def __init__(self, large, model_name, temp_dir, finetune=False): super(Bert, self).__init__() if model_name == 'bert': if (large): self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir) else: self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) elif model_name == 'scibert': self.model = BertModel.from_pretrained( 'allenai/scibert_scivocab_uncased', cache_dir=temp_dir) elif model_name == 'longformer': if large: self.model = LongformerModel.from_pretrained( 'allenai/longformer-large-4096', cache_dir=temp_dir) else: self.model = LongformerModel.from_pretrained( 'allenai/longformer-base-4096', cache_dir=temp_dir) self.model_name = model_name self.finetune = finetune
def load_torch_model(model_name, device): torch_model_name_or_dir = (PRETRAINED_LONGFORMER_MODELS[model_name] if model_name in PRETRAINED_LONGFORMER_MODELS else model_name) model = LongformerModel.from_pretrained(torch_model_name_or_dir) model.to(device) return model
def __init__(self, config_path): config = configparser.ConfigParser() config.read(config_path) self.save_dir = Path(config.get("general", "save_dir")) if not self.save_dir.exists(): self.save_dir.mkdir(parents=True) self.clf_th = config.getfloat("general", "clf_th") self.mlp_model_path = config.get("model", "mlp") assert Path(self.mlp_model_path).exists() self.device = "cuda" if torch.cuda.is_available() else "cpu" bert_config_path = config.get("bert", "config_path") assert Path(bert_config_path).exists() self.bert_config = LongformerConfig.from_json_file(bert_config_path) self.max_seq_length = self.bert_config.max_position_embeddings - 2 self.bert_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # bert_tokenizer_path = config.get("bert", "tokenizer_path") # assert Path(bert_config_path).exists() # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path) bert_model_path = config.get("bert", "model_path") assert Path(bert_model_path).exists() self.bert_model = LongformerModel.from_pretrained( bert_model_path, config=self.bert_config) self.bert_model.to(self.device) self.bert_model.eval() gold_dir = Path(config.get("data", "gold_dir")) assert Path(gold_dir).exists() self.gold_dataset = ConllDataset(gold_dir) target_dir = Path(config.get("data", "target_dir")) assert Path(target_dir).exists() self.target_dataset = ConllDataset(target_dir)
def convert_longformer_qa_checkpoint_to_pytorch( longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str): # load longformer model from model identifier longformer = LongformerModel.from_pretrained(longformer_model) lightning_model = LightningModel(longformer) ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) lightning_model.load_state_dict(ckpt["state_dict"]) # init longformer question answering model longformer_for_qa = LongformerForQuestionAnswering.from_pretrained( longformer_model) # transfer weights longformer_for_qa.longformer.load_state_dict( lightning_model.model.state_dict()) longformer_for_qa.qa_outputs.load_state_dict( lightning_model.qa_outputs.state_dict()) longformer_for_qa.eval() # save model longformer_for_qa.save_pretrained(pytorch_dump_folder_path) print( f"Conversion successful. Model saved under {pytorch_dump_folder_path}")
def __init__(self, pretrained: str, max_query_len: int, max_doc_len: int, mode: str = 'cls', task: str = 'ranking') -> None: super(LongformerMaxp, self).__init__() self._pretrained = pretrained self._max_query_len = max_query_len self._max_doc_len = max_doc_len self._mode = mode self._task = task self._config = LongformerConfig.from_pretrained(self._pretrained) self._config.attention_mode = 'sliding_chunks' self._config.gradient_checkpointing = 'True' #print("attention_mode: "+self._config.attention_mode) self._model = LongformerModel.from_pretrained(self._pretrained, config=self._config) self._activation = nn.ReLU() self.dense = nn.Linear(self._config.hidden_size, 128) self.dropout = nn.Dropout(self._config.hidden_dropout_prob) self.out_proj = nn.Linear(128, 2) if self._task == 'ranking': self._dense2 = nn.Linear(128, 1) elif self._task == 'classification': self._dense2 = nn.Linear(128, 2) else: raise ValueError('Task must be `ranking` or `classification`.')
def main(dataset_directory, jsonlines_filename): dataset, ids, images = extract_article_list( os.path.join(dataset_directory, jsonlines_filename)) print(f'Len dataset = {len(dataset)}') text_model = LongformerModel.from_pretrained( "allenai/longformer-base-4096").to("cuda") text_model.eval() tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-base-4096") # pool = Pool(processes=48) # processed_text = list(tqdm(pool.map(process_text, dataset), total=len(dataset))) # pool.close() batch_size = 8 all_embeddings_avg = np.zeros((len(dataset), 768), dtype=np.float) for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)), total=len(dataset) / batch_size): with torch.no_grad(): tokenized_text = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length") model_out = text_model(**(tokenized_text.to("cuda"))) all_embeddings_avg[i * batch_size:i * batch_size + len(chunk), :] = torch.mean( model_out[0], dim=1).cpu().numpy() data_df = pd.DataFrame(zip(ids, images, all_embeddings_avg)) data_df.to_pickle( os.path.join(dataset_directory, f"longformer_{jsonlines_filename.split('.')[0]}.pkl"))
def __init__(self, model_name='', n_class = 50,probing=False): super().__init__() #Transformers Encoder if model_name=="Bert_base": self.model=BertModel.from_pretrained('bert-base-uncased') elif model_name=="Longformer_base": self.model= LongformerModel.from_pretrained('allenai/longformer-base-4096') else: self.model = AutoModel.from_pretrained(model_name) # !!! different layers self.probing=probing if self.probing: for child in self.model.children(): for param in child.parameters(): param.requires_grad = False #hyperparams self.model_name = model_name self.c = n_class self.hid = self.model.config.hidden_size #model blocks self.fc = nn.Linear(self.hid, self.c)
def test_layer_local_attn(self): model = LongformerModel.from_pretrained("patrickvonplaten/longformer-random-tiny") model.eval() layer = model.encoder.layer[0].attention.self.to(torch_device) hidden_states = self._get_hidden_states() batch_size, seq_length, hidden_size = hidden_states.size() attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device) attention_mask[:, -2:] = -10000 is_index_masked = attention_mask < 0 is_index_global_attn = attention_mask > 0 is_global_attn = is_index_global_attn.flatten().any().item() output_hidden_states = layer( hidden_states, attention_mask=attention_mask, is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, )[0] self.assertTrue(output_hidden_states.shape, (1, 4, 8)) self.assertTrue( torch.allclose( output_hidden_states[0, 1], torch.tensor( [0.0019, 0.0122, -0.0171, -0.0256, -0.0300, 0.0173, -0.0115, 0.0048], dtype=torch.float32, device=torch_device, ), atol=1e-3, ) )
def test_layer_global_attn(self): model = LongformerModel.from_pretrained( "patrickvonplaten/longformer-random-tiny") model.eval() layer = model.encoder.layer[0].attention.self.to(torch_device) hidden_states = torch.cat( [self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0) batch_size, seq_length, hidden_size = hidden_states.size() attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device) # create attn mask attention_mask[0, -2:] = 10000.0 attention_mask[0, -1:] = -10000.0 attention_mask[1, 1:] = 10000.0 is_index_masked = attention_mask < 0 is_index_global_attn = attention_mask > 0 is_global_attn = is_index_global_attn.flatten().any().item() output_hidden_states = layer( hidden_states, attention_mask=attention_mask, is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, )[0] self.assertTrue(output_hidden_states.shape, (2, 4, 8)) self.assertTrue( torch.allclose( output_hidden_states[0, 2], torch.tensor( [ -0.0651, -0.0393, 0.0309, -0.0342, -0.0066, -0.0155, -0.0209, -0.0494 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) self.assertTrue( torch.allclose( output_hidden_states[1, -2], torch.tensor( [ -0.0405, -0.0384, 0.0396, -0.0374, -0.0341, 0.0136, 0.0014, -0.0571 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, ))
def test_inference_no_head_long(self): model = LongformerModel.from_pretrained("allenai/longformer-base-4096") model.to(torch_device) # 'Hello world! ' repeated 1000 times input_ids = torch.tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=torch.long, device=torch_device) # long input attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) global_attention_mask[:, [ 1, 4, 21 ]] = 1 # Set global attention on a few random positions output = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)[0] expected_output_sum = torch.tensor(74585.8594, device=torch_device) expected_output_mean = torch.tensor(0.0243, device=torch_device) self.assertTrue( torch.allclose(output.sum(), expected_output_sum, atol=1e-4)) self.assertTrue( torch.allclose(output.mean(), expected_output_mean, atol=1e-4))
def __init__(self): self.model = LongformerModel.from_pretrained( 'allenai/longformer-base-4096') self.tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') self.led_tokenizer = LEDTokenizer.from_pretrained( 'allenai/led-base-16384') self.led_model = LEDModel.from_pretrained('allenai/led-base-16384')
def __init__(self, h_dim=768, **kwargs): super().__init__(**kwargs) # self.data_processor = data_processor self.Longformer = LongformerModel.from_pretrained( 'allenai/longformer-base-4096') self.testing = False self.training = True self.dropout = nn.Dropout(0.5) self.proj_layer = nn.Linear(h_dim, 1)
def train(self, x, y=None): logging.info("Building vectorizer on " + self.__class__.__name__) t0 = time.time() processed_dataset = [clean_string_longformer(entry) for entry in x] train_dataset = load_custom_dataset( self.tokenizer, processed_dataset, y, "train", self.input_length, ) test_dataset = load_custom_dataset( self.tokenizer, processed_dataset, y, "test", self.input_length, ) today = date.today() date_string = today.strftime("%d_%m_%Y") time_string = today.strftime("%H_%M_%S") training_args = TrainingArguments( output_dir=f'./results/{date_string}', num_train_epochs=self.epochs, per_device_train_batch_size=self.batch_size, per_device_eval_batch_size=self.batch_size, warmup_steps=500, weight_decay=0.01, logging_dir=f'./logs/{date_string}', load_best_model_at_end=True, fp16=False, fp16_opt_level="O2", evaluation_strategy="epoch", metric_for_best_model="eval_loss", greater_is_better=False, ) trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, ) trainer.train() self.model = trainer.model model_path = os.path.join(self.save_directory, self.dataset_name, f"{self.model_name}_{time_string}") self.model.save_pretrained(model_path) self.model = LongformerModel.from_pretrained(model_path) self.model.to("cuda") elapsed = (time.time() - t0) logging.info("Done in %.3fsec" % elapsed)
def make_dataset(self, data_root: str) -> None: """ Make Dataset Make dataset from json files and save it as csv. Args: data_root: Root directory for document json files. """ log.info(f"Making dataset...") json_paths = glob.glob(f"{data_root}/**/*.json", recursive=True) # nltk settings nltk.download('punkt') stemmer = PorterStemmer() cv = CountVectorizer() texts = [] # A list of tokenized texts separated by half-width characters # Longformer feature_matrix = [] device = torch.device('cuda') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device) for json_path in tqdm(json_paths): with open(json_path) as f: json_obj = json.load(f) body = json_obj["body"] soup = BeautifulSoup(body, "html.parser") for script in soup(["script", "style"]): script.decompose() text = soup.get_text() with torch.no_grad(): input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0).to(device) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device) global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device) outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) vec = outputs.last_hidden_state[0].cpu().detach().clone().numpy().mean(0) # np.append(feature_matrix, vec) feature_matrix.append(list(vec)) # log.info(f"Done: {len(feature_matrix)}") feature_matrix = np.array(feature_matrix) log.info(f"Longformer: {feature_matrix.shape}") # Calculate distance matrix dist_mat = squareform(pdist(feature_matrix, metric='cosine')) df = pd.DataFrame(dist_mat) df.to_csv(join(self.cache_path, "json_document_longformer.csv"), index=False) log.info(f"Successfully made dataset.")
def __init__(self, input_size): super().__init__() self.activation = torch.nn.SELU() self.dropout = torch.nn.Dropout(p=0.1) self.projector_1 = torch.nn.Linear(input_size, 512) self.projector_2 = torch.nn.Linear(512, 256) self.text_model = LongformerModel.from_pretrained( "allenai/longformer-base-4096") fine_tune_layers = 3 for i, (name, param) in enumerate(self.text_model.named_parameters()): if i == (12 - fine_tune_layers) * 22 + 5: break param.requires_grad = False
def test_inference_no_head(self): model = LongformerModel.from_pretrained("allenai/longformer-base-4096") model.to(torch_device) # 'Hello world!' input_ids = torch.tensor([[0, 20920, 232, 328, 1437, 2]], dtype=torch.long, device=torch_device) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) output = model(input_ids, attention_mask=attention_mask)[0] output_without_mask = model(input_ids)[0] expected_output_slice = torch.tensor([0.0549, 0.1087, -0.1119, -0.0368, 0.0250], device=torch_device) self.assertTrue(torch.allclose(output[0, 0, -5:], expected_output_slice, atol=1e-4)) self.assertTrue(torch.allclose(output_without_mask[0, 0, -5:], expected_output_slice, atol=1e-4))
def main(args): model_name = args.model onnx_model_path = model_name + ".onnx" from transformers import LongformerModel model = LongformerModel.from_pretrained(PRETRAINED_LONGFORMER_MODELS[model_name]) export_longformer(model, onnx_model_path, args.export_padding) if args.optimize_onnx or args.precision != 'fp32': fp32_model_path = model_name + "_fp32.onnx" fp16_model_path = model_name + "_fp16.onnx" if args.precision == 'fp16' else None optimize_longformer(onnx_model_path, fp32_model_path, fp16_model_path)
def test_all(args): # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet). device = torch.device('cuda:0') results = [] for model_name in args.models: # Here we run an example input from transformers import LongformerModel torch_model_name_or_dir = MODELS[model_name] model = LongformerModel.from_pretrained( torch_model_name_or_dir) # pretrained model name or directory model.to(device) # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model optimized = False precision = 'fp32' onnx_model_path = model_name + ".onnx" optimized_fp32_model = model_name + "_fp32.onnx" optimized_fp16_model = model_name + "_fp16.onnx" import os.path if os.path.isfile(optimized_fp16_model): onnx_model_path = optimized_fp16_model optimized = True precision = 'fp16' elif os.path.isfile(optimized_fp32_model): onnx_model_path = optimized_fp32_model optimized = True for num_threads in args.num_threads: if "torch" in args.engines: results += test_torch(device, model, model_name, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads) if "onnxruntime" in args.engines: session = benchmark_helper.create_onnxruntime_session( onnx_model_path, use_gpu=True, enable_all_optimization=True, num_threads=num_threads) results += test_onnxruntime(device, model, model_name, session, args.batch_sizes, args.sequence_lengths, args.global_lengths, args.test_times, num_threads, optimized, precision) return results
def main(args): model_name = args.model onnx_model_path = model_name + ".onnx" global weight_bias_format weight_bias_format = 0 if args.no_merge_qkv else 1 model = LongformerModel.from_pretrained( PRETRAINED_LONGFORMER_MODELS[model_name]) export_longformer(model, onnx_model_path, args.export_padding) if args.optimize_onnx or args.precision != "fp32": fp32_model_path = model_name + f"_f{weight_bias_format}" + "_fp32.onnx" fp16_model_path = model_name + f"_f{weight_bias_format}" + "_fp16.onnx" if args.precision == "fp16" else None optimize_longformer(onnx_model_path, fp32_model_path, fp16_model_path)
def __init__(self, params): super(LongEntityLinkerModule, self).__init__() self.params = params if params['use_longformer']: self.ctxt_encoder = LongformerModel.from_pretrained( 'allenai/longformer-base-4096') longformer_output_dim = self.ctxt_encoder.embeddings.word_embeddings.weight.size( 1) self.NULL_IDX = 0 else: self.ctxt_encoder = BertModel.from_pretrained('bert-base-uncased') self.NULL_IDX = 0 longformer_output_dim = self.ctxt_encoder.embeddings.word_embeddings.weight.size( 1) self.config = self.ctxt_encoder.config self.linear_compression = None if longformer_output_dim != self.params['cand_emb_dim']: self.linear_compression = nn.Linear(longformer_output_dim, self.params['cand_emb_dim'])
def __init__(self, params): super().__init__() if 'dropout' in params: self.dropout = nn.Dropout(p=params['dropout']) else: self.dropout = None # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False, do_basic_tokenize=False) # self.bert = BertModel.from_pretrained("bert-base-uncased") self.max_length = params['max_length'] if 'max_length' in params else 1024 self.max_memory_size = params['max_memory_size'] self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') self.bert = LongformerModel.from_pretrained("allenai/longformer-base-4096", gradient_checkpointing=True) self.num_labels = params["label_length"] if 'label_length' in params else 2 self.fc = nn.Linear(768, self.num_labels)
def __init__(self, params): super(LongEncoderModule, self).__init__() self.params = params if params['use_longformer']: self.ctxt_encoder = LongformerModel.from_pretrained('allenai/longformer-base-4096') longformer_output_dim = self.ctxt_encoder.embeddings.word_embeddings.weight.size(1) self.NULL_IDX = 0 else: # temporary change to large cased SpanBert for test #self.ctxt_encoder = BertModel.from_pretrained('bert-base-uncased') self.ctxt_encoder = BertModel.from_pretrained('../models/spanbert_hf_base') self.NULL_IDX = 0 longformer_output_dim = self.ctxt_encoder.embeddings.word_embeddings.weight.size(1) #num_tags = 4 if not self.params['end_tag'] else 5 #num_tags = 3 if not self.params['end_tag'] else 4 num_tags = 9 if self.params['conll'] else 3 self.config = self.ctxt_encoder.config self.tagger = LongTagger(longformer_output_dim, num_tags, self.params['classifier']) self.linear_compression = None if longformer_output_dim != self.params['cand_emb_dim']: self.linear_compression = nn.Linear(longformer_output_dim, self.params['cand_emb_dim'])
def load_model(self): if 'longformer' in self.args.model_path: model = LongformerModel.from_pretrained(self.args.model_path) for layer in model.encoder.layer: layer.attention.self.attention_mode = self.args.attention_mode self.args.attention_window = 512 # layer.attention.self.attention_window elif self.args.model_path in ['bart.large', 'bart.base']: model = torch.hub.load('pytorch/fairseq', self.args.model_path) model.config = model.args model.config.hidden_size = model.config.decoder_output_dim elif 'bart' in self.args.model_path and 'base' in self.args.model_path: config = AutoConfig.from_pretrained(self.args.model_path) config.encoder_attention_heads = 12 config.decoder_attention_heads = 12 config.attention_dropout = 0.1 if self.args.seq2seq: model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path, config=config) elif 'bart' in self.args.model_path and 'large' in self.args.model_path: config = AutoConfig.from_pretrained(self.args.model_path) config.attention_dropout = 0.1 config.gradient_checkpointing = True if self.args.seq2seq: model = AutoModelWithLMHead.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path, config=config) else: model = AutoModel.from_pretrained(self.args.model_path) print("Loaded model with config:") print(model.config) for p in model.parameters(): p.requires_grad_(True) model.train() return model
def __init__(self): super(Model, self).__init__() self.model = LongformerModel.from_pretrained(model_config.pretrain_model_path, gradient_checkpointing=True) self.config = self.model.config self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.classifier = nn.Linear(self.config.hidden_size, config.num_labels)
def test_layer_attn_probs(self): model = LongformerModel.from_pretrained( "patrickvonplaten/longformer-random-tiny") model.eval() layer = model.encoder.layer[0].attention.self.to(torch_device) hidden_states = torch.cat( [self._get_hidden_states(), self._get_hidden_states() - 0.5], dim=0) batch_size, seq_length, hidden_size = hidden_states.size() attention_mask = torch.zeros((batch_size, seq_length), dtype=torch.float32, device=torch_device) # create attn mask attention_mask[0, -2:] = 10000.0 attention_mask[0, -1:] = -10000.0 attention_mask[1, 1:] = 10000.0 is_index_masked = attention_mask < 0 is_index_global_attn = attention_mask > 0 is_global_attn = is_index_global_attn.flatten().any().item() output_hidden_states, local_attentions, global_attentions = layer( hidden_states, attention_mask=attention_mask, is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn, output_attentions=True, ) self.assertEqual(local_attentions.shape, (2, 4, 2, 8)) self.assertEqual(global_attentions.shape, (2, 2, 3, 4)) # All tokens with global attention have weight 0 in local attentions. self.assertTrue(torch.all(local_attentions[0, 2:4, :, :] == 0)) self.assertTrue(torch.all(local_attentions[1, 1:4, :, :] == 0)) # The weight of all tokens with local attention must sum to 1. self.assertTrue( torch.all( torch.abs(global_attentions[0, :, :2, :].sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.all( torch.abs(global_attentions[1, :, :1, :].sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.allclose( local_attentions[0, 0, 0, :], torch.tensor( [ 0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318, 0.0000 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) self.assertTrue( torch.allclose( local_attentions[1, 0, 0, :], torch.tensor( [ 0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000, 0.0000 ], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) # All the global attention weights must sum to 1. self.assertTrue( torch.all(torch.abs(global_attentions.sum(dim=-1) - 1) < 1e-6)) self.assertTrue( torch.allclose( global_attentions[0, 0, 1, :], torch.tensor( [0.2500, 0.2500, 0.2500, 0.2500], dtype=torch.float32, device=torch_device, ), atol=1e-3, )) self.assertTrue( torch.allclose( global_attentions[1, 0, 0, :], torch.tensor( [0.2497, 0.2500, 0.2499, 0.2504], dtype=torch.float32, device=torch_device, ), atol=1e-3, ))
from sentence_transformers import SentenceTransformer from tqdm import tqdm import gensim from torch import nn as nn from config import SBERT_MODEL_NAME from utils.types import FolkLoreData, FolkLoreEmb, FolkLoreEmbCoarse nlp = en_core_web_sm.load() sbert_model = SentenceTransformer(SBERT_MODEL_NAME) from transformers import LongformerModel, LongformerTokenizerFast, LongformerConfig LFconfig = LongformerConfig.from_pretrained('allenai/longformer-base-4096') LF_model = LongformerModel.from_pretrained('allenai/longformer-base-4096', config=LFconfig) LF_tokenizer = LongformerTokenizerFast.from_pretrained( 'allenai/longformer-base-4096') LF_tokenizer.model_max_length = LF_model.config.max_position_embeddings class MatrixVectorScaledDotProductAttention(nn.Module): def __init__(self, temperature, attn_dropout=0.1): super().__init__() self.temperature = temperature self.dropout = nn.Dropout(attn_dropout) self.softmax = nn.Softmax(dim=1) def forward(self, q, k, v, mask=None): """ q: tensor of shape (n*b, d_k)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # TODO: use random word ID. #TODO: simulate masked word global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) if num_global_tokens > 0: global_token_index = list(range(num_global_tokens)) global_attention_mask[:, global_token_index] = 1 # TODO: support more inputs like token_type_ids, position_ids return input_ids, attention_mask, global_attention_mask args = parse_arguments() model_name = args.model onnx_model_path = model_name + ".onnx" from transformers import LongformerModel model = LongformerModel.from_pretrained(MODELS[model_name]) # pretrained model name or directory input_ids, attention_mask, global_attention_mask = get_dummy_inputs(sequence_length=args.sequence_length, num_global_tokens=args.global_length, device=torch.device('cpu')) example_outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) # A new function to replace LongformerSelfAttention.forward #For transformers 4.0 def my_longformer_self_attention_forward_4(self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None): # TODO: move mask calculation to LongFormerModel class to avoid calculating it again and again in each layer. global_mask = is_index_global_attn.int() torch.masked_fill(attention_mask, is_index_global_attn, 0.0) weight = torch.stack((self.query.weight.transpose(0,1), self.key.weight.transpose(0,1), self.value.weight.transpose(0,1)), dim=1) weight = weight.reshape(self.embed_dim, 3*self.embed_dim)
def __init__(self, model_name: str = "allenai/longformer-base-4096"): self.model = LongformerModel.from_pretrained(model_name) self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
from transformers import ElectraForMaskedLM, ElectraTokenizer tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor( tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] print(prediction_scores) ## Longformer from transformers import LongformerModel, LongformerTokenizer model = LongformerModel.from_pretrained('longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1 # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones( input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [ 1, 4, 21, ]] = 2 # Set global attention based on the task. For example,
print(df.target.value_counts()) import torch from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig from transformers import LongformerTokenizerFast, LongformerModel, LongformerConfig #model_name = 'distilbert-base-uncased' model_name = 'allenai/longformer-base-4096' tokenizer = LongformerTokenizerFast.from_pretrained(model_name) df["vecs"] = df.text.map( lambda x: torch.LongTensor(tokenizer.encode(x)).unsqueeze(0)) config = LongformerConfig.from_pretrained(model_name, output_hidden_states=True) model = LongformerModel.from_pretrained(model_name, config=config) device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' model = model.to(device) input_tf = tokenizer.batch_encode_plus(df.text.to_list(), return_tensors='pt', padding=True) #vecs = input_tf['input_ids'].to(device) #granola_ids = granola_ids.to(device) model.eval() with torch.no_grad(): print("and GO!!!!")