def __init__(self, config: BartConfig, **kwargs: Any): """The classification init is a super set of LM init""" PretrainedBartModel.__init__(self, config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classif_dropout) self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) self.model._init_weights(self.lm_head)
def test_advanced_inputs(self): # (config, input_ids, token_type_ids, input_mask, *unused) = \ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) decoder_input_ids, decoder_attn_mask = _prepare_bart_decoder_inputs( config, inputs_dict["input_ids"]) model = BartModel(config) model.to(torch_device) model.eval() # test init self.assertTrue( (model.encoder.embed_tokens.weight == model.shared.weight ).all().item()) def _check_var(module): """Check that we initialized various parameters from N(0, config.init_std).""" self.assertAlmostEqual( torch.std(module.weight).item(), config.init_std, 2) _check_var(model.encoder.embed_tokens) _check_var(model.encoder.layers[0].self_attn.k_proj) _check_var(model.encoder.layers[0].fc1) _check_var(model.encoder.embed_positions) decoder_features_with_created_mask = model.forward(**inputs_dict)[0] decoder_features_with_passed_mask = model.forward( decoder_attention_mask=decoder_attn_mask, decoder_input_ids=decoder_input_ids, **inputs_dict)[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model.forward(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance( decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual(decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model)) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse( (decoder_features_with_created_mask == decoder_features ).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model.forward( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long())[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
def main2(): config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') bart = MyBartWithAdapter(config) bart_old = BartModel.from_pretrained("facebook/bart-base") bart.model.load_state_dict(bart_old.state_dict(), strict=False) config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') # config.adapt_layer_norm = True generator = ParameterGenerator(config) output = generator(torch.tensor([[1,2,3]])) print(output) print(output[0].size()) growingbart = GrowingBart(bart, generator, config) output = growingbart(torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]), torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]), torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]])) print(output) loss = output[0].sum(-1).sum(-1).sum(-1) print(loss) loss.backward()
def test_advanced_inputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.use_cache = False inputs_dict["input_ids"][:, -2:] = config.pad_token_id decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_bart_decoder_inputs( config, inputs_dict["input_ids"] ) model = BartModel(config).to(torch_device).eval() decoder_features_with_created_mask = model(**inputs_dict)[0] decoder_features_with_passed_mask = model( decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict )[0] _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask) useless_mask = torch.zeros_like(decoder_attn_mask) decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0] self.assertTrue(isinstance(decoder_features, torch.Tensor)) # no hidden states or attentions self.assertEqual( decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model) ) if decoder_attn_mask.min().item() < -1e3: # some tokens were masked self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item()) # Test different encoder attention masks decoder_features_with_long_encoder_mask = model( inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long() )[0] _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
def main(): config = BartWithAdapterConfig.from_pretrained('facebook/bart-base') bart = MyBartWithAdapter(config) bart_old = BartModel.from_pretrained("facebook/bart-base") ret = bart.model.load_state_dict(bart_old.state_dict(), strict=False) print(ret)
def __init__(self): super().__init__() self.config = BartConfig.from_pretrained('facebook/bart-large', use_cache=False) bart = BartModel(self.config) self.encoder = bart.encoder self.decoder = bart.decoder self.linear = nn.Linear(1024, 50265, bias=False)
def __init__(self, model_name): super().__init__() bart = BartModel.from_pretrained(model_name) self.hidden_dim = bart.config.hidden_size self.bart_encoder = bart.encoder self.bart_encoder.embed_tokens = lambda x: x self.bart_encoder.embed_positions = lambda x: torch.zeros( (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
def __init__(self): super(MoralClassifier, self).__init__() # self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased') self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn') # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.3) self.l4 = torch.nn.Linear(1024, 11) # 11 categories
def __init__(self, config: BartConfig): super().__init__(config) self.model = BartModel(config) self.pointer = BartMultiPointerHead( config.d_model, config.decoder_attention_heads, dropout=config.attention_dropout, ) self.heads_combination = nn.Linear(config.decoder_attention_heads, 1) self.eos_token_id = config.eos_token_id self.pad_token_id = config.pad_token_id
def __init__(self, config, crf=None, output_concat=False): super().__init__(config) self.num_labels = config.num_labels self.bart = BartModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.d_model, config.num_labels) self.loss_fct = nn.CrossEntropyLoss() self.use_crf = False self.output_concat = output_concat self.crf_layer = crf self.init_weights()
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): """ Copy/paste/tweak model's weights to our BERT structure. """ if not os.path.exists(checkpoint_path): bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() else: bart = load_xsum_checkpoint(checkpoint_path) bart.model.upgrade_state_dict(bart.model.state_dict()) if hf_checkpoint_name is None: hf_checkpoint_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_checkpoint_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode( SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path == "bart.large.mnli": state_dict = bart.state_dict() remove_ignore_keys_(state_dict) state_dict["model.shared.weight"] = state_dict[ "model.decoder.embed_tokens.weight"] for src, dest in mnli_rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config).eval() model.load_state_dict(state_dict) fairseq_output = bart.predict("mnli", tokens, return_logits=True) new_model_outputs = model(tokens)[0] # logits else: # no classification heads to worry about state_dict = bart.model.state_dict() remove_ignore_keys_(state_dict) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] fairseq_output = bart.extract_features(tokens) if hf_checkpoint_name == "bart-large": model = BartModel(config).eval() model.load_state_dict(state_dict) new_model_outputs = model(tokens).model[0] else: model = BartForConditionalGeneration( config).eval() # an existing summarization ckpt model.model.load_state_dict(state_dict) if hasattr(model, "lm_head"): model.lm_head = _make_linear_from_emb(model.model.shared) new_model_outputs = model.model(tokens)[0] # Check results assert fairseq_output.shape == new_model_outputs.shape assert (fairseq_output == new_model_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def test_inference_no_head(self): model = BartModel.from_pretrained("facebook/bart-large").to(torch_device) input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) inputs_dict = prepare_bart_inputs_dict(model.config, input_ids) with torch.no_grad(): output = model(**inputs_dict)[0] expected_shape = torch.Size((1, 11, 1024)) self.assertEqual(output.shape, expected_shape) expected_slice = torch.tensor( [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BERT structure. """ b2 = torch.hub.load("pytorch/fairseq", checkpoint_path) b2.eval() # disable dropout b2.model.upgrade_state_dict(b2.model.state_dict()) config = BartConfig() tokens = b2.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained("bart-large").encode( SAMPLE_TEXT).unsqueeze(0) assert torch.eq(tokens, tokens2).all() # assert their_output.size() == (1, 11, 1024) if checkpoint_path == "bart.large": state_dict = b2.model.state_dict() state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartModel(config) their_output = b2.extract_features(tokens) else: # MNLI Case state_dict = b2.state_dict() state_dict["model.shared.weight"] = state_dict[ "model.decoder.embed_tokens.weight"] for src, dest in rename_keys: rename_key(state_dict, src, dest) state_dict.pop("_float_tensor", None) model = BartForSequenceClassification(config) their_output = b2.predict("mnli", tokens, return_logits=True) for k in IGNORE_KEYS: state_dict.pop(k, None) model.load_state_dict(state_dict) model.eval() our_outputs = model.forward(tokens)[0] assert their_output.shape == our_outputs.shape assert (their_output == our_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def __init__(self, args): super(MoralClassifier, self).__init__() self.hparams = args self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn') # freeze bart weights # for param in self.l1.parameters(): # param.requires_grad = False # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.2) self.l4 = torch.nn.Linear(1024, 10) # 10 categories
def __init__(self): super(Model, self).__init__() self.model = BartModel.from_pretrained( model_config.pretrain_model_path) self.config = self.model.config self.classification_head = BartClassificationHead( self.config.d_model, self.config.d_model, config.num_labels, self.config.classif_dropout, ) self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj)
def __init__(self, model_name, use_pretrained_embeddings=False): super().__init__() # or some flag that indicates the bart encoder in it's entirety could be used. if use_pretrained_embeddings: # will use the entire bart encoder including all embeddings bart = PretrainedTransformerEmbedder(model_name, sub_module="encoder") else: bart = BartModel.from_pretrained(model_name) self.bart_encoder.embed_tokens = lambda x: x self.bart_encoder.embed_positions = lambda x: torch.zeros( (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32) self.hidden_dim = bart.config.hidden_size self.bart_encoder = bart.transformer_model
def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classifier_dropout, ) self.metric_hidden_size = 256 self.metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size) # self.label_metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size) # self.predict_linear = nn.Linear(self.metric_hidden_size * 2, ) self.scl_t = 1 self.ce_p = 0.8 self.scl_p = 0.1 self.lscl_p = 0.1 self.ce_loss_fct = CrossEntropyLoss() self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj)
def __init__(self, large, temp_dir, finetune=False, bart=False): # def __init__(self, large, temp_dir, finetune=False): super(Bert, self).__init__() if(large): self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir) else: # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) if bart: self.model = BartModel.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True) # self.model = BartForConditionalGeneration.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True) else: self.model = BertModel.from_pretrained('bert-base-multilingual-uncased', cache_dir=temp_dir, local_files_only=False) self.finetune = finetune
def __init__(self, config, output_concat=False): super().__init__(config) self.num_labels = config.num_labels self.bart = BartModel(config) self.dropout = nn.Dropout(config.dropout) self.classifier = nn.Linear(config.d_model, config.num_labels) self.loss_fct = nn.CrossEntropyLoss() self.use_crf = config.use_crf if self.use_crf: self.crf_layer = Transformer_CRF( num_labels=config.num_labels, start_label_id=config.label2idx['CLS']) else: self.crf_layer = None self.output_concat = output_concat self.init_weights()
def test_initialization_more(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = BartModel(config) model.to(torch_device) model.eval() # test init self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item()) def _check_var(module): """Check that we initialized various parameters from N(0, config.init_std).""" self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2) _check_var(model.encoder.embed_tokens) _check_var(model.encoder.layers[0].self_attn.k_proj) _check_var(model.encoder.layers[0].fc1) _check_var(model.encoder.embed_positions)
def __init__(self, args, use_mask=True): super(OneHotMoralClassifier, self).__init__() self.hparams = args self.bart = BartModel.from_pretrained('facebook/bart-large-cnn') self.use_mask = use_mask self.vocab_size = 50264 self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False) self.onehot_embeddings.weight = nn.Parameter(self.build_lookups()) # self.bart.encoder.embed_tokens = nn.Identity() # freeze bert weights # self.onehot_embeddings.requires_grad = False # self.onehot_embeddings.weight.requires_grad = False # for param in self.bart.parameters(): # param.requires_grad = False # Pooler self.l2 = torch.nn.Linear(1024, 1024) self.act = torch.nn.Tanh() # Classifier self.l3 = torch.nn.Dropout(0.2) self.l4 = torch.nn.Linear(1024, 10) # 10 categories
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our BERT structure. """ bart = torch.hub.load("pytorch/fairseq", checkpoint_path) bart.eval() # disable dropout bart.model.upgrade_state_dict(bart.model.state_dict()) hf_model_name = checkpoint_path.replace(".", "-") config = BartConfig.from_pretrained(hf_model_name) tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) assert torch.eq(tokens, tokens2).all() if checkpoint_path in ["bart.large", "bart.large.cnn"]: state_dict = bart.model.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] model = BartModel(config) their_output = bart.extract_features(tokens) else: # MNLI Case state_dict = bart.state_dict() for k in IGNORE_KEYS: state_dict.pop(k, None) state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] for src, dest in rename_keys: rename_key(state_dict, src, dest) model = BartForSequenceClassification(config) their_output = bart.predict("mnli", tokens, return_logits=True) # Load state dict model.load_state_dict(state_dict) model.eval() # Check results if checkpoint_path == "bart.large.cnn": model = BartForConditionalGeneration(config, base_model=model) assert "lm_head.weight" in model.state_dict() assert model.lm_head.out_features == config.max_position_embeddings model.eval() our_outputs = model.model(tokens)[0] else: our_outputs = model(tokens)[0] assert their_output.shape == our_outputs.shape assert (their_output == our_outputs).all().item() Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path)
def test_model_from_pretrained(self): # Forces 1.6GB download from S3 for each model for model_name in list(BART_PRETRAINED_MODEL_ARCHIVE_MAP.keys()): model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model)
def __init__(self, n_vocab=50264): self.n_vocab = n_vocab self.true_embedding = BartModel.from_pretrained( 'facebook/bart-large-cnn').encoder.embed_tokens
def Seq2Seq(df): model_type = 'facebook/bart-large' tokenizer = BartTokenizer.from_pretrained(model_type) model = BartModel.from_pretrained(model_type) mask_model = BartForConditionalGeneration.from_pretrained(model_type) sep_token = '</s>' mask_token = '<mask>' mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1] sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1] optimizer = AdamW(model.parameters()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) mask_model.to(device) df['mask_text'] = 0 df['auxiliary_text'] = 0 for i in range(len(df)): aspect = df['aspect'].iloc[i] sentiment = df['sentiment'].iloc[i] if aspect == 'NULL' or isinstance(aspect, (int, float)): aspect = 'aspect' if DPM_type == 'Senti': mask_sent = 'the polarity of the ' + aspect + ' is ' + mask_token + ' ' + sep_token + ' ' auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' ' elif DPM_type == 'AS': mask_sent = 'the polarity of the ' + mask_token + ' is ' + sentiment + ' ' + sep_token + ' ' auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' ' df['mask_text'].iloc[i] = mask_sent + df['text'].iloc[i] df['auxiliary_text'].iloc[i] = auxiliary_sent + df['text'].iloc[i] df['distance'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = df['mask_text'][i:i + 1].apply((lambda x: tokenizer.encode( x, add_special_tokens=True, max_length=MAX_LEN, truncation=True))) sep_index = tokenized[i].index(sep_id) mask_index = tokenized[i].index(mask_id) padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") attention_mask = np.where(padded != 0, 1, 0) input_ids = torch.tensor(padded).to(device) attention_mask = torch.tensor(attention_mask).to(device) with torch.no_grad(): last_hidden_states = model(input_ids, attention_mask=attention_mask) original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu( ).numpy() distance = [] for pertubed_index in range(sep_index + 1, MAX_LEN): padded = pad_sequences(tokenized, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") if padded[0][pertubed_index] != 0 and padded[0][ pertubed_index] != sep_id: #print(padded.shape) cur_id = padded[0][pertubed_index] padded[0][pertubed_index] = mask_id cur_embedding = mask_embedding(model, padded, mask_index) d = dist(original_mask_embedding, cur_embedding) distance.append((cur_id, d)) df['distance'].iloc[i] = distance df['perturbed_mask_index'] = 0 df = df.astype('object') for i in range(len(df)): perturbed_mask_index = [] mask_threshold = calculate_threshold( np.array(df['distance'].iloc[i])[:, 1], std_strength) for dis_index in range(len(df['distance'].iloc[i])): if df['distance'].iloc[i][dis_index][1] < mask_threshold: perturbed_mask_index.append(dis_index) df['perturbed_mask_index'].iloc[i] = perturbed_mask_index df['augment_token_id'] = 0 df = df.astype('object') for i in range(len(df)): tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) tokenized = torch.Tensor(tokenized).unsqueeze(0).to( torch.int64).to(device) augment_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) mask_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i]) sep_index = mask_tokenized.index(sep_id) for j in range(len(df['perturbed_mask_index'].iloc[i])): perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ j] + sep_index + 1 mask_tokenized[perturbed_mask_index] = mask_id mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to( torch.int64).to(device) logits = mask_model(mask_tokenized).logits for j in range(len(df['perturbed_mask_index'].iloc[i])): perturbed_mask_index = df['perturbed_mask_index'].iloc[i][ j] + sep_index + 1 probs = logits[0, perturbed_mask_index].softmax(dim=0) values, predictions = probs.topk(1) augment_tokenized[perturbed_mask_index] = int( predictions.cpu().numpy()) df['augment_token_id'].iloc[i] = augment_tokenized df['augment_text'] = 0 df = df.astype('object') for i in range(len(df)): sep_index = df['augment_token_id'].iloc[i].index(sep_id) df['augment_text'].iloc[i] = tokenizer.decode( df['augment_token_id'].iloc[i][sep_index + 1:-1]) return df
class BartMetricLearningModel(BartPretrainedModel): def __init__(self, config: BartConfig, **kwargs): super().__init__(config, **kwargs) self.model = BartModel(config) self.classification_head = BartClassificationHead( config.d_model, config.d_model, config.num_labels, config.classifier_dropout, ) self.metric_hidden_size = 256 self.metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size) # self.label_metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size) # self.predict_linear = nn.Linear(self.metric_hidden_size * 2, ) self.scl_t = 1 self.ce_p = 0.8 self.scl_p = 0.1 self.lscl_p = 0.1 self.ce_loss_fct = CrossEntropyLoss() self.model._init_weights(self.classification_head.dense) self.model._init_weights(self.classification_head.out_proj) def scl_func(self, anchor_vectors, labels): """ <<SUPERVISED CONTRASTIVE LEARNING FOR PRE-TRAINED LANGUAGE MODEL FINE-TUNING>> :param anchor_vector: batch_size * hidden_size :param labels: :return: """ total_losses = 0 anchor_vectors = anchor_vectors.squeeze(dim=1) for i in range(anchor_vectors.shape[0]): anchor_vector = anchor_vectors[i, :] # other_index = torch.from_numpy(np.tile(np.array(list(filter(lambda x: x != i, range(anchor_vectors.shape[0])))), # anchor_vectors.shape[1]).reshape(anchor_vectors.shape[1], -1)) # other_vectors = torch.gather(anchor_vectors.transpose(1, 0), dim=1, index=other_index).transpose(1, 0) other_vectors = np.delete(anchor_vectors.detach().cpu(), i, 0).to(anchor_vector.device) same_labels = torch.where(labels == labels[i]) same_label_vectors = anchor_vectors[same_labels] if same_label_vectors.shape[0] > 0: up = torch.exp( torch.cosine_similarity(same_label_vectors, anchor_vector.unsqueeze(0)) / self.scl_t) down = torch.sum( torch.exp( torch.cosine_similarity(other_vectors, anchor_vector.unsqueeze(0)) / self.scl_t)) singe_sample_loss = torch.sum(torch.log( up / down)) / -(anchor_vectors.shape[0] - 1) total_losses += singe_sample_loss return total_losses def forward( self, input_ids=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, encoder_outputs=None, inputs_embeds=None, decoder_inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, label_positions=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ label_max_position = torch.max(label_positions[-1]).tolist() return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False if input_ids is None and inputs_embeds is not None: raise NotImplementedError( f"Passing input embeddings is currently not supported for {self.__class__.__name__}" ) outputs = self.model( input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) if len(torch.unique(eos_mask.sum(1))) > 1: raise ValueError( "All examples must have the same number of <eos> tokens.") sentence_representation = sequence_output[eos_mask, :].view( sequence_output.size(0), -1, sequence_output.size(-1))[:, -1, :] anchor_vector = sentence_representation.unsqueeze(dim=1) label_vectors = None for positions in label_positions: position = positions[0] label_vector = sequence_output[:, position, :] label_vector = torch.mean(label_vector, dim=1).unsqueeze(dim=1) if label_vectors is None: label_vectors = label_vector else: label_vectors = torch.cat([label_vectors, label_vector], dim=1) anchor_vector = self.metric_linear(anchor_vector) label_vectors = self.metric_linear(label_vectors) logits = torch.cosine_similarity(label_vectors, anchor_vector, dim=2) loss = None if labels is not None: ce_loss = self.ce_loss_fct(logits, labels) scl_loss = self.scl_func(anchor_vector.squeeze(dim=1), labels) / 10 # true_label_vectors = label_vectors[range(len(labels)), labels, :] # scl_label_loss = self.scl_func(true_label_vectors, labels) / 10 # center_loss = self.center_loss_fct(anchor_vector, labels) # label_distance_loss = self.label_distance_loss_fct(label_vectors) loss = ce_loss * self.ce_p + scl_loss * self.scl_p # loss = ce_loss * self.ce_p + scl_loss * self.scl_p + scl_label_loss * self.lscl_p # loss = ce_loss if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return ZeroShotOutput(loss=loss, logits=logits, anchor_vector=anchor_vector, label_vectors=label_vectors, hidden_states=sequence_output)
def build_lookups(self): embeddings = BartModel.from_pretrained( 'facebook/bart-large-cnn').encoder.embed_tokens ids = torch.LongTensor([i for i in range(self.vocab_size)]) return torch.transpose(embeddings(ids), 0, 1).detach()
def get_embedding(type_embedding, data): if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': data = [x.lower() for x in data] path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' list_of_four_last_embeddings = [] list_of_mean = [] for l in data: # Convert the string "granola bars" to tokenized vocabulary IDs input_ids = tokenizer.encode(l) #print(input_ids) # Convert the list of IDs to a tensor of IDs input_ids = torch.LongTensor(input_ids) #print(input_ids) model = model.to(device) input_ids = input_ids.to(device) #print(input_ids) model.eval() # unsqueeze IDs to get batch size of 1 as added dimension input_ids = input_ids.unsqueeze(0) with torch.no_grad(): out = model(input_ids=input_ids) # we only want the hidden_states if type_embedding == 'xlm': hidden_states = out[1] else: hidden_states = out[2] #mean of layers sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze() list_of_mean.append(sentence_embedding.tolist()) # get last four layers last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] # cast layers to a tuple and concatenate over the last dimension cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # take the mean of the concatenated vector over the token dimension cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() list_of_four_last_embeddings.append(cat_sentence_embedding.tolist()) #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape) #print('list of mean', np.array(list_of_mean).shape) return list_of_mean, list_of_four_last_embeddings
def get_kobart_model(): return BartModel.from_pretrained("hyunwoongko/kobart")
'tenacity': 5, 'epoch_size': 4 } # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='bart-large', help='model name or path') args = parser.parse_args() config = BartConfig.from_pretrained(args.model) model = BartModel.from_pretrained(args.model, config=config) tokenizer = BartTokenizer.from_pretrained(args.model) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer params_senteval['config'] = config se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI' ] results = se.eval(transfer_tasks)