Esempio n. 1
0
    def __init__(self, config: BartConfig, **kwargs: Any):
        """The classification init is a super set of LM init"""
        PretrainedBartModel.__init__(self, config, **kwargs)
        self.model = BartModel(config)

        self.classification_head = BartClassificationHead(
            config.d_model, config.d_model, config.num_labels,
            config.classif_dropout)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)
        self.model._init_weights(self.lm_head)
Esempio n. 2
0
    def test_advanced_inputs(self):
        # (config, input_ids, token_type_ids, input_mask, *unused) = \
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        decoder_input_ids, decoder_attn_mask = _prepare_bart_decoder_inputs(
            config, inputs_dict["input_ids"])
        model = BartModel(config)
        model.to(torch_device)
        model.eval()
        # test init
        self.assertTrue(
            (model.encoder.embed_tokens.weight == model.shared.weight
             ).all().item())

        def _check_var(module):
            """Check that we initialized various parameters from N(0, config.init_std)."""
            self.assertAlmostEqual(
                torch.std(module.weight).item(), config.init_std, 2)

        _check_var(model.encoder.embed_tokens)
        _check_var(model.encoder.layers[0].self_attn.k_proj)
        _check_var(model.encoder.layers[0].fc1)
        _check_var(model.encoder.embed_positions)

        decoder_features_with_created_mask = model.forward(**inputs_dict)[0]
        decoder_features_with_passed_mask = model.forward(
            decoder_attention_mask=decoder_attn_mask,
            decoder_input_ids=decoder_input_ids,
            **inputs_dict)[0]
        _assert_tensors_equal(decoder_features_with_passed_mask,
                              decoder_features_with_created_mask)
        useless_mask = torch.zeros_like(decoder_attn_mask)
        decoder_features = model.forward(decoder_attention_mask=useless_mask,
                                         **inputs_dict)[0]
        self.assertTrue(isinstance(
            decoder_features, torch.Tensor))  # no hidden states or attentions
        self.assertEqual(decoder_features.size(),
                         (self.model_tester.batch_size,
                          self.model_tester.seq_length, config.d_model))
        if decoder_attn_mask.min().item() < -1e3:  # some tokens were masked
            self.assertFalse(
                (decoder_features_with_created_mask == decoder_features
                 ).all().item())

        # Test different encoder attention masks
        decoder_features_with_long_encoder_mask = model.forward(
            inputs_dict["input_ids"],
            attention_mask=inputs_dict["attention_mask"].long())[0]
        _assert_tensors_equal(decoder_features_with_long_encoder_mask,
                              decoder_features_with_created_mask)
Esempio n. 3
0
def main2():
    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    bart = MyBartWithAdapter(config)

    bart_old = BartModel.from_pretrained("facebook/bart-base")
    bart.model.load_state_dict(bart_old.state_dict(), strict=False)

    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    # config.adapt_layer_norm = True
    generator = ParameterGenerator(config)

    output = generator(torch.tensor([[1,2,3]]))
    print(output)
    print(output[0].size())

    growingbart = GrowingBart(bart, generator, config)

    output = growingbart(torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]),
        torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]),
        torch.tensor([[4,1,3,4,3,5,6,3,2]]), torch.tensor([[1,1,1,1,1,1,1,1,1]]))

    print(output)
    
    loss = output[0].sum(-1).sum(-1).sum(-1)
    print(loss)
    loss.backward()
Esempio n. 4
0
    def test_advanced_inputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.use_cache = False
        inputs_dict["input_ids"][:, -2:] = config.pad_token_id
        decoder_input_ids, decoder_attn_mask, causal_mask = _prepare_bart_decoder_inputs(
            config, inputs_dict["input_ids"]
        )
        model = BartModel(config).to(torch_device).eval()

        decoder_features_with_created_mask = model(**inputs_dict)[0]
        decoder_features_with_passed_mask = model(
            decoder_attention_mask=invert_mask(decoder_attn_mask), decoder_input_ids=decoder_input_ids, **inputs_dict
        )[0]
        _assert_tensors_equal(decoder_features_with_passed_mask, decoder_features_with_created_mask)
        useless_mask = torch.zeros_like(decoder_attn_mask)
        decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0]
        self.assertTrue(isinstance(decoder_features, torch.Tensor))  # no hidden states or attentions
        self.assertEqual(
            decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model)
        )
        if decoder_attn_mask.min().item() < -1e3:  # some tokens were masked
            self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item())

        # Test different encoder attention masks
        decoder_features_with_long_encoder_mask = model(
            inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long()
        )[0]
        _assert_tensors_equal(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
Esempio n. 5
0
def main():
    config = BartWithAdapterConfig.from_pretrained('facebook/bart-base')
    bart = MyBartWithAdapter(config)

    bart_old = BartModel.from_pretrained("facebook/bart-base")
    ret = bart.model.load_state_dict(bart_old.state_dict(), strict=False)

    print(ret)
    def __init__(self):
        super().__init__()
        self.config = BartConfig.from_pretrained('facebook/bart-large',
                                                 use_cache=False)

        bart = BartModel(self.config)
        self.encoder = bart.encoder
        self.decoder = bart.decoder
        self.linear = nn.Linear(1024, 50265, bias=False)
Esempio n. 7
0
    def __init__(self, model_name):
        super().__init__()

        bart = BartModel.from_pretrained(model_name)
        self.hidden_dim = bart.config.hidden_size
        self.bart_encoder = bart.encoder
        self.bart_encoder.embed_tokens = lambda x: x
        self.bart_encoder.embed_positions = lambda x: torch.zeros(
            (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
Esempio n. 8
0
 def __init__(self):
     super(MoralClassifier, self).__init__()
     # self.l1 = DistilBertModel.from_pretrained('distilbert-base-uncased')
     self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn')
     # Pooler
     self.l2 = torch.nn.Linear(1024, 1024)
     self.act = torch.nn.Tanh()
     # Classifier
     self.l3 = torch.nn.Dropout(0.3)
     self.l4 = torch.nn.Linear(1024, 11)  # 11 categories
Esempio n. 9
0
 def __init__(self, config: BartConfig):
     super().__init__(config)
     self.model = BartModel(config)
     self.pointer = BartMultiPointerHead(
         config.d_model,
         config.decoder_attention_heads,
         dropout=config.attention_dropout,
     )
     self.heads_combination = nn.Linear(config.decoder_attention_heads, 1)
     self.eos_token_id = config.eos_token_id
     self.pad_token_id = config.pad_token_id
Esempio n. 10
0
 def __init__(self, config, crf=None, output_concat=False):
     super().__init__(config)
     self.num_labels = config.num_labels
     self.bart = BartModel(config)
     self.dropout = nn.Dropout(config.dropout)
     self.classifier = nn.Linear(config.d_model, config.num_labels)
     self.loss_fct = nn.CrossEntropyLoss()
     self.use_crf = False
     self.output_concat = output_concat
     self.crf_layer = crf
     self.init_weights()
def convert_bart_checkpoint(checkpoint_path,
                            pytorch_dump_folder_path,
                            hf_checkpoint_name=None):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    if not os.path.exists(checkpoint_path):
        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
    else:
        bart = load_xsum_checkpoint(checkpoint_path)

    bart.model.upgrade_state_dict(bart.model.state_dict())
    if hf_checkpoint_name is None:
        hf_checkpoint_name = checkpoint_path.replace(".", "-")
    config = BartConfig.from_pretrained(hf_checkpoint_name)
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(
        SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    if checkpoint_path == "bart.large.mnli":
        state_dict = bart.state_dict()
        remove_ignore_keys_(state_dict)
        state_dict["model.shared.weight"] = state_dict[
            "model.decoder.embed_tokens.weight"]
        for src, dest in mnli_rename_keys:
            rename_key(state_dict, src, dest)
        model = BartForSequenceClassification(config).eval()
        model.load_state_dict(state_dict)
        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
        new_model_outputs = model(tokens)[0]  # logits
    else:  # no classification heads to worry about
        state_dict = bart.model.state_dict()
        remove_ignore_keys_(state_dict)
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        fairseq_output = bart.extract_features(tokens)
        if hf_checkpoint_name == "bart-large":
            model = BartModel(config).eval()
            model.load_state_dict(state_dict)
            new_model_outputs = model(tokens).model[0]
        else:
            model = BartForConditionalGeneration(
                config).eval()  # an existing summarization ckpt
            model.model.load_state_dict(state_dict)
            if hasattr(model, "lm_head"):
                model.lm_head = _make_linear_from_emb(model.model.shared)
            new_model_outputs = model.model(tokens)[0]

    # Check results
    assert fairseq_output.shape == new_model_outputs.shape
    assert (fairseq_output == new_model_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 12
0
 def test_inference_no_head(self):
     model = BartModel.from_pretrained("facebook/bart-large").to(torch_device)
     input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
     inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
     with torch.no_grad():
         output = model(**inputs_dict)[0]
     expected_shape = torch.Size((1, 11, 1024))
     self.assertEqual(output.shape, expected_shape)
     expected_slice = torch.tensor(
         [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
     )
     self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    b2 = torch.hub.load("pytorch/fairseq", checkpoint_path)
    b2.eval()  # disable dropout
    b2.model.upgrade_state_dict(b2.model.state_dict())
    config = BartConfig()
    tokens = b2.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained("bart-large").encode(
        SAMPLE_TEXT).unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    # assert their_output.size() == (1, 11, 1024)

    if checkpoint_path == "bart.large":
        state_dict = b2.model.state_dict()
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        model = BartModel(config)
        their_output = b2.extract_features(tokens)

    else:  # MNLI Case
        state_dict = b2.state_dict()
        state_dict["model.shared.weight"] = state_dict[
            "model.decoder.embed_tokens.weight"]
        for src, dest in rename_keys:
            rename_key(state_dict, src, dest)
        state_dict.pop("_float_tensor", None)
        model = BartForSequenceClassification(config)
        their_output = b2.predict("mnli", tokens, return_logits=True)
    for k in IGNORE_KEYS:
        state_dict.pop(k, None)
    model.load_state_dict(state_dict)
    model.eval()
    our_outputs = model.forward(tokens)[0]

    assert their_output.shape == our_outputs.shape
    assert (their_output == our_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 14
0
 def __init__(self, args):
     super(MoralClassifier, self).__init__()
     self.hparams = args
     self.l1 = BartModel.from_pretrained('facebook/bart-large-cnn')
     # freeze bart weights
     # for param in self.l1.parameters():
     #     param.requires_grad = False
     # Pooler
     self.l2 = torch.nn.Linear(1024, 1024)
     self.act = torch.nn.Tanh()
     # Classifier
     self.l3 = torch.nn.Dropout(0.2)
     self.l4 = torch.nn.Linear(1024, 10)  # 10 categories
Esempio n. 15
0
 def __init__(self):
     super(Model, self).__init__()
     self.model = BartModel.from_pretrained(
         model_config.pretrain_model_path)
     self.config = self.model.config
     self.classification_head = BartClassificationHead(
         self.config.d_model,
         self.config.d_model,
         config.num_labels,
         self.config.classif_dropout,
     )
     self.model._init_weights(self.classification_head.dense)
     self.model._init_weights(self.classification_head.out_proj)
Esempio n. 16
0
 def __init__(self, model_name, use_pretrained_embeddings=False):
     super().__init__()
     # or some flag that indicates the bart encoder in it's entirety could be used.
     if use_pretrained_embeddings:
         # will use the entire bart encoder including all embeddings
         bart = PretrainedTransformerEmbedder(model_name,
                                              sub_module="encoder")
     else:
         bart = BartModel.from_pretrained(model_name)
         self.bart_encoder.embed_tokens = lambda x: x
         self.bart_encoder.embed_positions = lambda x: torch.zeros(
             (x.shape[0], x.shape[1], self.hidden_dim), dtype=torch.float32)
     self.hidden_dim = bart.config.hidden_size
     self.bart_encoder = bart.transformer_model
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
        self.classification_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )
        self.metric_hidden_size = 256
        self.metric_linear = nn.Linear(config.hidden_size,
                                       self.metric_hidden_size)
        # self.label_metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size)
        # self.predict_linear = nn.Linear(self.metric_hidden_size * 2, )
        self.scl_t = 1

        self.ce_p = 0.8
        self.scl_p = 0.1
        self.lscl_p = 0.1
        self.ce_loss_fct = CrossEntropyLoss()

        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)
Esempio n. 18
0
    def __init__(self, large, temp_dir, finetune=False, bart=False):
    # def __init__(self, large, temp_dir, finetune=False):
        super(Bert, self).__init__()
        if(large):
            self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir)
        else:
            # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir)
            if bart:
                self.model = BartModel.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True)
                # self.model = BartForConditionalGeneration.from_pretrained('/home/ybai/downloads/bart', cache_dir=temp_dir, local_files_only=True)
            else:
                self.model = BertModel.from_pretrained('bert-base-multilingual-uncased', cache_dir=temp_dir,
                                                       local_files_only=False)

        self.finetune = finetune
Esempio n. 19
0
 def __init__(self, config, output_concat=False):
     super().__init__(config)
     self.num_labels = config.num_labels
     self.bart = BartModel(config)
     self.dropout = nn.Dropout(config.dropout)
     self.classifier = nn.Linear(config.d_model, config.num_labels)
     self.loss_fct = nn.CrossEntropyLoss()
     self.use_crf = config.use_crf
     if self.use_crf:
         self.crf_layer = Transformer_CRF(
             num_labels=config.num_labels,
             start_label_id=config.label2idx['CLS'])
     else:
         self.crf_layer = None
     self.output_concat = output_concat
     self.init_weights()
Esempio n. 20
0
    def test_initialization_more(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        model = BartModel(config)
        model.to(torch_device)
        model.eval()
        # test init
        self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item())

        def _check_var(module):
            """Check that we initialized various parameters from N(0, config.init_std)."""
            self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2)

        _check_var(model.encoder.embed_tokens)
        _check_var(model.encoder.layers[0].self_attn.k_proj)
        _check_var(model.encoder.layers[0].fc1)
        _check_var(model.encoder.embed_positions)
    def __init__(self, args, use_mask=True):
        super(OneHotMoralClassifier, self).__init__()
        self.hparams = args
        self.bart = BartModel.from_pretrained('facebook/bart-large-cnn')
        self.use_mask = use_mask

        self.vocab_size = 50264
        self.onehot_embeddings = nn.Linear(self.vocab_size, 1024, bias=False)
        self.onehot_embeddings.weight = nn.Parameter(self.build_lookups())

        # self.bart.encoder.embed_tokens = nn.Identity()
        # freeze bert weights
        # self.onehot_embeddings.requires_grad = False
        # self.onehot_embeddings.weight.requires_grad = False
        # for param in self.bart.parameters():
        #     param.requires_grad = False

        # Pooler
        self.l2 = torch.nn.Linear(1024, 1024)
        self.act = torch.nn.Tanh()
        # Classifier
        self.l3 = torch.nn.Dropout(0.2)
        self.l4 = torch.nn.Linear(1024, 10)  # 10 categories
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
    bart.eval()  # disable dropout
    bart.model.upgrade_state_dict(bart.model.state_dict())
    hf_model_name = checkpoint_path.replace(".", "-")
    config = BartConfig.from_pretrained(hf_model_name)
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    if checkpoint_path in ["bart.large", "bart.large.cnn"]:
        state_dict = bart.model.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        model = BartModel(config)
        their_output = bart.extract_features(tokens)
    else:  # MNLI Case
        state_dict = bart.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
        for src, dest in rename_keys:
            rename_key(state_dict, src, dest)
        model = BartForSequenceClassification(config)
        their_output = bart.predict("mnli", tokens, return_logits=True)

    # Load state dict
    model.load_state_dict(state_dict)
    model.eval()
    # Check results

    if checkpoint_path == "bart.large.cnn":
        model = BartForConditionalGeneration(config, base_model=model)
        assert "lm_head.weight" in model.state_dict()
        assert model.lm_head.out_features == config.max_position_embeddings
        model.eval()
        our_outputs = model.model(tokens)[0]
    else:
        our_outputs = model(tokens)[0]
    assert their_output.shape == our_outputs.shape
    assert (their_output == our_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 23
0
 def test_model_from_pretrained(self):
     # Forces 1.6GB download from S3 for each model
     for model_name in list(BART_PRETRAINED_MODEL_ARCHIVE_MAP.keys()):
         model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
         self.assertIsNotNone(model)
 def __init__(self, n_vocab=50264):
     self.n_vocab = n_vocab
     self.true_embedding = BartModel.from_pretrained(
         'facebook/bart-large-cnn').encoder.embed_tokens
Esempio n. 25
0
def Seq2Seq(df):
    model_type = 'facebook/bart-large'

    tokenizer = BartTokenizer.from_pretrained(model_type)
    model = BartModel.from_pretrained(model_type)
    mask_model = BartForConditionalGeneration.from_pretrained(model_type)

    sep_token = '</s>'
    mask_token = '<mask>'

    mask_id = tokenizer(mask_token, return_tensors='pt')['input_ids'][0][1]
    sep_id = tokenizer(sep_token, return_tensors='pt')['input_ids'][0][1]

    optimizer = AdamW(model.parameters())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    mask_model.to(device)

    df['mask_text'] = 0
    df['auxiliary_text'] = 0

    for i in range(len(df)):
        aspect = df['aspect'].iloc[i]
        sentiment = df['sentiment'].iloc[i]

        if aspect == 'NULL' or isinstance(aspect, (int, float)):
            aspect = 'aspect'

        if DPM_type == 'Senti':
            mask_sent = 'the polarity of the ' + aspect + ' is ' + mask_token + ' ' + sep_token + ' '
            auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' '
        elif DPM_type == 'AS':
            mask_sent = 'the polarity of the ' + mask_token + ' is ' + sentiment + ' ' + sep_token + ' '
            auxiliary_sent = 'the polarity of the ' + aspect + ' is ' + sentiment + ' ' + sep_token + ' '

        df['mask_text'].iloc[i] = mask_sent + df['text'].iloc[i]
        df['auxiliary_text'].iloc[i] = auxiliary_sent + df['text'].iloc[i]

    df['distance'] = 0
    df = df.astype('object')

    for i in range(len(df)):

        tokenized = df['mask_text'][i:i + 1].apply((lambda x: tokenizer.encode(
            x, add_special_tokens=True, max_length=MAX_LEN, truncation=True)))

        sep_index = tokenized[i].index(sep_id)
        mask_index = tokenized[i].index(mask_id)

        padded = pad_sequences(tokenized,
                               maxlen=MAX_LEN,
                               dtype="long",
                               value=0,
                               truncating="post",
                               padding="post")

        attention_mask = np.where(padded != 0, 1, 0)

        input_ids = torch.tensor(padded).to(device)
        attention_mask = torch.tensor(attention_mask).to(device)

        with torch.no_grad():
            last_hidden_states = model(input_ids,
                                       attention_mask=attention_mask)

        original_mask_embedding = last_hidden_states[0][:, mask_index, :].cpu(
        ).numpy()

        distance = []

        for pertubed_index in range(sep_index + 1, MAX_LEN):
            padded = pad_sequences(tokenized,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   value=0,
                                   truncating="post",
                                   padding="post")
            if padded[0][pertubed_index] != 0 and padded[0][
                    pertubed_index] != sep_id:
                #print(padded.shape)
                cur_id = padded[0][pertubed_index]
                padded[0][pertubed_index] = mask_id

                cur_embedding = mask_embedding(model, padded, mask_index)
                d = dist(original_mask_embedding, cur_embedding)
                distance.append((cur_id, d))

        df['distance'].iloc[i] = distance

    df['perturbed_mask_index'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        perturbed_mask_index = []
        mask_threshold = calculate_threshold(
            np.array(df['distance'].iloc[i])[:, 1], std_strength)
        for dis_index in range(len(df['distance'].iloc[i])):
            if df['distance'].iloc[i][dis_index][1] < mask_threshold:
                perturbed_mask_index.append(dis_index)

        df['perturbed_mask_index'].iloc[i] = perturbed_mask_index

    df['augment_token_id'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        tokenized = torch.Tensor(tokenized).unsqueeze(0).to(
            torch.int64).to(device)
        augment_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])

        mask_tokenized = tokenizer.encode(df['auxiliary_text'].iloc[i])
        sep_index = mask_tokenized.index(sep_id)

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                j] + sep_index + 1
            mask_tokenized[perturbed_mask_index] = mask_id

        mask_tokenized = torch.Tensor(mask_tokenized).unsqueeze(0).to(
            torch.int64).to(device)
        logits = mask_model(mask_tokenized).logits

        for j in range(len(df['perturbed_mask_index'].iloc[i])):
            perturbed_mask_index = df['perturbed_mask_index'].iloc[i][
                j] + sep_index + 1
            probs = logits[0, perturbed_mask_index].softmax(dim=0)
            values, predictions = probs.topk(1)
            augment_tokenized[perturbed_mask_index] = int(
                predictions.cpu().numpy())

        df['augment_token_id'].iloc[i] = augment_tokenized

    df['augment_text'] = 0
    df = df.astype('object')

    for i in range(len(df)):
        sep_index = df['augment_token_id'].iloc[i].index(sep_id)
        df['augment_text'].iloc[i] = tokenizer.decode(
            df['augment_token_id'].iloc[i][sep_index + 1:-1])

    return df
class BartMetricLearningModel(BartPretrainedModel):
    def __init__(self, config: BartConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.model = BartModel(config)
        self.classification_head = BartClassificationHead(
            config.d_model,
            config.d_model,
            config.num_labels,
            config.classifier_dropout,
        )
        self.metric_hidden_size = 256
        self.metric_linear = nn.Linear(config.hidden_size,
                                       self.metric_hidden_size)
        # self.label_metric_linear = nn.Linear(config.hidden_size, self.metric_hidden_size)
        # self.predict_linear = nn.Linear(self.metric_hidden_size * 2, )
        self.scl_t = 1

        self.ce_p = 0.8
        self.scl_p = 0.1
        self.lscl_p = 0.1
        self.ce_loss_fct = CrossEntropyLoss()

        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)

    def scl_func(self, anchor_vectors, labels):
        """
        <<SUPERVISED CONTRASTIVE LEARNING FOR PRE-TRAINED LANGUAGE MODEL FINE-TUNING>>
        :param anchor_vector: batch_size * hidden_size
        :param labels:
        :return:
        """

        total_losses = 0
        anchor_vectors = anchor_vectors.squeeze(dim=1)
        for i in range(anchor_vectors.shape[0]):
            anchor_vector = anchor_vectors[i, :]
            # other_index = torch.from_numpy(np.tile(np.array(list(filter(lambda x: x != i, range(anchor_vectors.shape[0])))),
            #                                        anchor_vectors.shape[1]).reshape(anchor_vectors.shape[1], -1))
            # other_vectors = torch.gather(anchor_vectors.transpose(1, 0), dim=1, index=other_index).transpose(1, 0)

            other_vectors = np.delete(anchor_vectors.detach().cpu(), i,
                                      0).to(anchor_vector.device)
            same_labels = torch.where(labels == labels[i])
            same_label_vectors = anchor_vectors[same_labels]
            if same_label_vectors.shape[0] > 0:
                up = torch.exp(
                    torch.cosine_similarity(same_label_vectors,
                                            anchor_vector.unsqueeze(0)) /
                    self.scl_t)
                down = torch.sum(
                    torch.exp(
                        torch.cosine_similarity(other_vectors,
                                                anchor_vector.unsqueeze(0)) /
                        self.scl_t))
                singe_sample_loss = torch.sum(torch.log(
                    up / down)) / -(anchor_vectors.shape[0] - 1)
                total_losses += singe_sample_loss
        return total_losses

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        encoder_outputs=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        label_positions=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        label_max_position = torch.max(label_positions[-1]).tolist()
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
            use_cache = False

        if input_ids is None and inputs_embeds is not None:
            raise NotImplementedError(
                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
            )

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            encoder_outputs=encoder_outputs,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]  # last hidden state

        eos_mask = input_ids.eq(self.config.eos_token_id)

        if len(torch.unique(eos_mask.sum(1))) > 1:
            raise ValueError(
                "All examples must have the same number of <eos> tokens.")
        sentence_representation = sequence_output[eos_mask, :].view(
            sequence_output.size(0), -1, sequence_output.size(-1))[:, -1, :]

        anchor_vector = sentence_representation.unsqueeze(dim=1)

        label_vectors = None
        for positions in label_positions:
            position = positions[0]
            label_vector = sequence_output[:, position, :]
            label_vector = torch.mean(label_vector, dim=1).unsqueeze(dim=1)
            if label_vectors is None:
                label_vectors = label_vector
            else:
                label_vectors = torch.cat([label_vectors, label_vector], dim=1)

        anchor_vector = self.metric_linear(anchor_vector)
        label_vectors = self.metric_linear(label_vectors)
        logits = torch.cosine_similarity(label_vectors, anchor_vector, dim=2)

        loss = None
        if labels is not None:
            ce_loss = self.ce_loss_fct(logits, labels)
            scl_loss = self.scl_func(anchor_vector.squeeze(dim=1), labels) / 10
            # true_label_vectors = label_vectors[range(len(labels)), labels, :]
            # scl_label_loss = self.scl_func(true_label_vectors, labels) / 10

            # center_loss = self.center_loss_fct(anchor_vector, labels)
            # label_distance_loss = self.label_distance_loss_fct(label_vectors)

            loss = ce_loss * self.ce_p + scl_loss * self.scl_p
            # loss = ce_loss * self.ce_p + scl_loss * self.scl_p + scl_label_loss * self.lscl_p
            # loss = ce_loss

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output
        return ZeroShotOutput(loss=loss,
                              logits=logits,
                              anchor_vector=anchor_vector,
                              label_vectors=label_vectors,
                              hidden_states=sequence_output)
 def build_lookups(self):
     embeddings = BartModel.from_pretrained(
         'facebook/bart-large-cnn').encoder.embed_tokens
     ids = torch.LongTensor([i for i in range(self.vocab_size)])
     return torch.transpose(embeddings(ids), 0, 1).detach()
Esempio n. 28
0
def get_embedding(type_embedding, data):
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            data = [x.lower() for x in data]
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path, output_hidden_states=True)

    # Set the device to GPU (cuda) if available, otherwise stick with CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    list_of_four_last_embeddings = []
    list_of_mean = []

    for l in data:
        # Convert the string "granola bars" to tokenized vocabulary IDs
        input_ids = tokenizer.encode(l)
        #print(input_ids)
        # Convert the list of IDs to a tensor of IDs
        input_ids = torch.LongTensor(input_ids)
        #print(input_ids)
        model = model.to(device)
        input_ids = input_ids.to(device)
        #print(input_ids)
        model.eval()

        # unsqueeze IDs to get batch size of 1 as added dimension
        input_ids = input_ids.unsqueeze(0)
        with torch.no_grad():
            out = model(input_ids=input_ids)

        # we only want the hidden_states
        if type_embedding == 'xlm':
            hidden_states = out[1]
        else:
            hidden_states = out[2]
        #mean of layers
        sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze()
        list_of_mean.append(sentence_embedding.tolist())

        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)

        # take the mean of the concatenated vector over the token dimension
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        list_of_four_last_embeddings.append(cat_sentence_embedding.tolist())

    #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape)
    #print('list of mean', np.array(list_of_mean).shape)

    return list_of_mean, list_of_four_last_embeddings
def get_kobart_model():
    return BartModel.from_pretrained("hyunwoongko/kobart")
Esempio n. 30
0
    'tenacity': 5,
    'epoch_size': 4
}

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        default='bart-large',
                        help='model name or path')
    args = parser.parse_args()

    config = BartConfig.from_pretrained(args.model)
    model = BartModel.from_pretrained(args.model, config=config)
    tokenizer = BartTokenizer.from_pretrained(args.model)

    params_senteval['model'] = model.cuda().eval()
    params_senteval['tokenizer'] = tokenizer
    params_senteval['config'] = config

    se = senteval.engine.SE(params_senteval, batcher, prepare)
    transfer_tasks = [
        'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA',
        'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth',
        'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
        'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
    ]
    results = se.eval(transfer_tasks)