Exemple #1
0
    def __init__(self, task, bert_model, marginalization, tau_gumbel_softmax,
                 hard_gumbel_softmax, eps_gumbel_softmax, label_smoothing,
                 soft_bert_score, mixed_proportion):
        super().__init__(task)

        self.bert_model = bert_model

        self.marginalization = marginalization

        self.bert_scorer = BERTScorer(
            self.bert_model,
            soft_bert_score=soft_bert_score)  # , device='cpu')
        self.pad_token_id = self.bert_scorer._tokenizer.convert_tokens_to_ids(
            '[PAD]')

        # Gumbel-Softmax hyperparameters
        self.tau_gumbel_softmax = tau_gumbel_softmax
        self.hard_gumbel_softmax = hard_gumbel_softmax
        self.eps_gumbel_softmax = eps_gumbel_softmax

        # NLL parameters
        self.eps = label_smoothing

        # Cosine loss
        self.cos_loss = CosineEmbeddingLoss(reduction='sum')

        self._lambda = torch.tensor(mixed_proportion).to(
            self.bert_scorer.device)

        # File
        self.loss_stats_file = open('stats_mixed_nll_bert_sparsemax.txt', 'w')
        self.loss_stats_file.write('accuracy\tF_BERT\tLoss\n')
Exemple #2
0
 def __init__(self, student_config, teacher_config, device, args):
     self.mse_loss = MSELoss()
     self.kl_loss = KLDivLoss(reduction='batchmean')
     self.cosine_loss = CosineEmbeddingLoss()
     self.distill_config = student_config.distillation_config
     self.device = device
     self.student_config = student_config
     self.teacher_config = teacher_config
     self.batch_size = args.train_batch_size
Exemple #3
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        input_labels=None,
    ):
        loss = defaultdict(float)
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequences_output = outputs[0]  # bs x seq x hidden

        syn_labels = input_labels['syn_labels']  # bs
        positions = input_labels['positions']  # bs x 4

        syn_features = self.extract_features(sequences_output,
                                             positions)  # bs x hidden
        clf = self.syn_mse_clf if self.local_config['loss'] in {
            'mseplus_loss', 'mse_loss'
        } else self.syn_clf
        syn_logits = clf(syn_features)  # bs x 2 or bs

        if input_labels is not None:
            if self.local_config['loss'] != 'cosine_similarity':
                y_size = syn_logits.size(-1)
            else:
                y_size = -1
            if y_size == 1:
                lossfn = MSELoss(
                ) if self.local_config['loss'] == 'mse_loss' else MSEPlusLoss(
                )
                loss['total'] = lossfn(syn_logits,
                                       syn_labels.unsqueeze(-1).float())
            elif self.local_config['loss'] == 'crossentropy_loss':
                loss['total'] = CrossEntropyLoss()(syn_logits, syn_labels)
            else:
                loss['total'] = CosineEmbeddingLoss()(syn_logits[0],
                                                      syn_logits[1],
                                                      syn_labels * 2 - 1)

        return (loss, syn_logits)
Exemple #4
0
    def __init__(
            self,
            optimizer_class=torch.optim.Adam,
            optim_wt_decay=0.,
            epochs=5,
            regularization=None,
            loss_type='cos',
            all_senses=None,
            all_supersenses=None,
            elmo_class=None,  # for sense vector in the model
            file_path="",
            device=device,
            **kwargs):

        ## Training parameters
        self.epochs = epochs
        self.elmo_class = elmo_class

        ## optimizer
        self.optimizer = optimizer_class
        self.optim_wt_decay = optim_wt_decay

        # taget word index and senses list
        self.all_senses = all_senses
        self.all_supersenses = all_supersenses

        self._init_kwargs = kwargs
        self.device = device

        # loss to calculate the similarity betwee two tensors
        if loss_type == 'mse':
            self.loss = MSELoss().to(self.device)
        else:
            self.loss = CosineEmbeddingLoss().to(self.device)
        '''
		if regularization == "l1":
			self.regularization = L1Loss()
		elif regularization == "smoothl1":
			self.regularization = SmoothL1Loss()
		else:
			self.regularization = None
		'''
        self.best_model_file = file_path + "word_sense_model_.pth"
        '''
Exemple #5
0
    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                sim_labels=None):
        sen1_attention_mask = (1 - token_type_ids) * attention_mask

        _, pooled_output_combined = self.bert(input_ids,
                                              token_type_ids,
                                              attention_mask,
                                              output_all_encoded_layers=False)
        pooled_output_combined = self.dropout(pooled_output_combined)

        _, pooled_output_sen1 = self.bert(input_ids,
                                          token_type_ids,
                                          sen1_attention_mask,
                                          output_all_encoded_layers=False)

        cos_sim = self.cosine(pooled_output_combined,
                              pooled_output_sen1).unsqueeze(1)

        combined = torch.cat([pooled_output_combined, cos_sim], dim=1)
        logits = self.classifier(combined)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss_bert = loss_fct(logits.view(-1, self.num_labels),
                                 labels.view(-1))

            #print("Labels:", labels[10:])
            #new_labels = (1.0 - labels) + (labels * -1.0)
            #print("New Labels:", new_labels[10:])

            loss_cosine = CosineEmbeddingLoss()
            loss_intent = loss_cosine(pooled_output_combined,
                                      pooled_output_sen1, sim_labels.float())

            loss = self.alpha * loss_bert + (1 - self.alpha) * loss_intent

            return loss
        else:
            return logits
Exemple #6
0
    def __init__(self, task, bert_model, marginalization, tau_gumbel_softmax, hard_gumbel_softmax, eps_gumbel_softmax,
                 soft_bert_score, force_alignment):
        super().__init__(task)

        self.bert_model = bert_model

        self.marginalization = marginalization
        self.force_alignment = force_alignment

        self.bert_scorer = BERTScorer(self.bert_model, soft_bert_score=soft_bert_score)  # , device='cpu')
        self.pad_token_id = self.bert_scorer._tokenizer.convert_tokens_to_ids('[PAD]')

        # Gumbel-Softmax hyperparameters
        self.tau_gumbel_softmax = tau_gumbel_softmax
        self.hard_gumbel_softmax = hard_gumbel_softmax
        self.eps_gumbel_softmax = eps_gumbel_softmax

        # Cosine loss
        self.cos_loss = CosineEmbeddingLoss(reduction='sum')
        # self.cos_sim = CosineSimilarity(dim=1)

        # File
        self.loss_stats_file = open('stats_aligned_bert_'+self.marginalization+'.txt', 'w')
        self.loss_stats_file.write('accuracy\tBERT_loss\n')
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        position_ids2=None,
        head_mask2=None,
        inputs_embeds2=None,
        labels2=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        _, outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #             position_ids=position_ids,
            #             head_mask=head_mask,
            #             inputs_embeds=inputs_embeds,
        )

        _, outputs2 = self.bert(
            input_ids2,
            attention_mask=attention_mask2,
            token_type_ids=token_type_ids2,
            #             position_ids=position_ids2,
            #             head_mask=head_mask2,
            #             inputs_embeds=inputs_embeds2,
        )

        pooled_output = outputs
        pooled_output2 = outputs2

        pooled_output = self.dropout(pooled_output)
        pooled_output2 = self.dropout(pooled_output2)

        #         A series of different concatenations(concat(),|minus|,multiply, ...)
        final_output_cat = torch.cat((pooled_output, pooled_output2), 1)
        final_output_minus = torch.abs(pooled_output - pooled_output2)
        final_output_mult = torch.mul(pooled_output, pooled_output2)
        #         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
        #         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
        #         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        final_output_camimu = torch.cat(
            (final_output_cat, final_output_minus, final_output_mult), 1)

        cos_pooled_outputs = torch.cosine_similarity(pooled_output,
                                                     pooled_output2,
                                                     dim=1)
        #         1
        #         torch.Size([hidden_size*2, 768])
        #         2
        #         torch.Size([hidden_size, 768])
        #         3
        #         torch.Size([hidden_size, 768])
        #         4
        #         torch.Size([hidden_size*2, 768])
        #         5
        #         torch.Size([hidden_size*3, 768])
        #         6
        #         torch.Size([hidden_size*3, 768])
        #         7
        #         torch.Size([hidden_size*4, 768])

        #         batch_size = list(pooled_output.size())[0]
        #         hidden_size = list(pooled_output.size())[1]

        final_output_all = torch.cat(
            (final_output_camimu, cos_pooled_outputs.unsqueeze(1)), 1)
        logits_ce = self.classifier(final_output_all)
        #         print('logits_ce:')
        #         print(logits_ce)

        #         logits_ori = self.classifier2(final_output_camimu)
        #         print('logits_ori:')
        #         print(logits_ori)

        #Calculate loss during training process
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct_ce = CrossEntropyLoss()
                loss_ce = loss_fct_ce(logits_ce.view(-1, self.num_labels),
                                      labels.view(-1))
                #                 logger.info('loss_ce:')
                #                 logger.info(loss_ce)

                #                 loss_ori = loss_fct_ce(logits_ori.view(-1, self.num_labels), labels.view(-1))
                #                 print('loss_ori:')
                #                 print(loss_ori)
                loss_fct_cos = CosineEmbeddingLoss()

                labels2[labels2 == 0] = -1
                loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2)
                labels2[labels2 == -1] = 0

                #                 labels2[labels2==1] = -1
                #                 labels2[labels2==0] = 1
                #                 loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2)
                #                 labels2[labels2== 1] = 0
                #                 labels2[labels2==-1] = 1

                #                 logger.info('loss_cos:')
                #                 logger.info(loss_cos)

                loss = loss_ce + loss_cos
                #                 logger.info('final loss:')
                #                 logger.info(loss)

                #             outputs = (loss,) + outputs
                #             outputs = (loss,) + logits_cos
                outputs = loss
                return outputs
        else:
            #Get predictions when doing evaluation
            return logits_ce
def CosineLoss(A, B):
    lossfunc = CosineEmbeddingLoss(margin=0.5)
    y = torch.tensor(1.0)
    if A.is_cuda:
        y = y.cuda()
    return lossfunc(A, B, target=y)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None,
                labels=None, entity_labels=None, checkpoint_activations=False):

        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        # **YD** entity branch forward.
        entity_logits = self.entity_classifier(sequence_output)
        # **YD** may not require activation function
        entity_logits = self.activate(entity_logits)

        # entity_logits = F.normalize(entity_logits, 2, 2)
        # entity_logits = torch.matmul(entity_logits, self.entity_emb.weight.T)
        # entity_logits = torch.log(entity_logits)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            entity_loss_fct = CosineEmbeddingLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                '''
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
                '''
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                ner_loss = loss_fct(active_logits, active_labels)

                '''
                entity_labels[entity_labels == _OUT_DICT_ENTITY_ID] = _IGNORE_CLASSIFICATION_LABEL
                assert entity_labels.requires_grad is False
                entity_active_logits = entity_logits.view(-1, self.num_entity_labels)
                entity_active_labels = torch.where(
                    active_loss, entity_labels.view(-1),
                    torch.tensor(entity_loss_fct.ignore_index).type_as(entity_labels)
                )
                entity_loss = entity_loss_fct(entity_active_logits, entity_active_labels)
                '''

                # entity_active_loss = (labels.view(-1) == NER_LABEL_DICT['B']) | active_loss
                entity_active_loss = (entity_labels.view(-1) > 0)
                entity_active_logits = entity_logits.view(-1, self.dim_entity_emb)[entity_active_loss]
                entity_active_labels = entity_labels.view(-1)[entity_active_loss]

                entity_loss = entity_loss_fct(
                    entity_active_logits,
                    self.entity_emb.weight[entity_active_labels],
                    torch.tensor(1).type_as(entity_labels)
                )

                print('ner_loss', ner_loss, 'entity_loss', entity_loss)
                if torch.isnan(entity_loss):
                    loss = ner_loss
                else:
                    loss = ner_loss + entity_loss
                assert not torch.isnan(loss)
            else:
                # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
                raise ValueError("mask has to not None ")

            return loss
        else:
            return logits, entity_logits
Exemple #10
0
    def __init__(self, args, src_dict, tgt_dict, src_embedding, tgt_embedding,
                 device):
        super(E2E, self).__init__(args)

        self.args = args
        self.src_dict = src_dict
        self.tgt_dict = tgt_dict

        # src_flow: assume tgt embeddings are transformed from the src mog space
        self.register_buffer('src_embedding', src_embedding)
        self.register_buffer('tgt_embedding', tgt_embedding)

        if args.init_var:
            # initialize with gaussian variance
            self.register_buffer("s2t_s_var", src_dict.var)
            self.register_buffer("s2t_t_var", tgt_dict.var)
            self.register_buffer("t2s_s_var", src_dict.var)
            self.register_buffer("t2s_t_var", tgt_dict.var)
        else:
            self.s2t_s_var = args.s_var
            self.s2t_t_var = args.s2t_t_var
            self.t2s_t_var = args.t_var
            self.t2s_s_var = args.t2s_s_var

        self.register_buffer('src_freqs',
                             torch.tensor(src_dict.freqs, dtype=torch.float))
        self.register_buffer('tgt_freqs',
                             torch.tensor(tgt_dict.freqs, dtype=torch.float))

        # backward: t2s
        self.src_flow = MogFlow_batch(args, self.t2s_s_var)
        # backward: s2t
        self.tgt_flow = MogFlow_batch(args, self.s2t_t_var)

        self.s2t_valid_dico = None
        self.t2s_valid_dico = None

        self.device = device
        # use dict pairs from train data (supervise) or identical words (supervise_id) as supervisions
        self.supervise = args.supervise_id
        if self.supervise:
            self.load_training_dico()
            if args.sup_obj == 'mse':
                self.sup_loss_func = nn.MSELoss()
            elif args.sup_obj == 'cosine':
                self.sup_loss_func = CosineEmbeddingLoss()

        optim_fn, optim_params = get_optimizer(args.flow_opt_params)
        self.flow_optimizer = optim_fn(
            list(self.src_flow.parameters()) +
            list(self.tgt_flow.parameters()), **optim_params)
        self.flow_scheduler = torch.optim.lr_scheduler.ExponentialLR(
            self.flow_optimizer, gamma=args.lr_decay)

        self.best_valid_metric = 1e-12

        self.sup_sw = args.sup_s_weight
        self.sup_tw = args.sup_t_weight

        self.mse_loss = nn.MSELoss()
        self.cos_loss = CosineEmbeddingLoss()

        # Evaluation on trained model
        if args.load_from_pretrain_s2t != "" or args.load_from_pretrain_t2s != "":
            self.load_from_pretrain()
Exemple #11
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        position_ids2=None,
        head_mask2=None,
        inputs_embeds2=None,
        labels2=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """

        _, outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,
        )

        _, outputsC = self.bert(
            input_ids2,
            attention_mask=attention_mask2,
            token_type_ids=token_type_ids2,
#             position_ids=position_ids2,
#             head_mask=head_mask2,
#             inputs_embeds=inputs_embeds2,
        )
#         print("Careful, outputs:")
#         print(outputs)
#         print(outputsC)
        pooled_output = outputs
        pooled_outputC = outputsC

        pooled_output = self.dropout(pooled_output)
#         pooled_outputC = self.dropout(pooled_outputC)
        
        cos_pooled_outputs = torch.cosine_similarity(pooled_output, pooled_outputC, dim=1)
        
#         print('pooled_output size:')
#         print(pooled_output.size())
#         print(pooled_output)
#         print('cos_pooled_outputs size:')
#         print(cos_pooled_outputs.size())
#         print(cos_pooled_outputs)
        batch_size = list(pooled_output.size())[0]
        hidden_size = list(pooled_output.size())[1]

#         logits_ce = self.classifier2(pooled_outputC)
#         print('logits_ce:')
#         print(logits_ce)
        
    
        ## v2: concat
        ## v3: multiply
        ## v4: v2 & ce_cos_similarity
        ## v5: v3 & ce_cos_similarity
        
#         print(torch.cat((pooled_output, cos_pooled_outputs.unsqueeze(1)),1))
#         print((pooled_output*cos_pooled_outputs.unsqueeze(1)))
        logits_cos = self.classifier(torch.cat((pooled_output, cos_pooled_outputs.unsqueeze(1)),1))
        logits_final = logits_cos
#         logits_cos = self.classifier2((pooled_output*cos_pooled_outputs.unsqueeze(1)))
#         self.classifier = torch.nn.Linear(hidden_size+batch_size, 2).to(device)
#         logits_cos = self.classifier(torch.cat((pooled_output, cos_pooled_outputs.repeat(batch_size,1)),1))
#         print('logits_cos:')
#         print(logits_cos)
        
#         logits = self.classifier(pooled_output)
#         logitsC = self.classifier(pooled_outputC)

#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
#         print(logits)
#         print('xd')
#         print(outputs[2:])
#         outputs = (logits,) + outputs[2:]
#         print("labels:")
#         print(labels)
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
#                 loss_fct_ce = CrossEntropyLoss()
#                 loss_ce = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

                loss_fct_ce = CrossEntropyLoss()
#                 print('pooled_output size:')
#                 print(pooled_output.size())
                loss_ce = loss_fct_ce(logits_final.view(-1, self.num_labels), labels.view(-1))
#                 loss_ce = loss_fct_ce(pooled_output.view(-1), labels.view(-1))
#                 print('loss_ce:')
#                 print(loss_ce)

                loss_fct_cos = CosineEmbeddingLoss()
                labels2[labels2==0] = -1
                loss_cos = loss_fct_cos(pooled_output, pooled_outputC, labels2)
                labels2[labels2==-1] = 0
                
#                 loss_fct_cos = CosineEmbeddingLoss()
# #                 print(labels)
# #                 print(pooled_outputC)


# #                 labels2[labels2==0] = -1
#                 loss_cos = loss_fct_cos(pooled_output, pooled_outputC, labels2)
        
        
#                 loss_cos = loss_fct_cos(logits_ce, logits_cos, labels2)
#                 loss_cos = loss_fct_ce(logits_cos.view(-1, self.num_labels), labels2.view(-1))
#                 print('loss_cos:')
#                 print(loss_cos)
            
                loss = loss_cos+loss_ce
#                 print('final loss:')
#                 print(loss)
#                 logits = self.classifier(loss)
#             outputs = (loss,) + outputs
#             outputs = (loss,) + logits_cos 
                outputs = loss
                return outputs
        else:
            return logits_final
Exemple #12
0
def getPredictionLossFn(cl=None, net=None):
    kldivLoss = KLDivLoss()
    mseLoss = MSELoss()
    smoothl1Loss = SmoothL1Loss()
    tripletLoss = TripletMarginLoss()  #TripletLoss()
    cosineLoss = CosineEmbeddingLoss(margin=0.5)
    if PREDICTION_LOSS == 'MSE':

        def prediction_loss(predFeature, nextFeature):
            return mseLoss(predFeature, nextFeature)
    elif PREDICTION_LOSS == 'SMOOTHL1':

        def prediction_loss(predFeature, nextFeature):
            return smoothl1Loss(predFeature, nextFeature)
    elif PREDICTION_LOSS == 'TRIPLET':

        def prediction_loss(predFeature,
                            nextFeature,
                            negativeFeature=None,
                            cl=cl,
                            net=net):
            if not negativeFeature:
                negatives, _, _ = cl.randomSamples(1)  #predFeature.size(0))
                negativeFeature = net(
                    Variable(negatives[0], requires_grad=False).cuda(),
                    Variable(negatives[1],
                             requires_grad=False).cuda()).detach()
            return tripletLoss(predFeature.unsqueeze(0),
                               nextFeature.unsqueeze(0), negativeFeature)
    elif PREDICTION_LOSS == 'COSINE':

        def prediction_loss(predFeature,
                            nextFeature,
                            negativeFeature=None,
                            cl=cl,
                            net=net):
            if not negativeFeature:
                negatives, _, _ = cl.randomSamples(1)  #predFeature.size(0))
                negativeFeature = net(
                    Variable(negatives[0], requires_grad=False).cuda(),
                    Variable(negatives[1],
                             requires_grad=False).cuda()).detach()
            else:
                negativeFeature = negativeFeature.unsqueeze(0)
            predFeature = predFeature.unsqueeze(0)
            nextFeature = nextFeature.unsqueeze(0)
            # concat positive and negative features
            # create targets for concatenated positives and negatives
            input1 = torch.cat([predFeature, predFeature], dim=0)
            input2 = torch.cat([nextFeature, negativeFeature], dim=0)
            target1 = Variable(torch.ones(predFeature.size(0)),
                               requires_grad=False).detach().cuda()
            target2 = -target1
            target = torch.cat([target1, target2], dim=0)
            return cosineLoss(input1, input2, target)
    else:

        def prediction_loss(predFeature, nextFeature):
            return kldivLoss(F.log_softmax(predFeature),
                             F.softmax(nextFeature))

    return prediction_loss
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        input_ids2=None,
        attention_mask2=None,
        token_type_ids2=None,
        position_ids2=None,
        head_mask2=None,
        inputs_embeds2=None,
        labels2=None,
        input_ids3=None,
        attention_mask3=None,
        token_type_ids3=None,
        position_ids3=None,
        head_mask3=None,
        inputs_embeds3=None,
        labels3=None

        #         input_ids4=None,
        #         attention_mask4=None,
        #         token_type_ids4=None,
        #         position_ids4=None,
        #         head_mask4=None,
        #         inputs_embeds4=None,
        #         labels4=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import BertTokenizer, BertForSequenceClassification
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)

        loss, logits = outputs[:2]

        """
        # Pers rep
        _, outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #             position_ids=position_ids,
            #             head_mask=head_mask,
            #             inputs_embeds=inputs_embeds,
        )
        # Claim rep
        _, outputs2 = self.bert(
            input_ids2,
            attention_mask=attention_mask2,
            token_type_ids=token_type_ids2,
            #             position_ids=position_ids2,
            #             head_mask=head_mask2,
            #             inputs_embeds=inputs_embeds2,
        )
        # Opp Pers rep
        _, outputs3 = self.bert(
            input_ids3,
            attention_mask=attention_mask3,
            token_type_ids=token_type_ids3,
            #             position_ids=position_ids2,
            #             head_mask=head_mask2,
            #             inputs_embeds=inputs_embeds2,
        )
        # Opp Claim rep
        #         _, outputs4 = self.bert(
        #             input_ids4,
        #             attention_mask=attention_mask4,
        #             token_type_ids=token_type_ids4,
        # #             position_ids=position_ids2,
        # #             head_mask=head_mask2,
        # #             inputs_embeds=inputs_embeds2,
        #         )

        pooled_output = outputs
        pooled_output2 = outputs2
        pooled_output3 = outputs3
        #         pooled_output4 = outputs4

        pooled_output = self.dropout(pooled_output)
        pooled_output2 = self.dropout(pooled_output2)
        pooled_output3 = self.dropout(pooled_output3)
        #         pooled_output4 = self.dropout(pooled_output4)

        #         A series of different concatenations(concat(),|minus|,multiply, ...)
        final_output_cat = torch.cat((pooled_output2, pooled_output), 1)
        final_output_minus = torch.abs(pooled_output2 - pooled_output)
        final_output_mult = torch.mul(pooled_output2, pooled_output)
        #         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
        #         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
        #         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        final_output_camimu = torch.cat(
            (final_output_cat, final_output_minus, final_output_mult), 1)
        cos_pooled_outputs = torch.cosine_similarity(pooled_output2,
                                                     pooled_output,
                                                     dim=1)

        #         ocop_final_output_cat = torch.cat((pooled_output4, pooled_output3),1)
        #         ocop_final_output_minus = torch.abs(pooled_output4-pooled_output3)
        #         ocop_final_output_mult = torch.mul(pooled_output4, pooled_output3)
        #         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
        #         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
        #         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        #         ocop_final_output_camimu = torch.cat((ocop_final_output_cat, ocop_final_output_minus, ocop_final_output_mult),1)
        #         ocop_cos_pooled_outputs = torch.cosine_similarity(pooled_output4, pooled_output3, dim=1)

        cop_final_output_cat = torch.cat((pooled_output2, pooled_output3), 1)
        cop_final_output_minus = torch.abs(pooled_output2 - pooled_output3)
        cop_final_output_mult = torch.mul(pooled_output2, pooled_output3)
        #         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
        #         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
        #         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        cop_final_output_camimu = torch.cat(
            (cop_final_output_cat, cop_final_output_minus,
             cop_final_output_mult), 1)
        cop_cos_pooled_outputs = torch.cosine_similarity(pooled_output2,
                                                         pooled_output3,
                                                         dim=1)

        #         ocp_final_output_cat = torch.cat((pooled_output4, pooled_output),1)
        #         ocp_final_output_minus = torch.abs(pooled_output4-pooled_output)
        #         ocp_final_output_mult = torch.mul(pooled_output4, pooled_output)
        #         final_output_mimu = torch.cat((final_output_minus, final_output_mult),1)
        #         final_output_camu = torch.cat((final_output_cat, final_output_mult),1)
        #         final_output_cami = torch.cat((final_output_cat, final_output_minus),1)
        #         ocp_final_output_camimu = torch.cat((ocp_final_output_cat, ocp_final_output_minus, ocp_final_output_mult),1)
        #         ocp_cos_pooled_outputs = torch.cosine_similarity(pooled_output4, pooled_output, dim=1)

        #         1
        #         torch.Size([hidden_size*2, 768])
        #         2
        #         torch.Size([hidden_size, 768])
        #         3
        #         torch.Size([hidden_size, 768])
        #         4
        #         torch.Size([hidden_size*2, 768])
        #         5
        #         torch.Size([hidden_size*3, 768])
        #         6
        #         torch.Size([hidden_size*3, 768])
        #         7
        #         torch.Size([hidden_size*4, 768])

        batch_size = list(pooled_output.size())[0]
        hidden_size = list(pooled_output.size())[1]

        final_output_all = torch.cat(
            (final_output_camimu, cos_pooled_outputs.unsqueeze(1)), 1)
        cop_final_output_all = torch.cat(
            (cop_final_output_camimu, cop_cos_pooled_outputs.unsqueeze(1)), 1)
        #         ocp_final_output_all = torch.cat((ocp_final_output_camimu, ocp_cos_pooled_outputs.unsqueeze(1)),1)
        #         ocop_final_output_all = torch.cat((ocop_final_output_camimu, ocop_cos_pooled_outputs.unsqueeze(1)),1)

        logits_ce = self.classifier(final_output_all)

        #         ocop_logits_ce = self.classifier(ocop_final_output_all)
        cop_logits_ce = self.classifier(cop_final_output_all)
        #         ocp_logits_ce = self.classifier(ocp_final_output_all)

        #         best_score = 0
        #         logits_grid = []
        #         for ori in (list(np.arange(0,2.5,0.5))+[10,100,1000]):
        #             for cop in (list(np.arange(0,2.5,0.5))+[10,100,1000]):
        #                 for ocp in (list(np.arange(0,2.5,0.5))+[10,100,1000]):
        #                     for ocop in (list(np.arange(0,2.5,0.5))+[10,100,1000]):
        #                         logits_grid.append((ori*logits_ce)-(cop*cop_logits_ce)-(ocp*ocp_logits_ce)+(ocop*ocop_logits_ce))

        ####   grid search end
        #         if input_ids4 and input_ids3:
        final_logits = (1 * logits_ce) - (1 * cop_logits_ce)
        #         elif input_ids3:
        #             final_logits = logits_ce-(0.33*cop_logits_ce)
        #         elif input_ids4:
        #             final_logits = logits_ce-(0.33*ocp_logits_ce)
        #         else:
        #             final_logits = logits_ce
        #         print('logits_ce:')
        #         print(logits_ce)

        #         logits_ori = self.classifier2(final_output_camimu)
        #         print('logits_ori:')
        #         print(logits_ori)

        #Calculate loss during training process
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(final_logits.view(-1), labels.view(-1))
            else:
                loss_fct_ce = CrossEntropyLoss()
                loss_ce = loss_fct_ce(final_logits.view(-1, self.num_labels),
                                      labels.view(-1))
                #                 logger.info('loss_ce:')
                #                 logger.info(loss_ce)

                #                 loss_ori = loss_fct_ce(logits_ori.view(-1, self.num_labels), labels.view(-1))
                #                 print('loss_ori:')
                #                 print(loss_ori)
                loss_fct_cos = CosineEmbeddingLoss()
                loss_fct_tri = TripletLoss()

                #                 labels2[labels2==0] = -1
                #                 loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2)
                #                 labels2[labels2==-1] = 0
                k = 0
                index = []
                for i in labels:
                    k = k + 1
                    if i == 0:
                        index.append(k)
                pooled_output_inter = pooled_output.clone().detach()
                pooled_output3_inter = pooled_output3.clone().detach()

                pooled_output_inter2 = pooled_output.clone().detach()
                pooled_output3_inter2 = pooled_output3.clone().detach()

                for l in index:
                    pooled_output_inter[l - 1], pooled_output3_inter[
                        l -
                        1] = pooled_output3_inter[l -
                                                  1], pooled_output_inter[l -
                                                                          1]

                for l in index:
                    pooled_output3_inter2[l - 1], pooled_output_inter2[
                        l -
                        1] = pooled_output_inter2[l -
                                                  1], pooled_output3_inter2[l -
                                                                            1]

                loss_tri = loss_fct_tri(pooled_output2, pooled_output_inter,
                                        pooled_output3_inter2)

                loss = loss_ce + loss_tri
                #                 logger.info('final loss:')
                #                 logger.info(loss)

                #             outputs = (loss,) + outputs
                #             outputs = (loss,) + logits_cos
                outputs = loss
                return outputs
        else:
            #Get predictions when doing evaluation
            return final_logits
Exemple #14
0
    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                masked_lm_labels=None,
                **kwargs):
        sequence_output, _ = self._bert_model.bert(
            input_ids,
            token_type_ids,
            attention_mask,
            output_all_encoded_layers=False)
        prediction_scores = self._bert_model.cls(sequence_output)

        if masked_lm_labels is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1, reduction='sum')
            masked_lm_loss = loss_fct(
                prediction_scores.view(-1, self.bert_config.vocab_size),
                masked_lm_labels.view(-1))

            ## YS
            loss = masked_lm_loss

            ## YS
            if 'input_ref_ids' in kwargs:
                input_ref_ids = kwargs['input_ref_ids']
                sequence_ref_output, _ = self._bert_model.bert(
                    input_ids,
                    token_type_ids,
                    attention_mask,
                    output_all_encoded_layers=False)

                ## Similarity loss between [CLS] tokens. Cosine similarity with in-batch negative samples
                sim_loss_fct = CosineEmbeddingLoss(margin=0, reduction='mean')

                _bs, _seq_len, _bert_dim = sequence_output.size()

                _t_output = sequence_output[:, 0, :].unsqueeze(0).expand(
                    _bs, _bs, _bert_dim).reshape(_bs * _bs, _bert_dim)
                _t_ref_output = sequence_output[:, 0, :].unsqueeze(1).expand(
                    _bs, _bs, _bert_dim).reshape(_bs * _bs, _bert_dim)
                _y = torch.tensor(np.eye(_bs) * 2 - np.ones((_bs, _bs)),
                                  dtype=sequence_output.dtype,
                                  device=sequence_output.device).view(_bs *
                                                                      _bs)
                sim_loss = sim_loss_fct(_t_output, _t_ref_output, _y)

                loss += sim_loss

            sample_size = masked_lm_labels.ne(-1).sum().item()
            logging_output = {
                'sample_size': sample_size,
                'mlm_loss': masked_lm_loss.item(),
                'loss': loss.item()
            }

            if 'input_ref_ids' in kwargs:
                logging_output['sim_loss'] = sim_loss.item()

            return loss, logging_output
        else:
            return prediction_scores
Exemple #15
0
    def forward(self, output1, output2, is_diff):
        target = (1. - 2. * is_diff).float()  # map [0,1] -> [1, -1]
        cos = CosineEmbeddingLoss(margin=self.margin, reduction=self.reduction)

        return cos(output1, output2, target)