Example #1
0
    def forward(self, input_ids=None, attention_mask=None, labels=None):

        # BERT
        outputs = self.gpt2_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        summed_last_4_layers = self.dropout(
            summed_last_4_layers)  # newly added dropout

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        labels = labels[:, :lstm_output.shape[1]]

        # mask the unimportant tokens before log_reg
        mask = ((input_ids[:, :lstm_output.shape[1]] != 50256)
                & (input_ids[:, :lstm_output.shape[1]] != 50256)
                & (labels != 100))

        # on the first time steps
        for eachIndex in range(mask.shape[0]):
            mask[eachIndex, 0] = True

        mask_expanded = mask.unsqueeze(-1).expand(lstm_output.size())
        lstm_output *= mask_expanded.float()
        labels *= mask.long()

        # log reg
        probablities = self.hidden2tag(lstm_output)

        # CRF emissions
        loss = self.crf_layer(probablities, labels, reduction='token_mean')

        emissions_ = self.crf_layer.decode(probablities)
        emissions = [item for sublist in emissions_
                     for item in sublist]  # flatten the nest list of emissions

        return loss, torch.Tensor(emissions_), labels, mask
Example #2
0
    def forward(self, input_ids=None, attention_mask=None, labels=None):

        # BERT
        outputs = self.bert_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        labels = labels[:, :lstm_output.shape[1]]

        # mask the unimportant tokens before log_reg
        mask = (
            (input_ids[:, :lstm_output.shape[1]] !=
             self.tokenizer.pad_token_id)
            & (input_ids[:, :lstm_output.shape[1]] !=
               self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token))
            & (labels != 100))

        mask_expanded = mask.unsqueeze(-1).expand(lstm_output.size())
        lstm_output *= mask_expanded.float()
        labels *= mask.long()

        # linear for fine
        probablity = F.relu(self.hidden2tag_fine(lstm_output))
        max_probs = torch.max(probablity, dim=2)
        logits = max_probs.indices.flatten()

        loss = self.loss_fct(probablity.view(-1, clf_P_fine_num_labels),
                             labels.flatten())

        return loss, logits.to('cpu').numpy(), labels, mask
Example #3
0
    def forward(self,
                args,
                input_ids=None,
                attention_mask=None,
                P_labels=None,
                P_f_labels=None):

        # BERT
        outputs = self.bert_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        P_labels = P_labels[:, :lstm_output.shape[1]]
        P_f_labels = P_f_labels[:, :lstm_output.shape[1]]

        # Apply mask before calculating the matmul of inputs and the K, Q, V weights
        attention_mask_ = attention_mask[:, :lstm_output.shape[1]]
        attention_mask_ = attention_mask_.bool()

        # Apply self-attention here
        attention_applied, attention_weights = self.self_attention(
            lstm_output,
            lstm_output,
            lstm_output,
            key_padding_mask=None,
            need_weights=True,
            attn_mask=None)

        # Multi-attention applied here
        multi_attention_applied, multi_attention_weights = self.self_attention2(
            lstm_output,
            lstm_output,
            lstm_output,
            key_padding_mask=None,
            need_weights=True,
            attn_mask=None)

        # mask the unimportant tokens before attention is applied
        mask = (
            (input_ids[:, :attention_applied.shape[1]] !=
             self.tokenizer.pad_token_id)
            & (input_ids[:, :attention_applied.shape[1]] !=
               self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token))
            & (P_labels != 100))

        mask_expanded = mask.unsqueeze(-1).expand(attention_applied.size())
        attention_applied *= mask_expanded.float()
        P_labels *= mask.long()
        P_f_labels *= mask.long()

        # log reg (coarse)
        probablity = F.relu(self.hidden2tag(attention_applied))
        max_probs = torch.max(probablity, dim=2)
        logits = max_probs.indices

        # log reg (fine)
        probablity_fine = F.relu(self.hidden2tag_fine(multi_attention_applied))
        max_probs_fine = torch.max(probablity_fine, dim=2)
        logits_fine = max_probs_fine.indices

        # calculate coarse loss
        loss_coarse = self.loss_fct_coarse(
            probablity.view(-1, clf_P_num_labels), P_labels.flatten())

        # CRF emissions (fine)
        loss_fine = self.crf_layer_fine(probablity_fine,
                                        P_f_labels,
                                        mask=mask,
                                        reduction='token_mean')

        emissions_fine = self.crf_layer_fine.decode(probablity_fine)
        emissions_f = [item for sublist in emissions_fine for item in sublist
                       ]  # flatten the nest list of emissions

        loss = abs(loss_coarse) + abs(loss_fine)

        return loss_coarse, logits, P_labels, loss_fine, torch.Tensor(
            emissions_fine), P_f_labels, mask, loss
Example #4
0
    def forward(self, args, input_ids=None, attention_mask=None, P_labels=None, P_f_labels=None):

        # BERT
        outputs = self.gpt2_layer(
            input_ids,
            attention_mask = attention_mask
        )

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2] # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(sequence_output[:num_layer_sum]).mean(0)

        # summed_last_4_layers = self.dropout(summed_last_4_layers)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[unperm_idx] # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        P_labels = P_labels[:, :lstm_output.shape[1]]
        P_f_labels = P_f_labels[:, :lstm_output.shape[1]]

        # mask the unimportant tokens before log_reg
        mask = (
            #(input_ids[:, :lstm_output.shape[1]] != self.tokenizer.unk_token_id)
            (input_ids[:, :lstm_output.shape[1]] != 50256)
            # & (input_ids[:, :lstm_output.shape[1]] != self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token))
            & (input_ids[:, :lstm_output.shape[1]] != 50256)
            & (P_labels != 100)
        )

        mask_expanded = mask.unsqueeze(-1).expand(lstm_output.size())
        lstm_output *= mask_expanded.float()
        P_labels *= mask.long()
        P_f_labels *= mask.long()

        # log reg (coarse)
        probablity = F.relu ( self.hidden2tag( lstm_output ) )
        # log reg (fine)
        probablity_fine = F.relu ( self.hidden2tag_fine( lstm_output ) )

        # CRF emissions (coarse)
        loss_coarse = self.crf_layer(probablity, P_labels, reduction='token_mean')
        # print('Coarse-grained loss: ', loss_coarse)

        emissions_coarse = self.crf_layer.decode( probablity )
        emissions_c = [item for sublist in emissions_coarse for item in sublist] # flatten the nest list of emissions

        # CRF emissions (fine)
        loss_fine = self.crf_layer_fine(probablity_fine, P_f_labels, reduction='token_mean')
        # print('Fine-grained loss: ', loss_fine)

        emissions_fine = self.crf_layer_fine.decode( probablity_fine )
        emissions_f = [item for sublist in emissions_fine for item in sublist] # flatten the nest list of emissions

        loss = abs( loss_coarse ) + abs( loss_fine )

        return loss_coarse, torch.Tensor(emissions_coarse), P_labels, loss_fine, torch.Tensor(emissions_fine), P_f_labels, mask, loss
Example #5
0
    def forward(self, input_ids=None, attention_mask=None, labels=None):
    # def forward(self, args, input_ids=None, attention_mask=None, labels=None):

        # BERT
        outputs = self.bert_layer(
            input_ids,
            attention_mask = attention_mask
        )

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2] # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(sequence_output[:num_layer_sum]).mean(0)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output, batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[unperm_idx] # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        labels = labels[:, :lstm_output.shape[1]]

        # Apply the initialized dropout layer
        # lstm_output = self.drop_layer(lstm_output)

        # Apply mask before calculating the matmul of inputs and the K, Q, V weights
        attention_mask_ = attention_mask[:, :lstm_output.shape[1]]
        attention_mask_ = attention_mask_.bool()

        # Apply attention here
        attention_applied, attention_weights = self.self_attention( lstm_output, lstm_output, lstm_output, key_padding_mask=None, need_weights=True, attn_mask=None )

        # mask the unimportant tokens before attention is applied
        mask = (
            (input_ids[:, :attention_applied.shape[1]] != self.tokenizer.pad_token_id)
            & (input_ids[:, :attention_applied.shape[1]] != self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token))
            & (labels != 100)
        )

        mask_expanded = mask.unsqueeze(-1).expand(attention_applied.size())
        attention_applied *= mask_expanded.float()
        labels *= mask.long()

        # log reg
        probablity = F.relu ( self.hidden2tag( attention_applied ) )

        # CRF emissions (coarse)
        loss = self.crf_layer(probablity, labels, mask=mask, reduction='token_mean')

        emissions_ = self.crf_layer.decode( probablity )
        emissions = [item for sublist in emissions_ for item in sublist] # flatten the nest list of emissions

        # # mask labels here according to masks
        # labels_masked = labels[mask]

        return loss, torch.Tensor(emissions_), labels, mask
    def forward(self,
                args,
                input_ids=None,
                attention_mask=None,
                P_labels=None,
                P_f_labels=None):

        # BERT
        outputs = self.gpt2_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        P_labels = P_labels[:, :lstm_output.shape[1]]
        P_f_labels = P_f_labels[:, :lstm_output.shape[1]]

        # Apply mask before calculating the matmul of inputs and the K, Q, V weights
        attention_mask_ = attention_mask[:, :lstm_output.shape[1]]
        attention_mask_ = attention_mask_.bool()

        # apply more weight

        # Attention should be applied to the LSTM outputs here
        # attention_applied, attention_weights = self.attn(lstm_output, attention_mask.float())
        # attention_applied, attention_weights = self.self_attention( lstm_output, lstm_output, lstm_output, key_padding_mask=attention_mask_.permute(1, 0), need_weights=True, attn_mask=None )
        attention_applied, attention_weights = self.self_attention(
            lstm_output,
            lstm_output,
            lstm_output,
            key_padding_mask=None,
            need_weights=True,
            attn_mask=None)

        # mask the unimportant tokens after attention is applied
        mask = (
            #(input_ids[:, :lstm_output.shape[1]] != self.tokenizer.unk_token_id)
            (input_ids[:, :lstm_output.shape[1]] != 50256)
            # & (input_ids[:, :lstm_output.shape[1]] != self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token))
            & (input_ids[:, :lstm_output.shape[1]] != 50256)
            & (P_labels != 100))

        # on the first time steps
        for eachIndex in range(mask.shape[0]):
            mask[eachIndex, 0] = True

        mask_expanded = mask.unsqueeze(-1).expand(attention_applied.size())
        attention_applied *= mask_expanded.float()
        P_labels *= mask.long()
        P_f_labels *= mask.long()

        # log reg (coarse)
        probablity = F.relu(self.hidden2tag(attention_applied))
        # log reg (fine)
        probablity_fine = F.relu(self.hidden2tag_fine(attention_applied))

        # CRF emissions (coarse)
        #loss_coarse = self.crf_layer(probablity, reduction='token_mean')
        loss_coarse = self.crf_layer(probablity,
                                     P_labels,
                                     mask=mask,
                                     reduction='token_mean')

        emissions_coarse = self.crf_layer.decode(probablity)
        emissions_c = [
            item for sublist in emissions_coarse for item in sublist
        ]  # flatten the nest list of emissions

        # CRF emissions (fine)
        #loss_fine = self.crf_layer_fine(probablity_fine, reduction='token_mean')
        loss_fine = self.crf_layer_fine(probablity_fine,
                                        P_f_labels,
                                        mask=mask,
                                        reduction='token_mean')

        emissions_fine = self.crf_layer_fine.decode(probablity_fine)
        emissions_f = [item for sublist in emissions_fine for item in sublist
                       ]  # flatten the nest list of emissions

        loss = abs(loss_coarse) + abs(loss_fine)

        return loss_coarse, torch.Tensor(
            emissions_coarse), P_labels, loss_fine, torch.Tensor(
                emissions_fine), P_f_labels, mask, loss