class BertBiLSTMCRFSLModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.return_dict = config.return_dict if hasattr(
            config, "return_dict") else False
        self.bert = BertModel(config)
        self.dropout = Dropout(config.hidden_dropout_prob)
        self.lstm = LSTM(input_size=config.hidden_size,
                         hidden_size=config.hidden_size,
                         batch_first=True,
                         bidirectional=True)
        self.classifier = Linear(config.hidden_size * 2, config.num_labels)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
        self.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                labels=None,
                return_dict=None):

        self.lstm.flatten_parameters()

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            return_dict=self.return_dict)

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output = self.lstm(sequence_output)
        lstm_output = lstm_output[0]
        logits = self.classifier(lstm_output)

        loss = None
        if labels is not None:
            ## [TBD] change {label_id:-100 [CLS], [SEP], [PAD]} into {label_id:32 "O"}
            ## It means they contribute loss to loss function, so it need to be improved
            active_idx = labels != -100
            active_labels = torch.where(active_idx, labels,
                                        torch.tensor(0).type_as(labels))
            loss = self.crf(emissions=logits,
                            tags=active_labels,
                            mask=attention_mask.type(torch.uint8))
            loss = -1 * loss

        if self.return_dict:
            return TokenClassifierOutput(
                loss=loss,
                logits=logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )
        else:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output
Example #2
0
class Encoder(torch.nn.Module):
    """Vanilla Tacotron 2 encoder.
    
    Details:
        stack of 3 conv. layers 5 × 1 with BN and ReLU, dropout
        output is passed into a Bi-LSTM layer

    Arguments:
        input_dim -- size of the input (supposed character embedding)
        output_dim -- number of channels of the convolutional blocks and last Bi-LSTM
        num_blocks -- number of the convolutional blocks (at least one)
        kernel_size -- kernel size of the encoder's convolutional blocks
        dropout -- dropout rate to be aplied after each convolutional block
    Keyword arguments:
        generated -- just for convenience
    """
    def __init__(self,
                 input_dim,
                 output_dim,
                 num_blocks,
                 kernel_size,
                 dropout,
                 generated=False):
        super(Encoder, self).__init__()
        assert num_blocks > 0, (
            'There must be at least one convolutional block in the encoder.')
        assert output_dim % 2 == 0, (
            'Bidirectional LSTM output dimension must be divisible by 2.')
        convs = [ConvBlock(input_dim, output_dim, kernel_size, dropout, 'relu')] + \
                [ConvBlock(output_dim, output_dim, kernel_size, dropout, 'relu') for _ in range(num_blocks - 1)]
        self._convs = Sequential(*convs)
        self._lstm = LSTM(output_dim,
                          output_dim // 2,
                          batch_first=True,
                          bidirectional=True)

    def forward(self, x, x_lenghts, x_langs=None):
        # x_langs argument is there just for convenience
        x = x.transpose(1, 2)
        x = self._convs(x)
        x = x.transpose(1, 2)
        ml = x.size(1)
        x = torch.nn.utils.rnn.pack_padded_sequence(x,
                                                    x_lenghts,
                                                    batch_first=True)
        self._lstm.flatten_parameters()
        x, _ = self._lstm(x)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x,
                                                      batch_first=True,
                                                      total_length=ml)
        return x
class DocumentDistilBertLSTM(DistilBertPreTrainedModel):
    """
    DistilBERT output over document in LSTM
    """
    def __init__(self, bert_model_config: DistilBertConfig):
        super(DocumentDistilBertLSTM, self).__init__(bert_model_config)
        self.distilbert = DistilBertModel(bert_model_config)
        self.pooler = DistilBertPooler(bert_model_config)
        self.bert_batch_size = self.distilbert.config.bert_batch_size
        self.dropout = nn.Dropout(p=bert_model_config.dropout)
        self.lstm = LSTM(
            bert_model_config.hidden_size,
            bert_model_config.hidden_size,
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p=bert_model_config.dropout),
            nn.Linear(bert_model_config.hidden_size,
                      bert_model_config.num_labels), nn.Tanh())
        self.init_weights()

    #input_ids, token_type_ids, attention_masks
    def forward(self,
                document_batch: torch.Tensor,
                document_sequence_lengths: list,
                device='cuda'):

        #contains all BERT sequences
        #bert should output a (batch_size (i.e. number of documents), num_sequences , bert_hidden_size)
        distilbert_output = torch.zeros(
            size=(document_batch.shape[0],
                  min(document_batch.shape[1], self.bert_batch_size),
                  self.distilbert.config.hidden_size),
            dtype=torch.float,
            device=device)

        #only pass through bert_batch_size numbers of inputs into bert.
        #this means that we are possibly cutting off the last part of documents.

        for doc_id in range(document_batch.shape[0]):

            hidden_states = self.distilbert(
                input_ids=document_batch[doc_id][:self.bert_batch_size, 0],
                attention_mask=document_batch[doc_id][:self.bert_batch_size,
                                                      2])[0]
            #Output of distilbert is a tuple of length 1. First element (hidden_states) is of shape:
            #( num_sequences(i.e. nr of sequences per document), nr_of_tokens(512) (i.e. nr of tokens per sequence), bert_hidden_size )

            pooled_output = self.pooler(
                hidden_states
            )  # (num_sequences (i.e. nr of sequences per document), bert_hidden_size)

            distilbert_output[doc_id][:self.bert_batch_size] = self.dropout(
                pooled_output
            )  #( #batch_size(i.e. number of documents) ,num_sequences (i.e. nr of sequences per document), bert_hidden_size)

        #lstm expects a ( num_sequences, batch_size (i.e. number of documents) , bert_hidden_size )
        self.lstm.flatten_parameters()
        output, (_, _) = self.lstm(distilbert_output.permute(1, 0, 2))

        last_layer = output[-1]

        prediction = self.classifier(last_layer)
        assert prediction.shape[0] == document_batch.shape[0]
        return prediction

    def freeze_bert_encoder(self):
        for param in self.distilbert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.distilbert.parameters():
            param.requires_grad = True

    def unfreeze_bert_encoder_last_layers(self):
        for name, param in self.distilbert.named_parameters():
            if "layer.5" in name or "pooler" in name:
                param.requires_grad = True

    def unfreeze_bert_encoder_pooler_layer(self):
        for name, param in self.distilbert.named_parameters():
            if "pooler" in name:
                param.requires_grad = True
Example #4
0
class NetWork(Module):
    def __init__(self, h):
        self.size_after_conv = 128
        self.hidden_size = h
        self.current_state = None

        super(NetWork, self).__init__()

        self.color = Sequential(
            Conv2d(in_channels=3, out_channels=8, kernel_size=1, stride=1),
            LeakyReLU(),
            Conv2d(in_channels=8, out_channels=16, kernel_size=4, stride=2),
            LeakyReLU(),
        )

        self.conv1 = Sequential(
            Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=1),
            MaxPool2d(2, 2, padding=0),
            LeakyReLU(),
            Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            LeakyReLU(),
            Conv2d(in_channels=64, out_channels=64, kernel_size=4, stride=1),
            LeakyReLU(),
            Conv2d(in_channels=64,
                   out_channels=self.size_after_conv,
                   kernel_size=3,
                   stride=1),
            LeakyReLU(),
        )

        self.fromEncoder = Sequential(
            Conv2d(in_channels=12, out_channels=32, kernel_size=4, stride=1),
            LeakyReLU(),
            Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=1),
            LeakyReLU(),
            Conv2d(in_channels=64,
                   out_channels=self.size_after_conv,
                   kernel_size=4,
                   stride=1),
            LeakyReLU(),
        )

        self.lstm = LSTM(self.size_after_conv, self.hidden_size, 1)

        self.linear = Sequential(
            LeakyReLU(),
            Linear(self.hidden_size, 15),
        )

        self.exploration_network = Sequential(
            LeakyReLU(),
            Linear(self.hidden_size, self.hidden_size),
            LeakyReLU(),
            Linear(self.hidden_size, self.hidden_size // 2),
            LeakyReLU(),
            Linear(self.hidden_size // 2, 15),
        )

        self.state_difference_network = Sequential(
            Linear(self.size_after_conv, self.size_after_conv),
            LeakyReLU(),
            Linear(self.size_after_conv, self.hidden_size),
            LeakyReLU(),
            Linear(self.hidden_size, 15),
        )

    def forward(self, x):
        self.lstm.flatten_parameters()
        if self.hasEncoder:
            x = self.fromEncoder(x)
        else:
            x = self.color(x)
            x = self.conv1(x)
        x = x.view(1, -1, self.size_after_conv)

        p = x.view(-1, 1, self.size_after_conv)
        x, (self.hn, self.cn) = self.lstm(x, (self.hn, self.cn))
        x = x.view(-1, self.hidden_size)
        y = self.exploration_network(x.detach())
        x = self.linear(x)
        return x, y, p.detach()

    def avoid_similar_state(self, x):
        return self.state_difference_network(x.detach())