class BertBiLSTMCRFSLModel(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.return_dict = config.return_dict if hasattr( config, "return_dict") else False self.bert = BertModel(config) self.dropout = Dropout(config.hidden_dropout_prob) self.lstm = LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size, batch_first=True, bidirectional=True) self.classifier = Linear(config.hidden_size * 2, config.num_labels) self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights() def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, return_dict=None): self.lstm.flatten_parameters() outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=self.return_dict) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) lstm_output = self.lstm(sequence_output) lstm_output = lstm_output[0] logits = self.classifier(lstm_output) loss = None if labels is not None: ## [TBD] change {label_id:-100 [CLS], [SEP], [PAD]} into {label_id:32 "O"} ## It means they contribute loss to loss function, so it need to be improved active_idx = labels != -100 active_labels = torch.where(active_idx, labels, torch.tensor(0).type_as(labels)) loss = self.crf(emissions=logits, tags=active_labels, mask=attention_mask.type(torch.uint8)) loss = -1 * loss if self.return_dict: return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) else: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output
class Encoder(torch.nn.Module): """Vanilla Tacotron 2 encoder. Details: stack of 3 conv. layers 5 × 1 with BN and ReLU, dropout output is passed into a Bi-LSTM layer Arguments: input_dim -- size of the input (supposed character embedding) output_dim -- number of channels of the convolutional blocks and last Bi-LSTM num_blocks -- number of the convolutional blocks (at least one) kernel_size -- kernel size of the encoder's convolutional blocks dropout -- dropout rate to be aplied after each convolutional block Keyword arguments: generated -- just for convenience """ def __init__(self, input_dim, output_dim, num_blocks, kernel_size, dropout, generated=False): super(Encoder, self).__init__() assert num_blocks > 0, ( 'There must be at least one convolutional block in the encoder.') assert output_dim % 2 == 0, ( 'Bidirectional LSTM output dimension must be divisible by 2.') convs = [ConvBlock(input_dim, output_dim, kernel_size, dropout, 'relu')] + \ [ConvBlock(output_dim, output_dim, kernel_size, dropout, 'relu') for _ in range(num_blocks - 1)] self._convs = Sequential(*convs) self._lstm = LSTM(output_dim, output_dim // 2, batch_first=True, bidirectional=True) def forward(self, x, x_lenghts, x_langs=None): # x_langs argument is there just for convenience x = x.transpose(1, 2) x = self._convs(x) x = x.transpose(1, 2) ml = x.size(1) x = torch.nn.utils.rnn.pack_padded_sequence(x, x_lenghts, batch_first=True) self._lstm.flatten_parameters() x, _ = self._lstm(x) x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, total_length=ml) return x
class DocumentDistilBertLSTM(DistilBertPreTrainedModel): """ DistilBERT output over document in LSTM """ def __init__(self, bert_model_config: DistilBertConfig): super(DocumentDistilBertLSTM, self).__init__(bert_model_config) self.distilbert = DistilBertModel(bert_model_config) self.pooler = DistilBertPooler(bert_model_config) self.bert_batch_size = self.distilbert.config.bert_batch_size self.dropout = nn.Dropout(p=bert_model_config.dropout) self.lstm = LSTM( bert_model_config.hidden_size, bert_model_config.hidden_size, ) self.classifier = nn.Sequential( nn.Dropout(p=bert_model_config.dropout), nn.Linear(bert_model_config.hidden_size, bert_model_config.num_labels), nn.Tanh()) self.init_weights() #input_ids, token_type_ids, attention_masks def forward(self, document_batch: torch.Tensor, document_sequence_lengths: list, device='cuda'): #contains all BERT sequences #bert should output a (batch_size (i.e. number of documents), num_sequences , bert_hidden_size) distilbert_output = torch.zeros( size=(document_batch.shape[0], min(document_batch.shape[1], self.bert_batch_size), self.distilbert.config.hidden_size), dtype=torch.float, device=device) #only pass through bert_batch_size numbers of inputs into bert. #this means that we are possibly cutting off the last part of documents. for doc_id in range(document_batch.shape[0]): hidden_states = self.distilbert( input_ids=document_batch[doc_id][:self.bert_batch_size, 0], attention_mask=document_batch[doc_id][:self.bert_batch_size, 2])[0] #Output of distilbert is a tuple of length 1. First element (hidden_states) is of shape: #( num_sequences(i.e. nr of sequences per document), nr_of_tokens(512) (i.e. nr of tokens per sequence), bert_hidden_size ) pooled_output = self.pooler( hidden_states ) # (num_sequences (i.e. nr of sequences per document), bert_hidden_size) distilbert_output[doc_id][:self.bert_batch_size] = self.dropout( pooled_output ) #( #batch_size(i.e. number of documents) ,num_sequences (i.e. nr of sequences per document), bert_hidden_size) #lstm expects a ( num_sequences, batch_size (i.e. number of documents) , bert_hidden_size ) self.lstm.flatten_parameters() output, (_, _) = self.lstm(distilbert_output.permute(1, 0, 2)) last_layer = output[-1] prediction = self.classifier(last_layer) assert prediction.shape[0] == document_batch.shape[0] return prediction def freeze_bert_encoder(self): for param in self.distilbert.parameters(): param.requires_grad = False def unfreeze_bert_encoder(self): for param in self.distilbert.parameters(): param.requires_grad = True def unfreeze_bert_encoder_last_layers(self): for name, param in self.distilbert.named_parameters(): if "layer.5" in name or "pooler" in name: param.requires_grad = True def unfreeze_bert_encoder_pooler_layer(self): for name, param in self.distilbert.named_parameters(): if "pooler" in name: param.requires_grad = True
class NetWork(Module): def __init__(self, h): self.size_after_conv = 128 self.hidden_size = h self.current_state = None super(NetWork, self).__init__() self.color = Sequential( Conv2d(in_channels=3, out_channels=8, kernel_size=1, stride=1), LeakyReLU(), Conv2d(in_channels=8, out_channels=16, kernel_size=4, stride=2), LeakyReLU(), ) self.conv1 = Sequential( Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=1), MaxPool2d(2, 2, padding=0), LeakyReLU(), Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), LeakyReLU(), Conv2d(in_channels=64, out_channels=64, kernel_size=4, stride=1), LeakyReLU(), Conv2d(in_channels=64, out_channels=self.size_after_conv, kernel_size=3, stride=1), LeakyReLU(), ) self.fromEncoder = Sequential( Conv2d(in_channels=12, out_channels=32, kernel_size=4, stride=1), LeakyReLU(), Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=1), LeakyReLU(), Conv2d(in_channels=64, out_channels=self.size_after_conv, kernel_size=4, stride=1), LeakyReLU(), ) self.lstm = LSTM(self.size_after_conv, self.hidden_size, 1) self.linear = Sequential( LeakyReLU(), Linear(self.hidden_size, 15), ) self.exploration_network = Sequential( LeakyReLU(), Linear(self.hidden_size, self.hidden_size), LeakyReLU(), Linear(self.hidden_size, self.hidden_size // 2), LeakyReLU(), Linear(self.hidden_size // 2, 15), ) self.state_difference_network = Sequential( Linear(self.size_after_conv, self.size_after_conv), LeakyReLU(), Linear(self.size_after_conv, self.hidden_size), LeakyReLU(), Linear(self.hidden_size, 15), ) def forward(self, x): self.lstm.flatten_parameters() if self.hasEncoder: x = self.fromEncoder(x) else: x = self.color(x) x = self.conv1(x) x = x.view(1, -1, self.size_after_conv) p = x.view(-1, 1, self.size_after_conv) x, (self.hn, self.cn) = self.lstm(x, (self.hn, self.cn)) x = x.view(-1, self.hidden_size) y = self.exploration_network(x.detach()) x = self.linear(x) return x, y, p.detach() def avoid_similar_state(self, x): return self.state_difference_network(x.detach())