class Pooler_for_title_and_desc(Seq2VecEncoder): def __init__(self, args, word_embedder): super(Pooler_for_title_and_desc, self).__init__() self.args = args self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler( pretrained_model=self.bert_weight_filepath) self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) self.linear_for_entity_encoding = nn.Linear( self.bertpooler_sec2vec.get_output_dim(), self.bertpooler_sec2vec.get_output_dim()) self.linear_for_dimentionReduction = nn.Linear( self.bertpooler_sec2vec.get_output_dim(), self.args.dimentionReductionToThisDim) def huggingface_nameloader(self): if self.args.bert_name == 'bert-base-uncased': self.bert_weight_filepath = 'bert-base-uncased' else: self.bert_weight_filepath = 'dummy' print('Currently not supported', self.args.bert_name) exit() def forward(self, title_and_desc_concatnated_text): mask_sent = get_text_field_mask(title_and_desc_concatnated_text) entity_emb = self.word_embedder(title_and_desc_concatnated_text) entity_emb = self.word_embedding_dropout(entity_emb) if self.args.entityPooling == "CLSLinear": entity_emb = entity_emb[:, 0, :] entity_emb = self.linear_for_entity_encoding(entity_emb) elif self.args.entityPooling == 'CLS': entity_emb = entity_emb[:, 0, :] else: assert self.args.entityPooling == "CLSLinearTanh" entity_emb = self.bertpooler_sec2vec(entity_emb, mask_sent) if self.args.dimentionReduction: return self.linear_for_dimentionReduction(entity_emb) else: return entity_emb
def test_encoder(self): encoder = BertPooler("bert-base-uncased") assert encoder.get_input_dim() == encoder.get_output_dim() embedding = torch.rand(8, 24, encoder.get_input_dim()) pooled1 = encoder(embedding) assert pooled1.size() == (8, encoder.get_input_dim()) embedding[:, 1:, :] = 0 pooled2 = encoder(embedding) numpy.testing.assert_array_almost_equal(pooled1.detach().numpy(), pooled2.detach().numpy())
class SimpleBertClassifier(BaseModel): """ Model that encodes input using BERT, takes the embedding for the CLS token (using BertPooler) and puts the output through a FFN to get the probabilities. """ def __init__(self, bert_path: Path, vocab: Vocabulary, train_bert: bool = False ) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.pooler = BertPooler(pretrained_model=str(bert_path)) hidden_dim = self.pooler.get_output_dim() self.hidden2logit = torch.nn.Linear( in_features=hidden_dim, out_features=1 ) # This is the computation bit of the model. The arguments of this function # are the fields from the `Instance` we created, as that's what's going to # be passed to this. We also have the optional `label`, which is only # available at training time, used to calculate the loss. def forward(self, metadata: Dict[str, torch.Tensor], bert0: Dict[str, torch.Tensor], bert1: Dict[str, torch.Tensor], label: Optional[torch.Tensor] = None ) -> Dict[str, torch.Tensor]: # Every sample in a batch has to have the same size (as it's a tensor), # so smaller entries are padded. The mask is used to counteract this # padding. t0_masks = util.get_text_field_mask(bert0) t1_masks = util.get_text_field_mask(bert1) # We create the embeddings from the input text t0_embs = self.word_embeddings(bert0) t1_embs = self.word_embeddings(bert1) # Then we use those embeddings (along with the masks) as inputs for # our encoders enc0_outs = self.pooler(t0_embs, t0_masks) enc1_outs = self.pooler(t1_embs, t1_masks) # Finally, we pass each encoded output tensor to the feedforward layer # to produce logits corresponding to each class. logit0 = self.hidden2logit(enc0_outs).squeeze(-1) logit1 = self.hidden2logit(enc1_outs).squeeze(-1) logit0, _ = torch.max(logit0, dim=1) logit1, _ = torch.max(logit1, dim=1) logits = torch.stack((logit0, logit1), dim=-1) # We also compute the class with highest likelihood (our prediction) prob = torch.softmax(logits, dim=-1) output = {"logits": logits, "prob": prob} # Labels are optional. If they're present, we calculate the accuracy # and the loss function. if label is not None: self.accuracy(prob, label) output["loss"] = self.loss(logits, label) # The output is the dict we've been building, with the logits, loss # and the prediction. return output
class AdvancedAttentionBertClassifier(BaseModel): """ Model similar to the AttentiveClassifier with BERT, but without external features. SimpleTrian is this with the attention before the encoders. """ def __init__(self, bert_path: Path, encoder: Seq2SeqEncoder, vocab: Vocabulary, hidden_dim: int = 100, encoder_dropout: float = 0.0, train_bert: bool = False) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.encoder_dropout: torch.nn.Module if encoder_dropout > 0: self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout) else: self.encoder_dropout = torch.nn.Identity() self.pooler = BertPooler(pretrained_model=str(bert_path)) self.dense1 = torch.nn.Linear(in_features=self.pooler.get_output_dim(), out_features=hidden_dim) self.encoder = encoder self.self_attn = LinearSelfAttention( input_dim=self.encoder.get_output_dim(), bias=True) self.dense2 = torch.nn.Linear( in_features=self.encoder.get_output_dim(), out_features=1) # This is the computation bit of the model. The arguments of this function # are the fields from the `Instance` we created, as that's what's going to # be passed to this. We also have the optional `label`, which is only # available at training time, used to calculate the loss. def forward( self, metadata: Dict[str, torch.Tensor], bert0: Dict[str, torch.Tensor], bert1: Dict[str, torch.Tensor], label: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: # Every sample in a batch has to have the same size (as it's a tensor), # so smaller entries are padded. The mask is used to counteract this # padding. # We create the embeddings from the input text t0_embs = self.word_embeddings(bert0) t1_embs = self.word_embeddings(bert1) t0_pooled = self.pooler(t0_embs) t1_pooled = self.pooler(t1_embs) t0_transformed = self.dense1(t0_pooled) t1_transformed = self.dense1(t1_pooled) t0_enc_hiddens = self.encoder_dropout( self.encoder(t0_transformed, mask=None)) t1_enc_hiddens = self.encoder_dropout( self.encoder(t1_transformed, mask=None)) t0_enc_attn = self.self_attn(t0_enc_hiddens, t0_enc_hiddens) t1_enc_attn = self.self_attn(t1_enc_hiddens, t1_enc_hiddens) t0_enc_out = util.weighted_sum(t0_enc_hiddens, t0_enc_attn) t1_enc_out = util.weighted_sum(t1_enc_hiddens, t1_enc_attn) logit0 = self.dense2(t0_enc_out).squeeze(-1) logit1 = self.dense2(t1_enc_out).squeeze(-1) logits = torch.stack((logit0, logit1), dim=-1) # We also compute the class with highest likelihood (our prediction) prob = torch.softmax(logits, dim=-1) output = {"logits": logits, "prob": prob} # Labels are optional. If they're present, we calculate the accuracy # and the loss function. if label is not None: self.accuracy(prob, label) output["loss"] = self.loss(logits, label) # The output is the dict we've been building, with the logits, loss # and the prediction. return output