def __init__(self, bert_path: Path, encoder: Seq2SeqEncoder, vocab: Vocabulary, hidden_dim: int = 100, encoder_dropout: float = 0.0, train_bert: bool = False) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.encoder_dropout: torch.nn.Module if encoder_dropout > 0: self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout) else: self.encoder_dropout = torch.nn.Identity() self.pooler = BertPooler(pretrained_model=str(bert_path)) self.dense1 = torch.nn.Linear(in_features=self.pooler.get_output_dim(), out_features=hidden_dim) self.encoder = encoder self.self_attn = LinearSelfAttention( input_dim=self.encoder.get_output_dim(), bias=True) self.dense2 = torch.nn.Linear( in_features=self.encoder.get_output_dim(), out_features=1)
def __init__(self, args, word_embedder): super(Pooler_for_mention, self).__init__() self.args = args self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler(pretrained_model=self.bert_weight_filepath) self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout(self.args.word_embedding_dropout) self.linear_for_mention_encoding = nn.Linear(self.bertpooler_sec2vec.get_output_dim(), self.bertpooler_sec2vec.get_output_dim()) self.linear_for_dimentionReduction = nn.Linear(self.bertpooler_sec2vec.get_output_dim(), self.args.dimentionReductionToThisDim)
def __init__( self, vocab: Vocabulary, transformer_model: str = "roberta-large", override_weights_file: Optional[str] = None, override_weights_strip_prefix: Optional[str] = None, **kwargs ) -> None: super().__init__(vocab, **kwargs) self._text_field_embedder = PretrainedTransformerEmbedder( transformer_model, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, ) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": self._text_field_embedder}) self._pooler = BertPooler( transformer_model, override_weights_file=override_weights_file, override_weights_strip_prefix=override_weights_strip_prefix, dropout=0.1, ) self._linear_layer = torch.nn.Linear( self._text_field_embedder.get_output_dim(), 1) self._linear_layer.weight.data.normal_(mean=0.0, std=0.02) self._linear_layer.bias.data.zero_() self._loss = torch.nn.CrossEntropyLoss() self._accuracy = CategoricalAccuracy()
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model: if bert_model: embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model, train_parameters=True)}) encoder = BertPooler(pretrained_model=bert_model, requires_grad=True) else: # (3) How to get vectors for each Token ID: # (3.1) embed each token token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab")) # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz' # (3.2) embed each character in each token character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab")) cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,]) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # (3.3) embed the POS of each token pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab")) # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/ embedder = BasicTextFieldEmbedder( token_embedders={"tokens": token_embedding, "token_characters": token_encoder, "pos_tags": pos_tag_embedding} ) # emb_dim = 10 + 4 + 10 = 24 encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True) # ^ # average the embeddings across time, rather than simply summing # (ie. we will divide the summed embeddings by the length of the sentence). return SimpleClassifier(vocab, embedder, encoder)
def build_adversarial_transformer_model(vocab: Vocabulary, transformer_model: str) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedding = PretrainedTransformerEmbedder(model_name=transformer_model) embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding}) encoder = BertPooler(transformer_model) return SimpleClassifier(vocab, embedder, encoder)
def __init__(self, bert_path: Path, vocab: Vocabulary, train_bert: bool = False ) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.pooler = BertPooler(pretrained_model=str(bert_path)) hidden_dim = self.pooler.get_output_dim() self.hidden2logit = torch.nn.Linear( in_features=hidden_dim, out_features=1 )
class Pooler_for_title_and_desc(Seq2VecEncoder): def __init__(self, args, word_embedder): super(Pooler_for_title_and_desc, self).__init__() self.args = args self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler( pretrained_model=self.bert_weight_filepath) self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) self.linear_for_entity_encoding = nn.Linear( self.bertpooler_sec2vec.get_output_dim(), self.bertpooler_sec2vec.get_output_dim()) self.linear_for_dimentionReduction = nn.Linear( self.bertpooler_sec2vec.get_output_dim(), self.args.dimentionReductionToThisDim) def huggingface_nameloader(self): if self.args.bert_name == 'bert-base-uncased': self.bert_weight_filepath = 'bert-base-uncased' else: self.bert_weight_filepath = 'dummy' print('Currently not supported', self.args.bert_name) exit() def forward(self, title_and_desc_concatnated_text): mask_sent = get_text_field_mask(title_and_desc_concatnated_text) entity_emb = self.word_embedder(title_and_desc_concatnated_text) entity_emb = self.word_embedding_dropout(entity_emb) if self.args.entityPooling == "CLSLinear": entity_emb = entity_emb[:, 0, :] entity_emb = self.linear_for_entity_encoding(entity_emb) elif self.args.entityPooling == 'CLS': entity_emb = entity_emb[:, 0, :] else: assert self.args.entityPooling == "CLSLinearTanh" entity_emb = self.bertpooler_sec2vec(entity_emb, mask_sent) if self.args.dimentionReduction: return self.linear_for_dimentionReduction(entity_emb) else: return entity_emb
def __init__(self, args, word_embedder): super(Pooler_for_blink_mention, self).__init__() self.args = args self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler( pretrained_model=self.bert_weight_filepath) self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout)
def __init__(self, args, input_dim, word_embedder): super(Concatenate_Right_and_Left_MentionEncoder, self).__init__() self.config = args self.args = args self.input_dim = input_dim self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) self.ff_seq2vecs = nn.Linear(input_dim * 4, input_dim) self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler( pretrained_model=self.bert_weight_filepath)
def __init__( self, word_embedding_dropout: float = 0.05, bert_model_name: str = 'japanese_bert', word_embedder: BasicTextFieldEmbedder = BasicTextFieldEmbedder({ 'tokens': PretrainedTransformerEmbedder( model_name='cl-tohoku/bert-base-japanese') })): super(BertPoolerForMention, self).__init__() self.bert_model_name = bert_model_name self.huggingface_nameloader() self.bertpooler_sec2vec = BertPooler( pretrained_model=self.bert_weight_filepath) self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout(word_embedding_dropout)
def test_encoder(self): encoder = BertPooler("bert-base-uncased") assert encoder.get_input_dim() == encoder.get_output_dim() embedding = torch.rand(8, 24, encoder.get_input_dim()) pooled1 = encoder(embedding) assert pooled1.size() == (8, encoder.get_input_dim()) embedding[:, 1:, :] = 0 pooled2 = encoder(embedding) numpy.testing.assert_array_almost_equal(pooled1.detach().numpy(), pooled2.detach().numpy())
def __init__(self, vocab, pretrained_model: str = "bert-base-uncased", requires_grad: bool = True): super(ChatClassification, self).__init__() self.vocab = vocab self.turn_pooler = BertPooler(pretrained_model, requires_grad, dropout=0.0) #self.turn_pooler = self.chat_encoder = StackedBidirectionalLstm( hidden_size=400, input_size=768, num_layers=1, recurrent_dropout_probability=0.3, use_highway=True) self.classif_layer = torch.nn.Linear( in_features=self.chat_encoder.hidden_size, out_features=2) self.accuracy = CategoricalAccuracy()
def build_model_Transformer(vocab: Vocabulary, use_reg: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 300 # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = PretrainedTransformerEmbedder(BERT_MODEL_NAME) encoder = BertPooler( BERT_MODEL_NAME ) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if use_reg: l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg)] regularizer_applicator = RegularizerApplicator(regexes) return MortalityClassifier(vocab, embedder, encoder, regularizer_applicator)
def train_bert(train_dataset, validation_dataset, batch_size, pretrained_model, double_input=False, dense_vector=False, col_name=None, epochs=100, patience=None, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains BERT on train_dataset; with optional early stopping on validation_dataset. Parameters ---------- train_dataset: List[Instance] Instances for training set validation_dataset: List[Instance] Instances for validation set batch_size: int number of Instances to process in a batch pretrained_model: str pretrained BERT model to use double_input: bool True to run DoubleInput classifier | False (default) for SingleInput classifier dense_vector: bool True to concatenate dense feature vector before feeding to the FeedForward layer col_name: str 'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True epochs: int total number of epochs to train on (default=30) patience: int or None early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ vocab = Vocabulary() if double_input: # need context_tokens as well iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("reply_tokens", "num_tokens"), ("context_tokens", "num_tokens")]) else: # only reply_tokens iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("reply_tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data word_embeddings: TextFieldEmbedder = load_bert_embeddings(pretrained_model) encoder: Seq2VecEncoder = BertPooler(pretrained_model=pretrained_model, requires_grad=True) if double_input: # consider preceding 'comment_text' if dense_vector: # add length of dense vector to input dimension of Feedforward ff_input_dim = 2 * (encoder.get_output_dim() + DENSE_VECTOR_LEN) classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes) model = models.DenseDoubleClassifier(vocab=vocab, word_embeddings=word_embeddings, reply_encoder=encoder, context_encoder=encoder, classifier_feedforward=classifier_feedforward, col_name=col_name) else: classifier_feedforward: FeedForward = nn.Linear(2*encoder.get_output_dim(), num_classes) model = models.DoubleInputClassifier(vocab=vocab, word_embeddings=word_embeddings, reply_encoder=encoder, context_encoder=encoder, classifier_feedforward=classifier_feedforward) else: # only 'reply_text' or 'question' if dense_vector: # add length of dense vector to input dimension of Feedforward ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes) model = models.DenseSingleClassifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward, col_name=col_name) else: # Feedforward: classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.SingleInputClassifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) if patience == None: # No early stopping: train on both train+validation dataset trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset + validation_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) else: trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, cuda_device=0 if use_gpu else -1, patience=patience, # stop if loss does not improve for 'patience' epochs num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
class SimpleBertClassifier(BaseModel): """ Model that encodes input using BERT, takes the embedding for the CLS token (using BertPooler) and puts the output through a FFN to get the probabilities. """ def __init__(self, bert_path: Path, vocab: Vocabulary, train_bert: bool = False ) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.pooler = BertPooler(pretrained_model=str(bert_path)) hidden_dim = self.pooler.get_output_dim() self.hidden2logit = torch.nn.Linear( in_features=hidden_dim, out_features=1 ) # This is the computation bit of the model. The arguments of this function # are the fields from the `Instance` we created, as that's what's going to # be passed to this. We also have the optional `label`, which is only # available at training time, used to calculate the loss. def forward(self, metadata: Dict[str, torch.Tensor], bert0: Dict[str, torch.Tensor], bert1: Dict[str, torch.Tensor], label: Optional[torch.Tensor] = None ) -> Dict[str, torch.Tensor]: # Every sample in a batch has to have the same size (as it's a tensor), # so smaller entries are padded. The mask is used to counteract this # padding. t0_masks = util.get_text_field_mask(bert0) t1_masks = util.get_text_field_mask(bert1) # We create the embeddings from the input text t0_embs = self.word_embeddings(bert0) t1_embs = self.word_embeddings(bert1) # Then we use those embeddings (along with the masks) as inputs for # our encoders enc0_outs = self.pooler(t0_embs, t0_masks) enc1_outs = self.pooler(t1_embs, t1_masks) # Finally, we pass each encoded output tensor to the feedforward layer # to produce logits corresponding to each class. logit0 = self.hidden2logit(enc0_outs).squeeze(-1) logit1 = self.hidden2logit(enc1_outs).squeeze(-1) logit0, _ = torch.max(logit0, dim=1) logit1, _ = torch.max(logit1, dim=1) logits = torch.stack((logit0, logit1), dim=-1) # We also compute the class with highest likelihood (our prediction) prob = torch.softmax(logits, dim=-1) output = {"logits": logits, "prob": prob} # Labels are optional. If they're present, we calculate the accuracy # and the loss function. if label is not None: self.accuracy(prob, label) output["loss"] = self.loss(logits, label) # The output is the dict we've been building, with the logits, loss # and the prediction. return output
class AdvancedAttentionBertClassifier(BaseModel): """ Model similar to the AttentiveClassifier with BERT, but without external features. SimpleTrian is this with the attention before the encoders. """ def __init__(self, bert_path: Path, encoder: Seq2SeqEncoder, vocab: Vocabulary, hidden_dim: int = 100, encoder_dropout: float = 0.0, train_bert: bool = False) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = bert_embeddings(pretrained_model=bert_path, training=train_bert) self.encoder_dropout: torch.nn.Module if encoder_dropout > 0: self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout) else: self.encoder_dropout = torch.nn.Identity() self.pooler = BertPooler(pretrained_model=str(bert_path)) self.dense1 = torch.nn.Linear(in_features=self.pooler.get_output_dim(), out_features=hidden_dim) self.encoder = encoder self.self_attn = LinearSelfAttention( input_dim=self.encoder.get_output_dim(), bias=True) self.dense2 = torch.nn.Linear( in_features=self.encoder.get_output_dim(), out_features=1) # This is the computation bit of the model. The arguments of this function # are the fields from the `Instance` we created, as that's what's going to # be passed to this. We also have the optional `label`, which is only # available at training time, used to calculate the loss. def forward( self, metadata: Dict[str, torch.Tensor], bert0: Dict[str, torch.Tensor], bert1: Dict[str, torch.Tensor], label: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: # Every sample in a batch has to have the same size (as it's a tensor), # so smaller entries are padded. The mask is used to counteract this # padding. # We create the embeddings from the input text t0_embs = self.word_embeddings(bert0) t1_embs = self.word_embeddings(bert1) t0_pooled = self.pooler(t0_embs) t1_pooled = self.pooler(t1_embs) t0_transformed = self.dense1(t0_pooled) t1_transformed = self.dense1(t1_pooled) t0_enc_hiddens = self.encoder_dropout( self.encoder(t0_transformed, mask=None)) t1_enc_hiddens = self.encoder_dropout( self.encoder(t1_transformed, mask=None)) t0_enc_attn = self.self_attn(t0_enc_hiddens, t0_enc_hiddens) t1_enc_attn = self.self_attn(t1_enc_hiddens, t1_enc_hiddens) t0_enc_out = util.weighted_sum(t0_enc_hiddens, t0_enc_attn) t1_enc_out = util.weighted_sum(t1_enc_hiddens, t1_enc_attn) logit0 = self.dense2(t0_enc_out).squeeze(-1) logit1 = self.dense2(t1_enc_out).squeeze(-1) logits = torch.stack((logit0, logit1), dim=-1) # We also compute the class with highest likelihood (our prediction) prob = torch.softmax(logits, dim=-1) output = {"logits": logits, "prob": prob} # Labels are optional. If they're present, we calculate the accuracy # and the loss function. if label is not None: self.accuracy(prob, label) output["loss"] = self.loss(logits, label) # The output is the dict we've been building, with the logits, loss # and the prediction. return output