def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, final_feedforward: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, ) -> None: super().__init__(vocab, regularizer) # Model components self._embedder = text_field_embedder self._feed_forward = final_feedforward self._claim_encoder = CnnEncoder(embedding_dim=50, num_filters=100, output_dim=50) self._evidence_encoder = CnnEncoder(embedding_dim=50, num_filters=100, output_dim=50) # For accuracy and loss for training/evaluation of model self._accuracy = CategoricalAccuracy() self._loss = nn.CrossEntropyLoss() # Initialize weights initializer(self)
def test_forward_respects_masking(self): # seed 1 fails on the old cnn encoder code torch.manual_seed(1) encoder = CnnEncoder(embedding_dim=7, num_filters=13, ngram_filter_sizes=(1, 2, 3, 4, 5)) init = Initializer.from_params( Params({ "type": "normal", "mean": 0.0, "std": 10 })) initializer = InitializerApplicator([(".*", init)]) initializer(encoder) tokens = torch.ones(4, 8, 7) padded_tokens = torch.nn.functional.pad(tokens.transpose(1, 2), (0, 2), value=5).transpose(1, 2) mask = (torch.where(padded_tokens == 5, torch.zeros_like(padded_tokens), torch.ones_like(padded_tokens)).bool().any(dim=2)) regular_output = encoder.forward(tokens=tokens, mask=None) masked_output = encoder.forward(tokens=padded_tokens, mask=mask) assert_almost_equal(regular_output.data.numpy(), masked_output.data.numpy(), decimal=6)
def test_can_construct_from_params(self): params = Params({"embedding_dim": 5, "num_filters": 4, "ngram_filter_sizes": [3, 5]}) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 8 params = Params( {"embedding_dim": 5, "num_filters": 4, "ngram_filter_sizes": [3, 5], "output_dim": 7} ) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 7
def __init__(self, embedder: TextFieldEmbedder, embedding_size: int, num_filters: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.embedder = embedder self.encoder = CnnEncoder(embedding_size, num_filters=num_filters) self.linear = torch.nn.Linear( in_features=self.encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self.loss_function = torch.nn.CrossEntropyLoss()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, dropout: float = 0.0, input_dropout: float = 0.0, label_smoothing: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SentimentClassifier, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder share_rnn = nn.LSTM(input_size=self._text_field_embedder.get_output_dim(), hidden_size=150, batch_first=True, # dropout=dropout, bidirectional=True) share_encoder = PytorchSeq2SeqWrapper(share_rnn) self._encoder = RNNEncoder(vocab, share_encoder, input_dropout, regularizer) self._seq_vec = CnnEncoder(self._encoder.get_output_dim(), 25) self._de_dim = len(TASKS_NAME) weight = torch.empty(self._de_dim, self._text_field_embedder.get_output_dim()) torch.nn.init.orthogonal_(weight) self._domain_embeddings = Embedding(self._de_dim, self._text_field_embedder.get_output_dim(), weight=weight) self._de_attention = BilinearAttention(self._seq_vec.get_output_dim(), self._domain_embeddings.get_output_dim()) self._de_feedforward = FeedForward(self._domain_embeddings.get_output_dim(), 1, self._seq_vec.get_output_dim(), Activation.by_name("elu")()) self._num_classes = self.vocab.get_vocab_size("label") self._sentiment_discriminator = Discriminator(self._seq_vec.get_output_dim(), self._num_classes) self._s_domain_discriminator = Discriminator(self._seq_vec.get_output_dim(), len(TASKS_NAME)) self._valid_discriminator = Discriminator(self._domain_embeddings.get_output_dim(), 2) self._dropout = InputVariationalDropout(dropout) self._input_dropout = Dropout(input_dropout) self._label_smoothing = label_smoothing self.metrics = { "s_domain_acc": CategoricalAccuracy(), "valid_acc": CategoricalAccuracy() } for task_name in TASKS_NAME: self.metrics["{}_stm_acc".format(task_name)] = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._domain_loss = torch.nn.CrossEntropyLoss() # TODO torch.nn.BCELoss self._valid_loss = torch.nn.BCEWithLogitsLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, pos_tag_embedding: Embedding = None, users_embedding: Embedding = None, dropout: float = 0.1, label_namespace: str = "labels", initializer: InitializerApplicator = InitializerApplicator(), regularizer: RegularizerApplicator = None) -> None: super().__init__(vocab, regularizer) self._label_namespace = label_namespace self._dropout = Dropout(dropout) self._text_field_embedder = text_field_embedder self._pos_tag_embedding = pos_tag_embedding or None representation_dim = self._text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += self._pos_tag_embedding.get_output_dim() self._report_cnn = CnnEncoder(representation_dim, 25) self._comment_cnn = CnnEncoder(representation_dim, 25) lstm_input_dim = self._comment_cnn.get_output_dim() self._user_embedding = users_embedding or None if users_embedding is not None: lstm_input_dim += self._user_embedding.get_output_dim() rnn = nn.LSTM(input_size=lstm_input_dim, hidden_size=150, batch_first=True, bidirectional=True) self._encoder = PytorchSeq2SeqWrapper(rnn) self._seq2vec = CnnEncoder(self._encoder.get_output_dim(), 25) self._num_class = self.vocab.get_vocab_size(self._label_namespace) self._bilinear_sim = BilinearSimilarity(self._encoder.get_output_dim(), self._encoder.get_output_dim()) self._projector = FeedForward(self._seq2vec.get_output_dim(), 2, [50, self._num_class], Activation.by_name("sigmoid")(), dropout) self._golden_instances = None self._golden_instances_labels = None self._golden_instances_id = None self._metrics = { "accuracy": CategoricalAccuracy(), "f-measure": F1Measure( positive_label=vocab.get_token_index("feature", "labels")), } self._loss = torch.nn.CrossEntropyLoss() self._contrastive_loss = ContrastiveLoss() self._mse_loss = torch.nn.MSELoss() initializer(self)
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model: if bert_model: embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model, train_parameters=True)}) encoder = BertPooler(pretrained_model=bert_model, requires_grad=True) else: # (3) How to get vectors for each Token ID: # (3.1) embed each token token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab")) # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz' # (3.2) embed each character in each token character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab")) cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,]) token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder) # (3.3) embed the POS of each token pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab")) # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/ embedder = BasicTextFieldEmbedder( token_embedders={"tokens": token_embedding, "token_characters": token_encoder, "pos_tags": pos_tag_embedding} ) # emb_dim = 10 + 4 + 10 = 24 encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True) # ^ # average the embeddings across time, rather than simply summing # (ie. we will divide the summed embeddings by the length of the sentence). return SimpleClassifier(vocab, embedder, encoder)
def get_encoder(input_dim, output_dim, encoder_type, args): if encoder_type == "bag": return BagOfEmbeddingsEncoder(input_dim) if encoder_type == "bilstm": return PytorchSeq2VecWrapper( AllenNLPSequential(torch.nn.ModuleList( [get_encoder(input_dim, output_dim, "bilstm-unwrapped", args)]), input_dim, output_dim, bidirectional=True, residual_connection=args.residual_connection, dropout=args.dropout)) if encoder_type == "bilstm-unwrapped": return torch.nn.LSTM( input_dim, output_dim, batch_first=True, bidirectional=True, dropout=args.dropout, ) if encoder_type == "cnn": return CnnEncoder(embedding_dim=input_dim, num_filters=output_dim) if encoder_type == "cnn_highway": filter_size: int = output_dim // 4 return CnnHighwayEncoder( embedding_dim=input_dim, filters=[(2, filter_size), (3, filter_size), (4, filter_size), (5, filter_size)], projection_dim=output_dim, num_highway=3, do_layer_norm=True, ) raise RuntimeError(f"Unknown encoder type={encoder_type}")
def build_model(vocab: Vocabulary, args, **kwargs) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 200 if args.pretrained_WE_path: # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size, pretrained_file=args.pretrained_WE_path, vocab=vocab, )}) else: embedder = BasicTextFieldEmbedder( {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)}) encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5), num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if args.use_reg : l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg) ] regularizer_applicator = RegularizerApplicator(regexes) return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
def __init__(self, vocab_size, embedding_size, char_vocab_size, char_embedding_size, num_filter, ngram_filter_size, num_classes, bert_weight_path=False): super().__init__() self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_size) init.uniform_(self.char_embedding.weight, -0.1, 0.1) if bert_weight_path: self.bert = PretrainedBertEmbedder(bert_weight_path) else: self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_size) init.uniform_(self.embedding.weight, -0.1, 0.1) self.bert = None self.cnn_encoder = CnnEncoder(char_embedding_size, num_filters=num_filter, ngram_filter_sizes=ngram_filter_size) self.char_encoder = TokenCharactersEncoder(self.char_embedding, self.cnn_encoder) if bert_weight_path: embedding_size = 768 self.linear_layer = nn.Linear(embedding_size + num_filter, num_classes) init.xavier_normal_(self.linear_layer.weight)
def test_can_construct_from_params(self): params = Params({ 'embedding_dim': 5, 'num_filters': 4, 'ngram_filter_sizes': [3, 5] }) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 8 params = Params({ 'embedding_dim': 5, 'num_filters': 4, 'ngram_filter_sizes': [3, 5], 'output_dim': 7 }) encoder = CnnEncoder.from_params(params) assert encoder.get_output_dim() == 7
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") EMBED_DIMS = 300 # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings embedder = BasicTextFieldEmbedder({ "tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size) }) encoder = CnnEncoder( embedding_dim=EMBED_DIMS, ngram_filter_sizes=(2, 3, 4, 5), num_filters=5 ) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f # encoder = BertPooler("bert-base-cased") # the output dim is just the num filters *len(ngram_filter_sizes) # construct the regularizer applicator regularizer_applicator = None if use_reg: l2_reg = L2Regularizer() regexes = [("embedder", l2_reg), ("encoder", l2_reg), ("classifier", l2_reg)] regularizer_applicator = RegularizerApplicator(regexes) return DecompensationClassifier(vocab, embedder, encoder, regularizer_applicator)
def test_forward_runs_with_larger_input(self): encoder = CnnEncoder(embedding_dim=7, num_filters=13, ngram_filter_sizes=(1, 2, 3, 4, 5), output_dim=30) tensor = Variable(torch.rand(4, 8, 7)) assert encoder(tensor, None).size() == (4, 30)
def create_model( vocab: Vocabulary, embedding_dim: int, max_filter_size: int, num_filters: int, output_dim: int, dropout: float, ): model = BasicClassifier( text_field_embedder=BasicTextFieldEmbedder( { "tokens": Embedding( embedding_dim=embedding_dim, trainable=True, vocab=vocab ) } ), seq2vec_encoder=CnnEncoder( ngram_filter_sizes=range(2, max_filter_size), num_filters=num_filters, embedding_dim=embedding_dim, output_dim=output_dim, ), dropout=dropout, vocab=vocab, ) return model
def prepare_model(args, vocab): text_field_embedder = prepare_text_field_embedder(args, vocab) seq2seq_encoder = prepare_context_encoder( encoder_type=args.encoder_type, input_size=text_field_embedder.get_output_dim(), encoder_layer_num=args.encoder_layer, encoder_size=args.encoder_size, encoder_dropout=args.encoder_dropout) seq2vec_encoder = CnnEncoder( embedding_dim=seq2seq_encoder.get_output_dim(), num_filters=args.cnn_hidden, ngram_filter_sizes=args.cnn_window, conv_layer_activation=Activation.by_name('linear')()) model = Seq2VecClassificationModel( vocab=vocab, text_field_embedder=text_field_embedder, seq2seq_encoder=seq2seq_encoder, seq2vec_encoder=seq2vec_encoder, dropout=args.classifier_dropout, classification_type=args.classification_type, pos_label=args.positive_label, ) return model
class Discriminator(Model): def __init__(self, embedder: TextFieldEmbedder, embedding_size: int, num_filters: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.embedder = embedder self.encoder = CnnEncoder(embedding_size, num_filters=num_filters) self.linear = torch.nn.Linear( in_features=self.encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self.loss_function = torch.nn.CrossEntropyLoss() def forward(self, tokens: Dict[str, torch.Tensor], label: torch.Tensor = None) -> Dict[str, torch.Tensor]: mask = get_text_field_mask(tokens) embeddings = self.embedder(tokens) encoder_out = self.encoder(embeddings, mask) logits = self.linear(encoder_out) output = {"logits": logits} output["loss"] = self.loss_function(logits, label) return output
def test_get_dimension_is_correct(self): encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5)) assert encoder.get_output_dim() == 8 assert encoder.get_input_dim() == 5 encoder = CnnEncoder( embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5), output_dim=7 ) assert encoder.get_output_dim() == 7 assert encoder.get_input_dim() == 5
def __init__(self, vocab, encoder, attention, g, out_dim): super(LabelEmbedModel, self).__init__(vocab) _label_out_dim = 100 self.text_encoder = encoder self.label_encoder = LabelEmbedding(g.num_features, _label_out_dim) self.tl_attn = attention(self.text_encoder.get_output_dim(), self.label_encoder.get_output_dim()) self.encoder = CnnEncoder(self.text_encoder.get_output_dim() + self.label_encoder.get_output_dim(), num_filters=100) # self.encoder = CnnEncoder(self.text_encoder.get_output_dim(), # num_filters=100) self._classification_layer = torch.nn.Linear( self.encoder.get_output_dim(), out_dim) self._metric = MultiLabelMetric() self._loss = torch.nn.BCEWithLogitsLoss()
def test_forward_does_correct_computation(self): encoder = CnnEncoder(embedding_dim=2, num_filters=1, ngram_filter_sizes=(1, 2)) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.0})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(encoder) input_tensor = torch.FloatTensor([[[0.7, 0.8], [0.1, 1.5]]]) encoder_output = encoder(input_tensor, None) assert_almost_equal( encoder_output.data.numpy(), numpy.asarray([[1.6 + 1.0, 3.1 + 1.0]]), decimal=6 )
def test_forward_does_correct_computation(self): encoder = CnnEncoder(embedding_dim=2, num_filters=1, ngram_filter_sizes=(1, 2)) constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(encoder) input_tensor = Variable(torch.FloatTensor([[[.7, .8], [.1, 1.5]]])) encoder_output = encoder(input_tensor, None) assert_almost_equal(encoder_output.data.numpy(), numpy.asarray([[1.6 + 1.0, 3.1 + 1.0]]))
def test_get_dimension_is_correct(self): encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5)) assert encoder.get_output_dim() == 8 assert encoder.get_input_dim() == 5 encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5), output_dim=7) assert encoder.get_output_dim() == 7 assert encoder.get_input_dim() == 5
def __init__(self, vocab_size, emb_dim, hid_dim): super().__init__() self.embed = nn.Embedding(vocab_size, emb_dim) self.cnn_encoder = CnnEncoder(emb_dim, 64, output_dim=hid_dim) self.fc = nn.Sequential( nn.Linear(hid_dim, hid_dim // 2), nn.LeakyReLU(), nn.Dropout(), nn.Linear(hid_dim // 2, 9) )
def build_simple_cnn_model(vocab: Vocabulary, emb_size: int = 256, output_dim: int = 256, num_filters: int = 16, ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5, 6)) -> Model: print("Building the model") vocab_size = vocab.get_vocab_size("tokens") embedder = BasicTextFieldEmbedder( {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)} ) encoder = CnnEncoder( embedding_dim=emb_size, ngram_filter_sizes=ngram_filter_sizes, output_dim=output_dim, num_filters=num_filters, ) return SimpleClassifier(vocab, embedder, encoder)
class LabelEmbedModel(Model): def __init__(self, vocab, encoder, attention, g, out_dim): super(LabelEmbedModel, self).__init__(vocab) _label_out_dim = 100 self.text_encoder = encoder self.label_encoder = LabelEmbedding(g.num_features, _label_out_dim) self.tl_attn = attention(self.text_encoder.get_output_dim(), self.label_encoder.get_output_dim()) self.encoder = CnnEncoder(self.text_encoder.get_output_dim() + self.label_encoder.get_output_dim(), num_filters=100) # self.encoder = CnnEncoder(self.text_encoder.get_output_dim(), # num_filters=100) self._classification_layer = torch.nn.Linear( self.encoder.get_output_dim(), out_dim) self._metric = MultiLabelMetric() self._loss = torch.nn.BCEWithLogitsLoss() def forward(self, text, label, graph): text_vec = self.text_encoder(text) label_vec = self.label_encoder(graph[0]) att_vec = self.tl_attn(text_vec, label_vec) vec = torch.cat([text_vec, att_vec], dim=-1) vec = self.encoder(vec, None) logits = self._classification_layer(vec) probs = torch.softmax(logits, dim=-1) output_dict = {"logits": logits, "probs": probs} if label is not None: self._metric(predictions=logits, gold_labels=label) output_dict['loss'] = self._loss(input=logits, target=label.float()) return output_dict def get_metrics(self, reset: bool = False): metrics = {'f-score': self._metric.get_metric(reset)} return metrics
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, shared_encoder: Seq2SeqEncoder, private_encoder: Seq2SeqEncoder, input_dropout: float = 0.0, regularizer: RegularizerApplicator = None) -> None: super(RNNEncoder, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._shared_encoder = shared_encoder self._private_encoder = private_encoder self._seq2vec = CnnEncoder(embedding_dim=self._shared_encoder.get_output_dim(), num_filters=int(shared_encoder.get_output_dim() / 4)) self._input_dropout = Dropout(input_dropout) self._s_query = nn.Parameter(torch.randn(1, 100)) self._s_att = Attention(heads=3, attn_size=300, query_size=100, value_size=shared_encoder.get_output_dim(), key_size=shared_encoder.get_output_dim(), dropout=0.1) self._p_query = nn.Parameter(torch.randn(1, 100)) self._p_att = Attention(heads=3, attn_size=300, query_size=100, value_size=private_encoder.get_output_dim(), key_size=private_encoder.get_output_dim(), dropout=0.1)
def classification(): # Index each token with a single Id token_indexers = {"tokens": SingleIdTokenIndexer()} # Read the data reader = ClassificationDatasetReader(token_indexers) training_data = reader.read(path='data/classification/train.txt') validation_data = reader.read(path='data/classification/test.txt') test_data = reader.read(path='data/classification/test.txt') # Create a vocabulary vocabulary = Vocabulary.from_instances(training_data + validation_data + test_data) # Create an "Embedder" from a randomly initialized embedding layer embedding_layer = torch.nn.Embedding( num_embeddings=vocabulary.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) embedder = BasicTextFieldEmbedder( token_embedders={"tokens": embedding_layer}) # Our text classifier will use a CNN encoder cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM, num_filters=NUM_FILTERS, ngram_filter_sizes=FILTER_SIZES, output_dim=NUM_FILTERS * len(FILTER_SIZES)) model = TextClassifier(vocabulary=vocabulary, embedder=embedder, encoder=cnn_encoder) print("\nModel :\n") print(model) # Training train_model(model, training_data, validation_data, vocabulary) # Evaluation evaluate_classification_model(model, test_data)
def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim, CNN_num_filters, CNN_encoder_dim): # The word embedding will transform every word to a "Word_embedding_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim) indexers_dict = dict() if (Word_embedding_dim > 0): word_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_ids"), embedding_dim=Word_embedding_dim) word_embedding = word_embedding.to(device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["tokens"] = word_embedding if (CNN_encoder_dim > 0): # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim) char_embedding = Embedding( num_embeddings=vocab.get_vocab_size("token_chars"), embedding_dim=char_embeddedng_dim) # The Encoder will apply the cNN over the max_word_length dimension # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes) character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1), embedding_dim=char_embeddedng_dim, num_filters=CNN_num_filters, output_dim=CNN_encoder_dim) # We concatenate the char embdding and Encoding token_character_encoder = TokenCharactersEncoder( embedding=char_embedding, encoder=character_cnn) token_character_encoder = token_character_encoder.to( device=self.cf_a.device, dtype=self.cf_a.dtype) indexers_dict["chars"] = token_character_encoder ### Now we finally create the finally embedder indicating what are the token ids it embedds text_field_embedder = BasicTextFieldEmbedder(indexers_dict) return text_field_embedder
def construct_model(vocab, args): # token embedding word_embedding = Embedding.from_params(vocab=vocab, params=Params({ "pretrained_file": "glove\\glove.vocab.100d.txt", "embedding_dim": 100, "trainable": True, "padding_index": 0 })) word_embedding = BasicTextFieldEmbedder({ "token_words": word_embedding }) char_embedding = BasicTextFieldEmbedder({ "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20, num_embeddings=262), encoder=CnnEncoder(embedding_dim=20, ngram_filter_sizes=[5], num_filters=50)), }) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=100, num_layers=1, hidden_size=100, bidirectional=True, batch_first=True)) model = FollowUpSnippetModel(vocab=vocab, word_embedder=word_embedding, char_embedder=char_embedding, tokens_encoder=lstm, model_args=args) return model
def build_embeddings(args, vocab, tasks, pretrained_embs=None): ''' Build embeddings according to options in args ''' d_emb, d_char = 0, args.d_char token_embedders = {} # Word embeddings n_token_vocab = vocab.get_vocab_size('tokens') if args.word_embs != 'none': if args.word_embs in ['glove', 'fastText' ] and pretrained_embs is not None: word_embs = pretrained_embs assert word_embs.size()[0] == n_token_vocab d_word = word_embs.size()[1] log.info("\tUsing pre-trained word embeddings: %s", str(word_embs.size())) else: log.info("\tLearning word embeddings from scratch!") word_embs = None d_word = args.d_word embeddings = Embedding( num_embeddings=n_token_vocab, embedding_dim=d_word, weight=word_embs, trainable=False, padding_index=vocab.get_token_index('@@PADDING@@')) token_embedders["words"] = embeddings d_emb += d_word else: embeddings = None log.info("\tNot using word embeddings!") # Handle cove cove_layer = None if args.cove: assert embeddings is not None assert args.word_embs == "glove", "CoVe requires GloVe embeddings." assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings." try: from .cove.cove import MTLSTM as cove_lstm # Have CoVe do an internal GloVe lookup, but don't add residual. # We'll do this manually in modules.py; see # SentenceEncoder.forward(). cove_layer = cove_lstm(n_vocab=n_token_vocab, vectors=embeddings.weight.data) # Control whether CoVe is trainable. for param in cove_layer.parameters(): param.requires_grad = bool(args.cove_fine_tune) d_emb += 600 # 300 x 2 for biLSTM activations log.info("\tUsing CoVe embeddings!") except ImportError as e: log.info("Failed to import CoVe!") raise e # Character embeddings if args.char_embs: log.info("\tUsing character embeddings!") char_embeddings = Embedding(vocab.get_vocab_size('chars'), d_char) filter_sizes = tuple( [int(i) for i in args.char_filter_sizes.split(',')]) char_encoder = CnnEncoder(d_char, num_filters=args.n_char_filters, ngram_filter_sizes=filter_sizes, output_dim=d_char) char_embedder = TokenCharactersEncoder(char_embeddings, char_encoder, dropout=args.dropout_embs) d_emb += d_char token_embedders["chars"] = char_embedder else: log.info("\tNot using character embeddings!") # If we want separate ELMo scalar weights (a different ELMo representation for each classifier, # then we need count and reliably map each classifier to an index used by # allennlp internal ELMo. if args.sep_embs_for_skip: # Determine a deterministic list of classifier names to use for each task. classifiers = sorted(set(map(lambda x: x._classifier_name, tasks))) # Reload existing classifier map, if it exists. classifier_save_path = args.run_dir + "/classifier_task_map.json" if os.path.isfile(classifier_save_path): loaded_classifiers = json.load( open(args.run_dir + "/classifier_task_map.json", 'r')) else: # No file exists, so assuming we are just starting to pretrain. If pretrain is to be # skipped, then there's a way to bypass this assertion by explicitly allowing for a missing # classiifer task map. assert_for_log( args.do_pretrain or args.allow_missing_task_map, "Error: {} should already exist.".format(classifier_save_path)) if args.allow_missing_task_map: log.warning("Warning: classifier task map not found in model" " directory. Creating a new one from scratch.") loaded_classifiers = { "@pretrain@": 0 } # default is always @pretrain@ # Add the new tasks and update map, keeping the internal ELMo index consistent. max_number_classifiers = max(loaded_classifiers.values()) offset = 1 for classifier in classifiers: if classifier not in loaded_classifiers: loaded_classifiers[ classifier] = max_number_classifiers + offset offset += 1 log.info("Classifiers:{}".format(loaded_classifiers)) open(classifier_save_path, 'w+').write(json.dumps(loaded_classifiers)) # Every index in classifiers needs to correspond to a valid ELMo output representation. num_reps = 1 + max(loaded_classifiers.values()) else: # All tasks share the same scalars. # Not used if self.elmo_chars_only = 1 (i.e. no elmo) loaded_classifiers = {"@pretrain@": 0} num_reps = 1 if args.elmo: log.info("Loading ELMo from files:") log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH) if args.elmo_chars_only: log.info("\tUsing ELMo character CNN only!") log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH) elmo_embedder = ElmoCharacterEncoder(options_file=ELMO_OPT_PATH, weight_file=ELMO_WEIGHTS_PATH, requires_grad=False) d_emb += 512 else: log.info("\tUsing full ELMo! (separate scalars/task)") if args.elmo_weight_file_path != 'none': assert os.path.exists(args.elmo_weight_file_path), "ELMo weight file path \"" + \ args.elmo_weight_file_path + "\" does not exist." weight_file = args.elmo_weight_file_path else: weight_file = ELMO_WEIGHTS_PATH log.info("ELMO_WEIGHTS_PATH = %s", weight_file) elmo_embedder = ElmoTokenEmbedderWrapper( options_file=ELMO_OPT_PATH, weight_file=weight_file, num_output_representations=num_reps, # Dropout is added by the sentence encoder later. dropout=0.) d_emb += 1024 token_embedders["elmo"] = elmo_embedder # Wrap ELMo and other embedders, and concatenates the resulting # representations alone the last (vector) dimension. embedder = ElmoTextFieldEmbedder(token_embedders, loaded_classifiers, elmo_chars_only=args.elmo_chars_only, sep_embs_for_skip=args.sep_embs_for_skip) assert d_emb, "You turned off all the embeddings, ya goof!" return d_emb, embedder, cove_layer
def train_nli(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, use_elmo=False, epochs=30, patience=5, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains a Natural Language Inference (InferSent) inspired architecture. Reply and Context are separately encoded using CNN and GloVe embeddings (or optionally ELMo to dynamically compute embeddings). The CNN has one convolution layer for each ngram filter size. Parameters ---------- train_dataset: List[Instance] Instances for training set validation_dataset: List[Instance] Instances for validation set batch_size: int number of Instances to process in a batch num_filters: int output dim for each convolutional layer, which is the number of 'filters' learned by that layer filter_sizes: Tuple[int] specifies the number of convolutional layers and their sizes use_elmo: bool use ELMo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) patience: int or None early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() word_embeddings: TextFieldEmbedder = load_elmo_embeddings(large=True) else: vocab = Vocabulary.from_instances(train_dataset + validation_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("reply_tokens", "num_tokens"), ("context_tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data # CNN encoders: cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) # Feedforward: classifier_feedforward: FeedForward = nn.Linear(4 * cnn_reply.get_output_dim(), num_classes) # 4 because we perform [concatenation, element-wise subtraction (abs), element-wise multiplication] model = models.InferModel(vocab=vocab, word_embeddings=word_embeddings, reply_encoder=cnn_reply, context_encoder=cnn_context, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) if patience == None: # No early stopping: train on both train+validation dataset if patience is None trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset + validation_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) else: trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, cuda_device=0 if use_gpu else -1, patience=patience, # stop if loss does not improve for 'patience' epochs num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']
def train_cnn(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, double_input=False, dense_vector=False, col_name=None, use_elmo=False, epochs=30, patience=5, learning_rate=3e-4, num_classes=2, use_gpu=False): """ Trains CNN on train_dataset; optionally, perform early stopping based on validation loss. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings. The CNN has one convolution layer for each ngram filter size. Functionality to run it for (1) Single Input: reply/question, (2) Double Input: reply + context comment, (3) Dense Vector + reply/question, and (4) Dense Vector + reply + context comment. Parameters ---------- train_dataset: List[Instance] Instances for training set validation_dataset: List[Instance] Instances for validation set batch_size: int number of Instances to process in a batch num_filters: int output dim for each convolutional layer, which is the number of 'filters' learned by that layer filter_sizes: Tuple[int] specifies the number of convolutional layers and their sizes double_input: bool True to run DoubleInput classifier | False (default) for SingleInput classifier dense_vector: bool True to concatenate dense feature vector before feeding to the FeedForward layer col_name: str 'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True use_elmo: bool use ELMo embeddings (transfer learning) if True | GloVe if False epochs: int total number of epochs to train on (default=30) patience: int or None early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training learning_rate: float learning rate for Adam Optimizer num_classes: int default=2 for binary classification use_gpu: bool True to use the GPU Returns ------- Trained Model, Vocabulary, Number of actual training epochs """ if use_elmo: vocab = Vocabulary() word_embeddings: TextFieldEmbedder = load_elmo_embeddings() else: vocab = Vocabulary.from_instances(train_dataset + validation_dataset) word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab) if double_input: # need context_tokens as well iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("reply_tokens", "num_tokens"), ("context_tokens", "num_tokens")]) else: # only reply_tokens iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("reply_tokens", "num_tokens")]) iterator.index_with(vocab) # numericalize the data if double_input: # DoubleInput Classifier: two CNN encoders cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) if dense_vector: # add length of dense vector to input dimension of Feedforward ff_input_dim = 2 * (cnn_reply.get_output_dim() + DENSE_VECTOR_LEN) classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes) model = models.DenseDoubleClassifier(vocab=vocab, word_embeddings=word_embeddings, reply_encoder=cnn_reply, context_encoder=cnn_context, classifier_feedforward=classifier_feedforward, col_name=col_name) else: classifier_feedforward: FeedForward = nn.Linear(2 * cnn_reply.get_output_dim(), num_classes) model = models.DoubleInputClassifier(vocab=vocab, word_embeddings=word_embeddings, reply_encoder=cnn_reply, context_encoder=cnn_context, classifier_feedforward=classifier_feedforward) else: # SingleInput Classifier: one CNN encoder encoder: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(), num_filters=num_filters, ngram_filter_sizes=filter_sizes) if dense_vector: # add length of dense vector to input dimension of Feedforward ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes) model = models.DenseSingleClassifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward, col_name=col_name) else: classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes) model = models.SingleInputClassifier(vocab=vocab, word_embeddings=word_embeddings, encoder=encoder, classifier_feedforward=classifier_feedforward) if use_gpu: model.cuda() else: model optimizer = optim.Adam(model.parameters(), learning_rate) if patience == None: # Train on both train+validation dataset if patience is None trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset + validation_dataset, cuda_device=0 if use_gpu else -1, num_epochs=epochs) else: trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, cuda_device=0 if use_gpu else -1, patience=patience, # stop if loss does not improve for 'patience' epochs num_epochs=epochs) metrics = trainer.train() print(metrics) return model, vocab, metrics['training_epochs']