def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        final_feedforward: FeedForward,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:

        super().__init__(vocab, regularizer)

        # Model components
        self._embedder = text_field_embedder
        self._feed_forward = final_feedforward

        self._claim_encoder = CnnEncoder(embedding_dim=50,
                                         num_filters=100,
                                         output_dim=50)
        self._evidence_encoder = CnnEncoder(embedding_dim=50,
                                            num_filters=100,
                                            output_dim=50)

        # For accuracy and loss for training/evaluation of model
        self._accuracy = CategoricalAccuracy()
        self._loss = nn.CrossEntropyLoss()

        # Initialize weights
        initializer(self)
 def test_forward_respects_masking(self):
     # seed 1 fails on the old cnn encoder code
     torch.manual_seed(1)
     encoder = CnnEncoder(embedding_dim=7,
                          num_filters=13,
                          ngram_filter_sizes=(1, 2, 3, 4, 5))
     init = Initializer.from_params(
         Params({
             "type": "normal",
             "mean": 0.0,
             "std": 10
         }))
     initializer = InitializerApplicator([(".*", init)])
     initializer(encoder)
     tokens = torch.ones(4, 8, 7)
     padded_tokens = torch.nn.functional.pad(tokens.transpose(1, 2), (0, 2),
                                             value=5).transpose(1, 2)
     mask = (torch.where(padded_tokens == 5,
                         torch.zeros_like(padded_tokens),
                         torch.ones_like(padded_tokens)).bool().any(dim=2))
     regular_output = encoder.forward(tokens=tokens, mask=None)
     masked_output = encoder.forward(tokens=padded_tokens, mask=mask)
     assert_almost_equal(regular_output.data.numpy(),
                         masked_output.data.numpy(),
                         decimal=6)
 def test_can_construct_from_params(self):
     params = Params({"embedding_dim": 5, "num_filters": 4, "ngram_filter_sizes": [3, 5]})
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 8
     params = Params(
         {"embedding_dim": 5, "num_filters": 4, "ngram_filter_sizes": [3, 5], "output_dim": 7}
     )
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 7
Exemple #4
0
    def __init__(self, embedder: TextFieldEmbedder, embedding_size: int,
                 num_filters: int, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.embedder = embedder

        self.encoder = CnnEncoder(embedding_size, num_filters=num_filters)

        self.linear = torch.nn.Linear(
            in_features=self.encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        self.loss_function = torch.nn.CrossEntropyLoss()
Exemple #5
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: float = 0.0,
                 input_dropout: float = 0.0,
                 label_smoothing: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SentimentClassifier, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder

        share_rnn = nn.LSTM(input_size=self._text_field_embedder.get_output_dim(),
                            hidden_size=150,
                            batch_first=True,
                            # dropout=dropout,
                            bidirectional=True)
        share_encoder = PytorchSeq2SeqWrapper(share_rnn)

        self._encoder = RNNEncoder(vocab, share_encoder, input_dropout, regularizer)
        self._seq_vec = CnnEncoder(self._encoder.get_output_dim(), 25)
        self._de_dim = len(TASKS_NAME)
        weight = torch.empty(self._de_dim, self._text_field_embedder.get_output_dim())
        torch.nn.init.orthogonal_(weight)
        self._domain_embeddings = Embedding(self._de_dim, self._text_field_embedder.get_output_dim(), weight=weight)
        self._de_attention = BilinearAttention(self._seq_vec.get_output_dim(),
                                               self._domain_embeddings.get_output_dim())
        self._de_feedforward = FeedForward(self._domain_embeddings.get_output_dim(), 1,
                                           self._seq_vec.get_output_dim(), Activation.by_name("elu")())

        self._num_classes = self.vocab.get_vocab_size("label")
        self._sentiment_discriminator = Discriminator(self._seq_vec.get_output_dim(), self._num_classes)
        self._s_domain_discriminator = Discriminator(self._seq_vec.get_output_dim(), len(TASKS_NAME))
        self._valid_discriminator = Discriminator(self._domain_embeddings.get_output_dim(), 2)
        self._dropout = InputVariationalDropout(dropout)
        self._input_dropout = Dropout(input_dropout)
        self._label_smoothing = label_smoothing

        self.metrics = {
            "s_domain_acc": CategoricalAccuracy(),
            "valid_acc": CategoricalAccuracy()
        }
        for task_name in TASKS_NAME:
            self.metrics["{}_stm_acc".format(task_name)] = CategoricalAccuracy()

        self._loss = torch.nn.CrossEntropyLoss()
        self._domain_loss = torch.nn.CrossEntropyLoss()
        # TODO torch.nn.BCELoss
        self._valid_loss = torch.nn.BCEWithLogitsLoss()

        initializer(self)
Exemple #6
0
 def __init__(self,
              vocab: Vocabulary,
              text_field_embedder: TextFieldEmbedder,
              pos_tag_embedding: Embedding = None,
              users_embedding: Embedding = None,
              dropout: float = 0.1,
              label_namespace: str = "labels",
              initializer: InitializerApplicator = InitializerApplicator(),
              regularizer: RegularizerApplicator = None) -> None:
     super().__init__(vocab, regularizer)
     self._label_namespace = label_namespace
     self._dropout = Dropout(dropout)
     self._text_field_embedder = text_field_embedder
     self._pos_tag_embedding = pos_tag_embedding or None
     representation_dim = self._text_field_embedder.get_output_dim()
     if pos_tag_embedding is not None:
         representation_dim += self._pos_tag_embedding.get_output_dim()
     self._report_cnn = CnnEncoder(representation_dim, 25)
     self._comment_cnn = CnnEncoder(representation_dim, 25)
     lstm_input_dim = self._comment_cnn.get_output_dim()
     self._user_embedding = users_embedding or None
     if users_embedding is not None:
         lstm_input_dim += self._user_embedding.get_output_dim()
     rnn = nn.LSTM(input_size=lstm_input_dim,
                   hidden_size=150,
                   batch_first=True,
                   bidirectional=True)
     self._encoder = PytorchSeq2SeqWrapper(rnn)
     self._seq2vec = CnnEncoder(self._encoder.get_output_dim(), 25)
     self._num_class = self.vocab.get_vocab_size(self._label_namespace)
     self._bilinear_sim = BilinearSimilarity(self._encoder.get_output_dim(),
                                             self._encoder.get_output_dim())
     self._projector = FeedForward(self._seq2vec.get_output_dim(), 2,
                                   [50, self._num_class],
                                   Activation.by_name("sigmoid")(), dropout)
     self._golden_instances = None
     self._golden_instances_labels = None
     self._golden_instances_id = None
     self._metrics = {
         "accuracy":
         CategoricalAccuracy(),
         "f-measure":
         F1Measure(
             positive_label=vocab.get_token_index("feature", "labels")),
     }
     self._loss = torch.nn.CrossEntropyLoss()
     self._contrastive_loss = ContrastiveLoss()
     self._mse_loss = torch.nn.MSELoss()
     initializer(self)
Exemple #7
0
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model:
    if bert_model:
        embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model,
                                                                                 train_parameters=True)})
        encoder = BertPooler(pretrained_model=bert_model, requires_grad=True)
    else:
        # (3) How to get vectors for each Token ID:
        # (3.1) embed each token
        token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab"))
        # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz'

        # (3.2) embed each character in each token
        character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab"))
        cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,])
        token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder)
        # (3.3) embed the POS of each token
        pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab"))

        # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order
        # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/
        embedder = BasicTextFieldEmbedder(
            token_embedders={"tokens": token_embedding,
                             "token_characters": token_encoder,
                             "pos_tags": pos_tag_embedding}
        )  # emb_dim = 10 + 4 + 10 = 24
        encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True)
        #                                                  ^
        # average the embeddings across time, rather than simply summing
        # (ie. we will divide the summed embeddings by the length of the sentence).
    return SimpleClassifier(vocab, embedder, encoder)
def get_encoder(input_dim, output_dim, encoder_type, args):
    if encoder_type == "bag":
        return BagOfEmbeddingsEncoder(input_dim)
    if encoder_type == "bilstm":
        return PytorchSeq2VecWrapper(
            AllenNLPSequential(torch.nn.ModuleList(
                [get_encoder(input_dim, output_dim, "bilstm-unwrapped",
                             args)]),
                               input_dim,
                               output_dim,
                               bidirectional=True,
                               residual_connection=args.residual_connection,
                               dropout=args.dropout))
    if encoder_type == "bilstm-unwrapped":
        return torch.nn.LSTM(
            input_dim,
            output_dim,
            batch_first=True,
            bidirectional=True,
            dropout=args.dropout,
        )
    if encoder_type == "cnn":
        return CnnEncoder(embedding_dim=input_dim, num_filters=output_dim)
    if encoder_type == "cnn_highway":
        filter_size: int = output_dim // 4
        return CnnHighwayEncoder(
            embedding_dim=input_dim,
            filters=[(2, filter_size), (3, filter_size), (4, filter_size),
                     (5, filter_size)],
            projection_dim=output_dim,
            num_highway=3,
            do_layer_norm=True,
        )
    raise RuntimeError(f"Unknown encoder type={encoder_type}")
def build_model(vocab: Vocabulary,
                args,
                **kwargs) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 200

    if args.pretrained_WE_path:
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size,
                                 pretrained_file=args.pretrained_WE_path, vocab=vocab, )})

    else:
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)})

    encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5),
                         num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f

    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if args.use_reg :
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg),
                   ("encoder", l2_reg),
                   ("classifier", l2_reg)
                   ]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
Exemple #10
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 char_vocab_size,
                 char_embedding_size,
                 num_filter,
                 ngram_filter_size,
                 num_classes,
                 bert_weight_path=False):
        super().__init__()

        self.char_embedding = nn.Embedding(char_vocab_size,
                                           char_embedding_size)
        init.uniform_(self.char_embedding.weight, -0.1, 0.1)

        if bert_weight_path:
            self.bert = PretrainedBertEmbedder(bert_weight_path)
        else:
            self.embedding = nn.Embedding(vocab_size,
                                          embedding_dim=embedding_size)
            init.uniform_(self.embedding.weight, -0.1, 0.1)
            self.bert = None
        self.cnn_encoder = CnnEncoder(char_embedding_size,
                                      num_filters=num_filter,
                                      ngram_filter_sizes=ngram_filter_size)
        self.char_encoder = TokenCharactersEncoder(self.char_embedding,
                                                   self.cnn_encoder)
        if bert_weight_path:
            embedding_size = 768
        self.linear_layer = nn.Linear(embedding_size + num_filter, num_classes)
        init.xavier_normal_(self.linear_layer.weight)
Exemple #11
0
 def test_can_construct_from_params(self):
     params = Params({
             'embedding_dim': 5,
             'num_filters': 4,
             'ngram_filter_sizes': [3, 5]
             })
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 8
     params = Params({
             'embedding_dim': 5,
             'num_filters': 4,
             'ngram_filter_sizes': [3, 5],
             'output_dim': 7
             })
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 7
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = BasicTextFieldEmbedder({
        "tokens":
        Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)
    })
    encoder = CnnEncoder(
        embedding_dim=EMBED_DIMS,
        ngram_filter_sizes=(2, 3, 4, 5),
        num_filters=5
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return DecompensationClassifier(vocab, embedder, encoder,
                                    regularizer_applicator)
 def test_forward_runs_with_larger_input(self):
     encoder = CnnEncoder(embedding_dim=7,
                          num_filters=13,
                          ngram_filter_sizes=(1, 2, 3, 4, 5),
                          output_dim=30)
     tensor = Variable(torch.rand(4, 8, 7))
     assert encoder(tensor, None).size() == (4, 30)
def create_model(
        vocab: Vocabulary,
        embedding_dim: int,
        max_filter_size: int,
        num_filters: int,
        output_dim: int,
        dropout: float,
):
    model = BasicClassifier(
        text_field_embedder=BasicTextFieldEmbedder(
            {
                "tokens": Embedding(
                    embedding_dim=embedding_dim,
                    trainable=True,
                    vocab=vocab
                )
            }
        ),
        seq2vec_encoder=CnnEncoder(
            ngram_filter_sizes=range(2, max_filter_size),
            num_filters=num_filters,
            embedding_dim=embedding_dim,
            output_dim=output_dim,
        ),
        dropout=dropout,
        vocab=vocab,
    )
    return model
    def prepare_model(args, vocab):
        text_field_embedder = prepare_text_field_embedder(args, vocab)

        seq2seq_encoder = prepare_context_encoder(
            encoder_type=args.encoder_type,
            input_size=text_field_embedder.get_output_dim(),
            encoder_layer_num=args.encoder_layer,
            encoder_size=args.encoder_size,
            encoder_dropout=args.encoder_dropout)

        seq2vec_encoder = CnnEncoder(
            embedding_dim=seq2seq_encoder.get_output_dim(),
            num_filters=args.cnn_hidden,
            ngram_filter_sizes=args.cnn_window,
            conv_layer_activation=Activation.by_name('linear')())

        model = Seq2VecClassificationModel(
            vocab=vocab,
            text_field_embedder=text_field_embedder,
            seq2seq_encoder=seq2seq_encoder,
            seq2vec_encoder=seq2vec_encoder,
            dropout=args.classifier_dropout,
            classification_type=args.classification_type,
            pos_label=args.positive_label,
        )

        return model
 def test_can_construct_from_params(self):
     params = Params({
         'embedding_dim': 5,
         'num_filters': 4,
         'ngram_filter_sizes': [3, 5]
     })
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 8
     params = Params({
         'embedding_dim': 5,
         'num_filters': 4,
         'ngram_filter_sizes': [3, 5],
         'output_dim': 7
     })
     encoder = CnnEncoder.from_params(params)
     assert encoder.get_output_dim() == 7
Exemple #17
0
class Discriminator(Model):
    def __init__(self, embedder: TextFieldEmbedder, embedding_size: int,
                 num_filters: int, vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.embedder = embedder

        self.encoder = CnnEncoder(embedding_size, num_filters=num_filters)

        self.linear = torch.nn.Linear(
            in_features=self.encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels'))

        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(tokens)

        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)

        output = {"logits": logits}
        output["loss"] = self.loss_function(logits, label)

        return output
 def test_get_dimension_is_correct(self):
     encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5))
     assert encoder.get_output_dim() == 8
     assert encoder.get_input_dim() == 5
     encoder = CnnEncoder(
         embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5), output_dim=7
     )
     assert encoder.get_output_dim() == 7
     assert encoder.get_input_dim() == 5
Exemple #19
0
    def __init__(self, vocab, encoder, attention, g, out_dim):
        super(LabelEmbedModel, self).__init__(vocab)

        _label_out_dim = 100
        self.text_encoder = encoder
        self.label_encoder = LabelEmbedding(g.num_features, _label_out_dim)
        self.tl_attn = attention(self.text_encoder.get_output_dim(),
                                 self.label_encoder.get_output_dim())
        self.encoder = CnnEncoder(self.text_encoder.get_output_dim() +
                                  self.label_encoder.get_output_dim(),
                                  num_filters=100)
        # self.encoder = CnnEncoder(self.text_encoder.get_output_dim(),
        #                           num_filters=100)

        self._classification_layer = torch.nn.Linear(
            self.encoder.get_output_dim(), out_dim)
        self._metric = MultiLabelMetric()
        self._loss = torch.nn.BCEWithLogitsLoss()
 def test_forward_does_correct_computation(self):
     encoder = CnnEncoder(embedding_dim=2, num_filters=1, ngram_filter_sizes=(1, 2))
     constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.0}))
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(encoder)
     input_tensor = torch.FloatTensor([[[0.7, 0.8], [0.1, 1.5]]])
     encoder_output = encoder(input_tensor, None)
     assert_almost_equal(
         encoder_output.data.numpy(), numpy.asarray([[1.6 + 1.0, 3.1 + 1.0]]), decimal=6
     )
Exemple #21
0
 def test_forward_does_correct_computation(self):
     encoder = CnnEncoder(embedding_dim=2,
                          num_filters=1,
                          ngram_filter_sizes=(1, 2))
     constant_init = lambda tensor: torch.nn.init.constant(tensor, 1.)
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(encoder)
     input_tensor = Variable(torch.FloatTensor([[[.7, .8], [.1, 1.5]]]))
     encoder_output = encoder(input_tensor, None)
     assert_almost_equal(encoder_output.data.numpy(),
                         numpy.asarray([[1.6 + 1.0, 3.1 + 1.0]]))
Exemple #22
0
 def test_get_dimension_is_correct(self):
     encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5))
     assert encoder.get_output_dim() == 8
     assert encoder.get_input_dim() == 5
     encoder = CnnEncoder(embedding_dim=5, num_filters=4, ngram_filter_sizes=(3, 5), output_dim=7)
     assert encoder.get_output_dim() == 7
     assert encoder.get_input_dim() == 5
Exemple #23
0
    def __init__(self, vocab_size, emb_dim, hid_dim):
        super().__init__()

        self.embed = nn.Embedding(vocab_size, emb_dim)

        self.cnn_encoder = CnnEncoder(emb_dim, 64, output_dim=hid_dim)

        self.fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim // 2),
            nn.LeakyReLU(),
            nn.Dropout(),
            nn.Linear(hid_dim // 2, 9)
        )
def build_simple_cnn_model(vocab: Vocabulary,
                           emb_size: int = 256,
                           output_dim: int = 256,
                           num_filters: int = 16,
                           ngram_filter_sizes: Tuple[int, ...] = (2, 3, 4, 5, 6)) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"bert_tokens": Embedding(embedding_dim=emb_size, num_embeddings=vocab_size)}
    )
    encoder = CnnEncoder(
        embedding_dim=emb_size, ngram_filter_sizes=ngram_filter_sizes, output_dim=output_dim, 
        num_filters=num_filters,
    )
    return SimpleClassifier(vocab, embedder, encoder)
Exemple #25
0
class LabelEmbedModel(Model):
    def __init__(self, vocab, encoder, attention, g, out_dim):
        super(LabelEmbedModel, self).__init__(vocab)

        _label_out_dim = 100
        self.text_encoder = encoder
        self.label_encoder = LabelEmbedding(g.num_features, _label_out_dim)
        self.tl_attn = attention(self.text_encoder.get_output_dim(),
                                 self.label_encoder.get_output_dim())
        self.encoder = CnnEncoder(self.text_encoder.get_output_dim() +
                                  self.label_encoder.get_output_dim(),
                                  num_filters=100)
        # self.encoder = CnnEncoder(self.text_encoder.get_output_dim(),
        #                           num_filters=100)

        self._classification_layer = torch.nn.Linear(
            self.encoder.get_output_dim(), out_dim)
        self._metric = MultiLabelMetric()
        self._loss = torch.nn.BCEWithLogitsLoss()

    def forward(self, text, label, graph):
        text_vec = self.text_encoder(text)
        label_vec = self.label_encoder(graph[0])
        att_vec = self.tl_attn(text_vec, label_vec)
        vec = torch.cat([text_vec, att_vec], dim=-1)

        vec = self.encoder(vec, None)

        logits = self._classification_layer(vec)
        probs = torch.softmax(logits, dim=-1)
        output_dict = {"logits": logits, "probs": probs}

        if label is not None:
            self._metric(predictions=logits, gold_labels=label)
            output_dict['loss'] = self._loss(input=logits,
                                             target=label.float())

        return output_dict

    def get_metrics(self, reset: bool = False):
        metrics = {'f-score': self._metric.get_metric(reset)}
        return metrics
Exemple #26
0
 def __init__(self,
              vocab: Vocabulary,
              text_field_embedder: TextFieldEmbedder,
              shared_encoder: Seq2SeqEncoder,
              private_encoder: Seq2SeqEncoder,
              input_dropout: float = 0.0,
              regularizer: RegularizerApplicator = None) -> None:
     super(RNNEncoder, self).__init__(vocab, regularizer)
     self._text_field_embedder = text_field_embedder
     self._shared_encoder = shared_encoder
     self._private_encoder = private_encoder
     self._seq2vec = CnnEncoder(embedding_dim=self._shared_encoder.get_output_dim(),
                                num_filters=int(shared_encoder.get_output_dim() / 4))
     self._input_dropout = Dropout(input_dropout)
     self._s_query = nn.Parameter(torch.randn(1, 100))
     self._s_att = Attention(heads=3, attn_size=300, query_size=100, value_size=shared_encoder.get_output_dim(),
                             key_size=shared_encoder.get_output_dim(), dropout=0.1)
     self._p_query = nn.Parameter(torch.randn(1, 100))
     self._p_att = Attention(heads=3, attn_size=300, query_size=100, value_size=private_encoder.get_output_dim(),
                             key_size=private_encoder.get_output_dim(), dropout=0.1)
Exemple #27
0
def classification():

    # Index each token with a single Id
    token_indexers = {"tokens": SingleIdTokenIndexer()}

    # Read the data
    reader = ClassificationDatasetReader(token_indexers)
    training_data = reader.read(path='data/classification/train.txt')
    validation_data = reader.read(path='data/classification/test.txt')
    test_data = reader.read(path='data/classification/test.txt')

    # Create a vocabulary
    vocabulary = Vocabulary.from_instances(training_data + validation_data +
                                           test_data)

    # Create an "Embedder" from a randomly initialized embedding layer
    embedding_layer = torch.nn.Embedding(
        num_embeddings=vocabulary.get_vocab_size('tokens'),
        embedding_dim=EMBEDDING_DIM)
    embedder = BasicTextFieldEmbedder(
        token_embedders={"tokens": embedding_layer})

    # Our text classifier will use a CNN encoder
    cnn_encoder = CnnEncoder(embedding_dim=EMBEDDING_DIM,
                             num_filters=NUM_FILTERS,
                             ngram_filter_sizes=FILTER_SIZES,
                             output_dim=NUM_FILTERS * len(FILTER_SIZES))
    model = TextClassifier(vocabulary=vocabulary,
                           embedder=embedder,
                           encoder=cnn_encoder)

    print("\nModel :\n")
    print(model)

    # Training
    train_model(model, training_data, validation_data, vocabulary)

    # Evaluation
    evaluate_classification_model(model, test_data)
Exemple #28
0
    def get_embedder(self, vocab, Word_embedding_dim, char_embeddedng_dim,
                     CNN_num_filters, CNN_encoder_dim):
        # The word embedding will transform every word to a "Word_embedding_dim" real valued vector
        # Having a tensor (batch_size, max_sentence_length, Word_embedding_dim)

        indexers_dict = dict()
        if (Word_embedding_dim > 0):
            word_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_ids"),
                embedding_dim=Word_embedding_dim)

            word_embedding = word_embedding.to(device=self.cf_a.device,
                                               dtype=self.cf_a.dtype)
            indexers_dict["tokens"] = word_embedding
        if (CNN_encoder_dim > 0):
            # The char embedding will transform every character into a ""char_embeddedng_dim" real valued vector
            # Having a tensor (batch_size, max_sentence_length, max_word_length, char_embeddedng_dim)
            char_embedding = Embedding(
                num_embeddings=vocab.get_vocab_size("token_chars"),
                embedding_dim=char_embeddedng_dim)
            # The Encoder will apply the cNN over the max_word_length dimension
            # Having a tensor (batch_size, max_sentence_length, num_filters * ngram_filter_sizes)
            character_cnn = CnnEncoder(ngram_filter_sizes=(1, 1),
                                       embedding_dim=char_embeddedng_dim,
                                       num_filters=CNN_num_filters,
                                       output_dim=CNN_encoder_dim)

            # We concatenate the char embdding and Encoding
            token_character_encoder = TokenCharactersEncoder(
                embedding=char_embedding, encoder=character_cnn)

            token_character_encoder = token_character_encoder.to(
                device=self.cf_a.device, dtype=self.cf_a.dtype)
            indexers_dict["chars"] = token_character_encoder
        ### Now we finally create the finally embedder indicating what are the token ids it embedds
        text_field_embedder = BasicTextFieldEmbedder(indexers_dict)

        return text_field_embedder
Exemple #29
0
def construct_model(vocab, args):
    # token embedding

    word_embedding = Embedding.from_params(vocab=vocab, params=Params({
        "pretrained_file": "glove\\glove.vocab.100d.txt",
        "embedding_dim": 100,
        "trainable": True,
        "padding_index": 0
    }))

    word_embedding = BasicTextFieldEmbedder({
        "token_words": word_embedding
    })

    char_embedding = BasicTextFieldEmbedder({
        "token_characters": TokenCharactersEncoder(embedding=Embedding(embedding_dim=20,
                                                                       num_embeddings=262),
                                                   encoder=CnnEncoder(embedding_dim=20,
                                                                      ngram_filter_sizes=[5],
                                                                      num_filters=50)),
    })

    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(input_size=100,
                      num_layers=1,
                      hidden_size=100,
                      bidirectional=True,
                      batch_first=True))

    model = FollowUpSnippetModel(vocab=vocab,
                                 word_embedder=word_embedding,
                                 char_embedder=char_embedding,
                                 tokens_encoder=lstm,
                                 model_args=args)

    return model
Exemple #30
0
def build_embeddings(args, vocab, tasks, pretrained_embs=None):
    ''' Build embeddings according to options in args '''
    d_emb, d_char = 0, args.d_char

    token_embedders = {}
    # Word embeddings
    n_token_vocab = vocab.get_vocab_size('tokens')
    if args.word_embs != 'none':
        if args.word_embs in ['glove', 'fastText'
                              ] and pretrained_embs is not None:
            word_embs = pretrained_embs
            assert word_embs.size()[0] == n_token_vocab
            d_word = word_embs.size()[1]
            log.info("\tUsing pre-trained word embeddings: %s",
                     str(word_embs.size()))
        else:
            log.info("\tLearning word embeddings from scratch!")
            word_embs = None
            d_word = args.d_word

        embeddings = Embedding(
            num_embeddings=n_token_vocab,
            embedding_dim=d_word,
            weight=word_embs,
            trainable=False,
            padding_index=vocab.get_token_index('@@PADDING@@'))
        token_embedders["words"] = embeddings
        d_emb += d_word
    else:
        embeddings = None
        log.info("\tNot using word embeddings!")

    # Handle cove
    cove_layer = None
    if args.cove:
        assert embeddings is not None
        assert args.word_embs == "glove", "CoVe requires GloVe embeddings."
        assert d_word == 300, "CoVe expects 300-dimensional GloVe embeddings."
        try:
            from .cove.cove import MTLSTM as cove_lstm
            # Have CoVe do an internal GloVe lookup, but don't add residual.
            # We'll do this manually in modules.py; see
            # SentenceEncoder.forward().
            cove_layer = cove_lstm(n_vocab=n_token_vocab,
                                   vectors=embeddings.weight.data)
            # Control whether CoVe is trainable.
            for param in cove_layer.parameters():
                param.requires_grad = bool(args.cove_fine_tune)
            d_emb += 600  # 300 x 2 for biLSTM activations
            log.info("\tUsing CoVe embeddings!")
        except ImportError as e:
            log.info("Failed to import CoVe!")
            raise e

    # Character embeddings
    if args.char_embs:
        log.info("\tUsing character embeddings!")
        char_embeddings = Embedding(vocab.get_vocab_size('chars'), d_char)
        filter_sizes = tuple(
            [int(i) for i in args.char_filter_sizes.split(',')])
        char_encoder = CnnEncoder(d_char,
                                  num_filters=args.n_char_filters,
                                  ngram_filter_sizes=filter_sizes,
                                  output_dim=d_char)
        char_embedder = TokenCharactersEncoder(char_embeddings,
                                               char_encoder,
                                               dropout=args.dropout_embs)
        d_emb += d_char
        token_embedders["chars"] = char_embedder
    else:
        log.info("\tNot using character embeddings!")

    # If we want separate ELMo scalar weights (a different ELMo representation for each classifier,
    # then we need count and reliably map each classifier to an index used by
    # allennlp internal ELMo.
    if args.sep_embs_for_skip:
        # Determine a deterministic list of classifier names to use for each task.
        classifiers = sorted(set(map(lambda x: x._classifier_name, tasks)))
        # Reload existing classifier map, if it exists.
        classifier_save_path = args.run_dir + "/classifier_task_map.json"
        if os.path.isfile(classifier_save_path):
            loaded_classifiers = json.load(
                open(args.run_dir + "/classifier_task_map.json", 'r'))
        else:
            # No file exists, so assuming we are just starting to pretrain. If pretrain is to be
            # skipped, then there's a way to bypass this assertion by explicitly allowing for a missing
            # classiifer task map.
            assert_for_log(
                args.do_pretrain or args.allow_missing_task_map,
                "Error: {} should already exist.".format(classifier_save_path))
            if args.allow_missing_task_map:
                log.warning("Warning: classifier task map not found in model"
                            " directory. Creating a new one from scratch.")
            loaded_classifiers = {
                "@pretrain@": 0
            }  # default is always @pretrain@
        # Add the new tasks and update map, keeping the internal ELMo index consistent.
        max_number_classifiers = max(loaded_classifiers.values())
        offset = 1
        for classifier in classifiers:
            if classifier not in loaded_classifiers:
                loaded_classifiers[
                    classifier] = max_number_classifiers + offset
                offset += 1
        log.info("Classifiers:{}".format(loaded_classifiers))
        open(classifier_save_path, 'w+').write(json.dumps(loaded_classifiers))
        # Every index in classifiers needs to correspond to a valid ELMo output representation.
        num_reps = 1 + max(loaded_classifiers.values())
    else:
        # All tasks share the same scalars.
        # Not used if self.elmo_chars_only = 1 (i.e. no elmo)
        loaded_classifiers = {"@pretrain@": 0}
        num_reps = 1
    if args.elmo:
        log.info("Loading ELMo from files:")
        log.info("ELMO_OPT_PATH = %s", ELMO_OPT_PATH)
        if args.elmo_chars_only:
            log.info("\tUsing ELMo character CNN only!")
            log.info("ELMO_WEIGHTS_PATH = %s", ELMO_WEIGHTS_PATH)
            elmo_embedder = ElmoCharacterEncoder(options_file=ELMO_OPT_PATH,
                                                 weight_file=ELMO_WEIGHTS_PATH,
                                                 requires_grad=False)
            d_emb += 512
        else:
            log.info("\tUsing full ELMo! (separate scalars/task)")
            if args.elmo_weight_file_path != 'none':
                assert os.path.exists(args.elmo_weight_file_path), "ELMo weight file path \"" + \
                    args.elmo_weight_file_path + "\" does not exist."
                weight_file = args.elmo_weight_file_path
            else:
                weight_file = ELMO_WEIGHTS_PATH
            log.info("ELMO_WEIGHTS_PATH = %s", weight_file)
            elmo_embedder = ElmoTokenEmbedderWrapper(
                options_file=ELMO_OPT_PATH,
                weight_file=weight_file,
                num_output_representations=num_reps,
                # Dropout is added by the sentence encoder later.
                dropout=0.)
            d_emb += 1024

        token_embedders["elmo"] = elmo_embedder

    # Wrap ELMo and other embedders, and concatenates the resulting
    # representations alone the last (vector) dimension.
    embedder = ElmoTextFieldEmbedder(token_embedders,
                                     loaded_classifiers,
                                     elmo_chars_only=args.elmo_chars_only,
                                     sep_embs_for_skip=args.sep_embs_for_skip)

    assert d_emb, "You turned off all the embeddings, ya goof!"
    return d_emb, embedder, cove_layer
def train_nli(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, use_elmo=False, epochs=30, patience=5,
              learning_rate=3e-4, num_classes=2, use_gpu=False):
    """
    Trains a Natural Language Inference (InferSent) inspired architecture.
    Reply and Context are separately encoded using CNN and GloVe embeddings (or optionally ELMo to dynamically compute embeddings).

    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings(large=True)
    else:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("reply_tokens", "num_tokens"),
                                            ("context_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    # CNN encoders:
    cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                           num_filters=num_filters,
                                           ngram_filter_sizes=filter_sizes)

    cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                             num_filters=num_filters,
                                             ngram_filter_sizes=filter_sizes)

    # Feedforward:
    classifier_feedforward: FeedForward = nn.Linear(4 * cnn_reply.get_output_dim(), num_classes) # 4 because we perform [concatenation, element-wise subtraction (abs), element-wise multiplication]

    model = models.InferModel(vocab=vocab,
                              word_embeddings=word_embeddings,
                              reply_encoder=cnn_reply,
                              context_encoder=cnn_context,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # No early stopping: train on both train+validation dataset if patience is None
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
def train_cnn(train_dataset, validation_dataset, batch_size, num_filters, filter_sizes, double_input=False,
              dense_vector=False, col_name=None, use_elmo=False, epochs=30, patience=5, learning_rate=3e-4, num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset; optionally, perform early stopping based on validation loss. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Functionality to run it for (1) Single Input: reply/question, (2) Double Input: reply + context comment,
    (3) Dense Vector + reply/question, and (4) Dense Vector + reply + context comment.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    double_input: bool
        True to run DoubleInput classifier | False (default) for SingleInput classifier
    dense_vector: bool
        True to concatenate dense feature vector before feeding to the FeedForward layer
    col_name: str
        'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    if double_input: # need context_tokens as well
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens"),
                                                ("context_tokens", "num_tokens")])

    else: # only reply_tokens
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    if double_input: # DoubleInput Classifier: two CNN encoders
        cnn_reply: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                               num_filters=num_filters,
                                               ngram_filter_sizes=filter_sizes)

        cnn_context: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                                 num_filters=num_filters,
                                                 ngram_filter_sizes=filter_sizes)
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = 2 * (cnn_reply.get_output_dim() + DENSE_VECTOR_LEN)
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseDoubleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=cnn_reply,
                                                 context_encoder=cnn_context,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(2 * cnn_reply.get_output_dim(), num_classes)
            model = models.DoubleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=cnn_reply,
                                                 context_encoder=cnn_context,
                                                 classifier_feedforward=classifier_feedforward)


    else: # SingleInput Classifier: one CNN encoder
        encoder: Seq2VecEncoder = CnnEncoder(embedding_dim=word_embeddings.get_output_dim(),
                                             num_filters=num_filters,
                                             ngram_filter_sizes=filter_sizes)

        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseSingleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes)
            model = models.SingleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)


    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # Train on both train+validation dataset if patience is None
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']