Esempio n. 1
0
 def test_registry_has_builtin_seq2vec_encoders(self):
     assert Seq2VecEncoder.by_name(u'cnn').__name__ == u'CnnEncoder'
     # pylint: disable=protected-access
     assert Seq2VecEncoder.by_name(u'gru')._module_class.__name__ == u'GRU'
     assert Seq2VecEncoder.by_name(
         u'lstm')._module_class.__name__ == u'LSTM'
     assert Seq2VecEncoder.by_name(u'rnn')._module_class.__name__ == u'RNN'
Esempio n. 2
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 posclass_weight: Optional[float] = 1,
                 use_power: Optional[bool] = False,
                 dropout: Optional[float] = 0) -> None:
        super().__init__(vocab)
        
        self.embedder = embedder
        self.encoder = encoder
        if use_power:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim() + 1,
                out_features=vocab.get_vocab_size('labels')
            )
        else:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim(),
                out_features=vocab.get_vocab_size('labels')
            )
        self.use_power = use_power
    
        self.f1_lie = F1Measure(vocab.get_token_index('False', 'labels'))
        self.f1_truth = F1Measure(vocab.get_token_index('True', 'labels'))
        self.micro_f1 = FBetaMeasure(average='micro')
        self.macro_f1 = FBetaMeasure(average='macro')
        
        weights = [1,1]
        weights[vocab.get_token_index('False', 'labels')] = posclass_weight        
        self.loss = torch.nn.CrossEntropyLoss(weight = torch.Tensor(weights))

        self.dropout = torch.nn.Dropout(dropout)
Esempio n. 3
0
    def from_params(cls, params: Params) -> 'Seq2Seq2VecEncoder':
        seq2seq_encoder_params = params.pop("seq2seq_encoder")
        seq2vec_encoder_params = params.pop("seq2vec_encoder")
        seq2seq_encoder = Seq2SeqEncoder.from_params(seq2seq_encoder_params)
        seq2vec_encoder = Seq2VecEncoder.from_params(seq2vec_encoder_params)

        return cls(seq2seq_encoder=seq2seq_encoder,
                   seq2vec_encoder=seq2vec_encoder)
 def __init__(
         self,
         # input_dim: int,
         pooler: Seq2VecEncoder):
     super().__init__()
     self._input_dim = pooler.get_output_dim()
     # we distribute the pooler across _spans_, not actual time
     self._pooler = TimeDistributed(pooler)
Esempio n. 5
0
 def __init__(self,
              word_embeddings: TextFieldEmbedder,
              encoder: Seq2VecEncoder,
              vocab: Vocabulary) -> None:
     super().__init__(vocab)
     self.word_embeddings = word_embeddings
     self.encoder = encoder
     self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                       out_features=vocab.get_vocab_size('labels'))
     self.accuracy = CategoricalAccuracy()
     self.loss_function = torch.nn.CrossEntropyLoss()
Esempio n. 6
0
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "tokens" by default.
     embedding_params.setdefault("vocab_namespace", "token_bpe")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
 def from_params(cls, vocab, params):  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params = params.pop(u"embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault(u"vocab_namespace", u"token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params = params.pop(u"encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float(u"dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
Esempio n. 9
0
 def from_params(cls, vocab: Vocabulary,
                 params: Params) -> 'GlyphEmbeddingWrapper':
     # glyph_config
     glyph_config = GlyphEmbeddingConfig()
     glyph_config.output_size = params.pop_int("output_size", 300)
     glyph_config.use_highway = True
     glyph_config.dropout = params.pop_float("dropout", 0.0)
     glyph_config.font_channels = params.pop_int("font_channels", 8)
     glyph_config.glyph_embsize = params.pop_int("glyph_embsize", 256)
     glyph_config.use_batch_norm = params.pop_bool("use_batch_norm", False)
     # encoder_config
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     params.assert_empty(cls.__name__)
     return cls(vocab, glyph_config, encoder)
    def from_params(  # type: ignore
            cls, vocab: Vocabulary,
            params: Params) -> "TokenCharactersEncoder":

        embedding_params: Params = params.pop("embedding")
        # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
        # that to be "token_characters" by default. If num_embeddings is present, set default namespace
        # to None so that extend_vocab call doesn't misinterpret that some namespace was originally used.
        default_namespace = (None if embedding_params.get(
            "num_embeddings", None) else "token_characters")
        embedding_params.setdefault("vocab_namespace", default_namespace)
        embedding = Embedding.from_params(vocab, embedding_params)
        encoder_params: Params = params.pop("encoder")
        encoder = Seq2VecEncoder.from_params(encoder_params)
        dropout = params.pop_float("dropout", 0.0)
        params.assert_empty(cls.__name__)
        return cls(embedding, encoder, dropout)
Esempio n. 11
0
    def forward(self, tokens, mask=None):  #pylint: disable=arguments-differ
        if mask is not None:
            tokens = tokens * mask.unsqueeze(-1).float()

        # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens`
        # dimension.
        summed = tokens.sum(1)

        if self._averaged:
            if mask is not None:
                lengths = get_lengths_from_binary_sequence_mask(mask)
                length_mask = (lengths > 0)

                # Set any length 0 to 1, to avoid dividing by zero.
                lengths = torch.max(lengths, lengths.new_ones(1))
            else:
                lengths = tokens.new_full((1, ), fill_value=tokens.size(1))
                length_mask = None

            summed = summed / lengths.unsqueeze(-1).float()

            if length_mask is not None:
                summed = summed * (length_mask > 0).float().unsqueeze(-1)

        return summed


BagOfEmbeddingsEncoder = Seq2VecEncoder.register(u"boe")(
    BagOfEmbeddingsEncoder)
Esempio n. 12
0
    work.
    """
    PYTORCH_MODELS = [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]

    def __init__(self, module_class: Type[torch.nn.modules.RNNBase]) -> None:
        self._module_class = module_class

    def __call__(self, **kwargs) -> PytorchSeq2VecWrapper:
        return self.from_params(Params(kwargs))

    # Logic requires custom from_params
    def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
        if not params.pop('batch_first', True):
            raise ConfigurationError(
                "Our encoder semantics assumes batch is always first!")
        if self._module_class in self.PYTORCH_MODELS:
            params['batch_first'] = True
        module = self._module_class(**params.as_dict())
        return PytorchSeq2VecWrapper(module)


# pylint: disable=protected-access
Seq2VecEncoder.register("gru")(_Seq2VecWrapper(torch.nn.GRU))
Seq2VecEncoder.register("lstm")(_Seq2VecWrapper(torch.nn.LSTM))
Seq2VecEncoder.register("rnn")(_Seq2VecWrapper(torch.nn.RNN))
Seq2VecEncoder.register("augmented_lstm")(_Seq2VecWrapper(AugmentedLstm))
Seq2VecEncoder.register("alternating_lstm")(
    _Seq2VecWrapper(StackedAlternatingLstm))
Seq2VecEncoder.register("stacked_bidirectional_lstm")(
    _Seq2VecWrapper(StackedBidirectionalLstm))
Esempio n. 13
0
    through to the ``RNNBase`` constructor, then pass the instantiated pytorch RNN to the
    ``PytorchSeq2VecWrapper``.  This lets us use this class in the registry and have everything just
    work.
    """
    PYTORCH_MODELS = [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]

    def __init__(self, module_class):
        self._module_class = module_class

    def __call__(self, **kwargs):
        return self.from_params(Params(kwargs))

    # Logic requires custom from_params
    def from_params(self, params):
        if not params.pop(u'batch_first', True):
            raise ConfigurationError(
                u"Our encoder semantics assumes batch is always first!")
        if self._module_class in self.PYTORCH_MODELS:
            params[u'batch_first'] = True
        module = self._module_class(**params.as_dict())
        return PytorchSeq2VecWrapper(module)


# pylint: disable=protected-access
Seq2VecEncoder.register(u"gru")(_Seq2VecWrapper(torch.nn.GRU))
Seq2VecEncoder.register(u"lstm")(_Seq2VecWrapper(torch.nn.LSTM))
Seq2VecEncoder.register(u"rnn")(_Seq2VecWrapper(torch.nn.RNN))
Seq2VecEncoder.register(u"augmented_lstm")(_Seq2VecWrapper(AugmentedLstm))
Seq2VecEncoder.register(u"alternating_lstm")(
    _Seq2VecWrapper(StackedAlternatingLstm))
Esempio n. 14
0
        # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`.  The
        # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`,
        # where the conv layer `in_channels` is our `embedding_dim`.  We thus need to transpose the
        # tensor first.
        tokens = torch.transpose(tokens, 1, 2)
        # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`,
        # where `pool_length = num_tokens - ngram_size + 1`.  We then do an activation function,
        # then do max pooling over each filter for the whole input sequence.  Because our max
        # pooling is simple, we just use `torch.max`.  The resultant tensor of has shape
        # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the
        # projection layer, if requested.

        filter_outputs = []
        for i in range(len(self._convolution_layers)):
            convolution_layer = getattr(self, u'conv_layer_{}'.format(i))
            filter_outputs.append(
                    self._activation(convolution_layer(tokens)).max(dim=2)[0]
            )

        # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`.
        # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`.
        maxpool_output = torch.cat(filter_outputs, dim=1) if len(filter_outputs) > 1 else filter_outputs[0]

        if self.projection_layer:
            result = self.projection_layer(maxpool_output)
        else:
            result = maxpool_output
        return result

CnnEncoder = Seq2VecEncoder.register(u"cnn")(CnnEncoder)
Esempio n. 15
0
    When you instantiate a ``_Wrapper`` object, you give it an ``RNNBase`` subclass, which we save
    to ``self``.  Then when called (as if we were instantiating an actual encoder with
    ``Encoder(**params)``, or with ``Encoder.from_params(params)``), we pass those parameters
    through to the ``RNNBase`` constructor, then pass the instantiated pytorch RNN to the
    ``PytorchSeq2VecWrapper``.  This lets us use this class in the registry and have everything just
    work.
    """
    PYTORCH_MODELS = [torch.nn.GRU, torch.nn.LSTM, torch.nn.RNN]
    def __init__(self, module_class: Type[torch.nn.modules.RNNBase]) -> None:
        self._module_class = module_class

    def __call__(self, **kwargs) -> PytorchSeq2VecWrapper:
        return self.from_params(Params(kwargs))

    # Logic requires custom from_params
    def from_params(self, params: Params) -> PytorchSeq2VecWrapper:
        if not params.pop('batch_first', True):
            raise ConfigurationError("Our encoder semantics assumes batch is always first!")
        if self._module_class in self.PYTORCH_MODELS:
            params['batch_first'] = True
        module = self._module_class(**params.as_dict())
        return PytorchSeq2VecWrapper(module)

# pylint: disable=protected-access
Seq2VecEncoder.register("gru")(_Seq2VecWrapper(torch.nn.GRU))
Seq2VecEncoder.register("lstm")(_Seq2VecWrapper(torch.nn.LSTM))
Seq2VecEncoder.register("rnn")(_Seq2VecWrapper(torch.nn.RNN))
Seq2VecEncoder.register("augmented_lstm")(_Seq2VecWrapper(AugmentedLstm))
Seq2VecEncoder.register("alternating_lstm")(_Seq2VecWrapper(StackedAlternatingLstm))
Esempio n. 16
0
    def test_registry_has_builtin_seq2vec_encoders(self):
        assert Seq2VecEncoder.by_name("cnn").__name__ == "CnnEncoder"

        assert Seq2VecEncoder.by_name("gru")._module_class.__name__ == "GRU"
        assert Seq2VecEncoder.by_name("lstm")._module_class.__name__ == "LSTM"
        assert Seq2VecEncoder.by_name("rnn")._module_class.__name__ == "RNN"
Esempio n. 17
0
 def test_registry_has_builtin_seq2vec_encoders(self):
     assert Seq2VecEncoder.by_name('cnn').__name__ == 'CnnEncoder'
     # pylint: disable=protected-access
     assert Seq2VecEncoder.by_name('gru')._module_class.__name__ == 'GRU'
     assert Seq2VecEncoder.by_name('lstm')._module_class.__name__ == 'LSTM'
     assert Seq2VecEncoder.by_name('rnn')._module_class.__name__ == 'RNN'