Example #1
0
def build_encoder(config: Dict) -> Tuple[RecurrentEncoder, str]:
    vocabulary = from_nematus_json(config["src_vocabulary"],
                                   max_size=config["n_words_src"],
                                   pad_to_max_size=True)

    vocabulary_ini = VOCABULARY_TEMPLATE.format("src",
                                                config["src_vocabulary"],
                                                config["n_words_src"])

    inp_seq_name = "{}_input".format(ENCODER_NAME)
    inp_seq = EmbeddedSequence(name=inp_seq_name,
                               vocabulary=vocabulary,
                               data_id="source",
                               embedding_size=config["embedding_size"])

    encoder = RecurrentEncoder(name=ENCODER_NAME,
                               input_sequence=inp_seq,
                               rnn_size=config["rnn_size"],
                               rnn_cell="NematusGRU")

    encoder_ini = ENCODER_TEMPLATE.format(ENCODER_NAME, config["rnn_size"],
                                          inp_seq_name,
                                          config["embedding_size"],
                                          config["max_length"])

    return encoder, "\n".join([vocabulary_ini, encoder_ini])
Example #2
0
def build_encoder(hparams: Dict,
                  vocab_path: str) -> Tuple[TransformerEncoder, str]:
    vocabulary = from_t2t_vocabulary(vocab_path)
    vocabulary_ini = VOCABULARY_TEMPLATE.format(vocab_path)

    inp_seq_name = "{}_input".format(ENCODER_NAME)
    inp_seq = EmbeddedSequence(
        name=inp_seq_name,
        vocabulary=vocabulary,
        data_id="source_wp",
        embedding_size=hparams["embedding_size"],
        scale_embeddings_by_depth=hparams[
            "multiply_embedding_mode"] == "sqrt_depth",
        add_end_symbol=True)

    encoder = TransformerEncoder(
        name=ENCODER_NAME,
        input_sequence=inp_seq,
        ff_hidden_size=hparams["ff_hidden_size"],
        depth=hparams["depth"],
        n_heads=hparams["n_heads"],
        target_space_id=21,
        use_att_transform_bias=True)

    encoder_ini = ENCODER_TEMPLATE.format(
        inp_seq_name, hparams["embedding_size"],
        hparams["multiply_embedding_mode"] == "sqrt_depth",
        hparams["max_length"],
        ENCODER_NAME, hparams["ff_hidden_size"], hparams["depth"],
        hparams["n_heads"])

    return encoder, vocabulary, "\n".join([vocabulary_ini, encoder_ini])
Example #3
0
    def __init__(self,
                 name: str,
                 vocabulary: Vocabulary,
                 data_id: str,
                 embedding_size: int,
                 rnn_size: int,
                 rnn_cell: str = "GRU",
                 rnn_direction: str = "bidirectional",
                 max_input_len: int = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create a new instance of the sentence encoder.

        Arguments:
            name: ModelPart name.
            vocabulary: The input vocabulary.
            data_id: The input sequence data ID.
            embedding_size: The dimension of the embedding vectors in the input
                sequence.
            max_input_len: Maximum length of the input sequence (disregard
                tokens after this position).
            rnn_size: The dimension of the RNN hidden state vector.
            rnn_cell: One of "GRU", "NematusGRU", "LSTM". Which kind of memory
                cell to use.
            rnn_direction: One of "forward", "backward", "bidirectional". In
                what order to process the input sequence. Note that choosing
                "bidirectional" will double the resulting vector dimension as
                well as the number of encoder parameters.
            dropout_keep_prob: 1 - dropout probability.
            save_checkpoint: ModelPart save checkpoint file.
            load_checkpoint: ModelPart load checkpoint file.
        """
        check_argument_types()
        s_ckp = "input_{}".format(save_checkpoint) if save_checkpoint else None
        l_ckp = "input_{}".format(load_checkpoint) if load_checkpoint else None

        # TODO! Representation runner needs this. It is not simple to do it in
        # recurrent encoder since there may be more source data series. The
        # best way could be to enter the data_id parameter manually to the
        # representation runner
        self.data_id = data_id

        input_sequence = EmbeddedSequence(name="{}_input".format(name),
                                          vocabulary=vocabulary,
                                          data_id=data_id,
                                          embedding_size=embedding_size,
                                          max_length=max_input_len,
                                          save_checkpoint=s_ckp,
                                          load_checkpoint=l_ckp)

        RecurrentEncoder.__init__(self,
                                  name=name,
                                  input_sequence=input_sequence,
                                  rnn_size=rnn_size,
                                  rnn_cell=rnn_cell,
                                  rnn_direction=rnn_direction,
                                  dropout_keep_prob=dropout_keep_prob,
                                  save_checkpoint=save_checkpoint,
                                  load_checkpoint=load_checkpoint)
    def test_reuse(self):
        vocabulary = Vocabulary()
        vocabulary.add_word("a")
        vocabulary.add_word("b")

        seq1 = EmbeddedSequence(name="seq1",
                                vocabulary=vocabulary,
                                data_id="id",
                                embedding_size=10)

        seq2 = EmbeddedSequence(name="seq2",
                                vocabulary=vocabulary,
                                embedding_size=10,
                                data_id="id")

        seq3 = EmbeddedSequence(name="seq3",
                                vocabulary=vocabulary,
                                data_id="id",
                                embedding_size=10,
                                reuse=seq1)

        # blessing
        self.assertIsNotNone(seq1.embedding_matrix)
        self.assertIsNotNone(seq2.embedding_matrix)
        self.assertIsNotNone(seq3.embedding_matrix)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        params = sess.run((seq1.embedding_matrix, seq2.embedding_matrix,
                           seq3.embedding_matrix))

        with self.assertRaises(AssertionError):
            assert_array_equal(params[0], params[1])

        assert_array_equal(params[0], params[2])
Example #5
0
    def __init__(self,
                 name: str,
                 vocabulary: Vocabulary,
                 data_id: str,
                 embedding_size: int,
                 rnn_size: int,
                 max_input_len: int = None,
                 dropout_keep_prob: float = 1.0,
                 rnn_cell: str = "GRU",
                 attention_type: type = None,
                 attention_fertility: int = 3,
                 attention_state_size: int = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create a new instance of the sentence encoder. """

        # TODO Think this through.
        s_ckp = "input_{}".format(save_checkpoint) if save_checkpoint else None
        l_ckp = "input_{}".format(load_checkpoint) if load_checkpoint else None

        # TODO! Representation runner needs this. It is not simple to do it in
        # recurrent encoder since there may be more source data series. The
        # best way could be to enter the data_id parameter manually to the
        # representation runner
        self.data_id = data_id

        input_sequence = EmbeddedSequence(name="{}_input".format(name),
                                          vocabulary=vocabulary,
                                          data_id=data_id,
                                          embedding_size=embedding_size,
                                          max_length=max_input_len,
                                          save_checkpoint=s_ckp,
                                          load_checkpoint=l_ckp)

        RecurrentEncoder.__init__(self,
                                  name=name,
                                  input_sequence=input_sequence,
                                  rnn_size=rnn_size,
                                  dropout_keep_prob=dropout_keep_prob,
                                  rnn_cell=rnn_cell,
                                  attention_type=attention_type,
                                  attention_fertility=attention_fertility,
                                  attention_state_size=attention_state_size,
                                  save_checkpoint=save_checkpoint,
                                  load_checkpoint=load_checkpoint)
#!/usr/bin/env python3.5
"""Test init methods of encoders."""

import unittest
import copy

from typing import Dict, List, Any, Iterable

from neuralmonkey.encoders.numpy_encoder import (VectorEncoder,
                                                 PostCNNImageEncoder)
from neuralmonkey.encoders.recurrent import SentenceEncoder
from neuralmonkey.encoders.sentence_cnn_encoder import SentenceCNNEncoder
from neuralmonkey.model.sequence import EmbeddedSequence
from neuralmonkey.tests.test_vocabulary import VOCABULARY

INPUT_SEQUENCE = EmbeddedSequence("seq", VOCABULARY, "marmelade", 300)

SENTENCE_ENCODER_GOOD = {
    "name": ["encoder"],
    "vocabulary": [VOCABULARY],
    "data_id": ["marmelade"],
    "embedding_size": [20],
    "rnn_size": [30],
    "max_input_len": [None, 15],
    "dropout_keep_prob": [0.5, 1.],
}

SENTENCE_ENCODER_BAD = {
    "nonexistent": ["ahoj"],
    "name": [None, 1],
    "vocabulary": [0, None, "ahoj", dict()],