def build_encoder(config: Dict) -> Tuple[RecurrentEncoder, str]: vocabulary = from_nematus_json(config["src_vocabulary"], max_size=config["n_words_src"], pad_to_max_size=True) vocabulary_ini = VOCABULARY_TEMPLATE.format("src", config["src_vocabulary"], config["n_words_src"]) inp_seq_name = "{}_input".format(ENCODER_NAME) inp_seq = EmbeddedSequence(name=inp_seq_name, vocabulary=vocabulary, data_id="source", embedding_size=config["embedding_size"]) encoder = RecurrentEncoder(name=ENCODER_NAME, input_sequence=inp_seq, rnn_size=config["rnn_size"], rnn_cell="NematusGRU") encoder_ini = ENCODER_TEMPLATE.format(ENCODER_NAME, config["rnn_size"], inp_seq_name, config["embedding_size"], config["max_length"]) return encoder, "\n".join([vocabulary_ini, encoder_ini])
def build_encoder(hparams: Dict, vocab_path: str) -> Tuple[TransformerEncoder, str]: vocabulary = from_t2t_vocabulary(vocab_path) vocabulary_ini = VOCABULARY_TEMPLATE.format(vocab_path) inp_seq_name = "{}_input".format(ENCODER_NAME) inp_seq = EmbeddedSequence( name=inp_seq_name, vocabulary=vocabulary, data_id="source_wp", embedding_size=hparams["embedding_size"], scale_embeddings_by_depth=hparams[ "multiply_embedding_mode"] == "sqrt_depth", add_end_symbol=True) encoder = TransformerEncoder( name=ENCODER_NAME, input_sequence=inp_seq, ff_hidden_size=hparams["ff_hidden_size"], depth=hparams["depth"], n_heads=hparams["n_heads"], target_space_id=21, use_att_transform_bias=True) encoder_ini = ENCODER_TEMPLATE.format( inp_seq_name, hparams["embedding_size"], hparams["multiply_embedding_mode"] == "sqrt_depth", hparams["max_length"], ENCODER_NAME, hparams["ff_hidden_size"], hparams["depth"], hparams["n_heads"]) return encoder, vocabulary, "\n".join([vocabulary_ini, encoder_ini])
def __init__(self, name: str, vocabulary: Vocabulary, data_id: str, embedding_size: int, rnn_size: int, rnn_cell: str = "GRU", rnn_direction: str = "bidirectional", max_input_len: int = None, dropout_keep_prob: float = 1.0, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a new instance of the sentence encoder. Arguments: name: ModelPart name. vocabulary: The input vocabulary. data_id: The input sequence data ID. embedding_size: The dimension of the embedding vectors in the input sequence. max_input_len: Maximum length of the input sequence (disregard tokens after this position). rnn_size: The dimension of the RNN hidden state vector. rnn_cell: One of "GRU", "NematusGRU", "LSTM". Which kind of memory cell to use. rnn_direction: One of "forward", "backward", "bidirectional". In what order to process the input sequence. Note that choosing "bidirectional" will double the resulting vector dimension as well as the number of encoder parameters. dropout_keep_prob: 1 - dropout probability. save_checkpoint: ModelPart save checkpoint file. load_checkpoint: ModelPart load checkpoint file. """ check_argument_types() s_ckp = "input_{}".format(save_checkpoint) if save_checkpoint else None l_ckp = "input_{}".format(load_checkpoint) if load_checkpoint else None # TODO! Representation runner needs this. It is not simple to do it in # recurrent encoder since there may be more source data series. The # best way could be to enter the data_id parameter manually to the # representation runner self.data_id = data_id input_sequence = EmbeddedSequence(name="{}_input".format(name), vocabulary=vocabulary, data_id=data_id, embedding_size=embedding_size, max_length=max_input_len, save_checkpoint=s_ckp, load_checkpoint=l_ckp) RecurrentEncoder.__init__(self, name=name, input_sequence=input_sequence, rnn_size=rnn_size, rnn_cell=rnn_cell, rnn_direction=rnn_direction, dropout_keep_prob=dropout_keep_prob, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint)
def test_reuse(self): vocabulary = Vocabulary() vocabulary.add_word("a") vocabulary.add_word("b") seq1 = EmbeddedSequence(name="seq1", vocabulary=vocabulary, data_id="id", embedding_size=10) seq2 = EmbeddedSequence(name="seq2", vocabulary=vocabulary, embedding_size=10, data_id="id") seq3 = EmbeddedSequence(name="seq3", vocabulary=vocabulary, data_id="id", embedding_size=10, reuse=seq1) # blessing self.assertIsNotNone(seq1.embedding_matrix) self.assertIsNotNone(seq2.embedding_matrix) self.assertIsNotNone(seq3.embedding_matrix) sess = tf.Session() sess.run(tf.global_variables_initializer()) params = sess.run((seq1.embedding_matrix, seq2.embedding_matrix, seq3.embedding_matrix)) with self.assertRaises(AssertionError): assert_array_equal(params[0], params[1]) assert_array_equal(params[0], params[2])
def __init__(self, name: str, vocabulary: Vocabulary, data_id: str, embedding_size: int, rnn_size: int, max_input_len: int = None, dropout_keep_prob: float = 1.0, rnn_cell: str = "GRU", attention_type: type = None, attention_fertility: int = 3, attention_state_size: int = None, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a new instance of the sentence encoder. """ # TODO Think this through. s_ckp = "input_{}".format(save_checkpoint) if save_checkpoint else None l_ckp = "input_{}".format(load_checkpoint) if load_checkpoint else None # TODO! Representation runner needs this. It is not simple to do it in # recurrent encoder since there may be more source data series. The # best way could be to enter the data_id parameter manually to the # representation runner self.data_id = data_id input_sequence = EmbeddedSequence(name="{}_input".format(name), vocabulary=vocabulary, data_id=data_id, embedding_size=embedding_size, max_length=max_input_len, save_checkpoint=s_ckp, load_checkpoint=l_ckp) RecurrentEncoder.__init__(self, name=name, input_sequence=input_sequence, rnn_size=rnn_size, dropout_keep_prob=dropout_keep_prob, rnn_cell=rnn_cell, attention_type=attention_type, attention_fertility=attention_fertility, attention_state_size=attention_state_size, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint)
#!/usr/bin/env python3.5 """Test init methods of encoders.""" import unittest import copy from typing import Dict, List, Any, Iterable from neuralmonkey.encoders.numpy_encoder import (VectorEncoder, PostCNNImageEncoder) from neuralmonkey.encoders.recurrent import SentenceEncoder from neuralmonkey.encoders.sentence_cnn_encoder import SentenceCNNEncoder from neuralmonkey.model.sequence import EmbeddedSequence from neuralmonkey.tests.test_vocabulary import VOCABULARY INPUT_SEQUENCE = EmbeddedSequence("seq", VOCABULARY, "marmelade", 300) SENTENCE_ENCODER_GOOD = { "name": ["encoder"], "vocabulary": [VOCABULARY], "data_id": ["marmelade"], "embedding_size": [20], "rnn_size": [30], "max_input_len": [None, 15], "dropout_keep_prob": [0.5, 1.], } SENTENCE_ENCODER_BAD = { "nonexistent": ["ahoj"], "name": [None, 1], "vocabulary": [0, None, "ahoj", dict()],