def testConcatInDepthWithSequence(self): a = [ [[1], [-1], [-1]], [[1], [2], [3]], [[1], [2], [-1]]] b = [ [[1], [2], [3], [4]], [[1], [2], [-1], [-1]], [[1], [2], [-1], [-1]]] expected = [ [[1, 1], [0, 2], [0, 3], [0, 4]], [[1, 1], [2, 2], [3, 0], [0, 0]], [[1, 1], [2, 2], [0, 0], [0, 0]]] length_a = [1, 3, 2] length_b = [4, 2, 2] reduced, length = reducer.ConcatReducer().reduce_sequence( [tf.constant(a, dtype=tf.float32), tf.constant(b, dtype=tf.float32)], [tf.constant(length_a), tf.constant(length_b)]) self.assertEqual(2, reduced.get_shape().as_list()[-1]) with self.test_session() as sess: reduced, length = sess.run([reduced, length]) self.assertAllEqual(expected, reduced) self.assertAllEqual([4, 3, 2], length)
def testMixedInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) vocab_alt_file = self._makeTextFile("vocab_alt.txt", ["h", "e", "l", "w", "o"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) mixed_inputter = inputter.MixedInputter( [ text_inputter.WordEmbedder(embedding_size=10), text_inputter.CharConvEmbedder(10, 5), ], reducer=reducer.ConcatReducer(), ) self.assertEqual(mixed_inputter.num_outputs, 1) features, transformed = self._makeDataset( mixed_inputter, data_file, data_config={ "1_vocabulary": vocab_file, "2_vocabulary": vocab_alt_file }, shapes={ "char_ids": [None, None, None], "ids": [None, None], "length": [None], }, ) self.assertAllEqual([1, 3, 15], transformed.shape)
def testMixedInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) vocab_alt_file = self._makeTextFile("vocab_alt.txt", ["h", "e", "l", "w", "o"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) mixed_inputter = inputter.MixedInputter( [ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5) ], reducer=reducer.ConcatReducer()) self.assertEqual(mixed_inputter.num_outputs, 1) features, transformed = self._makeDataset(mixed_inputter, data_file, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file }, shapes={ "char_ids": [None, None, None], "ids": [None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 3, 15], transformed.shape)
def testConcatInDepthWithSequence(self): a = [[[1], [-1], [-1]], [[1], [2], [3]], [[1], [2], [-1]]] b = [[[1], [2], [3], [4]], [[1], [2], [-1], [-1]], [[1], [2], [-1], [-1]]] expected = [ [[1, 1], [0, 2], [0, 3], [0, 4]], [[1, 1], [2, 2], [3, 0], [0, 0]], [[1, 1], [2, 2], [0, 0], [0, 0]], ] length_a = [1, 3, 2] length_b = [4, 2, 2] reduced, length = reducer.ConcatReducer()( [ tf.constant(a, dtype=tf.float32), tf.constant(b, dtype=tf.float32) ], [tf.constant(length_a), tf.constant(length_b)], ) self.assertEqual(2, reduced.shape[-1]) reduced, length = self.evaluate([reduced, length]) self.assertAllEqual(expected, reduced) self.assertAllEqual([4, 3, 2], length)
def testConcatInTimeWithSequenceAndMaxTimeMismatch(self): a = [[[1], [-1], [-1]], [[1], [2], [3]], [[1], [2], [-1]]] b = [ [[1], [2], [3], [4], [-1], [-1]], [[1], [2], [-1], [-1], [-1], [-1]], [[1], [2], [-1], [-1], [-1], [-1]], ] expected = [ [[1], [1], [2], [3], [4]], [[1], [2], [3], [1], [2]], [[1], [2], [1], [2], [0]], ] length_a = [1, 3, 2] length_b = [4, 2, 2] reduced, length = reducer.ConcatReducer(axis=1)( [ tf.constant(a, dtype=tf.float32), tf.constant(b, dtype=tf.float32) ], [tf.constant(length_a), tf.constant(length_b)], ) self.assertEqual(1, reduced.shape[-1]) reduced, length = self.evaluate([reduced, length]) self.assertAllEqual(expected, reduced) self.assertAllEqual([5, 5, 4], length)
def __init__(self, num_layers, num_units, bidirectional=False, reducer=reducer_lib.ConcatReducer(), dropout=0, residual_connections=False, **kwargs): """Initializes the layer. Args: num_layers: Number of stacked LSTM layers. num_units: Dimension of the output space of each LSTM. bidirectional: Make each layer bidirectional. reducer: A :class:`opennmt.layers.Reducer` instance to merge the bidirectional states and outputs of each layer. dropout: The probability to drop units in each layer output. residual_connections: If ``True``, each layer input will be added to its output. **kwargs: Additional layer arguments. """ super(GRU, self).__init__(**kwargs) rnn_layers = [ _RNNWrapper(tf.keras.layers.GRU(num_units, return_sequences=True, return_state=True), bidirectional=bidirectional, reducer=reducer) for _ in range(num_layers) ] self.layers = [ common.LayerWrapper(layer, output_dropout=dropout, residual_connection=residual_connections) for layer in rnn_layers ]
def testNestedInputtersWithFlatDataFiles(self): inputters = inputter.ParallelInputter( [ record_inputter.SequenceRecordInputter(10), record_inputter.SequenceRecordInputter(10), ], reducer=reducer.SumReducer(), ) inputters = inputter.ParallelInputter( [ record_inputter.SequenceRecordInputter(10), inputters, ], reducer=reducer.ConcatReducer(), ) self.assertListEqual(inputters._structure(), [None, [None, None]]) empty_file = os.path.join(self.get_temp_dir(), "test.txt") with open(empty_file, "w"): pass with self.assertRaises(ValueError): inputters.make_inference_dataset([empty_file, empty_file], batch_size=2) inputters.make_inference_dataset([empty_file, empty_file, empty_file], batch_size=2)
def testConcatInTimeWithSequenceAndMaxTimeMismatch(self): a = [ [[1], [-1], [-1]], [[1], [2], [3]], [[1], [2], [-1]]] b = [ [[1], [2], [3], [4], [-1], [-1]], [[1], [2], [-1], [-1], [-1], [-1]], [[1], [2], [-1], [-1], [-1], [-1]]] expected = [ [[1], [1], [2], [3], [4]], [[1], [2], [3], [1], [2]], [[1], [2], [1], [2], [0]]] length_a = [1, 3, 2] length_b = [4, 2, 2] reduced, length = reducer.ConcatReducer(axis=1).reduce_sequence( [tf.constant(a, dtype=tf.float32), tf.constant(b, dtype=tf.float32)], [tf.constant(length_a), tf.constant(length_b)]) self.assertEqual(1, reduced.get_shape().as_list()[-1]) with self.test_session() as sess: reduced, length = sess.run([reduced, length]) self.assertAllEqual(expected, reduced) self.assertAllEqual([5, 5, 4], length)
def testParallelEncoder(self): sequence_lengths = [[3, 5, 2], [6, 6, 4]] inputs = [tf.zeros([3, 5, 10]), tf.zeros([3, 6, 10])] encoder = encoders.ParallelEncoder( [DenseEncoder(1, 20), DenseEncoder(2, 20)], outputs_reducer=reducer.ConcatReducer(axis=1)) outputs, state, encoded_length = encoder( inputs, sequence_length=sequence_lengths) self.assertEqual(len(state), 3) outputs, encoded_length = self.evaluate([outputs, encoded_length]) self.assertAllEqual([3, 11, 20], outputs.shape) self.assertAllEqual([9, 11, 6], encoded_length)
def testBRNN(self): cell = rnn.make_rnn_cell(3, 10, dropout=0.1, residual_connections=True) rnn_layer = rnn.RNN(cell, bidirectional=True, reducer=reducer.ConcatReducer()) inputs = tf.random.uniform([4, 5, 5]) outputs, states = rnn_layer(inputs, training=True) self.assertListEqual(outputs.shape.as_list(), [4, 5, 20]) self.assertIsInstance(states, tuple) self.assertEqual(len(states), 3) self.assertEqual(len(states[0]), 2) self.assertListEqual(states[0][0].shape.as_list(), [4, 20])
def _encodeInParallel(self, inputs, sequence_length=None, outputs_layer_fn=None, combined_output_layer_fn=None): columns = [DenseEncoder(1, 20), DenseEncoder(1, 20)] encoder = encoders.ParallelEncoder( columns, outputs_reducer=reducer.ConcatReducer(), outputs_layer_fn=outputs_layer_fn, combined_output_layer_fn=combined_output_layer_fn) outputs, _, _ = encoder(inputs, sequence_length=sequence_length) return self.evaluate(outputs)
def _encodeInParallel(self, inputs, sequence_length=None, outputs_layer_fn=None, combined_output_layer_fn=None): columns = [ encoders.UnidirectionalRNNEncoder(1, 20), encoders.UnidirectionalRNNEncoder(1, 20)] encoder = encoders.ParallelEncoder( columns, outputs_reducer=reducer.ConcatReducer(), outputs_layer_fn=outputs_layer_fn, combined_output_layer_fn=combined_output_layer_fn) return encoder.encode(inputs, sequence_length=sequence_length)
def __init__(self, cell, bidirectional=False, reducer=reducer_lib.ConcatReducer(), **kwargs): """Initializes the layer. Args: cell: The RNN cell to use. bidirectional: Make this layer bidirectional. reducer: A :class:`opennmt.layers.Reducer` instance to merge bidirectional states and outputs. **kwargs: Additional layer arguments. See Also: :func:`opennmt.layers.make_rnn_cell` """ rnn = tf.keras.layers.RNN(cell, return_sequences=True, return_state=True) super(RNN, self).__init__(rnn, bidirectional=bidirectional, reducer=reducer, **kwargs)
def testParallelEncoder(self): sequence_lengths = [[3, 5, 2], [6, 6, 4]] inputs = [tf.zeros([3, 5, 10]), tf.zeros([3, 6, 10])] encoder = encoders.ParallelEncoder( [DenseEncoder(1, 20), DenseEncoder(2, 20)], outputs_reducer=reducer.ConcatReducer(axis=1)) outputs, state, encoded_length = encoder.encode( inputs, sequence_length=sequence_lengths) self.assertEqual(len(state), 3) if not compat.is_tf2(): with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs, encoded_length = self.evaluate([outputs, encoded_length]) self.assertAllEqual([3, 11, 20], outputs.shape) self.assertAllEqual([9, 11, 6], encoded_length)
def _encodeInParallel(self, inputs, sequence_length=None, outputs_layer_fn=None, combined_output_layer_fn=None): columns = [DenseEncoder(1, 20), DenseEncoder(1, 20)] encoder = encoders.ParallelEncoder( columns, outputs_reducer=reducer.ConcatReducer(), outputs_layer_fn=outputs_layer_fn, combined_output_layer_fn=combined_output_layer_fn) outputs, _, _ = encoder.encode(inputs, sequence_length=sequence_length) if not compat.is_tf2(): with self.test_session() as sess: sess.run(tf.global_variables_initializer()) return self.evaluate(outputs)
def testParallelEncoderSameInput(self): sequence_length = [17, 21, 20] inputs = _build_dummy_sequences(sequence_length) encoder = encoders.ParallelEncoder( [ encoders.UnidirectionalRNNEncoder(1, 20), encoders.UnidirectionalRNNEncoder(1, 20) ], outputs_reducer=reducer.ConcatReducer()) outputs, _, encoded_length = encoder.encode( inputs, sequence_length=sequence_length) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs, encoded_length = sess.run([outputs, encoded_length]) self.assertAllEqual([3, 21, 40], outputs.shape) self.assertAllEqual(sequence_length, encoded_length)
def __init__(self, rnn, bidirectional=False, reducer=reducer_lib.ConcatReducer(), **kwargs): """Initializes the layer. Args: rnn: The RNN layer to extend, built with ``return_sequences`` and ``return_state`` enabled. bidirectional: Make this layer bidirectional. reducer: A :class:`opennmt.layers.Reducer` instance to merge bidirectional states and outputs. **kwargs: Additional layer arguments. """ super(_RNNWrapper, self).__init__(**kwargs) self.rnn = rnn self.reducer = reducer self.bidirectional = bidirectional if bidirectional: self.rnn = tf.keras.layers.Bidirectional(self.rnn, merge_mode=None)
def testParallelEncoder(self): sequence_lengths = [[17, 21, 20], [10, 9, 15]] inputs = [ _build_dummy_sequences(length) for length in sequence_lengths] encoder = encoders.ParallelEncoder([ encoders.UnidirectionalRNNEncoder(1, 20), encoders.UnidirectionalRNNEncoder(1, 20)], outputs_reducer=reducer.ConcatReducer(axis=1)) outputs, state, encoded_length = encoder.encode( inputs, sequence_length=sequence_lengths) self.assertEqual(2, len(state)) for s in state: self.assertIsInstance(s, tf.contrib.rnn.LSTMStateTuple) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) outputs, encoded_length = sess.run([outputs, encoded_length]) self.assertAllEqual([3, 35, 20], outputs.shape) self.assertAllEqual([27, 30, 35], encoded_length)
def testMixedInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(vocab_alt_file, "w") as vocab_alt: vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") mixed_inputter = inputter.MixedInputter( [ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5) ], reducer=reducer.ConcatReducer()) features, transformed = self._makeDataset(mixed_inputter, data_file, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file }, shapes={ "char_ids": [None, None, None], "ids": [None, None], "length": [None] }) self.assertNotIn("tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 3, 15], transformed.shape)
def testMixedInputter(self): with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(vocab_alt_file, "w") as vocab_alt: vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") mixed_inputter = inputter.MixedInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)], reducer=reducer.ConcatReducer()) data, transformed = _first_element( mixed_inputter, data_file, {"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file}) input_receiver = mixed_inputter.get_serving_input_receiver() self.assertIn("ids", input_receiver.features) self.assertIn("char_ids", input_receiver.features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) data, transformed = sess.run([data, transformed]) self.assertNotIn("raw", data) self.assertNotIn("tokens", data) self.assertIn("ids", data) self.assertIn("char_ids", data) self.assertAllEqual([1, 3, 15], transformed.shape)