def test_expand_dims_for_broadcast(): with clean_session(): arr = tf.constant([ [ [1, 2, 3], [4, 5, 6], [4, 5, 6], ], [ [1, 2, 3], [4, 5, 6], [4, 5, 6], ], ], dtype=tf.float32) weights = tf.constant([1, 2], dtype=tf.float32) assert arr.get_shape().as_list() == [2, 3, 3] assert weights.get_shape().as_list() == [2] new_weights = expand_dims_for_broadcast(weights, arr) assert new_weights.eval().shape == (2, 1, 1) bad_weights = tf.constant([1, 2, 3], dtype=tf.float32) bad_new_weights = expand_dims_for_broadcast(bad_weights, arr) with pytest.raises(InvalidArgumentError): bad_new_weights.eval()
def test_seq_length(self): tokens = u'<unk> a b c'.split() unk = '<unk>' vocab = VocabExample(tokens, unk) sequences = [ u'a b a b c'.split(), # more than length 4 u'a b'.split(), [u'b'], [u'c'], ] indices = np.array([ [2, 1, 2, 3], [0, 0, 1, 2], [0, 0, 0, 2], [0, 0, 0, 3], ], dtype=np.int32) mask = np.array([ [1, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 1], ], dtype=np.float32) with clean_session(): model = FeedSequenceBatch(align='right', seq_length=4) test_feed = model.inputs_to_feed_dict(sequences, vocab) correct = {model.values: indices, model.mask: mask} assert_array_collections_equal(correct, test_feed) indices = tf.identity(model.values) mask = tf.identity(model.mask) assert indices.get_shape().as_list() == [None, 4] assert mask.get_shape().as_list() == [None, 4]
def test(self): token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ] correct_embeds = np.array([ [1, 2, 0, 3, 4, 1, 5, 6, 0, 7, 8, 1], [5, 6, 0, 7, 8, 1, 0, 0, 0, 0, 0, 0], ], dtype=np.float32) with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = ConcatSequenceEmbedder(token_embeds) test_embeds = model.compute(model.embeds, sequences, token_vocab) assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
def test(self): npa = lambda arr: np.array(arr, dtype=np.float32) correct = npa([ npa([3, 5, 7]), npa([3, 5, 7]), npa([9, 9, 9]), ]) with clean_session(): array = tf.constant( [[[1., 2., 3.], [3., 5., 7.], [100., 200., 2000.]], [[2., 4., 6.], [3., 5., 7.], [3., 5., 7.]], [[9., 9., 9.], [3., 5., 7.], [1., 2., 3.]]], dtype=tf.float32) mask = tf.constant([ [1, 1, 0], [1, 1, 1], [1, 1, 1], ], dtype=tf.float32) bm = reduce_max(SequenceBatch(array, mask)) assert_almost_equal(bm.eval(), correct, decimal=5) bad_mask = tf.constant([ [0, 0, 0], [1, 1, 1], [1, 1, 1], ], dtype=tf.float32) bm2 = reduce_mean(SequenceBatch(array, bad_mask)) with pytest.raises(InvalidArgumentError): bm2.eval()
def test(self): token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ] correct_embeds = np.array( [[3, 4, 1, 5, 6, 0, 7, 8, 1], [0, 0, 0, 5, 6, 0, 7, 8, 1]], dtype=np.float32) with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = ConcatSequenceEmbedder(token_embeds, seq_length=3, align='right') test_embeds = model.compute(model.embeds, sequences, token_vocab) # check that static shape inference works assert model.embeds.get_shape().as_list() == [None, 3 * 3] assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
def test_batch_mean(self): correct = np.array([-2. / 3, 1., 21. / 4]) with clean_session(): array = tf.constant([ [1, -8, 5, 4, 9], [0, 2, 7, 8, 1], [2, -8, 6, 4, 9], ], dtype=tf.float32) mask = tf.constant([ [1, 1, 1, 0, 0], [1, 1, 0, 0, 0], [1, 0, 1, 1, 1], ], dtype=tf.float32) bad_mask = tf.constant([ [1, 1, 1, 0, 0], [0, 0, 0, 0, 0], [1, 0, 1, 1, 1], ], dtype=tf.float32) bm = reduce_mean(SequenceBatch(array, mask)) assert_almost_equal(bm.eval(), correct, decimal=5) bm2 = reduce_mean(SequenceBatch(array, bad_mask)) with pytest.raises(InvalidArgumentError): bm2.eval() # try allow_empty option bm3 = reduce_mean(SequenceBatch(array, bad_mask), allow_empty=True) assert_almost_equal(bm3.eval(), np.array([-2. / 3, 0., 21. / 4]))
def test_multidim(self): npa = lambda arr: np.array(arr, dtype=np.float32) correct = npa([ npa([4, 7, 10]) / 2, npa([8, 14, 20]) / 3, npa([13, 16, 19]) / 3, ]) with clean_session(): array = tf.constant([[[1., 2., 3.], [3., 5., 7.], [0., 0., 0.]], [[2., 4., 6.], [3., 5., 7.], [3., 5., 7.]], [[9., 9., 9.], [3., 5., 7.], [1., 2., 3.]]], dtype=tf.float32) mask = tf.constant([ [1, 1, 0], [1, 1, 1], [1, 1, 1], ], dtype=tf.float32) bm = reduce_mean(SequenceBatch(array, mask)) assert_almost_equal(bm.eval(), correct, decimal=5)
def test_lstm(self): """Test whether the mask works properly for LSTM embedder.""" token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ['a', 'b', 'c', 'd'], ] sequences_alt = [ ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'], ['b', 'a', 'd'], ['c', 'd'], ] with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = LSTMSequenceEmbedder(token_embeds, seq_length=4, hidden_size=7) test_embeds, test_hidden_states = model.compute( [model.embeds, model.hidden_states.values], sequences, token_vocab) assert test_embeds.shape == (3, 7) assert test_hidden_states.shape == (3, 4, 7) # Padded spaces should have the same hidden states assert_array_almost_equal(test_hidden_states[1, 1, :], test_hidden_states[1, 2, :], decimal=5) assert_array_almost_equal(test_hidden_states[1, 1, :], test_hidden_states[1, 3, :], decimal=5) # Try again but with different paddings # Should get the same result for ['c', 'd'] big_model = LSTMSequenceEmbedder(token_embeds, seq_length=8, hidden_size=7) big_model.weights = model.weights # match weights test_embeds_alt, test_hidden_states_alt = big_model.compute( [big_model.embeds, big_model.hidden_states.values], sequences_alt, token_vocab) assert test_embeds_alt.shape == (3, 7) assert test_hidden_states_alt.shape == (3, 8, 7) assert_array_almost_equal(test_embeds[1, :], test_embeds_alt[2, :], decimal=5) assert_array_almost_equal(test_hidden_states[1, :2, :], test_hidden_states_alt[2, :2, :], decimal=5)
def test_no_sequences(self): vocab = SimpleVocab(u'a b c'.split()) sequences = [] with clean_session(): model = FeedSequenceBatch() indices = tf.identity(model.values) mask = tf.identity(model.mask) indices_val, mask_val = model.compute([indices, mask], sequences, vocab) assert indices_val.shape == mask_val.shape == (0, 0)
def test_broadcast(): with clean_session(): values = tf.constant([ [ [1, 2], [1, 2], ], [ [1, 2], [3, 4], ], [ [5, 6], [7, 8], ] ], dtype=tf.float32) mask = tf.constant([ [1, 0], [1, 1], [0, 1], ], dtype=tf.float32) correct = np.array([ [ [1, 1], [0, 0], ], [ [1, 1], [1, 1], ], [ [0, 0], [1, 1], ] ], dtype=np.float32) assert values.get_shape().as_list() == [3, 2, 2] assert mask.get_shape().as_list() == [3, 2] mask = expand_dims_for_broadcast(mask, values) assert mask.get_shape().as_list() == [3, 2, 1] mask = broadcast(mask, values) assert mask.get_shape().as_list() == [3, 2, 2] mask_val = mask.eval() assert_array_equal(mask_val, correct)
def test(self): correct = np.array([-2, 2, 21]) with clean_session(): array = tf.constant([ [1, -8, 5, 4, 9], [0, 2, 7, 8, 1], [2, -8, 6, 4, 9], ], dtype=tf.float32) mask = tf.constant([ [1, 1, 1, 0, 0], [1, 1, 0, 0, 0], [1, 0, 1, 1, 1], ], dtype=tf.float32) result = reduce_sum(SequenceBatch(array, mask)) assert_almost_equal(result.eval(), correct, decimal=5)
def test_broadcast(): with clean_session(): values = tf.constant([[ [1, 2], [1, 2], ], [ [1, 2], [3, 4], ], [ [5, 6], [7, 8], ]], dtype=tf.float32) mask = tf.constant([ [1, 0], [1, 1], [0, 1], ], dtype=tf.float32) correct = np.array([[ [1, 1], [0, 0], ], [ [1, 1], [1, 1], ], [ [0, 0], [1, 1], ]], dtype=np.float32) assert values.get_shape().as_list() == [3, 2, 2] assert mask.get_shape().as_list() == [3, 2] mask = expand_dims_for_broadcast(mask, values) assert mask.get_shape().as_list() == [3, 2, 1] mask = broadcast(mask, values) assert mask.get_shape().as_list() == [3, 2, 2] mask_val = mask.eval() assert_array_equal(mask_val, correct)
def test_right_align(self, inputs): indices = np.array([ [1, 1, 2, 2, 3], [0, 0, 0, 1, 2], [0, 0, 0, 0, 2], [0, 0, 0, 0, 3], ], dtype=np.int32) mask = np.array([ [1, 1, 1, 1, 1], [0, 0, 0, 1, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], ], dtype=np.float32) with clean_session(): model = FeedSequenceBatch(align='right') correct = {model.values: indices, model.mask: mask} args, kwargs = inputs test = model.inputs_to_feed_dict(*args, **kwargs) assert_array_collections_equal(correct, test)
def test(self): npa = lambda arr: np.array(arr, dtype=np.float32) correct = npa([ npa([3, 5, 7]), npa([3, 5, 7]), npa([9, 9, 9]), ]) with clean_session(): array = tf.constant([[[1., 2., 3.], [3., 5., 7.], [100., 200., 2000.]], [[2., 4., 6.], [3., 5., 7.], [3., 5., 7.]], [[9., 9., 9.], [3., 5., 7.], [1., 2., 3.]]], dtype=tf.float32) mask = tf.constant([ [1, 1, 0], [1, 1, 1], [1, 1, 1], ], dtype=tf.float32) bm = reduce_max(SequenceBatch(array, mask)) assert_almost_equal(bm.eval(), correct, decimal=5) bad_mask = tf.constant([ [0, 0, 0], [1, 1, 1], [1, 1, 1], ], dtype=tf.float32) bm2 = reduce_mean(SequenceBatch(array, bad_mask)) with pytest.raises(InvalidArgumentError): bm2.eval()
def test_lstm(self): """Test whether the mask works properly for bidi LSTM embedder.""" token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ['a', 'b', 'c', 'd'], ] sequences_alt = [ ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'], ['b', 'a', 'd'], ['c', 'd'], ] with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=4, hidden_size=7) test_embeds, test_hidden_states = model.compute( [model.embeds, model.hidden_states.values], sequences, token_vocab) assert test_embeds.shape == (3, 14) assert test_hidden_states.shape == (3, 4, 14) assert_array_almost_equal(test_embeds[1, :7], test_hidden_states[1, 1, :7], decimal=5) assert_array_almost_equal(test_embeds[1, 7:], test_hidden_states[1, 0, 7:], decimal=5) # Padded spaces should have the same forward embeddings assert_array_almost_equal(test_hidden_states[1, 1, :7], test_hidden_states[1, 2, :7], decimal=5) assert_array_almost_equal(test_hidden_states[1, 1, :7], test_hidden_states[1, 3, :7], decimal=5) # Padded spaces should have 0 backward embeddings assert_array_almost_equal(np.zeros((7, )), test_hidden_states[1, 2, 7:], decimal=5) assert_array_almost_equal(np.zeros((7, )), test_hidden_states[1, 3, 7:], decimal=5) # Other spaces should not have 0 embeddings with very high probability assert np.linalg.norm(test_hidden_states[1, 0, :7]) > 1e-5 assert np.linalg.norm(test_hidden_states[1, 1, :7]) > 1e-5 assert np.linalg.norm(test_hidden_states[1, 0, 7:]) > 1e-5 assert np.linalg.norm(test_hidden_states[1, 1, 7:]) > 1e-5 # Try again but with different paddings # Should get the same result for ['c', 'd'] big_model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=8, hidden_size=7) big_model.weights = model.weights # match weights test_embeds_alt, test_hidden_states_alt = big_model.compute( [big_model.embeds, big_model.hidden_states.values], sequences_alt, token_vocab) assert test_embeds_alt.shape == (3, 14) assert test_hidden_states_alt.shape == (3, 8, 14) assert_array_almost_equal(test_embeds[1, :], test_embeds_alt[2, :], decimal=5) assert_array_almost_equal(test_hidden_states[1, :2, :], test_hidden_states_alt[2, :2, :], decimal=5)
def test_empty(self): with clean_session(): array = tf.constant(np.empty((0, 10, 20))) mask = tf.constant(np.empty((0, 10))) bm = reduce_mean(SequenceBatch(array, mask)) assert bm.eval().shape == (0, 20)
def clean_test_session(): with clean_session() as sess: yield sess