def test(self): token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ] correct_embeds = np.array([ [1, 2, 0, 3, 4, 1, 5, 6, 0, 7, 8, 1], [5, 6, 0, 7, 8, 1, 0, 0, 0, 0, 0, 0], ], dtype=np.float32) with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = ConcatSequenceEmbedder(token_embeds) test_embeds = model.compute(model.embeds, sequences, token_vocab) assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
def test(self): token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ] correct_embeds = np.array( [[3, 4, 1, 5, 6, 0, 7, 8, 1], [0, 0, 0, 5, 6, 0, 7, 8, 1]], dtype=np.float32) with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = ConcatSequenceEmbedder(token_embeds, seq_length=3, align='right') test_embeds = model.compute(model.embeds, sequences, token_vocab) # check that static shape inference works assert model.embeds.get_shape().as_list() == [None, 3 * 3] assert_array_almost_equal(correct_embeds, test_embeds, decimal=5)
def test_embed(self): sequences = [ [], [1, 2, 3], [3, 3], [2] ] vocab = SimpleVocab([0, 1, 2, 3, 4]) indices = SequenceBatch.from_sequences(sequences, vocab) embeds = GPUVariable(torch.FloatTensor([ [0, 0], [2, 2], # 1 [3, 4], # 2 [-10, 1], # 3 [11, -1] # 4 ])) embedded = SequenceBatch.embed(indices, embeds) correct = np.array([ [[0, 0], [0, 0], [0, 0]], [[2, 2], [3, 4], [-10, 1]], [[-10, 1], [-10, 1], [0, 0]], [[3, 4], [0, 0], [0, 0]] ], dtype=np.float32) assert_tensor_equal(embedded.values, correct)
def inputs(self): token_vocab = SimpleVocab(['<pad>'] + u'a b c d'.split()) sequences = [ ['a', 'c'], ['b', 'c', 'c'], ['d', 'c', 'a'], ] return self.as_args_kwargs(sequences, token_vocab)
def test_lstm(self): """Test whether the mask works properly for LSTM embedder.""" token_vocab = SimpleVocab(u'a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ['a', 'b', 'c', 'd'], ] sequences_alt = [ ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'], ['b', 'a', 'd'], ['c', 'd'], ] with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = LSTMSequenceEmbedder(token_embeds, seq_length=4, hidden_size=7) test_embeds, test_hidden_states = model.compute( [model.embeds, model.hidden_states.values], sequences, token_vocab) assert test_embeds.shape == (3, 7) assert test_hidden_states.shape == (3, 4, 7) # Padded spaces should have the same hidden states assert_array_almost_equal(test_hidden_states[1, 1, :], test_hidden_states[1, 2, :], decimal=5) assert_array_almost_equal(test_hidden_states[1, 1, :], test_hidden_states[1, 3, :], decimal=5) # Try again but with different paddings # Should get the same result for ['c', 'd'] big_model = LSTMSequenceEmbedder(token_embeds, seq_length=8, hidden_size=7) big_model.weights = model.weights # match weights test_embeds_alt, test_hidden_states_alt = big_model.compute( [big_model.embeds, big_model.hidden_states.values], sequences_alt, token_vocab) assert test_embeds_alt.shape == (3, 7) assert test_hidden_states_alt.shape == (3, 8, 7) assert_array_almost_equal(test_embeds[1, :], test_embeds_alt[2, :], decimal=5) assert_array_almost_equal(test_hidden_states[1, :2, :], test_hidden_states_alt[2, :2, :], decimal=5)
def base_pred_embeddings(self): array = np.array([ [0, 0, 0, 0], [1, 2, 3, 4], [0, 2, 0, 8], ], dtype=np.float32) vocab = SimpleVocab(u'<unk> b0 b1'.split()) return SimpleEmbeddings(array, vocab)
def model(self): array = np.array([ [1, 2, 3], [2, 4, 6], [3, 5, 7], ], dtype=np.float32) vocab = SimpleVocab(u'a b c'.split()) embeddings = SimpleEmbeddings(array, vocab) return TokenEmbedder(embeddings, 'token_embeds')
def test_lstm(self): """Test whether the mask works properly for bidi LSTM embedder.""" token_vocab = SimpleVocab('a b c d'.split()) sequences = [ ['a', 'b', 'c', 'd'], ['c', 'd'], ['a', 'b', 'c', 'd'], ] sequences_alt = [ ['a', 'b', 'c', 'd', 'a', 'b', 'd', 'c'], ['b', 'a', 'd'], ['c', 'd'], ] with clean_session(): token_embeds = tf.constant([ [1, 2, 0], [3, 4, 1], [5, 6, 0], [7, 8, 1], ], dtype=tf.float32) model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=4, hidden_size=7) test_embeds, test_hidden_states = model.compute( [model.embeds, model.hidden_states.values], sequences, token_vocab) assert test_embeds.shape == (3, 14) assert test_hidden_states.shape == (3, 4, 14) assert_array_almost_equal(test_embeds[1,:7], test_hidden_states[1,1,:7], decimal=5) assert_array_almost_equal(test_embeds[1,7:], test_hidden_states[1,0,7:], decimal=5) # Padded spaces should have the same forward embeddings assert_array_almost_equal(test_hidden_states[1,1,:7], test_hidden_states[1,2,:7], decimal=5) assert_array_almost_equal(test_hidden_states[1,1,:7], test_hidden_states[1,3,:7], decimal=5) # Padded spaces should have 0 backward embeddings assert_array_almost_equal(np.zeros((7,)), test_hidden_states[1,2,7:], decimal=5) assert_array_almost_equal(np.zeros((7,)), test_hidden_states[1,3,7:], decimal=5) # Other spaces should not have 0 embeddings with very high probability assert np.linalg.norm(test_hidden_states[1,0,:7]) > 1e-5 assert np.linalg.norm(test_hidden_states[1,1,:7]) > 1e-5 assert np.linalg.norm(test_hidden_states[1,0,7:]) > 1e-5 assert np.linalg.norm(test_hidden_states[1,1,7:]) > 1e-5 # Try again but with different paddings # Should get the same result for ['c', 'd'] big_model = BidiLSTMSequenceEmbedder(token_embeds, seq_length=8, hidden_size=7) big_model.weights = model.weights # match weights test_embeds_alt, test_hidden_states_alt = big_model.compute( [big_model.embeds, big_model.hidden_states.values], sequences_alt, token_vocab) assert test_embeds_alt.shape == (3, 14) assert test_hidden_states_alt.shape == (3, 8, 14) assert_array_almost_equal(test_embeds[1,:], test_embeds_alt[2,:], decimal=5) assert_array_almost_equal(test_hidden_states[1,:2,:], test_hidden_states_alt[2,:2,:], decimal=5)
def __init__(self, embed_dim): OBJECT = 'object' LIST = 'list' tokens = [ OBJECT, LIST, 'r', 'y', 'g', 'o', 'p', 'b', 'e', # 7 colors 'color-na', # if an Alchemy beaker is empty or has multiple colors # TODO(kelvin): change the behavior of RLongAlchemyObject.color to return `color-na` 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, # 0 index is used to represent things that are not visible -1, 'X1/1', '0', '1', '2', '3', '4', # Shapes! ] vocab = SimpleVocab(tokens) vocab.OBJECT = OBJECT vocab.LIST = LIST array = emulate_distribution((len(vocab), embed_dim), GloveEmbeddings(5000).array, seed=3) super(RLongPrimitiveEmbeddings, self).__init__(array, vocab)
def test_no_sequences(self): vocab = SimpleVocab('a b c'.split()) sequences = [] with clean_session(): model = FeedSequenceBatch() indices = tf.identity(model.values) mask = tf.identity(model.mask) indices_val, mask_val = model.compute([indices, mask], sequences, vocab) assert indices_val.shape == mask_val.shape == (0, 0)
def __init__(self, tokens, embeds): """ Args: tokens (list[unicode]) embeds (np.array) """ self.vocab = SimpleVocab(tokens) self._embeds = tf.constant(embeds, dtype=tf.float32) self._embed_dim = embeds.shape[1]
def embeddings(self): array = np.array([ [0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14], [15, 16, 17], ], dtype=np.float32) vocab = SimpleVocab(['<pad>', 'a', 'b', 'c', 'd', 'e']) return SimpleEmbeddings(array, vocab)
def test_multi_vocab_indices(self): vocabs = [ [SimpleVocab('a b c d e'.split()), SimpleVocab('x y z'.split())], [SimpleVocab('e d c b a'.split()), SimpleVocab('y z x'.split())], ] sequences = [ 'a b a e'.split(), 'y y y x z'.split(), ] indices = SequenceBatch.multi_vocab_indices(sequences, vocabs) assert_tensor_equal(indices.values, [ [[0, 4], [1, 3], [0, 4], [4, 0], [0, 0]], [[1, 0], [1, 0], [1, 0], [0, 2], [2, 1]], ]) assert_tensor_equal(indices.mask, [ [1, 1, 1, 1, 0], [1, 1, 1, 1, 1], ])
def input_embeds_list(self): sequences = [ [1, 2, 3], [8, 4, 2, 1, 1], [], ] # token 1 maps to embedding [1], 2 maps to [2] and so on... vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8]) array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1) token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array)) seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab)) return seq_embeds.split()
def __init__(self, embed_dim, all_types): vocab = SimpleVocab(all_types) array = emulate_distribution((len(vocab), embed_dim), GloveEmbeddings(5000).array, seed=1) super(TypeEmbeddings, self).__init__(array, vocab)
def embedder(self, request): vocab = SimpleVocab(['<unk>', '<start>', '<stop>'] + ['a', 'b', 'c']) arr = np.eye(len(vocab), dtype=np.float32) word_embeddings = Bunch(vocab=vocab, array=arr) return TokenEmbedder(word_embeddings, trainable=request.param)
def vocab(): return SimpleVocab(['a', 'b', 'c'])
def test_save_load(self, vocab, tmpdir): path = str(tmpdir.join('vocab.txt')) vocab.save(path) new_vocab = SimpleVocab.load(path) assert vocab == new_vocab
def __init__(self, embed_dim): bool_vocab = SimpleVocab([True, False]) embed_matrix = np.random.uniform( -np.sqrt(3. / embed_dim), np.sqrt(3. / embed_dim), size=(len(bool_vocab), embed_dim)).astype(np.float32) super(BoolEmbeddings, self).__init__(embed_matrix, bool_vocab)
def vocab(self): return SimpleVocab(['<unk>', 'a', 'b', 'c', '<start>', '<stop>'])
def utterances(self): tokens = sorted(list(self._utterance_set)) return SimpleVocab(tokens)