def test_namespaces(self): data_indexer = DataIndexer() initial_vocab_size = data_indexer.get_vocab_size() word_index = data_indexer.add_word_to_index("word", namespace='1') assert "word" in data_indexer.words_in_index(namespace='1') assert data_indexer.get_word_index("word", namespace='1') == word_index assert data_indexer.get_word_from_index(word_index, namespace='1') == "word" assert data_indexer.get_vocab_size( namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = data_indexer.add_word_to_index("word2", namespace='2') word_index = data_indexer.add_word_to_index("word", namespace='2') assert "word" in data_indexer.words_in_index(namespace='2') assert "word2" in data_indexer.words_in_index(namespace='2') assert data_indexer.get_word_index("word", namespace='2') == word_index assert data_indexer.get_word_index("word2", namespace='2') == word2_index assert data_indexer.get_word_from_index(word_index, namespace='2') == "word" assert data_indexer.get_word_from_index(word2_index, namespace='2') == "word2" assert data_indexer.get_vocab_size( namespace='2') == initial_vocab_size + 2
def test_works_with_word_and_character_tokenizer(self): answer_options_simple = ("a<>a sentence<><>") background_simple = ("a<>a sentence<><>") line_simple = "\t".join( str(x) for x in [answer_options_simple, background_simple, "0"]) TextInstance.tokenizer = WordAndCharacterTokenizer(Params({})) data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') new_instance = TupleInferenceInstance.read_from_line(line_simple) indexed = new_instance.to_indexed_instance(data_indexer) padding_lengths = { 'num_question_tuples': 1, 'num_background_tuples': 1, 'num_slots': 2, 'num_sentence_words': 2, 'num_options': 1, 'num_word_characters': 3 } indexed.pad(padding_lengths) expected_indexed_tuple = [[[0, 0, 0], [a_word_index, a_index, 0]], [[a_word_index, a_index, 0], [sentence_index, s_index, e_index]]] expected_answers_indexed = numpy.asarray([expected_indexed_tuple]) expected_background_indexed = numpy.asarray(expected_indexed_tuple) assert numpy.all(indexed.answers_indexed == expected_answers_indexed) assert numpy.all( indexed.background_indexed == expected_background_indexed) TextInstance.tokenizer = tokenizers['words'](Params({}))
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word2")] assert not numpy.allclose(word_vector, numpy.asarray([0.0, 0.0, 0.0]))
def test_get_embedding_layer_actually_initializes_word_vectors_correctly(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word")] assert numpy.allclose(word_vector, numpy.asarray([1.0, 2.3, -1.0]))
def test_to_indexed_instance_converts_correctly(self): instance = SentenceSelectionInstance("What do dogs eat?", ["Dogs eat cats.", "Dogs play with cats.", "Dogs enjoy cats."], 0) data_indexer = DataIndexer() what_index = data_indexer.add_word_to_index("what") do_index = data_indexer.add_word_to_index("do") dogs_index = data_indexer.add_word_to_index("dogs") eat_index = data_indexer.add_word_to_index("eat") cats_index = data_indexer.add_word_to_index("cats") period_index = data_indexer.add_word_to_index(".") question_index = data_indexer.add_word_to_index("?") play_index = data_indexer.add_word_to_index("play") with_index = data_indexer.add_word_to_index("with") enjoy_index = data_indexer.add_word_to_index("enjoy") indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [what_index, do_index, dogs_index, eat_index, question_index] assert indexed_instance.sentences_indices == [[dogs_index, eat_index, cats_index, period_index], [dogs_index, play_index, with_index, cats_index, period_index], [dogs_index, enjoy_index, cats_index, period_index]] assert indexed_instance.label == 0
def test_get_embedding_layer_skips_inconsistent_lines(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word1") data_indexer.add_word_to_index("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer.initial_weights[0][data_indexer.get_word_index("word2")] assert not numpy.allclose(word_vector[:2], numpy.asarray([0.1, 0.4]))
def test_to_indexed_instance_converts_correctly(self): instance = CharacterSpanInstance("What do dogs eat?", "Dogs eat cats.", (9, 13)) data_indexer = DataIndexer() what_index = data_indexer.add_word_to_index("what") do_index = data_indexer.add_word_to_index("do") dogs_index = data_indexer.add_word_to_index("dogs") eat_index = data_indexer.add_word_to_index("eat") cats_index = data_indexer.add_word_to_index("cats") period_index = data_indexer.add_word_to_index(".") question_index = data_indexer.add_word_to_index("?") stop_index = data_indexer.add_word_to_index( CharacterSpanInstance.stop_token) indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [ what_index, do_index, dogs_index, eat_index, question_index ] assert indexed_instance.passage_indices == [ dogs_index, eat_index, cats_index, period_index, stop_index ] assert indexed_instance.label == (2, 3) # I put this test in here, instead of its own `test_as_training_data` test, to be sure that # the conversion to IndexedCharacterSpanIndex was performed correctly. indexed_instance.pad({'num_question_words': 3, 'num_passage_words': 6}) (question_array, passage_array), label = indexed_instance.as_training_data() assert isinstance(label, tuple) assert numpy.all(label[0] == numpy.asarray([0, 0, 1, 0, 0, 0])) assert numpy.all(label[1] == numpy.asarray([0, 0, 0, 1, 0, 0])) assert numpy.all(question_array == numpy.asarray( [dogs_index, eat_index, question_index])) assert numpy.all(passage_array == numpy.asarray( [dogs_index, eat_index, cats_index, period_index, stop_index, 0]))
def test_add_word_to_index_gives_consistent_results(self): data_indexer = DataIndexer() initial_vocab_size = data_indexer.get_vocab_size() word_index = data_indexer.add_word_to_index("word") assert "word" in data_indexer.words_in_index() assert data_indexer.get_word_index("word") == word_index assert data_indexer.get_word_from_index(word_index) == "word" assert data_indexer.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. data_indexer.add_word_to_index("word") assert "word" in data_indexer.words_in_index() assert data_indexer.get_word_index("word") == word_index assert data_indexer.get_word_from_index(word_index) == "word" assert data_indexer.get_vocab_size() == initial_vocab_size + 1
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') capital_a_index = data_indexer.add_word_to_index("A", namespace='characters') space_index = data_indexer.add_word_to_index(" ", namespace='characters') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') n_index = data_indexer.add_word_to_index("n", namespace='characters') t_index = data_indexer.add_word_to_index("t", namespace='characters') c_index = data_indexer.add_word_to_index("c", namespace='characters') instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [a_word_index, sentence_index] TextInstance.tokenizer = tokenizers['characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index] TextInstance.tokenizer = tokenizers['words and characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [[a_word_index, a_index], [sentence_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] TextInstance.tokenizer = tokenizers['words']({})
def test_to_indexed_instance_converts_correctly(self): instance = SnliInstance("a b", "d e f", "entails") data_indexer = DataIndexer() a_index = data_indexer.add_word_to_index("a") d_index = data_indexer.add_word_to_index("d") oov_index = data_indexer.get_word_index(data_indexer._oov_token) # pylint: disable=protected-access indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.first_sentence_indices == [a_index, oov_index] assert indexed_instance.second_sentence_indices == [d_index, oov_index, oov_index] assert indexed_instance.label == instance.label
def test_to_indexed_instance_converts_correctly(self): instance = QuestionAnswerInstance("a A b", ["d", "e f D"], 1) data_indexer = DataIndexer() a_index = data_indexer.add_word_to_index("a") d_index = data_indexer.add_word_to_index("d") oov_index = data_indexer.get_word_index(data_indexer._oov_token) # pylint: disable=protected-access indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [a_index, a_index, oov_index] assert len(indexed_instance.option_indices) == 2 assert indexed_instance.option_indices[0] == [d_index] assert indexed_instance.option_indices[1] == [oov_index, oov_index, d_index]
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() cats_index = data_indexer.add_word_to_index("cats") are_index = data_indexer.add_word_to_index("are") animals_index = data_indexer.add_word_to_index("animals") period_index = data_indexer.add_word_to_index(".") n_tag_index = data_indexer.add_word_to_index("N", namespace="tags") v_tag_index = data_indexer.add_word_to_index("V", namespace="tags") period_tag_index = data_indexer.add_word_to_index(".", namespace="tags") indexed_instance = self.instance.to_indexed_instance(data_indexer) expected_indices = [cats_index, are_index, animals_index, period_index] assert indexed_instance.text_indices == expected_indices expected_label = [self.one_hot(n_tag_index - 2, 3), self.one_hot(v_tag_index - 2, 3), self.one_hot(n_tag_index - 2, 3), self.one_hot(period_tag_index - 2, 3)] assert_array_almost_equal(indexed_instance.label, expected_label) train_inputs, train_labels = indexed_instance.as_training_data() assert_array_almost_equal(train_labels, expected_label) assert_array_almost_equal(train_inputs, expected_indices)
def test_to_indexed_instance_converts_correctly(self): instance = McQuestionAnswerInstance( "Cats from Nevada are eaten by dogs in XXX .", "Dogs eat cats from Nevada in Washington .", ["Nevada", "Washington"], 1) data_indexer = DataIndexer() cats_index = data_indexer.add_word_to_index("cats") are_index = data_indexer.add_word_to_index("are") eaten_index = data_indexer.add_word_to_index("eaten") by_index = data_indexer.add_word_to_index("by") dogs_index = data_indexer.add_word_to_index("dogs") in_index = data_indexer.add_word_to_index("in") XXX_index = data_indexer.add_word_to_index("xxx") period_index = data_indexer.add_word_to_index(".") eat_index = data_indexer.add_word_to_index("eat") from_index = data_indexer.add_word_to_index("from") nevada_index = data_indexer.add_word_to_index("nevada") washington_index = data_indexer.add_word_to_index("washington") indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [ cats_index, from_index, nevada_index, are_index, eaten_index, by_index, dogs_index, in_index, XXX_index, period_index ] assert indexed_instance.passage_indices == [ dogs_index, eat_index, cats_index, from_index, nevada_index, in_index, washington_index, period_index ] assert len(indexed_instance.option_indices) == 2 assert indexed_instance.option_indices[0] == [nevada_index] assert indexed_instance.option_indices[1] == [washington_index] assert indexed_instance.label == 1
class TestSentenceInstance(DeepQaTestCase): def setUp(self): super(TestSentenceInstance, self).setUp() self.data_indexer = DataIndexer() self.this_index = self.data_indexer.add_word_to_index("this") self.is_index = self.data_indexer.add_word_to_index("is") self.a_index = self.data_indexer.add_word_to_index("a") self.sentence_index = self.data_indexer.add_word_to_index("sentence") self.start_index = self.data_indexer.add_word_to_index("<S>") self.end_index = self.data_indexer.add_word_to_index("</S>") self.space_index = self.data_indexer.add_word_to_index(' ') self.c_index = self.data_indexer.add_word_to_index('c') self.e_index = self.data_indexer.add_word_to_index('e') self.h_index = self.data_indexer.add_word_to_index('h') self.i_index = self.data_indexer.add_word_to_index('i') self.n_index = self.data_indexer.add_word_to_index('n') self.s_index = self.data_indexer.add_word_to_index('s') self.t_index = self.data_indexer.add_word_to_index('t') self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters') self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters') self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters') self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters') self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters') self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters') self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters') self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters') def tearDown(self): super(TestSentenceInstance, self).tearDown() TextInstance.tokenizer = tokenizers['words'](Params({})) @staticmethod def instance_to_line(text, index=None): index_str = '' if index is None else str(index) + '\t' return index_str + text def test_read_from_line_handles_one_column(self): text = "this is a sentence" instance = SentenceInstance.read_from_line(text) assert instance.text == text assert instance.label is None assert instance.index is None def test_read_from_line_handles_two_column(self): index = 23 text = "this is a sentence" line = self.instance_to_line(text, index) instance = SentenceInstance.read_from_line(line) assert instance.text == text assert instance.index == index assert instance.label is None def test_end_to_end_conversion_to_arrays(self): instance = SentenceInstance("this is a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 7}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [0, 0, self.start_index, self.this_index, self.is_index, self.a_index, self.sentence_index]) assert_array_equal(label_array, [[0], [0], [self.this_index], [self.is_index], [self.a_index], [self.sentence_index], [self.end_index]]) def test_end_to_end_conversion_to_arrays_with_character_tokenizer(self): TextInstance.tokenizer = tokenizers['characters'](Params({})) instance = SentenceInstance("a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 13}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [0, 0, self.start_index, self.a_index, self.space_index, self.s_index, self.e_index, self.n_index, self.t_index, self.e_index, self.n_index, self.c_index, self.e_index]) assert_array_equal(label_array, [[0], [0], [self.a_index], [self.space_index], [self.s_index], [self.e_index], [self.n_index], [self.t_index], [self.e_index], [self.n_index], [self.c_index], [self.e_index], [self.end_index]]) def test_end_to_end_conversion_to_arrays_with_word_and_character_tokenizer(self): TextInstance.tokenizer = tokenizers['words and characters'](Params({})) instance = SentenceInstance("this is a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 6, 'num_word_characters': 5}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [[0, 0, 0, 0, 0], [self.start_index, 0, 0, 0, 0], [self.this_index, self.t_char_index, self.h_char_index, self.i_char_index, self.s_char_index], [self.is_index, self.i_char_index, self.s_char_index, 0, 0], [self.a_index, self.a_char_index, 0, 0, 0], [self.sentence_index, self.s_char_index, self.e_char_index, self.n_char_index, self.t_char_index], ]) assert_array_equal(label_array, [[0], [self.this_index], [self.is_index], [self.a_index], [self.sentence_index], [self.end_index]])