Ejemplo n.º 1
0
    def test_namespaces(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word", namespace='1')
        assert "word" in data_indexer.words_in_index(namespace='1')
        assert data_indexer.get_word_index("word", namespace='1') == word_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='1') == "word"
        assert data_indexer.get_vocab_size(
            namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = data_indexer.add_word_to_index("word2", namespace='2')
        word_index = data_indexer.add_word_to_index("word", namespace='2')
        assert "word" in data_indexer.words_in_index(namespace='2')
        assert "word2" in data_indexer.words_in_index(namespace='2')
        assert data_indexer.get_word_index("word", namespace='2') == word_index
        assert data_indexer.get_word_index("word2",
                                           namespace='2') == word2_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='2') == "word"
        assert data_indexer.get_word_from_index(word2_index,
                                                namespace='2') == "word2"
        assert data_indexer.get_vocab_size(
            namespace='2') == initial_vocab_size + 2
    def test_works_with_word_and_character_tokenizer(self):
        answer_options_simple = ("a<>a sentence<><>")
        background_simple = ("a<>a sentence<><>")
        line_simple = "\t".join(
            str(x) for x in [answer_options_simple, background_simple, "0"])
        TextInstance.tokenizer = WordAndCharacterTokenizer(Params({}))
        data_indexer = DataIndexer()
        a_word_index = data_indexer.add_word_to_index("a", namespace='words')
        sentence_index = data_indexer.add_word_to_index("sentence",
                                                        namespace='words')
        a_index = data_indexer.add_word_to_index("a", namespace='characters')
        s_index = data_indexer.add_word_to_index("s", namespace='characters')
        e_index = data_indexer.add_word_to_index("e", namespace='characters')

        new_instance = TupleInferenceInstance.read_from_line(line_simple)
        indexed = new_instance.to_indexed_instance(data_indexer)

        padding_lengths = {
            'num_question_tuples': 1,
            'num_background_tuples': 1,
            'num_slots': 2,
            'num_sentence_words': 2,
            'num_options': 1,
            'num_word_characters': 3
        }
        indexed.pad(padding_lengths)
        expected_indexed_tuple = [[[0, 0, 0], [a_word_index, a_index, 0]],
                                  [[a_word_index, a_index, 0],
                                   [sentence_index, s_index, e_index]]]
        expected_answers_indexed = numpy.asarray([expected_indexed_tuple])
        expected_background_indexed = numpy.asarray(expected_indexed_tuple)
        assert numpy.all(indexed.answers_indexed == expected_answers_indexed)
        assert numpy.all(
            indexed.background_indexed == expected_background_indexed)
        TextInstance.tokenizer = tokenizers['words'](Params({}))
Ejemplo n.º 3
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word2")]
     assert not numpy.allclose(word_vector, numpy.asarray([0.0, 0.0, 0.0]))
Ejemplo n.º 4
0
 def test_get_embedding_layer_actually_initializes_word_vectors_correctly(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word")]
     assert numpy.allclose(word_vector, numpy.asarray([1.0, 2.3, -1.0]))
 def test_to_indexed_instance_converts_correctly(self):
     instance = SentenceSelectionInstance("What do dogs eat?",
                                          ["Dogs eat cats.",
                                           "Dogs play with cats.",
                                           "Dogs enjoy cats."],
                                          0)
     data_indexer = DataIndexer()
     what_index = data_indexer.add_word_to_index("what")
     do_index = data_indexer.add_word_to_index("do")
     dogs_index = data_indexer.add_word_to_index("dogs")
     eat_index = data_indexer.add_word_to_index("eat")
     cats_index = data_indexer.add_word_to_index("cats")
     period_index = data_indexer.add_word_to_index(".")
     question_index = data_indexer.add_word_to_index("?")
     play_index = data_indexer.add_word_to_index("play")
     with_index = data_indexer.add_word_to_index("with")
     enjoy_index = data_indexer.add_word_to_index("enjoy")
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.question_indices == [what_index, do_index,
                                                  dogs_index, eat_index,
                                                  question_index]
     assert indexed_instance.sentences_indices == [[dogs_index, eat_index,
                                                    cats_index, period_index],
                                                   [dogs_index, play_index,
                                                    with_index, cats_index,
                                                    period_index],
                                                   [dogs_index, enjoy_index,
                                                    cats_index, period_index]]
     assert indexed_instance.label == 0
Ejemplo n.º 6
0
 def test_get_embedding_layer_skips_inconsistent_lines(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word1")
     data_indexer.add_word_to_index("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer.initial_weights[0][data_indexer.get_word_index("word2")]
     assert not numpy.allclose(word_vector[:2], numpy.asarray([0.1, 0.4]))
Ejemplo n.º 7
0
    def test_to_indexed_instance_converts_correctly(self):
        instance = CharacterSpanInstance("What do dogs eat?", "Dogs eat cats.",
                                         (9, 13))
        data_indexer = DataIndexer()
        what_index = data_indexer.add_word_to_index("what")
        do_index = data_indexer.add_word_to_index("do")
        dogs_index = data_indexer.add_word_to_index("dogs")
        eat_index = data_indexer.add_word_to_index("eat")
        cats_index = data_indexer.add_word_to_index("cats")
        period_index = data_indexer.add_word_to_index(".")
        question_index = data_indexer.add_word_to_index("?")
        stop_index = data_indexer.add_word_to_index(
            CharacterSpanInstance.stop_token)
        indexed_instance = instance.to_indexed_instance(data_indexer)
        assert indexed_instance.question_indices == [
            what_index, do_index, dogs_index, eat_index, question_index
        ]
        assert indexed_instance.passage_indices == [
            dogs_index, eat_index, cats_index, period_index, stop_index
        ]
        assert indexed_instance.label == (2, 3)

        # I put this test in here, instead of its own `test_as_training_data` test, to be sure that
        # the conversion to IndexedCharacterSpanIndex was performed correctly.
        indexed_instance.pad({'num_question_words': 3, 'num_passage_words': 6})
        (question_array,
         passage_array), label = indexed_instance.as_training_data()
        assert isinstance(label, tuple)
        assert numpy.all(label[0] == numpy.asarray([0, 0, 1, 0, 0, 0]))
        assert numpy.all(label[1] == numpy.asarray([0, 0, 0, 1, 0, 0]))
        assert numpy.all(question_array == numpy.asarray(
            [dogs_index, eat_index, question_index]))
        assert numpy.all(passage_array == numpy.asarray(
            [dogs_index, eat_index, cats_index, period_index, stop_index, 0]))
Ejemplo n.º 8
0
    def test_add_word_to_index_gives_consistent_results(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1
Ejemplo n.º 9
0
    def test_to_indexed_instance_converts_correctly(self):
        data_indexer = DataIndexer()
        a_word_index = data_indexer.add_word_to_index("a", namespace='words')
        sentence_index = data_indexer.add_word_to_index("sentence", namespace='words')
        capital_a_index = data_indexer.add_word_to_index("A", namespace='characters')
        space_index = data_indexer.add_word_to_index(" ", namespace='characters')
        a_index = data_indexer.add_word_to_index("a", namespace='characters')
        s_index = data_indexer.add_word_to_index("s", namespace='characters')
        e_index = data_indexer.add_word_to_index("e", namespace='characters')
        n_index = data_indexer.add_word_to_index("n", namespace='characters')
        t_index = data_indexer.add_word_to_index("t", namespace='characters')
        c_index = data_indexer.add_word_to_index("c", namespace='characters')

        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [a_word_index, sentence_index]
        TextInstance.tokenizer = tokenizers['characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index,
                                         e_index, n_index, c_index, e_index]
        TextInstance.tokenizer = tokenizers['words and characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [[a_word_index, a_index],
                                         [sentence_index, s_index, e_index, n_index, t_index,
                                          e_index, n_index, c_index, e_index]]
        TextInstance.tokenizer = tokenizers['words']({})
Ejemplo n.º 10
0
 def test_to_indexed_instance_converts_correctly(self):
     instance = SnliInstance("a b", "d e f", "entails")
     data_indexer = DataIndexer()
     a_index = data_indexer.add_word_to_index("a")
     d_index = data_indexer.add_word_to_index("d")
     oov_index = data_indexer.get_word_index(data_indexer._oov_token)  # pylint: disable=protected-access
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.first_sentence_indices == [a_index, oov_index]
     assert indexed_instance.second_sentence_indices == [d_index, oov_index, oov_index]
     assert indexed_instance.label == instance.label
 def test_to_indexed_instance_converts_correctly(self):
     instance = QuestionAnswerInstance("a A b", ["d", "e f D"], 1)
     data_indexer = DataIndexer()
     a_index = data_indexer.add_word_to_index("a")
     d_index = data_indexer.add_word_to_index("d")
     oov_index = data_indexer.get_word_index(data_indexer._oov_token)  # pylint: disable=protected-access
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.question_indices == [a_index, a_index, oov_index]
     assert len(indexed_instance.option_indices) == 2
     assert indexed_instance.option_indices[0] == [d_index]
     assert indexed_instance.option_indices[1] == [oov_index, oov_index, d_index]
Ejemplo n.º 12
0
 def test_to_indexed_instance_converts_correctly(self):
     data_indexer = DataIndexer()
     cats_index = data_indexer.add_word_to_index("cats")
     are_index = data_indexer.add_word_to_index("are")
     animals_index = data_indexer.add_word_to_index("animals")
     period_index = data_indexer.add_word_to_index(".")
     n_tag_index = data_indexer.add_word_to_index("N", namespace="tags")
     v_tag_index = data_indexer.add_word_to_index("V", namespace="tags")
     period_tag_index = data_indexer.add_word_to_index(".", namespace="tags")
     indexed_instance = self.instance.to_indexed_instance(data_indexer)
     expected_indices = [cats_index, are_index, animals_index, period_index]
     assert indexed_instance.text_indices == expected_indices
     expected_label = [self.one_hot(n_tag_index - 2, 3),
                       self.one_hot(v_tag_index - 2, 3),
                       self.one_hot(n_tag_index - 2, 3),
                       self.one_hot(period_tag_index - 2, 3)]
     assert_array_almost_equal(indexed_instance.label, expected_label)
     train_inputs, train_labels = indexed_instance.as_training_data()
     assert_array_almost_equal(train_labels, expected_label)
     assert_array_almost_equal(train_inputs, expected_indices)
Ejemplo n.º 13
0
    def test_to_indexed_instance_converts_correctly(self):
        instance = McQuestionAnswerInstance(
            "Cats from Nevada are eaten by dogs in XXX .",
            "Dogs eat cats from Nevada in Washington .",
            ["Nevada", "Washington"], 1)
        data_indexer = DataIndexer()
        cats_index = data_indexer.add_word_to_index("cats")
        are_index = data_indexer.add_word_to_index("are")
        eaten_index = data_indexer.add_word_to_index("eaten")
        by_index = data_indexer.add_word_to_index("by")
        dogs_index = data_indexer.add_word_to_index("dogs")
        in_index = data_indexer.add_word_to_index("in")
        XXX_index = data_indexer.add_word_to_index("xxx")
        period_index = data_indexer.add_word_to_index(".")
        eat_index = data_indexer.add_word_to_index("eat")
        from_index = data_indexer.add_word_to_index("from")
        nevada_index = data_indexer.add_word_to_index("nevada")
        washington_index = data_indexer.add_word_to_index("washington")
        indexed_instance = instance.to_indexed_instance(data_indexer)

        assert indexed_instance.question_indices == [
            cats_index, from_index, nevada_index, are_index, eaten_index,
            by_index, dogs_index, in_index, XXX_index, period_index
        ]
        assert indexed_instance.passage_indices == [
            dogs_index, eat_index, cats_index, from_index, nevada_index,
            in_index, washington_index, period_index
        ]
        assert len(indexed_instance.option_indices) == 2
        assert indexed_instance.option_indices[0] == [nevada_index]
        assert indexed_instance.option_indices[1] == [washington_index]
        assert indexed_instance.label == 1
Ejemplo n.º 14
0
class TestSentenceInstance(DeepQaTestCase):
    def setUp(self):
        super(TestSentenceInstance, self).setUp()
        self.data_indexer = DataIndexer()
        self.this_index = self.data_indexer.add_word_to_index("this")
        self.is_index = self.data_indexer.add_word_to_index("is")
        self.a_index = self.data_indexer.add_word_to_index("a")
        self.sentence_index = self.data_indexer.add_word_to_index("sentence")
        self.start_index = self.data_indexer.add_word_to_index("<S>")
        self.end_index = self.data_indexer.add_word_to_index("</S>")
        self.space_index = self.data_indexer.add_word_to_index(' ')
        self.c_index = self.data_indexer.add_word_to_index('c')
        self.e_index = self.data_indexer.add_word_to_index('e')
        self.h_index = self.data_indexer.add_word_to_index('h')
        self.i_index = self.data_indexer.add_word_to_index('i')
        self.n_index = self.data_indexer.add_word_to_index('n')
        self.s_index = self.data_indexer.add_word_to_index('s')
        self.t_index = self.data_indexer.add_word_to_index('t')
        self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters')
        self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters')
        self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters')
        self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters')
        self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters')
        self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters')
        self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters')
        self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters')

    def tearDown(self):
        super(TestSentenceInstance, self).tearDown()
        TextInstance.tokenizer = tokenizers['words'](Params({}))

    @staticmethod
    def instance_to_line(text, index=None):
        index_str = '' if index is None else str(index) + '\t'
        return index_str + text

    def test_read_from_line_handles_one_column(self):
        text = "this is a sentence"
        instance = SentenceInstance.read_from_line(text)
        assert instance.text == text
        assert instance.label is None
        assert instance.index is None

    def test_read_from_line_handles_two_column(self):
        index = 23
        text = "this is a sentence"
        line = self.instance_to_line(text, index)

        instance = SentenceInstance.read_from_line(line)
        assert instance.text == text
        assert instance.index == index
        assert instance.label is None

    def test_end_to_end_conversion_to_arrays(self):
        instance = SentenceInstance("this is a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 7})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [0, 0, self.start_index, self.this_index, self.is_index,
                                        self.a_index, self.sentence_index])
        assert_array_equal(label_array, [[0], [0], [self.this_index], [self.is_index],
                                         [self.a_index], [self.sentence_index], [self.end_index]])

    def test_end_to_end_conversion_to_arrays_with_character_tokenizer(self):
        TextInstance.tokenizer = tokenizers['characters'](Params({}))
        instance = SentenceInstance("a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 13})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [0, 0, self.start_index, self.a_index, self.space_index,
                                        self.s_index, self.e_index, self.n_index, self.t_index,
                                        self.e_index, self.n_index, self.c_index, self.e_index])
        assert_array_equal(label_array, [[0], [0], [self.a_index], [self.space_index],
                                         [self.s_index], [self.e_index], [self.n_index],
                                         [self.t_index], [self.e_index], [self.n_index],
                                         [self.c_index], [self.e_index], [self.end_index]])

    def test_end_to_end_conversion_to_arrays_with_word_and_character_tokenizer(self):
        TextInstance.tokenizer = tokenizers['words and characters'](Params({}))
        instance = SentenceInstance("this is a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 6, 'num_word_characters': 5})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [[0, 0, 0, 0, 0],
                                        [self.start_index, 0, 0, 0, 0],
                                        [self.this_index, self.t_char_index, self.h_char_index,
                                         self.i_char_index, self.s_char_index],
                                        [self.is_index, self.i_char_index, self.s_char_index, 0, 0],
                                        [self.a_index, self.a_char_index, 0, 0, 0],
                                        [self.sentence_index, self.s_char_index, self.e_char_index,
                                         self.n_char_index, self.t_char_index],
                                       ])
        assert_array_equal(label_array, [[0], [self.this_index], [self.is_index], [self.a_index],
                                         [self.sentence_index], [self.end_index]])