コード例 #1
0
    def test_indexed_instance_padding(self):
        data_indexer = DataIndexer()
        dataset = TextDataset([self.instance])
        data_indexer.fit_word_dictionary(dataset)

        indexed = self.instance.to_indexed_instance(data_indexer)
        num_question_tuples = 1
        num_background_tuples = 4
        num_slots = 3
        slot_length = 6
        num_options = 4
        padding_lengths = {
            'num_question_tuples': num_question_tuples,
            'num_background_tuples': num_background_tuples,
            'num_slots': num_slots,
            'num_sentence_words': slot_length,
            'num_options': num_options
        }
        indexed.pad(padding_lengths)
        assert len(indexed.answers_indexed) == num_options
        for answer_option_tuples in indexed.answers_indexed:
            assert len(answer_option_tuples) == num_question_tuples
            for ans_tuple in answer_option_tuples:
                assert len(ans_tuple) == num_slots
                for slot in ans_tuple:
                    assert len(slot) == slot_length
        assert len(indexed.background_indexed) == num_background_tuples
        for background_tuple in indexed.background_indexed:
            assert len(background_tuple) == num_slots
            for slot in background_tuple:
                assert len(slot) == slot_length
コード例 #2
0
    def test_to_indexed_instance_converts_correctly(self):
        instance = CharacterSpanInstance("What do dogs eat?", "Dogs eat cats.",
                                         (9, 13))
        data_indexer = DataIndexer()
        what_index = data_indexer.add_word_to_index("what")
        do_index = data_indexer.add_word_to_index("do")
        dogs_index = data_indexer.add_word_to_index("dogs")
        eat_index = data_indexer.add_word_to_index("eat")
        cats_index = data_indexer.add_word_to_index("cats")
        period_index = data_indexer.add_word_to_index(".")
        question_index = data_indexer.add_word_to_index("?")
        stop_index = data_indexer.add_word_to_index(
            CharacterSpanInstance.stop_token)
        indexed_instance = instance.to_indexed_instance(data_indexer)
        assert indexed_instance.question_indices == [
            what_index, do_index, dogs_index, eat_index, question_index
        ]
        assert indexed_instance.passage_indices == [
            dogs_index, eat_index, cats_index, period_index, stop_index
        ]
        assert indexed_instance.label == (2, 3)

        # I put this test in here, instead of its own `test_as_training_data` test, to be sure that
        # the conversion to IndexedCharacterSpanIndex was performed correctly.
        indexed_instance.pad({'num_question_words': 3, 'num_passage_words': 6})
        (question_array,
         passage_array), label = indexed_instance.as_training_data()
        assert isinstance(label, tuple)
        assert numpy.all(label[0] == numpy.asarray([0, 0, 1, 0, 0, 0]))
        assert numpy.all(label[1] == numpy.asarray([0, 0, 0, 1, 0, 0]))
        assert numpy.all(question_array == numpy.asarray(
            [dogs_index, eat_index, question_index]))
        assert numpy.all(passage_array == numpy.asarray(
            [dogs_index, eat_index, cats_index, period_index, stop_index, 0]))
コード例 #3
0
 def setUp(self):
     super(TestSentenceInstance, self).setUp()
     self.data_indexer = DataIndexer()
     self.this_index = self.data_indexer.add_word_to_index("this")
     self.is_index = self.data_indexer.add_word_to_index("is")
     self.a_index = self.data_indexer.add_word_to_index("a")
     self.sentence_index = self.data_indexer.add_word_to_index("sentence")
     self.start_index = self.data_indexer.add_word_to_index("<S>")
     self.end_index = self.data_indexer.add_word_to_index("</S>")
     self.space_index = self.data_indexer.add_word_to_index(' ')
     self.c_index = self.data_indexer.add_word_to_index('c')
     self.e_index = self.data_indexer.add_word_to_index('e')
     self.h_index = self.data_indexer.add_word_to_index('h')
     self.i_index = self.data_indexer.add_word_to_index('i')
     self.n_index = self.data_indexer.add_word_to_index('n')
     self.s_index = self.data_indexer.add_word_to_index('s')
     self.t_index = self.data_indexer.add_word_to_index('t')
     self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters')
     self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters')
     self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters')
     self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters')
     self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters')
     self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters')
     self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters')
     self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters')
コード例 #4
0
 def test_get_embedding_layer_actually_initializes_word_vectors_correctly(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word")]
     assert numpy.allclose(word_vector, numpy.asarray([1.0, 2.3, -1.0]))
コード例 #5
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word2")]
     assert not numpy.allclose(word_vector, numpy.asarray([0.0, 0.0, 0.0]))
コード例 #6
0
 def test_to_indexed_instance_converts_correctly(self):
     instance = SnliInstance("a b", "d e f", "entails")
     data_indexer = DataIndexer()
     a_index = data_indexer.add_word_to_index("a")
     d_index = data_indexer.add_word_to_index("d")
     oov_index = data_indexer.get_word_index(data_indexer._oov_token)  # pylint: disable=protected-access
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.first_sentence_indices == [a_index, oov_index]
     assert indexed_instance.second_sentence_indices == [d_index, oov_index, oov_index]
     assert indexed_instance.label == instance.label
コード例 #7
0
    def test_to_indexed_instance_converts_correctly(self):
        data_indexer = DataIndexer()
        a_word_index = data_indexer.add_word_to_index("a", namespace='words')
        sentence_index = data_indexer.add_word_to_index("sentence", namespace='words')
        capital_a_index = data_indexer.add_word_to_index("A", namespace='characters')
        space_index = data_indexer.add_word_to_index(" ", namespace='characters')
        a_index = data_indexer.add_word_to_index("a", namespace='characters')
        s_index = data_indexer.add_word_to_index("s", namespace='characters')
        e_index = data_indexer.add_word_to_index("e", namespace='characters')
        n_index = data_indexer.add_word_to_index("n", namespace='characters')
        t_index = data_indexer.add_word_to_index("t", namespace='characters')
        c_index = data_indexer.add_word_to_index("c", namespace='characters')

        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [a_word_index, sentence_index]
        TextInstance.tokenizer = tokenizers['characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index,
                                         e_index, n_index, c_index, e_index]
        TextInstance.tokenizer = tokenizers['words and characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [[a_word_index, a_index],
                                         [sentence_index, s_index, e_index, n_index, t_index,
                                          e_index, n_index, c_index, e_index]]
        TextInstance.tokenizer = tokenizers['words']({})
コード例 #8
0
 def test_unknown_token(self):
     # pylint: disable=protected-access
     # We're putting this behavior in a test so that the behavior is documented.  There is
     # solver code that depends in a small way on how we treat the unknown token, so any
     # breaking change to this behavior should break a test, so you know you've done something
     # that needs more consideration.
     data_indexer = DataIndexer()
     oov_token = data_indexer._oov_token
     oov_index = data_indexer.get_word_index(oov_token)
     assert oov_index == 1
     assert data_indexer.get_word_index("unseen word") == oov_index
コード例 #9
0
 def test_to_indexed_instance_converts_correctly(self):
     instance = QuestionAnswerInstance("a A b", ["d", "e f D"], 1)
     data_indexer = DataIndexer()
     a_index = data_indexer.add_word_to_index("a")
     d_index = data_indexer.add_word_to_index("d")
     oov_index = data_indexer.get_word_index(data_indexer._oov_token)  # pylint: disable=protected-access
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.question_indices == [a_index, a_index, oov_index]
     assert len(indexed_instance.option_indices) == 2
     assert indexed_instance.option_indices[0] == [d_index]
     assert indexed_instance.option_indices[1] == [oov_index, oov_index, d_index]
コード例 #10
0
 def test_get_embedding_layer_skips_inconsistent_lines(self):
     data_indexer = DataIndexer()
     data_indexer.add_word_to_index("word1")
     data_indexer.add_word_to_index("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8'))
     embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
     word_vector = embedding_layer.initial_weights[0][data_indexer.get_word_index("word2")]
     assert not numpy.allclose(word_vector[:2], numpy.asarray([0.1, 0.4]))
コード例 #11
0
 def test_get_embedding_layer_crashes_when_embedding_dim_is_one(self):
     data_indexer = DataIndexer()
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("dimensionality 3\n".encode('utf-8'))
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     with pytest.raises(Exception):
         PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
コード例 #12
0
    def test_works_with_word_and_character_tokenizer(self):
        answer_options_simple = ("a<>a sentence<><>")
        background_simple = ("a<>a sentence<><>")
        line_simple = "\t".join(
            str(x) for x in [answer_options_simple, background_simple, "0"])
        TextInstance.tokenizer = WordAndCharacterTokenizer(Params({}))
        data_indexer = DataIndexer()
        a_word_index = data_indexer.add_word_to_index("a", namespace='words')
        sentence_index = data_indexer.add_word_to_index("sentence",
                                                        namespace='words')
        a_index = data_indexer.add_word_to_index("a", namespace='characters')
        s_index = data_indexer.add_word_to_index("s", namespace='characters')
        e_index = data_indexer.add_word_to_index("e", namespace='characters')

        new_instance = TupleInferenceInstance.read_from_line(line_simple)
        indexed = new_instance.to_indexed_instance(data_indexer)

        padding_lengths = {
            'num_question_tuples': 1,
            'num_background_tuples': 1,
            'num_slots': 2,
            'num_sentence_words': 2,
            'num_options': 1,
            'num_word_characters': 3
        }
        indexed.pad(padding_lengths)
        expected_indexed_tuple = [[[0, 0, 0], [a_word_index, a_index, 0]],
                                  [[a_word_index, a_index, 0],
                                   [sentence_index, s_index, e_index]]]
        expected_answers_indexed = numpy.asarray([expected_indexed_tuple])
        expected_background_indexed = numpy.asarray(expected_indexed_tuple)
        assert numpy.all(indexed.answers_indexed == expected_answers_indexed)
        assert numpy.all(
            indexed.background_indexed == expected_background_indexed)
        TextInstance.tokenizer = tokenizers['words'](Params({}))
コード例 #13
0
    def test_get_embedding_layer_uses_correct_embedding_dim(self):
        data_indexer = DataIndexer()
        embeddings_filename = self.TEST_DIR + "embeddings.gz"
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
        embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
        assert embedding_layer.output_dim == 3

        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("word1 1.0 2.3 -1.0 3.1\n".encode('utf-8'))
            embeddings_file.write("word2 0.1 0.4 -4.0 -1.2\n".encode('utf-8'))
        embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
        assert embedding_layer.output_dim == 4
コード例 #14
0
 def test_to_indexed_instance_converts_correctly(self):
     data_indexer = DataIndexer()
     cats_index = data_indexer.add_word_to_index("cats")
     are_index = data_indexer.add_word_to_index("are")
     animals_index = data_indexer.add_word_to_index("animals")
     period_index = data_indexer.add_word_to_index(".")
     n_tag_index = data_indexer.add_word_to_index("N", namespace="tags")
     v_tag_index = data_indexer.add_word_to_index("V", namespace="tags")
     period_tag_index = data_indexer.add_word_to_index(".", namespace="tags")
     indexed_instance = self.instance.to_indexed_instance(data_indexer)
     expected_indices = [cats_index, are_index, animals_index, period_index]
     assert indexed_instance.text_indices == expected_indices
     expected_label = [self.one_hot(n_tag_index - 2, 3),
                       self.one_hot(v_tag_index - 2, 3),
                       self.one_hot(n_tag_index - 2, 3),
                       self.one_hot(period_tag_index - 2, 3)]
     assert_array_almost_equal(indexed_instance.label, expected_label)
     train_inputs, train_labels = indexed_instance.as_training_data()
     assert_array_almost_equal(train_labels, expected_label)
     assert_array_almost_equal(train_inputs, expected_indices)
コード例 #15
0
 def test_indexed_instance_padding(self):
     data_indexer = DataIndexer()
     indexed = self.instance.to_indexed_instance(data_indexer)
     num_graphlets = 1
     num_alignments = 2
     num_features = 5
     num_options = 4
     padding_lengths = {
         'num_graphlets': num_graphlets,
         'num_alignments': num_alignments,
         'num_features': num_features,
         'num_options': num_options
     }
     indexed.pad(padding_lengths)
     assert len(indexed.answers_indexed) == num_options
     for answer_option_graphlets in indexed.answers_indexed:
         assert len(answer_option_graphlets) == num_graphlets
         for answer_graphlet in answer_option_graphlets:
             assert len(answer_graphlet) == num_alignments
             for alignment in answer_graphlet:
                 assert len(alignment) == num_features
コード例 #16
0
    def test_to_indexed_instance_converts_correctly(self):
        instance = McQuestionAnswerInstance(
            "Cats from Nevada are eaten by dogs in XXX .",
            "Dogs eat cats from Nevada in Washington .",
            ["Nevada", "Washington"], 1)
        data_indexer = DataIndexer()
        cats_index = data_indexer.add_word_to_index("cats")
        are_index = data_indexer.add_word_to_index("are")
        eaten_index = data_indexer.add_word_to_index("eaten")
        by_index = data_indexer.add_word_to_index("by")
        dogs_index = data_indexer.add_word_to_index("dogs")
        in_index = data_indexer.add_word_to_index("in")
        XXX_index = data_indexer.add_word_to_index("xxx")
        period_index = data_indexer.add_word_to_index(".")
        eat_index = data_indexer.add_word_to_index("eat")
        from_index = data_indexer.add_word_to_index("from")
        nevada_index = data_indexer.add_word_to_index("nevada")
        washington_index = data_indexer.add_word_to_index("washington")
        indexed_instance = instance.to_indexed_instance(data_indexer)

        assert indexed_instance.question_indices == [
            cats_index, from_index, nevada_index, are_index, eaten_index,
            by_index, dogs_index, in_index, XXX_index, period_index
        ]
        assert indexed_instance.passage_indices == [
            dogs_index, eat_index, cats_index, from_index, nevada_index,
            in_index, washington_index, period_index
        ]
        assert len(indexed_instance.option_indices) == 2
        assert indexed_instance.option_indices[0] == [nevada_index]
        assert indexed_instance.option_indices[1] == [washington_index]
        assert indexed_instance.label == 1
コード例 #17
0
class TestSentenceInstance(DeepQaTestCase):
    def setUp(self):
        super(TestSentenceInstance, self).setUp()
        self.data_indexer = DataIndexer()
        self.this_index = self.data_indexer.add_word_to_index("this")
        self.is_index = self.data_indexer.add_word_to_index("is")
        self.a_index = self.data_indexer.add_word_to_index("a")
        self.sentence_index = self.data_indexer.add_word_to_index("sentence")
        self.start_index = self.data_indexer.add_word_to_index("<S>")
        self.end_index = self.data_indexer.add_word_to_index("</S>")
        self.space_index = self.data_indexer.add_word_to_index(' ')
        self.c_index = self.data_indexer.add_word_to_index('c')
        self.e_index = self.data_indexer.add_word_to_index('e')
        self.h_index = self.data_indexer.add_word_to_index('h')
        self.i_index = self.data_indexer.add_word_to_index('i')
        self.n_index = self.data_indexer.add_word_to_index('n')
        self.s_index = self.data_indexer.add_word_to_index('s')
        self.t_index = self.data_indexer.add_word_to_index('t')
        self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters')
        self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters')
        self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters')
        self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters')
        self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters')
        self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters')
        self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters')
        self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters')

    def tearDown(self):
        super(TestSentenceInstance, self).tearDown()
        TextInstance.tokenizer = tokenizers['words'](Params({}))

    @staticmethod
    def instance_to_line(text, index=None):
        index_str = '' if index is None else str(index) + '\t'
        return index_str + text

    def test_read_from_line_handles_one_column(self):
        text = "this is a sentence"
        instance = SentenceInstance.read_from_line(text)
        assert instance.text == text
        assert instance.label is None
        assert instance.index is None

    def test_read_from_line_handles_two_column(self):
        index = 23
        text = "this is a sentence"
        line = self.instance_to_line(text, index)

        instance = SentenceInstance.read_from_line(line)
        assert instance.text == text
        assert instance.index == index
        assert instance.label is None

    def test_end_to_end_conversion_to_arrays(self):
        instance = SentenceInstance("this is a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 7})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [0, 0, self.start_index, self.this_index, self.is_index,
                                        self.a_index, self.sentence_index])
        assert_array_equal(label_array, [[0], [0], [self.this_index], [self.is_index],
                                         [self.a_index], [self.sentence_index], [self.end_index]])

    def test_end_to_end_conversion_to_arrays_with_character_tokenizer(self):
        TextInstance.tokenizer = tokenizers['characters'](Params({}))
        instance = SentenceInstance("a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 13})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [0, 0, self.start_index, self.a_index, self.space_index,
                                        self.s_index, self.e_index, self.n_index, self.t_index,
                                        self.e_index, self.n_index, self.c_index, self.e_index])
        assert_array_equal(label_array, [[0], [0], [self.a_index], [self.space_index],
                                         [self.s_index], [self.e_index], [self.n_index],
                                         [self.t_index], [self.e_index], [self.n_index],
                                         [self.c_index], [self.e_index], [self.end_index]])

    def test_end_to_end_conversion_to_arrays_with_word_and_character_tokenizer(self):
        TextInstance.tokenizer = tokenizers['words and characters'](Params({}))
        instance = SentenceInstance("this is a sentence")
        indexed_instance = instance.to_indexed_instance(self.data_indexer)
        indexed_instance.pad({'num_sentence_words': 6, 'num_word_characters': 5})
        word_array, label_array = indexed_instance.as_training_data()
        assert_array_equal(word_array, [[0, 0, 0, 0, 0],
                                        [self.start_index, 0, 0, 0, 0],
                                        [self.this_index, self.t_char_index, self.h_char_index,
                                         self.i_char_index, self.s_char_index],
                                        [self.is_index, self.i_char_index, self.s_char_index, 0, 0],
                                        [self.a_index, self.a_char_index, 0, 0, 0],
                                        [self.sentence_index, self.s_char_index, self.e_char_index,
                                         self.n_char_index, self.t_char_index],
                                       ])
        assert_array_equal(label_array, [[0], [self.this_index], [self.is_index], [self.a_index],
                                         [self.sentence_index], [self.end_index]])
コード例 #18
0
 def test_set_from_file(self):
     # pylint: disable=protected-access
     vocab_filename = self.TEST_DIR + 'vocab_file'
     with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
         vocab_file.write('<S>\n')
         vocab_file.write('</S>\n')
         vocab_file.write('<UNK>\n')
         vocab_file.write('a\n')
         vocab_file.write('word\n')
         vocab_file.write('another\n')
     data_indexer = DataIndexer()
     data_indexer.set_from_file(vocab_filename, oov_token="<UNK>")
     assert data_indexer._oov_token == "<UNK>"
     assert data_indexer.get_word_index("random string") == 3
     assert data_indexer.get_word_index("<S>") == 1
     assert data_indexer.get_word_index("</S>") == 2
     assert data_indexer.get_word_index("<UNK>") == 3
     assert data_indexer.get_word_index("a") == 4
     assert data_indexer.get_word_index("word") == 5
     assert data_indexer.get_word_index("another") == 6
     assert data_indexer.get_word_from_index(
         0) == data_indexer._padding_token
     assert data_indexer.get_word_from_index(1) == "<S>"
     assert data_indexer.get_word_from_index(2) == "</S>"
     assert data_indexer.get_word_from_index(3) == "<UNK>"
     assert data_indexer.get_word_from_index(4) == "a"
     assert data_indexer.get_word_from_index(5) == "word"
     assert data_indexer.get_word_from_index(6) == "another"
コード例 #19
0
    def test_add_word_to_index_gives_consistent_results(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1
コード例 #20
0
 def test_to_indexed_instance_converts_correctly(self):
     instance = SentenceSelectionInstance("What do dogs eat?",
                                          ["Dogs eat cats.",
                                           "Dogs play with cats.",
                                           "Dogs enjoy cats."],
                                          0)
     data_indexer = DataIndexer()
     what_index = data_indexer.add_word_to_index("what")
     do_index = data_indexer.add_word_to_index("do")
     dogs_index = data_indexer.add_word_to_index("dogs")
     eat_index = data_indexer.add_word_to_index("eat")
     cats_index = data_indexer.add_word_to_index("cats")
     period_index = data_indexer.add_word_to_index(".")
     question_index = data_indexer.add_word_to_index("?")
     play_index = data_indexer.add_word_to_index("play")
     with_index = data_indexer.add_word_to_index("with")
     enjoy_index = data_indexer.add_word_to_index("enjoy")
     indexed_instance = instance.to_indexed_instance(data_indexer)
     assert indexed_instance.question_indices == [what_index, do_index,
                                                  dogs_index, eat_index,
                                                  question_index]
     assert indexed_instance.sentences_indices == [[dogs_index, eat_index,
                                                    cats_index, period_index],
                                                   [dogs_index, play_index,
                                                    with_index, cats_index,
                                                    period_index],
                                                   [dogs_index, enjoy_index,
                                                    cats_index, period_index]]
     assert indexed_instance.label == 0
コード例 #21
0
    def test_namespaces(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word", namespace='1')
        assert "word" in data_indexer.words_in_index(namespace='1')
        assert data_indexer.get_word_index("word", namespace='1') == word_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='1') == "word"
        assert data_indexer.get_vocab_size(
            namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = data_indexer.add_word_to_index("word2", namespace='2')
        word_index = data_indexer.add_word_to_index("word", namespace='2')
        assert "word" in data_indexer.words_in_index(namespace='2')
        assert "word2" in data_indexer.words_in_index(namespace='2')
        assert data_indexer.get_word_index("word", namespace='2') == word_index
        assert data_indexer.get_word_index("word2",
                                           namespace='2') == word2_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='2') == "word"
        assert data_indexer.get_word_from_index(word2_index,
                                                namespace='2') == "word2"
        assert data_indexer.get_vocab_size(
            namespace='2') == initial_vocab_size + 2
コード例 #22
0
    def test_fit_word_dictionary_respects_min_count(self):
        instance = TextClassificationInstance("a a a a b b c c c", True)
        dataset = TextDataset([instance])
        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=4)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' not in data_indexer.words_in_index()
        assert 'c' not in data_indexer.words_in_index()

        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=1)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' in data_indexer.words_in_index()
        assert 'c' in data_indexer.words_in_index()