Ejemplo n.º 1
0
    def test_namespaces(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word", namespace='1')
        assert "word" in data_indexer.words_in_index(namespace='1')
        assert data_indexer.get_word_index("word", namespace='1') == word_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='1') == "word"
        assert data_indexer.get_vocab_size(
            namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = data_indexer.add_word_to_index("word2", namespace='2')
        word_index = data_indexer.add_word_to_index("word", namespace='2')
        assert "word" in data_indexer.words_in_index(namespace='2')
        assert "word2" in data_indexer.words_in_index(namespace='2')
        assert data_indexer.get_word_index("word", namespace='2') == word_index
        assert data_indexer.get_word_index("word2",
                                           namespace='2') == word2_index
        assert data_indexer.get_word_from_index(word_index,
                                                namespace='2') == "word"
        assert data_indexer.get_word_from_index(word2_index,
                                                namespace='2') == "word2"
        assert data_indexer.get_vocab_size(
            namespace='2') == initial_vocab_size + 2
Ejemplo n.º 2
0
 def test_set_from_file(self):
     # pylint: disable=protected-access
     vocab_filename = self.TEST_DIR + 'vocab_file'
     with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
         vocab_file.write('<S>\n')
         vocab_file.write('</S>\n')
         vocab_file.write('<UNK>\n')
         vocab_file.write('a\n')
         vocab_file.write('word\n')
         vocab_file.write('another\n')
     data_indexer = DataIndexer()
     data_indexer.set_from_file(vocab_filename, oov_token="<UNK>")
     assert data_indexer._oov_token == "<UNK>"
     assert data_indexer.get_word_index("random string") == 3
     assert data_indexer.get_word_index("<S>") == 1
     assert data_indexer.get_word_index("</S>") == 2
     assert data_indexer.get_word_index("<UNK>") == 3
     assert data_indexer.get_word_index("a") == 4
     assert data_indexer.get_word_index("word") == 5
     assert data_indexer.get_word_index("another") == 6
     assert data_indexer.get_word_from_index(
         0) == data_indexer._padding_token
     assert data_indexer.get_word_from_index(1) == "<S>"
     assert data_indexer.get_word_from_index(2) == "</S>"
     assert data_indexer.get_word_from_index(3) == "<UNK>"
     assert data_indexer.get_word_from_index(4) == "a"
     assert data_indexer.get_word_from_index(5) == "word"
     assert data_indexer.get_word_from_index(6) == "another"
Ejemplo n.º 3
0
    def test_add_word_to_index_gives_consistent_results(self):
        data_indexer = DataIndexer()
        initial_vocab_size = data_indexer.get_vocab_size()
        word_index = data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        data_indexer.add_word_to_index("word")
        assert "word" in data_indexer.words_in_index()
        assert data_indexer.get_word_index("word") == word_index
        assert data_indexer.get_word_from_index(word_index) == "word"
        assert data_indexer.get_vocab_size() == initial_vocab_size + 1