def test_words_tokenizes_the_sentence_correctly(self):
     t = TextClassificationInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     t = TextClassificationInstance("This isn't a sentence.", None)
     assert t.words() == {'words': ['this', 'is', "n't", 'a', 'sentence', '.']}
     t = TextClassificationInstance("And, I have commas.", None)
     assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
Ejemplo n.º 2
0
    def test_to_indexed_instance_converts_correctly(self):
        data_indexer = DataIndexer()
        sentence_index = data_indexer.add_word_to_index("sentence",
                                                        namespace='words')
        capital_a_index = data_indexer.add_word_to_index("A",
                                                         namespace='words')
        space_index = data_indexer.add_word_to_index(" ", namespace='words')
        a_index = data_indexer.add_word_to_index("a", namespace='words')
        s_index = data_indexer.add_word_to_index("s", namespace='words')
        e_index = data_indexer.add_word_to_index("e", namespace='words')
        n_index = data_indexer.add_word_to_index("n", namespace='words')
        t_index = data_indexer.add_word_to_index("t", namespace='words')
        c_index = data_indexer.add_word_to_index("c", namespace='words')
        a_char_index = data_indexer.add_word_to_index("a",
                                                      namespace='characters')
        s_char_index = data_indexer.add_word_to_index("s",
                                                      namespace='characters')
        e_char_index = data_indexer.add_word_to_index("e",
                                                      namespace='characters')
        n_char_index = data_indexer.add_word_to_index("n",
                                                      namespace='characters')
        t_char_index = data_indexer.add_word_to_index("t",
                                                      namespace='characters')
        c_char_index = data_indexer.add_word_to_index("c",
                                                      namespace='characters')

        instance = TextClassificationInstance(
            "A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [a_index, sentence_index]

        TextInstance.tokenizer = tokenizers['characters'](Params({}))
        instance = TextClassificationInstance(
            "A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [
            capital_a_index, space_index, s_index, e_index, n_index, t_index,
            e_index, n_index, c_index, e_index
        ]
        TextInstance.tokenizer = tokenizers['words and characters'](Params({}))
        instance = TextClassificationInstance(
            "A sentence", None).to_indexed_instance(data_indexer)

        assert instance.word_indices == [[a_index, a_char_index],
                                         [
                                             sentence_index, s_char_index,
                                             e_char_index, n_char_index,
                                             t_char_index, e_char_index,
                                             n_char_index, c_char_index,
                                             e_char_index
                                         ]]
Ejemplo n.º 3
0
 def test_words_tokenizes_the_sentence_correctly(self):
     t = TextClassificationInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     TextInstance.tokenizer = tokenizers['characters'](Params({}))
     assert t.words() == {
         'words': [
             'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e',
             'n', 't', 'e', 'n', 'c', 'e', '.'
         ]
     }
     TextInstance.tokenizer = tokenizers['words and characters'](Params({}))
     assert t.words() == {
         'words': ['this', 'is', 'a', 'sentence', '.'],
         'characters': [
             't', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'n', 't', 'e',
             'n', 'c', 'e', '.'
         ]
     }
Ejemplo n.º 4
0
    def test_read_from_line_handles_two_column_with_index(self):
        index = 23
        text = "this is a sentence"
        label = None
        line = self.instance_to_line(text, label, index)

        instance = TextClassificationInstance.read_from_line(line)
        assert instance.text == text
        assert instance.label is label
        assert instance.index == index
Ejemplo n.º 5
0
 def test_get_nearest_neighbors_does_not_crash(self):
     args = Params({
         'corpus_path': self.corpus_path,
         'model_serialization_prefix': './',
         'num_sentence_words': 5,
     })
     model = self.get_model(DifferentiableSearchMemoryNetwork, args)
     model.encoder_model = FakeEncoder()
     model._initialize_lsh()
     model.num_sentence_words = 5
     model.max_knowledge_length = 2
     model.get_nearest_neighbors(
         TextClassificationInstance("this is a sentence", True))
Ejemplo n.º 6
0
 def test_read_from_line_handles_one_column(self):
     text = "this is a sentence"
     instance = TextClassificationInstance.read_from_line(text)
     assert instance.text == text
     assert instance.label is None
     assert instance.index is None