Ejemplo n.º 1
0
    def test_to_indexed_instance_converts_correctly(self):
        data_indexer = DataIndexer()
        a_word_index = data_indexer.add_word_to_index("a", namespace='words')
        sentence_index = data_indexer.add_word_to_index("sentence", namespace='words')
        capital_a_index = data_indexer.add_word_to_index("A", namespace='characters')
        space_index = data_indexer.add_word_to_index(" ", namespace='characters')
        a_index = data_indexer.add_word_to_index("a", namespace='characters')
        s_index = data_indexer.add_word_to_index("s", namespace='characters')
        e_index = data_indexer.add_word_to_index("e", namespace='characters')
        n_index = data_indexer.add_word_to_index("n", namespace='characters')
        t_index = data_indexer.add_word_to_index("t", namespace='characters')
        c_index = data_indexer.add_word_to_index("c", namespace='characters')

        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [a_word_index, sentence_index]
        TextInstance.tokenizer = tokenizers['characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index,
                                         e_index, n_index, c_index, e_index]
        TextInstance.tokenizer = tokenizers['words and characters']({})
        instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer)
        assert instance.word_indices == [[a_word_index, a_index],
                                         [sentence_index, s_index, e_index, n_index, t_index,
                                          e_index, n_index, c_index, e_index]]
        TextInstance.tokenizer = tokenizers['words']({})
Ejemplo n.º 2
0
 def test_merge(self):
     instances = [
         TrueFalseInstance("testing", None, None),
         TrueFalseInstance("testing1", None, None)
     ]
     dataset1 = Dataset(instances[:1])
     dataset2 = Dataset(instances[1:])
     merged = dataset1.merge(dataset2)
     assert merged.instances == instances
Ejemplo n.º 3
0
 def test_words_tokenizes_the_sentence_correctly(self):
     t = TrueFalseInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     TextInstance.tokenizer = tokenizers['characters']({})
     assert t.words() == {'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's',
                                         'e', 'n', 't', 'e', 'n', 'c', 'e', '.']}
     TextInstance.tokenizer = tokenizers['words and characters']({})
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.'],
                          'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's',
                                         'e', 'n', 't', 'e', 'n', 'c', 'e', '.']}
     TextInstance.tokenizer = tokenizers['words']({})
Ejemplo n.º 4
0
 def test_words_tokenizes_the_sentence_correctly(self):
     t = TrueFalseInstance("This is a sentence.", None)
     assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']}
     t = TrueFalseInstance("This isn't a sentence.", None)
     assert t.words() == {
         'words': ['this', 'is', "n't", 'a', 'sentence', '.']
     }
     t = TrueFalseInstance("And, I have commas.", None)
     assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
Ejemplo n.º 5
0
 def read_instance_message(self, instance_message):
     # pylint: disable=redefined-variable-type
     instance_type = instance_message.type
     if instance_type == message_pb2.TRUE_FALSE:
         text = instance_message.question
         instance = TrueFalseInstance(text, None, None)
     elif instance_type == message_pb2.MULTIPLE_TRUE_FALSE:
         options = []
         for instance in instance_message.contained_instances:
             options.append(self.read_instance_message(instance))
         instance = MultipleTrueFalseInstance(options)
     elif instance_type == message_pb2.QUESTION_ANSWER:
         question = instance_message.question
         options = instance_message.answer_options
         instance = QuestionAnswerInstance(question, options, None, None)
     elif instance_type == message_pb2.CHARACTER_SPAN:
         question = instance_message.question
         passage = instance_message.passage
         instance = CharacterSpanInstance(question, passage, None, None)
     else:
         raise RuntimeError("Unrecognized instance type: " + instance_type)
     if instance_message.background_instances:
         background = instance_message.background_instances
         background_instances = [
             self.read_instance_message(instance) for instance in background
         ]
         instance = BackgroundInstance(instance, background_instances)
     return instance
Ejemplo n.º 6
0
    def test_read_from_line_handles_two_column_with_default_false(self):
        index = 23
        text = "this is a sentence"
        label = None
        line = self.instance_to_line(text, label, index)

        instance = TrueFalseInstance.read_from_line(line, default_label=False)
        assert instance.text == text
        assert instance.label is False
        assert instance.index == index
Ejemplo n.º 7
0
    def test_read_from_line_handles_two_column_with_label(self):
        index = None
        text = "this is a sentence"
        label = True
        line = self.instance_to_line(text, label, index)

        instance = TrueFalseInstance.read_from_line(line)
        assert instance.text == text
        assert instance.label is label
        assert instance.index == index
Ejemplo n.º 8
0
 def test_get_nearest_neighbors_does_not_crash(self):
     args = {
             'corpus_path': self.corpus_path,
             'model_serialization_prefix': './',
             'max_sentence_length': 5,
             }
     model = self.get_model(DifferentiableSearchMemoryNetwork, args)
     model.encoder_model = FakeEncoder()
     model._initialize_lsh()
     model.max_sentence_length = 5
     model.max_knowledge_length = 2
     model.get_nearest_neighbors(TrueFalseInstance("this is a sentence", True))
Ejemplo n.º 9
0
    def test_fit_word_dictionary_respects_min_count(self):
        instance = TrueFalseInstance("a a a a b b c c c", True)
        dataset = TextDataset([instance])
        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=4)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' not in data_indexer.words_in_index()
        assert 'c' not in data_indexer.words_in_index()

        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=1)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' in data_indexer.words_in_index()
        assert 'c' in data_indexer.words_in_index()
Ejemplo n.º 10
0
 def test_read_from_line_handles_one_column(self):
     text = "this is a sentence"
     instance = TrueFalseInstance.read_from_line(text)
     assert instance.text == text
     assert instance.label is None
     assert instance.index is None