def test_read_labeled_background_from_file_loads_correct_instances(self): filename = self.TEST_DIR + 'test_dataset_file' with open(filename, 'w') as datafile: datafile.write("1\tinstance1\t0\n") datafile.write("2\tinstance2\t1\n") datafile.write("3\tinstance3\n") background_filename = self.TEST_DIR + 'test_dataset_background' with open(background_filename, 'w') as datafile: datafile.write("1\t2\tb1\tb2\tb3\tb4\n") datafile.write("2\t1,3\tb1\tb2\tb3\tb4\n") datafile.write("3\t0\tb5\tb6\tb7\tb8\n") dataset = TextDataset.read_from_file(filename, TrueFalseInstance) background_dataset = TextDataset.read_labeled_background_from_file( dataset, background_filename) assert len(background_dataset.instances) == 3 instance = background_dataset.instances[0] assert isinstance(instance, LabeledBackgroundInstance) assert instance.instance.index == 1 assert instance.instance.text == "instance1" assert instance.label == [2] assert instance.background == ['b1', 'b2', 'b3', 'b4'] instance = background_dataset.instances[1] assert isinstance(instance, LabeledBackgroundInstance) assert instance.instance.index == 2 assert instance.instance.text == "instance2" assert instance.label == [1, 3] assert instance.background == ['b1', 'b2', 'b3', 'b4'] instance = background_dataset.instances[2] assert isinstance(instance, LabeledBackgroundInstance) assert instance.instance.index == 3 assert instance.instance.text == "instance3" assert instance.label == [0] assert instance.background == ['b5', 'b6', 'b7', 'b8']
def test_indexed_instance_padding(self): data_indexer = DataIndexer() dataset = TextDataset([self.instance]) data_indexer.fit_word_dictionary(dataset) indexed = self.instance.to_indexed_instance(data_indexer) num_question_tuples = 1 num_background_tuples = 4 num_slots = 3 slot_length = 6 num_options = 4 padding_lengths = { 'num_question_tuples': num_question_tuples, 'num_background_tuples': num_background_tuples, 'num_slots': num_slots, 'num_sentence_words': slot_length, 'num_options': num_options } indexed.pad(padding_lengths) assert len(indexed.answers_indexed) == num_options for answer_option_tuples in indexed.answers_indexed: assert len(answer_option_tuples) == num_question_tuples for ans_tuple in answer_option_tuples: assert len(ans_tuple) == num_slots for slot in ans_tuple: assert len(slot) == slot_length assert len(indexed.background_indexed) == num_background_tuples for background_tuple in indexed.background_indexed: assert len(background_tuple) == num_slots for slot in background_tuple: assert len(slot) == slot_length
def test_fit_word_dictionary_respects_min_count(self): instance = TextClassificationInstance("a a a a b b c c c", True) dataset = TextDataset([instance]) data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=4) assert 'a' in data_indexer.words_in_index() assert 'b' not in data_indexer.words_in_index() assert 'c' not in data_indexer.words_in_index() data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=1) assert 'a' in data_indexer.words_in_index() assert 'b' in data_indexer.words_in_index() assert 'c' in data_indexer.words_in_index()
def test_read_from_file_with_no_default_label(self): filename = self.TEST_DIR + 'test_dataset_file' with open(filename, 'w') as datafile: datafile.write("1\tinstance1\t0\n") datafile.write("2\tinstance2\t1\n") datafile.write("3\tinstance3\n") dataset = TextDataset.read_from_file(filename, TrueFalseInstance) assert len(dataset.instances) == 3 instance = dataset.instances[0] assert instance.index == 1 assert instance.text == "instance1" assert instance.label is False instance = dataset.instances[1] assert instance.index == 2 assert instance.text == "instance2" assert instance.label is True instance = dataset.instances[2] assert instance.index == 3 assert instance.text == "instance3" assert instance.label is None
def score_instance(self, instance): dataset = TextDataset([instance]) return self.solver.score_text_dataset(dataset)