def test_indexed_instance_padding(self): data_indexer = DataIndexer() dataset = TextDataset([self.instance]) data_indexer.fit_word_dictionary(dataset) indexed = self.instance.to_indexed_instance(data_indexer) num_question_tuples = 1 num_background_tuples = 4 num_slots = 3 slot_length = 6 num_options = 4 padding_lengths = { 'num_question_tuples': num_question_tuples, 'num_background_tuples': num_background_tuples, 'num_slots': num_slots, 'num_sentence_words': slot_length, 'num_options': num_options } indexed.pad(padding_lengths) assert len(indexed.answers_indexed) == num_options for answer_option_tuples in indexed.answers_indexed: assert len(answer_option_tuples) == num_question_tuples for ans_tuple in answer_option_tuples: assert len(ans_tuple) == num_slots for slot in ans_tuple: assert len(slot) == slot_length assert len(indexed.background_indexed) == num_background_tuples for background_tuple in indexed.background_indexed: assert len(background_tuple) == num_slots for slot in background_tuple: assert len(slot) == slot_length
def test_fit_word_dictionary_respects_min_count(self): instance = TextClassificationInstance("a a a a b b c c c", True) dataset = TextDataset([instance]) data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=4) assert 'a' in data_indexer.words_in_index() assert 'b' not in data_indexer.words_in_index() assert 'c' not in data_indexer.words_in_index() data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=1) assert 'a' in data_indexer.words_in_index() assert 'b' in data_indexer.words_in_index() assert 'c' in data_indexer.words_in_index()