def test_indexed_instance_padding(self): data_indexer = DataIndexer() dataset = TextDataset([self.instance]) data_indexer.fit_word_dictionary(dataset) indexed = self.instance.to_indexed_instance(data_indexer) num_question_tuples = 1 num_background_tuples = 4 num_slots = 3 slot_length = 6 num_options = 4 padding_lengths = { 'num_question_tuples': num_question_tuples, 'num_background_tuples': num_background_tuples, 'num_slots': num_slots, 'num_sentence_words': slot_length, 'num_options': num_options } indexed.pad(padding_lengths) assert len(indexed.answers_indexed) == num_options for answer_option_tuples in indexed.answers_indexed: assert len(answer_option_tuples) == num_question_tuples for ans_tuple in answer_option_tuples: assert len(ans_tuple) == num_slots for slot in ans_tuple: assert len(slot) == slot_length assert len(indexed.background_indexed) == num_background_tuples for background_tuple in indexed.background_indexed: assert len(background_tuple) == num_slots for slot in background_tuple: assert len(slot) == slot_length
def test_to_indexed_instance_converts_correctly(self): instance = CharacterSpanInstance("What do dogs eat?", "Dogs eat cats.", (9, 13)) data_indexer = DataIndexer() what_index = data_indexer.add_word_to_index("what") do_index = data_indexer.add_word_to_index("do") dogs_index = data_indexer.add_word_to_index("dogs") eat_index = data_indexer.add_word_to_index("eat") cats_index = data_indexer.add_word_to_index("cats") period_index = data_indexer.add_word_to_index(".") question_index = data_indexer.add_word_to_index("?") stop_index = data_indexer.add_word_to_index( CharacterSpanInstance.stop_token) indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [ what_index, do_index, dogs_index, eat_index, question_index ] assert indexed_instance.passage_indices == [ dogs_index, eat_index, cats_index, period_index, stop_index ] assert indexed_instance.label == (2, 3) # I put this test in here, instead of its own `test_as_training_data` test, to be sure that # the conversion to IndexedCharacterSpanIndex was performed correctly. indexed_instance.pad({'num_question_words': 3, 'num_passage_words': 6}) (question_array, passage_array), label = indexed_instance.as_training_data() assert isinstance(label, tuple) assert numpy.all(label[0] == numpy.asarray([0, 0, 1, 0, 0, 0])) assert numpy.all(label[1] == numpy.asarray([0, 0, 0, 1, 0, 0])) assert numpy.all(question_array == numpy.asarray( [dogs_index, eat_index, question_index])) assert numpy.all(passage_array == numpy.asarray( [dogs_index, eat_index, cats_index, period_index, stop_index, 0]))
def setUp(self): super(TestSentenceInstance, self).setUp() self.data_indexer = DataIndexer() self.this_index = self.data_indexer.add_word_to_index("this") self.is_index = self.data_indexer.add_word_to_index("is") self.a_index = self.data_indexer.add_word_to_index("a") self.sentence_index = self.data_indexer.add_word_to_index("sentence") self.start_index = self.data_indexer.add_word_to_index("<S>") self.end_index = self.data_indexer.add_word_to_index("</S>") self.space_index = self.data_indexer.add_word_to_index(' ') self.c_index = self.data_indexer.add_word_to_index('c') self.e_index = self.data_indexer.add_word_to_index('e') self.h_index = self.data_indexer.add_word_to_index('h') self.i_index = self.data_indexer.add_word_to_index('i') self.n_index = self.data_indexer.add_word_to_index('n') self.s_index = self.data_indexer.add_word_to_index('s') self.t_index = self.data_indexer.add_word_to_index('t') self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters') self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters') self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters') self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters') self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters') self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters') self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters') self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters')
def test_get_embedding_layer_actually_initializes_word_vectors_correctly(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word")] assert numpy.allclose(word_vector, numpy.asarray([1.0, 2.3, -1.0]))
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer._initial_weights[0][data_indexer.get_word_index("word2")] assert not numpy.allclose(word_vector, numpy.asarray([0.0, 0.0, 0.0]))
def test_to_indexed_instance_converts_correctly(self): instance = SnliInstance("a b", "d e f", "entails") data_indexer = DataIndexer() a_index = data_indexer.add_word_to_index("a") d_index = data_indexer.add_word_to_index("d") oov_index = data_indexer.get_word_index(data_indexer._oov_token) # pylint: disable=protected-access indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.first_sentence_indices == [a_index, oov_index] assert indexed_instance.second_sentence_indices == [d_index, oov_index, oov_index] assert indexed_instance.label == instance.label
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') capital_a_index = data_indexer.add_word_to_index("A", namespace='characters') space_index = data_indexer.add_word_to_index(" ", namespace='characters') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') n_index = data_indexer.add_word_to_index("n", namespace='characters') t_index = data_indexer.add_word_to_index("t", namespace='characters') c_index = data_indexer.add_word_to_index("c", namespace='characters') instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [a_word_index, sentence_index] TextInstance.tokenizer = tokenizers['characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index] TextInstance.tokenizer = tokenizers['words and characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [[a_word_index, a_index], [sentence_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] TextInstance.tokenizer = tokenizers['words']({})
def test_unknown_token(self): # pylint: disable=protected-access # We're putting this behavior in a test so that the behavior is documented. There is # solver code that depends in a small way on how we treat the unknown token, so any # breaking change to this behavior should break a test, so you know you've done something # that needs more consideration. data_indexer = DataIndexer() oov_token = data_indexer._oov_token oov_index = data_indexer.get_word_index(oov_token) assert oov_index == 1 assert data_indexer.get_word_index("unseen word") == oov_index
def test_to_indexed_instance_converts_correctly(self): instance = QuestionAnswerInstance("a A b", ["d", "e f D"], 1) data_indexer = DataIndexer() a_index = data_indexer.add_word_to_index("a") d_index = data_indexer.add_word_to_index("d") oov_index = data_indexer.get_word_index(data_indexer._oov_token) # pylint: disable=protected-access indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [a_index, a_index, oov_index] assert len(indexed_instance.option_indices) == 2 assert indexed_instance.option_indices[0] == [d_index] assert indexed_instance.option_indices[1] == [oov_index, oov_index, d_index]
def test_get_embedding_layer_skips_inconsistent_lines(self): data_indexer = DataIndexer() data_indexer.add_word_to_index("word1") data_indexer.add_word_to_index("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 \n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) word_vector = embedding_layer.initial_weights[0][data_indexer.get_word_index("word2")] assert not numpy.allclose(word_vector[:2], numpy.asarray([0.1, 0.4]))
def test_get_embedding_layer_crashes_when_embedding_dim_is_one(self): data_indexer = DataIndexer() embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("dimensionality 3\n".encode('utf-8')) embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) with pytest.raises(Exception): PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer)
def test_works_with_word_and_character_tokenizer(self): answer_options_simple = ("a<>a sentence<><>") background_simple = ("a<>a sentence<><>") line_simple = "\t".join( str(x) for x in [answer_options_simple, background_simple, "0"]) TextInstance.tokenizer = WordAndCharacterTokenizer(Params({})) data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') new_instance = TupleInferenceInstance.read_from_line(line_simple) indexed = new_instance.to_indexed_instance(data_indexer) padding_lengths = { 'num_question_tuples': 1, 'num_background_tuples': 1, 'num_slots': 2, 'num_sentence_words': 2, 'num_options': 1, 'num_word_characters': 3 } indexed.pad(padding_lengths) expected_indexed_tuple = [[[0, 0, 0], [a_word_index, a_index, 0]], [[a_word_index, a_index, 0], [sentence_index, s_index, e_index]]] expected_answers_indexed = numpy.asarray([expected_indexed_tuple]) expected_background_indexed = numpy.asarray(expected_indexed_tuple) assert numpy.all(indexed.answers_indexed == expected_answers_indexed) assert numpy.all( indexed.background_indexed == expected_background_indexed) TextInstance.tokenizer = tokenizers['words'](Params({}))
def test_get_embedding_layer_uses_correct_embedding_dim(self): data_indexer = DataIndexer() embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) assert embedding_layer.output_dim == 3 with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0 3.1\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0 -1.2\n".encode('utf-8')) embedding_layer = PretrainedEmbeddings.get_embedding_layer(embeddings_filename, data_indexer) assert embedding_layer.output_dim == 4
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() cats_index = data_indexer.add_word_to_index("cats") are_index = data_indexer.add_word_to_index("are") animals_index = data_indexer.add_word_to_index("animals") period_index = data_indexer.add_word_to_index(".") n_tag_index = data_indexer.add_word_to_index("N", namespace="tags") v_tag_index = data_indexer.add_word_to_index("V", namespace="tags") period_tag_index = data_indexer.add_word_to_index(".", namespace="tags") indexed_instance = self.instance.to_indexed_instance(data_indexer) expected_indices = [cats_index, are_index, animals_index, period_index] assert indexed_instance.text_indices == expected_indices expected_label = [self.one_hot(n_tag_index - 2, 3), self.one_hot(v_tag_index - 2, 3), self.one_hot(n_tag_index - 2, 3), self.one_hot(period_tag_index - 2, 3)] assert_array_almost_equal(indexed_instance.label, expected_label) train_inputs, train_labels = indexed_instance.as_training_data() assert_array_almost_equal(train_labels, expected_label) assert_array_almost_equal(train_inputs, expected_indices)
def test_indexed_instance_padding(self): data_indexer = DataIndexer() indexed = self.instance.to_indexed_instance(data_indexer) num_graphlets = 1 num_alignments = 2 num_features = 5 num_options = 4 padding_lengths = { 'num_graphlets': num_graphlets, 'num_alignments': num_alignments, 'num_features': num_features, 'num_options': num_options } indexed.pad(padding_lengths) assert len(indexed.answers_indexed) == num_options for answer_option_graphlets in indexed.answers_indexed: assert len(answer_option_graphlets) == num_graphlets for answer_graphlet in answer_option_graphlets: assert len(answer_graphlet) == num_alignments for alignment in answer_graphlet: assert len(alignment) == num_features
def test_to_indexed_instance_converts_correctly(self): instance = McQuestionAnswerInstance( "Cats from Nevada are eaten by dogs in XXX .", "Dogs eat cats from Nevada in Washington .", ["Nevada", "Washington"], 1) data_indexer = DataIndexer() cats_index = data_indexer.add_word_to_index("cats") are_index = data_indexer.add_word_to_index("are") eaten_index = data_indexer.add_word_to_index("eaten") by_index = data_indexer.add_word_to_index("by") dogs_index = data_indexer.add_word_to_index("dogs") in_index = data_indexer.add_word_to_index("in") XXX_index = data_indexer.add_word_to_index("xxx") period_index = data_indexer.add_word_to_index(".") eat_index = data_indexer.add_word_to_index("eat") from_index = data_indexer.add_word_to_index("from") nevada_index = data_indexer.add_word_to_index("nevada") washington_index = data_indexer.add_word_to_index("washington") indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [ cats_index, from_index, nevada_index, are_index, eaten_index, by_index, dogs_index, in_index, XXX_index, period_index ] assert indexed_instance.passage_indices == [ dogs_index, eat_index, cats_index, from_index, nevada_index, in_index, washington_index, period_index ] assert len(indexed_instance.option_indices) == 2 assert indexed_instance.option_indices[0] == [nevada_index] assert indexed_instance.option_indices[1] == [washington_index] assert indexed_instance.label == 1
class TestSentenceInstance(DeepQaTestCase): def setUp(self): super(TestSentenceInstance, self).setUp() self.data_indexer = DataIndexer() self.this_index = self.data_indexer.add_word_to_index("this") self.is_index = self.data_indexer.add_word_to_index("is") self.a_index = self.data_indexer.add_word_to_index("a") self.sentence_index = self.data_indexer.add_word_to_index("sentence") self.start_index = self.data_indexer.add_word_to_index("<S>") self.end_index = self.data_indexer.add_word_to_index("</S>") self.space_index = self.data_indexer.add_word_to_index(' ') self.c_index = self.data_indexer.add_word_to_index('c') self.e_index = self.data_indexer.add_word_to_index('e') self.h_index = self.data_indexer.add_word_to_index('h') self.i_index = self.data_indexer.add_word_to_index('i') self.n_index = self.data_indexer.add_word_to_index('n') self.s_index = self.data_indexer.add_word_to_index('s') self.t_index = self.data_indexer.add_word_to_index('t') self.a_char_index = self.data_indexer.add_word_to_index('a', namespace='characters') self.c_char_index = self.data_indexer.add_word_to_index('c', namespace='characters') self.e_char_index = self.data_indexer.add_word_to_index('e', namespace='characters') self.h_char_index = self.data_indexer.add_word_to_index('h', namespace='characters') self.i_char_index = self.data_indexer.add_word_to_index('i', namespace='characters') self.n_char_index = self.data_indexer.add_word_to_index('n', namespace='characters') self.s_char_index = self.data_indexer.add_word_to_index('s', namespace='characters') self.t_char_index = self.data_indexer.add_word_to_index('t', namespace='characters') def tearDown(self): super(TestSentenceInstance, self).tearDown() TextInstance.tokenizer = tokenizers['words'](Params({})) @staticmethod def instance_to_line(text, index=None): index_str = '' if index is None else str(index) + '\t' return index_str + text def test_read_from_line_handles_one_column(self): text = "this is a sentence" instance = SentenceInstance.read_from_line(text) assert instance.text == text assert instance.label is None assert instance.index is None def test_read_from_line_handles_two_column(self): index = 23 text = "this is a sentence" line = self.instance_to_line(text, index) instance = SentenceInstance.read_from_line(line) assert instance.text == text assert instance.index == index assert instance.label is None def test_end_to_end_conversion_to_arrays(self): instance = SentenceInstance("this is a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 7}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [0, 0, self.start_index, self.this_index, self.is_index, self.a_index, self.sentence_index]) assert_array_equal(label_array, [[0], [0], [self.this_index], [self.is_index], [self.a_index], [self.sentence_index], [self.end_index]]) def test_end_to_end_conversion_to_arrays_with_character_tokenizer(self): TextInstance.tokenizer = tokenizers['characters'](Params({})) instance = SentenceInstance("a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 13}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [0, 0, self.start_index, self.a_index, self.space_index, self.s_index, self.e_index, self.n_index, self.t_index, self.e_index, self.n_index, self.c_index, self.e_index]) assert_array_equal(label_array, [[0], [0], [self.a_index], [self.space_index], [self.s_index], [self.e_index], [self.n_index], [self.t_index], [self.e_index], [self.n_index], [self.c_index], [self.e_index], [self.end_index]]) def test_end_to_end_conversion_to_arrays_with_word_and_character_tokenizer(self): TextInstance.tokenizer = tokenizers['words and characters'](Params({})) instance = SentenceInstance("this is a sentence") indexed_instance = instance.to_indexed_instance(self.data_indexer) indexed_instance.pad({'num_sentence_words': 6, 'num_word_characters': 5}) word_array, label_array = indexed_instance.as_training_data() assert_array_equal(word_array, [[0, 0, 0, 0, 0], [self.start_index, 0, 0, 0, 0], [self.this_index, self.t_char_index, self.h_char_index, self.i_char_index, self.s_char_index], [self.is_index, self.i_char_index, self.s_char_index, 0, 0], [self.a_index, self.a_char_index, 0, 0, 0], [self.sentence_index, self.s_char_index, self.e_char_index, self.n_char_index, self.t_char_index], ]) assert_array_equal(label_array, [[0], [self.this_index], [self.is_index], [self.a_index], [self.sentence_index], [self.end_index]])
def test_set_from_file(self): # pylint: disable=protected-access vocab_filename = self.TEST_DIR + 'vocab_file' with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file: vocab_file.write('<S>\n') vocab_file.write('</S>\n') vocab_file.write('<UNK>\n') vocab_file.write('a\n') vocab_file.write('word\n') vocab_file.write('another\n') data_indexer = DataIndexer() data_indexer.set_from_file(vocab_filename, oov_token="<UNK>") assert data_indexer._oov_token == "<UNK>" assert data_indexer.get_word_index("random string") == 3 assert data_indexer.get_word_index("<S>") == 1 assert data_indexer.get_word_index("</S>") == 2 assert data_indexer.get_word_index("<UNK>") == 3 assert data_indexer.get_word_index("a") == 4 assert data_indexer.get_word_index("word") == 5 assert data_indexer.get_word_index("another") == 6 assert data_indexer.get_word_from_index( 0) == data_indexer._padding_token assert data_indexer.get_word_from_index(1) == "<S>" assert data_indexer.get_word_from_index(2) == "</S>" assert data_indexer.get_word_from_index(3) == "<UNK>" assert data_indexer.get_word_from_index(4) == "a" assert data_indexer.get_word_from_index(5) == "word" assert data_indexer.get_word_from_index(6) == "another"
def test_add_word_to_index_gives_consistent_results(self): data_indexer = DataIndexer() initial_vocab_size = data_indexer.get_vocab_size() word_index = data_indexer.add_word_to_index("word") assert "word" in data_indexer.words_in_index() assert data_indexer.get_word_index("word") == word_index assert data_indexer.get_word_from_index(word_index) == "word" assert data_indexer.get_vocab_size() == initial_vocab_size + 1 # Now add it again, and make sure nothing changes. data_indexer.add_word_to_index("word") assert "word" in data_indexer.words_in_index() assert data_indexer.get_word_index("word") == word_index assert data_indexer.get_word_from_index(word_index) == "word" assert data_indexer.get_vocab_size() == initial_vocab_size + 1
def test_to_indexed_instance_converts_correctly(self): instance = SentenceSelectionInstance("What do dogs eat?", ["Dogs eat cats.", "Dogs play with cats.", "Dogs enjoy cats."], 0) data_indexer = DataIndexer() what_index = data_indexer.add_word_to_index("what") do_index = data_indexer.add_word_to_index("do") dogs_index = data_indexer.add_word_to_index("dogs") eat_index = data_indexer.add_word_to_index("eat") cats_index = data_indexer.add_word_to_index("cats") period_index = data_indexer.add_word_to_index(".") question_index = data_indexer.add_word_to_index("?") play_index = data_indexer.add_word_to_index("play") with_index = data_indexer.add_word_to_index("with") enjoy_index = data_indexer.add_word_to_index("enjoy") indexed_instance = instance.to_indexed_instance(data_indexer) assert indexed_instance.question_indices == [what_index, do_index, dogs_index, eat_index, question_index] assert indexed_instance.sentences_indices == [[dogs_index, eat_index, cats_index, period_index], [dogs_index, play_index, with_index, cats_index, period_index], [dogs_index, enjoy_index, cats_index, period_index]] assert indexed_instance.label == 0
def test_namespaces(self): data_indexer = DataIndexer() initial_vocab_size = data_indexer.get_vocab_size() word_index = data_indexer.add_word_to_index("word", namespace='1') assert "word" in data_indexer.words_in_index(namespace='1') assert data_indexer.get_word_index("word", namespace='1') == word_index assert data_indexer.get_word_from_index(word_index, namespace='1') == "word" assert data_indexer.get_vocab_size( namespace='1') == initial_vocab_size + 1 # Now add it again, in a different namespace and a different word, and make sure it's like # new. word2_index = data_indexer.add_word_to_index("word2", namespace='2') word_index = data_indexer.add_word_to_index("word", namespace='2') assert "word" in data_indexer.words_in_index(namespace='2') assert "word2" in data_indexer.words_in_index(namespace='2') assert data_indexer.get_word_index("word", namespace='2') == word_index assert data_indexer.get_word_index("word2", namespace='2') == word2_index assert data_indexer.get_word_from_index(word_index, namespace='2') == "word" assert data_indexer.get_word_from_index(word2_index, namespace='2') == "word2" assert data_indexer.get_vocab_size( namespace='2') == initial_vocab_size + 2
def test_fit_word_dictionary_respects_min_count(self): instance = TextClassificationInstance("a a a a b b c c c", True) dataset = TextDataset([instance]) data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=4) assert 'a' in data_indexer.words_in_index() assert 'b' not in data_indexer.words_in_index() assert 'c' not in data_indexer.words_in_index() data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=1) assert 'a' in data_indexer.words_in_index() assert 'b' in data_indexer.words_in_index() assert 'c' in data_indexer.words_in_index()