def test_prepare_tokenizer_profanity(): """It should accept the profanity censorship.""" songs = ["ok ok ok **** go go"] tokenizer = util.prepare_tokenizer(["ok ok ok **** go go"]) assert len(tokenizer.word_index) == 3 assert tokenizer.word_index == {"ok": 1, "go": 2, "****": 3} sentences = tokenizer.texts_to_sequences(songs) assert sentences[0] == [1, 1, 1, 3, 2, 2]
def test_prepare_tokenizer(songs): """It should tokenize newlines and include all words.""" tokenizer = util.prepare_tokenizer(songs) assert len(tokenizer.word_index) == 4 assert tokenizer.word_index == {"\n": 1, "woof": 2, "meow": 3, "chorus": 4} sentences = tokenizer.texts_to_sequences(songs) # The songs fixture has been carefully crafted, didn't you notice? :-) # 0 is reserved, 1 is newline, 2 is woof, 3 is meow, 4 is chorus assert sentences[0] == [3, 1, 3] assert sentences[1] == [2, 1, 1, 4, 1, 2, 2]
def test_prepare_tokenizer_limit_words(songs): """It should tokenize newlines.""" tokenizer = util.prepare_tokenizer(songs, num_words=2) # So interestingly, keras keeps track of all words. It's not until turning # sentences into sequences that the num_words parameter kicks in assert len(tokenizer.word_index) == 4 sentences = tokenizer.texts_to_sequences(songs) # 0 is reserved, 1 is newline, 2 is woof, the others are not included so they will be 0 assert sentences[0] == [1] assert sentences[1] == [2, 1, 1, 1, 2, 2]
def test_create_embedding_matrix(songs, embedding_mapping): """It should create a dictionary of embedding mappings.""" num_words = 2 tokenizer = util.prepare_tokenizer(songs, num_words=num_words) embedding_matrix = embedding.create_embedding_matrix( tokenizer, embedding_mapping, max_num_words=num_words, embedding_dim=3) # Only woof is known np.testing.assert_array_equal(embedding_matrix, [ [0, 0, 0], # OOV [0, 0, 0], # \n [0.1, 0.2, 0.3] # woof #[0, 0, 0], # meow, absent, because we only choose 2 words #[0, 0, 0], # chorus, absent, same reason ])
def test_prepare_tokenizer_char_level(songs): """It should tokenize at character level.""" tokenizer = util.prepare_tokenizer(songs, char_level=True) # 12 characters = ['\n', ' ', 'c', 'e', 'f', 'h', 'm', 'o', 'r', 's', 'u', 'w'] assert len(tokenizer.word_index) == 12