class TestVocabulary(unittest.TestCase): def setUp(self): self.tokens = ['a', 'b', 'c'] self.vocab = Vocabulary(self.tokens) (a, b, c) = (self.vocab.tok2idx[tok] for tok in self.tokens[-3:]) self.tok_seq = ['a', 'a', 'b', 'c'] self.idx_seq = [a, a, b, c] def test_init(self): self.assertEqual(self.vocab.size, 3) self.assertListEqual(self.vocab.idx2tok, ['a', 'b', 'c']) tok2idx = defaultdict() for tok in self.tokens: tok2idx.setdefault(tok, len(tok2idx)) self.assertDictEqual(self.vocab.tok2idx, tok2idx) def test_to_idx(self): self.assertEqual(self.vocab.to_idx(self.tok_seq), self.idx_seq) def test_to_text(self): self.assertEqual(self.vocab.to_text(self.idx_seq), ' '.join(self.tok_seq)) self.assertEqual(self.vocab.to_text(self.idx_seq, sep=None), self.tok_seq) def test_save(self): self.vocab.save('tests/vocab.pkl') loaded_vocab = Vocabulary.load('tests/vocab.pkl') self.assertListEqual(self.vocab.idx2tok, loaded_vocab.idx2tok) def test_from_tokens(self): from_tokens_vocab = Vocabulary.from_tokens(self.tokens) self.assertCountEqual(self.vocab.idx2tok, from_tokens_vocab.idx2tok) def test_from_sequences(self): from_sequences_vocab = Vocabulary.from_sequences([self.tokens]) self.assertCountEqual(self.vocab.idx2tok, from_sequences_vocab.idx2tok) def test_special_tokens(self): tokens = ['a', 'b', 'c', SPECIAL_TOKENS[1]] vocab = Vocabulary(tokens, use_special_tokens=True) self.assertEqual(vocab.size, len(SPECIAL_TOKENS) + 3) self.assertListEqual(vocab.idx2tok, SPECIAL_TOKENS + ['a', 'b', 'c']) tok2idx = defaultdict() for tok in SPECIAL_TOKENS + tokens: tok2idx.setdefault(tok, len(tok2idx)) self.assertDictEqual(vocab.tok2idx, tok2idx)
def test_special_tokens(self): tokens = ['a', 'b', 'c', SPECIAL_TOKENS[1]] vocab = Vocabulary(tokens, use_special_tokens=True) self.assertEqual(vocab.size, len(SPECIAL_TOKENS) + 3) self.assertListEqual(vocab.idx2tok, SPECIAL_TOKENS + ['a', 'b', 'c']) tok2idx = defaultdict() for tok in SPECIAL_TOKENS + tokens: tok2idx.setdefault(tok, len(tok2idx)) self.assertDictEqual(vocab.tok2idx, tok2idx)
def test_with_special_tokens(self): vectorizer = CountVectorizer(max_doc_freq=2, min_freq=1, max_features=1) vectorizer.fit(self.docs) new_vocab = Vocabulary(vectorizer.vocab.idx2tok, use_special_tokens=True) vectorizer.vocab = new_vocab sequences, X = vectorizer.transform(self.docs) npt.assert_array_equal(X.A, np.asarray([[0], [2], [0]]))
def test_from_sequences(self): from_sequences_vocab = Vocabulary.from_sequences([self.tokens]) self.assertCountEqual(self.vocab.idx2tok, from_sequences_vocab.idx2tok)
def test_from_tokens(self): from_tokens_vocab = Vocabulary.from_tokens(self.tokens) self.assertCountEqual(self.vocab.idx2tok, from_tokens_vocab.idx2tok)
def test_save(self): self.vocab.save('tests/vocab.pkl') loaded_vocab = Vocabulary.load('tests/vocab.pkl') self.assertListEqual(self.vocab.idx2tok, loaded_vocab.idx2tok)
def setUp(self): self.tokens = ['a', 'b', 'c'] self.vocab = Vocabulary(self.tokens) (a, b, c) = (self.vocab.tok2idx[tok] for tok in self.tokens[-3:]) self.tok_seq = ['a', 'a', 'b', 'c'] self.idx_seq = [a, a, b, c]