Ejemplo n.º 1
0
class TestVocabulary(unittest.TestCase):
    def setUp(self):
        self.tokens = ['a', 'b', 'c']
        self.vocab = Vocabulary(self.tokens)
        (a, b, c) = (self.vocab.tok2idx[tok] for tok in self.tokens[-3:])
        self.tok_seq = ['a', 'a', 'b', 'c']
        self.idx_seq = [a, a, b, c]

    def test_init(self):
        self.assertEqual(self.vocab.size, 3)
        self.assertListEqual(self.vocab.idx2tok, ['a', 'b', 'c'])

        tok2idx = defaultdict()
        for tok in self.tokens:
            tok2idx.setdefault(tok, len(tok2idx))
        self.assertDictEqual(self.vocab.tok2idx, tok2idx)

    def test_to_idx(self):
        self.assertEqual(self.vocab.to_idx(self.tok_seq), self.idx_seq)

    def test_to_text(self):
        self.assertEqual(self.vocab.to_text(self.idx_seq),
                         ' '.join(self.tok_seq))
        self.assertEqual(self.vocab.to_text(self.idx_seq, sep=None),
                         self.tok_seq)

    def test_save(self):
        self.vocab.save('tests/vocab.pkl')
        loaded_vocab = Vocabulary.load('tests/vocab.pkl')
        self.assertListEqual(self.vocab.idx2tok, loaded_vocab.idx2tok)

    def test_from_tokens(self):
        from_tokens_vocab = Vocabulary.from_tokens(self.tokens)
        self.assertCountEqual(self.vocab.idx2tok, from_tokens_vocab.idx2tok)

    def test_from_sequences(self):
        from_sequences_vocab = Vocabulary.from_sequences([self.tokens])
        self.assertCountEqual(self.vocab.idx2tok, from_sequences_vocab.idx2tok)

    def test_special_tokens(self):
        tokens = ['a', 'b', 'c', SPECIAL_TOKENS[1]]
        vocab = Vocabulary(tokens, use_special_tokens=True)

        self.assertEqual(vocab.size, len(SPECIAL_TOKENS) + 3)
        self.assertListEqual(vocab.idx2tok, SPECIAL_TOKENS + ['a', 'b', 'c'])

        tok2idx = defaultdict()
        for tok in SPECIAL_TOKENS + tokens:
            tok2idx.setdefault(tok, len(tok2idx))
        self.assertDictEqual(vocab.tok2idx, tok2idx)
Ejemplo n.º 2
0
    def test_special_tokens(self):
        tokens = ['a', 'b', 'c', SPECIAL_TOKENS[1]]
        vocab = Vocabulary(tokens, use_special_tokens=True)

        self.assertEqual(vocab.size, len(SPECIAL_TOKENS) + 3)
        self.assertListEqual(vocab.idx2tok, SPECIAL_TOKENS + ['a', 'b', 'c'])

        tok2idx = defaultdict()
        for tok in SPECIAL_TOKENS + tokens:
            tok2idx.setdefault(tok, len(tok2idx))
        self.assertDictEqual(vocab.tok2idx, tok2idx)
Ejemplo n.º 3
0
    def test_with_special_tokens(self):
        vectorizer = CountVectorizer(max_doc_freq=2,
                                     min_freq=1,
                                     max_features=1)
        vectorizer.fit(self.docs)

        new_vocab = Vocabulary(vectorizer.vocab.idx2tok,
                               use_special_tokens=True)
        vectorizer.vocab = new_vocab

        sequences, X = vectorizer.transform(self.docs)
        npt.assert_array_equal(X.A, np.asarray([[0], [2], [0]]))
Ejemplo n.º 4
0
 def test_from_sequences(self):
     from_sequences_vocab = Vocabulary.from_sequences([self.tokens])
     self.assertCountEqual(self.vocab.idx2tok, from_sequences_vocab.idx2tok)
Ejemplo n.º 5
0
 def test_from_tokens(self):
     from_tokens_vocab = Vocabulary.from_tokens(self.tokens)
     self.assertCountEqual(self.vocab.idx2tok, from_tokens_vocab.idx2tok)
Ejemplo n.º 6
0
 def test_save(self):
     self.vocab.save('tests/vocab.pkl')
     loaded_vocab = Vocabulary.load('tests/vocab.pkl')
     self.assertListEqual(self.vocab.idx2tok, loaded_vocab.idx2tok)
Ejemplo n.º 7
0
 def setUp(self):
     self.tokens = ['a', 'b', 'c']
     self.vocab = Vocabulary(self.tokens)
     (a, b, c) = (self.vocab.tok2idx[tok] for tok in self.tokens[-3:])
     self.tok_seq = ['a', 'a', 'b', 'c']
     self.idx_seq = [a, a, b, c]