Exemple #1
0
    def test_build_vocab_single_instance_has_words(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000)
        vocab = vocab_builder.map_tokens_to_freq_idx()

        assert "i" in vocab.keys()
        assert "like" in vocab.keys()
        assert "nlp" in vocab.keys()
Exemple #2
0
    def test_vocab_always_has_special_tokens(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=1)

        vocab = vocab_builder.map_tokens_to_freq_idx()
        assert vocab_builder.unk_token in vocab.keys()
        assert vocab_builder.pad_token in vocab.keys()
        assert vocab_builder.start_token in vocab.keys()
        assert vocab_builder.end_token in vocab.keys()
Exemple #3
0
    def test_single_instance_clip_on_max_num(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 1
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        vocab = vocab_builder.map_tokens_to_freq_idx()

        vocab = vocab_builder.clip_on_max_num(vocab)

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == MAX_NUM_WORDS + len(vocab_builder.special_vocab)
Exemple #4
0
    def test_single_instance_min_count(self, instances):
        single_instance = instances["single_instance"]

        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=2)
        vocab_builder.build_vocab()
        vocab = vocab_builder.map_tokens_to_freq_idx()
        vocab = vocab_builder.clip_on_mincount(vocab)

        # check that is mapped to unk
        nlp_freq, nlp_idx = vocab["nlp"]
        assert nlp_idx == vocab_builder.token2idx["<UNK>"]
Exemple #5
0
    def test_build_vocab_single_instance_descending_order(self, instances):
        single_instance = instances["single_instance"]
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=1)
        vocab = vocab_builder.map_tokens_to_freq_idx()

        i_freq, i_idx = vocab["i"]
        like_freq, like_idx = vocab["like"]
        nlp_freq, nlp_idx = vocab["nlp"]

        # since 'i' appears more number of times than 'like' appears more
        # number of times than nlp
        assert i_idx < like_idx < nlp_idx
        assert i_freq > like_freq > nlp_freq