Exemple #1
0
 def test_idx2token_out_of_bounds(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     print(vocab_builder.get_idx2token_mapping())
     with pytest.raises(ValueError):
         vocab_builder.get_token_from_idx(100)
Exemple #2
0
 def test_idx2token_for_unk(self, instances):
     """" Many words map to UNK in the vocab. For example say the index for UNK is 3.
     Then mapping 3 to the token should always map to UNK and not any other word
     """
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         start_token="<SOS>",
         end_token="<EOS>",
         pad_token="<PAD>",
         unk_token="<UNK>",
     )
     vocab_builder.build_vocab()
     UNK_IDX = vocab_builder.special_vocab[vocab_builder.unk_token][1]
     assert vocab_builder.get_token_from_idx(UNK_IDX) == "<UNK>"
Exemple #3
0
 def test_add_tokens(self, instances, tmpdir):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         store_location=vocab_file,
     )
     vocab.build_vocab()
     vocab.add_tokens(["very", "much"])
     assert "very" in vocab.vocab.keys()
     assert "much" in vocab.vocab.keys()
     assert vocab.vocab["very"] == (1, 7)
     assert vocab.vocab["much"] == (1, 8)
     assert vocab.get_token_from_idx(7) == "very"
     assert vocab.get_token_from_idx(8) == "much"
     assert vocab.get_idx_from_token("very") == 7
     assert vocab.get_idx_from_token("much") == 8