def test_idx2token_cries_for_vocab(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) with pytest.raises(ValueError): vocab_builder.get_idx_from_token(1)
def set_char_vocab(self): self.char_instances = self.char_tokenizer.tokenize_batch(self.lines) self.char_vocab = Vocab( instances=self.char_instances, max_num_tokens=1e6, min_count=1, store_location=self.char_vocab_store_location, embedding_type=self.char_embedding_type, embedding_dimension=self.char_embedding_dimension, start_token=self.char_start_token, end_token=self.char_end_token, unk_token=self.char_unk_token, pad_token=self.char_pad_token, ) self.char_vocab.build_vocab() # adding these to help conversion to characters later self.char_vocab.add_tokens( list(self.word_start_token) + list(self.word_end_token) + list(self.word_unk_token) + list(self.word_pad_token) ) self.char_numericalizer = Numericalizer(vocabulary=self.char_vocab) self.char_vocab.print_stats()
def set_word_vocab(self): if not all( [ attribute in dir(self) for attribute in self.word_vocab_required_attributes ] ): raise ValueError( f"For building word vocab, " f"please pass these attributes in your " f"dataset construction {self.word_vocab_required_attributes}" ) self.word_instances = self.word_tokenizer.tokenize_batch(self.lines) self.word_vocab = Vocab( instances=self.word_instances, max_num_tokens=self.max_num_words, unk_token=self.word_unk_token, pad_token=self.word_pad_token, start_token=self.word_start_token, end_token=self.word_end_token, store_location=self.word_vocab_store_location, embedding_type=self.word_embedding_type, embedding_dimension=self.word_embedding_dimension, ) self.word_numericalizer = Numericalizer(self.word_vocab) self.word_vocab.build_vocab() self.word_vocab.print_stats()
def test_build_vocab_single_instance_has_words(self, instances): single_instance = instances["single_instance"] vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000) vocab = vocab_builder.map_tokens_to_freq_idx() assert "i" in vocab.keys() assert "like" in vocab.keys() assert "nlp" in vocab.keys()
def test_orig_vocab_len(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 0 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_len = vocab_builder.get_orig_vocab_len() assert vocab_len == 3 + len(vocab_builder.special_vocab)
def test_get_topn(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() words_freqs = vocab_builder.get_topn_frequent_words(n=1) assert words_freqs[0][0] == "i" assert words_freqs[0][1] == 3
def test_max_num_tokens_unset(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab.build_vocab() assert vocab.max_num_tokens == 3 + len(vocab.special_vocab.keys())
def setup_lstm2seqdecoder(request, ): HIDDEN_DIM = 1024 NUM_LAYERS = request.param[0] BIDIRECTIONAL = request.param[1] TEACHER_FORCING_RATIO = request.param[3] MAX_LENGTH = 5 lines = [] words = [] # texts = ["First", "second", "Third"] texts = ["First sentence", "second sentence", "Third long sentence here"] for text in texts: line = Line(text=text) word = Line(text=text.split()[0]) lines.append(line) words.append(word) flat_texts = [[word for sentence in texts for word in sentence]] vocab = Vocab(flat_texts) vocab.build_vocab() num_direction = 2 if BIDIRECTIONAL else 1 h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1 c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2 embedder = WordEmbedder(embedding_type="glove_6B_50") encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) * 0.5 if request.param[2] else None) decoder = Lstm2SeqDecoder( embedder=embedder, vocab=vocab, max_length=MAX_LENGTH, attn_module=request.param[2], dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( decoder, { "HIDDEN_DIM": HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "MAX_LENGTH": MAX_LENGTH, "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO, "LINES": lines, "WORDS": words, "VOCAB_SIZE": vocab.get_vocab_len(), "BIDIRECTIONAL": BIDIRECTIONAL, }, encoder_outputs, (h0, c0), )
def test_vocab_always_has_special_tokens(self, instances): single_instance = instances["single_instance"] vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000, min_count=1) vocab = vocab_builder.map_tokens_to_freq_idx() assert vocab_builder.unk_token in vocab.keys() assert vocab_builder.pad_token in vocab.keys() assert vocab_builder.start_token in vocab.keys() assert vocab_builder.end_token in vocab.keys()
def test_vocab_length_min_freq_1_max_words_1(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 1 MIN_FREQ = 1 vocab_builder = Vocab(instances=single_instance, min_count=MIN_FREQ, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() len_vocab = vocab_builder.get_vocab_len() assert len_vocab == 1 + len(vocab_builder.special_vocab)
def test_build_vocab_single_instance_min_freq_2(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 MIN_FREQ = 2 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS, min_count=MIN_FREQ) vocab = vocab_builder.build_vocab() vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == 2 + len(vocab_builder.special_vocab)
def test_random_embeddinng_has_2dimensions(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=None, embedding_dimension=300, ) vocab.build_vocab() embeddings = vocab.load_embedding() assert embeddings.ndimension() == 2
def test_get_topn(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab_builder.build_vocab() words_freqs = vocab_builder.get_topn_frequent_words(n=1) assert words_freqs[0][0] == "i" assert words_freqs[0][1] == 3
def test_disp_sentences_from_indices( self, instances, tmpdir, include_special_vocab ): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = None vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab.build_vocab() sent = vocab.get_disp_sentence_from_indices([0, 1, 2]) assert type(sent) is str
def test_disp_sentences_from_indices(self, instances, tmpdir): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = 100 vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=None, embedding_dimension=300, store_location=vocab_file, ) vocab.build_vocab() sent = vocab.get_disp_sentence_from_indices([0, 1, 2, 3]) assert type(sent) is str
def test_build_vocab_single_instance_descending_order(self, instances): single_instance = instances["single_instance"] vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000, min_count=1) vocab = vocab_builder.map_tokens_to_freq_idx() i_freq, i_idx = vocab["i"] like_freq, like_idx = vocab["like"] nlp_freq, nlp_idx = vocab["nlp"] # since 'i' appears more number of times than 'like' appears more # number of times than nlp assert i_idx < like_idx < nlp_idx assert i_freq > like_freq > nlp_freq
def get_numericalized_instances(get_preprocessed_instances): instances, labels = get_preprocessed_instances MAX_NUM_WORDS = 3000 MAX_LENGTH = 15 vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS) vocab.build_vocab() numericalizer = Numericalizer(vocabulary=vocab) numericalized_instances = numericalizer.numericalize_batch_instances( instances[:32]) return { "numericalized_instances": numericalized_instances, "labels": labels, "max_length": MAX_LENGTH, "max_num_words": MAX_NUM_WORDS, "vocab": vocab, }
def test_token2idx(self, instances, start_token, end_token, unk_token, pad_token): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token=start_token, end_token=end_token, pad_token=pad_token, unk_token=unk_token, ) vocab_builder.build_vocab() token2idx = vocab_builder.token2idx len_indices = len(token2idx.keys()) indices = token2idx.values() indices = sorted(indices) assert indices == list(range(len_indices))
def test_idx2token_for_unk(self, instances): """" Many words map to UNK in the vocab. For example say the index for UNK is 3. Then mapping 3 to the token should always map to UNK and not any other word """ single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token="<SOS>", end_token="<EOS>", pad_token="<PAD>", unk_token="<UNK>", ) vocab_builder.build_vocab() UNK_IDX = vocab_builder.special_vocab[vocab_builder.unk_token][1] assert vocab_builder.get_token_from_idx(UNK_IDX) == "<UNK>"
def single_instance_setup(instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocabulary = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) numericalizer = Numericalizer(vocabulary=vocabulary) return single_instance, numericalizer, vocabulary
def test_idx2token_out_of_bounds(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() print(vocab_builder.get_idx2token_mapping()) with pytest.raises(ValueError): vocab_builder.get_token_from_idx(100)
def test_idx2token(self, instances, start_token, end_token, unk_token, pad_token): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token=start_token, end_token=end_token, pad_token=pad_token, unk_token=unk_token, ) vocab_builder.build_vocab() idx2token = vocab_builder.idx2token len_idx2token = len(idx2token) indices = idx2token.keys() indices = sorted(indices) # tests all indices are in order assert indices == list(range(len_idx2token))
def test_single_instance_build_vocab(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None MIN_FREQ = 1 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, min_count=MIN_FREQ, include_special_vocab=include_special_vocab, ) vocab = vocab_builder.build_vocab() assert "i" in vocab.keys() assert "like" in vocab.keys() assert "nlp" in vocab.keys() vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == 3 + len(vocab_builder.special_vocab)
def test_print_stats_works(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_builder.print_stats()
def test_load_vocab(self, instances, tmpdir): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab_builder.save_to_file(vocab_file) vocab = Vocab.load_from_file(filename=vocab_file) assert vocab.get_vocab_len() == 3 + len(vocab_builder.special_vocab)
def test_preprocessing_lower(self, instances): single_instance = instances["single_instance"] instance_preprocesing = InstancePreprocessing() vocab_builder = Vocab( instances=single_instance, max_num_tokens=1000, preprocessing_pipeline=[instance_preprocesing.lowercase], ) instances = vocab_builder.instances for instance in instances: for token in instance: assert token.islower()
def test_print_stats_works(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab_builder.build_vocab() vocab_builder.print_stats()
def test_save_vocab(self, instances, tmpdir): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab_builder.save_to_file(vocab_file) assert os.path.isfile(vocab_file)
def test_load_embedding_has_all_words(self, instances, embedding_type): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=embedding_type, ) vocab.build_vocab() embedding = vocab.load_embedding() assert embedding.size(0) == vocab.get_vocab_len()
def test_single_instance_clip_on_max_num(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 1 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab = vocab_builder.map_tokens_to_freq_idx() vocab = vocab_builder.clip_on_max_num(vocab) vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == MAX_NUM_WORDS + len(vocab_builder.special_vocab)