def test_idx2token_out_of_bounds(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() print(vocab_builder.get_idx2token_mapping()) with pytest.raises(ValueError): vocab_builder.get_token_from_idx(100)
def get_embeddings_for_vocab(self, vocab: Vocab) -> torch.FloatTensor: idx2item = vocab.get_idx2token_mapping() len_vocab = len(idx2item) embeddings = [] for idx in range(len_vocab): item = idx2item.get(idx) try: # try getting the embeddings from the embeddings dictionary emb = self._embeddings[item] except KeyError: try: # try lowercasing the item and getting the embedding emb = self._embeddings[item.lower()] except KeyError: # nothing is working, lets fill it with random integers from normal dist emb = np.random.randn(self.embedding_dimension) embeddings.append(emb) embeddings = torch.tensor(embeddings, dtype=torch.float) return embeddings