Exemple #1
0
 def test_idx2token_out_of_bounds(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     print(vocab_builder.get_idx2token_mapping())
     with pytest.raises(ValueError):
         vocab_builder.get_token_from_idx(100)
Exemple #2
0
    def get_embeddings_for_vocab(self, vocab: Vocab) -> torch.FloatTensor:
        idx2item = vocab.get_idx2token_mapping()
        len_vocab = len(idx2item)
        embeddings = []
        for idx in range(len_vocab):
            item = idx2item.get(idx)
            try:
                # try getting the embeddings from the embeddings dictionary
                emb = self._embeddings[item]
            except KeyError:
                try:
                    # try lowercasing the item and getting the embedding
                    emb = self._embeddings[item.lower()]
                except KeyError:
                    # nothing is working, lets fill it with random integers from normal dist
                    emb = np.random.randn(self.embedding_dimension)
            embeddings.append(emb)

        embeddings = torch.tensor(embeddings, dtype=torch.float)
        return embeddings