def test_contains(self): vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) vocab.update(text) self.assertTrue(text[-1] in vocab) self.assertFalse("~!@#" in vocab) self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1])) self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
def test_contains(self): vocab = Vocabulary(max_size=None, min_freq=None, unknown=None, padding=None) vocab.update(text) self.assertTrue(text[-1] in vocab) self.assertFalse("~!@#" in vocab) self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1])) self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl): """Load the pre-trained embedding and combine with the given dictionary. :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. :param emb_file: str, the pre-trained embedding file path. :param emb_type: str, the pre-trained embedding format, support glove now :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding :param emb_pkl: str, the embedding pickle file. :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train TODO: fragile code """ # If the embedding pickle exists, load it and return. # if os.path.exists(emb_pkl): # with open(emb_pkl, "rb") as f: # embedding_tensor, vocab = _pickle.load(f) # return embedding_tensor, vocab # Otherwise, load the pre-trained embedding. pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.update(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v # save and return the result # with open(emb_pkl, "wb") as f: # _pickle.dump((embedding_tensor, vocab), f) return embedding_tensor, vocab
def load_embedding(emb_dim, emb_file, emb_type, vocab): """Load the pre-trained embedding and combine with the given dictionary. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param str emb_type: the pre-trained embedding format, support glove now :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train """ pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.add(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError( "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v return embedding_tensor, vocab