def _create_pv(self): try: vectors = KeyedVectors() except TypeError: # Newer versions of gensim require a constructor argument. vectors = KeyedVectors(self._wv.shape[1]) phrases = [] for name in self.terminology.iter_names(): # This iterates over unique names. vectors.vocab[name] = Vocab(index=len(vectors.vocab), count=None) vectors.index2word.append(name) phrases.append(self._phrase_vector(name)) vectors.syn0 = vectors.syn0norm = np.array(phrases) return vectors
def test_average_embeddings_custom(): corpus = [ ["the", "man", "ran"], ["the", "boy"], ["the", "man", "boy"], ] max_sent_len = 3 model = KeyedVectors(vector_size=1) model.add(["the"], [np.array([1])]) model.add(["man"], [np.array([2])]) model.add(["ran"], [np.array([3])]) model.add(["boy"], [np.array([6])]) model.add([PAD_TOK], [np.array([0])]) embedding_input = lib.embeddings.get_embedding_input(corpus, model, max_sent_len) avg_embedding_model = lib.mlp.get_average_embedding_model( input_shape=embedding_input.shape[1:], w2v_model=model, ) avg = avg_embedding_model.predict(embedding_input) print(avg) assert avg[0][0] == (1 + 2 + 3) / 3 assert avg[1][0] == (1 + 6) / 2 assert avg[2][0] == (1 + 2 + 6) / 3
def main(): config = load_config() with open(os.path.join(config.cooccurrence_dir, "vocab.pkl"), "rb") as f: vocab = pickle.load(f) model = GloVe( vocab_size=config.vocab_size, embedding_size=config.embedding_size, x_max=config.x_max, alpha=config.alpha ) model.load_state_dict(torch.load(config.output_filepath)) keyed_vectors = KeyedVectors(vector_size=config.embedding_size) keyed_vectors.add_vectors( keys=[vocab.get_token(index) for index in range(config.vocab_size)], weights=(model.weight.weight.detach() + model.weight_tilde.weight.detach()).numpy() ) print("How similar is man and woman:") print(keyed_vectors.similarity("woman", "man")) print("How similar is man and apple:") print(keyed_vectors.similarity("apple", "man")) print("How similar is woman and apple:") print(keyed_vectors.similarity("apple", "woman")) for word in ["computer", "united", "early"]: print(f"Most similar words of {word}:") most_similar_words = [word for word, _ in keyed_vectors.similar_by_word(word)] print(most_similar_words)
def test_hierarchical_pool(): tokens: List[str] = ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'] kv: KeyedVectors = KeyedVectors(vector_size=200) word_embeds: np.ndarray = models._word_embeds(tokens, kv, (-1, 1)) ret: np.ndarray = models._hierarchical_pool(word_embeds, num_windows=3) assert ret.shape == (200, )
def test_embeddings_input(): corpus = [ ["the", "man", "ran", "to", "boy"], ["the", "boy", "pig"], ["the", "pig", "man", "boy"], ] # "pig" and "to" is not in the vocabulary model = KeyedVectors(vector_size=2) model.add(["the"], [np.array([1, 3])]) # idx 0 model.add(["man"], [np.array([2, 5])]) # idx 1 model.add(["ran"], [np.array([6, 9])]) # idx 2 model.add(["boy"], [np.array([4, 20])]) # idx 3 model.add([PAD_TOK], [np.array([0, 0])]) # idx 4 embedding_input = lib.embeddings.get_embedding_input(corpus, model) assert (embedding_input == np.array([ [0, 1, 2, 3], [0, 3, 4, 4], [0, 1, 3, 4], ])).all() embedding_layer = lib.embeddings.get_keras_embedding(model) embedded = embedding_layer(embedding_input) assert (embedded.numpy() == np.array([[[1., 3.], [2., 5.], [6., 9.], [4., 20.]], [[1., 3.], [4., 20.], [0., 0.], [0., 0.]], [[1., 3.], [2., 5.], [4., 20.], [0., 0.]]])).all()
def create_keyedvector_from_matrix(self, embedding_matrix, word2id): """ Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings. Parameters ---------- embedding_matrix: numpy.ndarray Embedding matrix as a numpy object word2id: dict Word vocabulary (key: word, value: word_index) """ vocab = { word: word2id[word] for word in sorted(word2id, key=word2id.__getitem__, reverse=False) } embedding_matrix = embedding_matrix vector_size = embedding_matrix.shape[1] kv = KeyedVectors(vector_size) kv.vector_size = vector_size kv.vectors = embedding_matrix kv.index2word = list(vocab.keys()) kv.vocab = { word: Vocab(index=word_id, count=0) for word, word_id in vocab.items() } self.embedding = kv
def test_hierarchical_pool_raise(): """ ValueError: when invalid n_windows passed. """ doc: str = '桃' kv: KeyedVectors = KeyedVectors(vector_size=200) word_embeds = models._word_embeds(doc, kv, (-1, 1)) with pytest.raises(ValueError): # text_length: 1, n_windows: 3 models._hierarchical_pool(word_embeds, num_windows=3)
def test_add_type(self): kv = KeyedVectors(2) assert kv.vectors.dtype == REAL words, vectors = ["a"], np.array([1., 1.], dtype=np.float64).reshape(1, -1) kv.add_vectors(words, vectors) assert kv.vectors.dtype == REAL
def __init__(self, max_vocab_size, min_count, sample, estimate_memory): self.max_vocab_size = max_vocab_size self.corpus_count = 0 self.raw_vocab = None self.wv = KeyedVectors() self.min_count = min_count self.sample = sample self.estimate_memory = estimate_memory
def truncate_w2v(w2v, new_dims): """Limit w2v to the specified number of dimensions, selected at random""" old_dims = w2v.vectors.shape[1] new_w2v = KeyedVectors(new_dims) vocab = list(w2v.vocab.keys()) cols_idx = np.random.choice(old_dims, size=new_dims, replace=False) weights = w2v.vectors[:, cols_idx] new_w2v.add(vocab, weights) return new_w2v
def test_no_header(self): randkv = KeyedVectors(vector_size=100) count = 20 keys = [str(i) for i in range(count)] weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] randkv.add_vectors(keys, weights) tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False) reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True) self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
def test_infer_vector_functional(): tokens: List[str] = ['私', 'は', '私', 'は'] kv: KeyedVectors = KeyedVectors(vector_size=200) methods: Dict[str, Tuple[int]] = { 'avg': (200, ), 'max': (200, ), 'concat': (400, ), 'hierarchical': (200, ) } for method, shape in methods.items(): embed: np.ndarray = swem.infer_vector(tokens, kv=kv, method=method) assert embed.shape == shape
def _set_keyedvector(self, attrname, keys, dim, vec=None): keyed_vec = KeyedVectors(dim) dummy_max_count = len(keys) + 1 for i, key in enumerate(keys): key = str(key) keyed_vec.vocab[key] = Vocab(index=i, count=dummy_max_count - i) # dummy count keyed_vec.index2word.append(key) if vec is not None: keyed_vec.vectors = vec keyed_vec.init_sims() setattr(self, attrname, keyed_vec)
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR): orig_model = load_word_vector_model(small=False, cache_dir=cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True)
def update(self): wv = self.word_vectors_file.get_word_vectors() voc = self.vocabs_file.get_vocabs()['word'] words_in_vocab = [ k for k, _ in sorted(voc.items(), key=lambda i: i[1][0]) ] word_embs = wv[words_in_vocab[1:]] unk_emb = np.mean(word_embs, 0, keepdims=True) embs = np.concatenate((unk_emb, word_embs), 0) kv = KeyedVectors(embs.shape[1]) kv.syn0 = embs kv.vocab = dict( (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items()) kv.index2word = words_in_vocab kv.save(self.path)
def minimize(_log, vectors_path="wiki.en.vec", output_path="wiki.min.en.vec"): """Minimize the given vectors file to contain only words in the given corpus.""" samples = {wh: list(read_samples(which=wh)) for wh in ["train", "test"]} try: samples["dev"] = list(read_samples(which="dev")) except FileNotFoundError: pass # skip if not exist vocab = Vocab.from_samples(chain(*samples.values())) kv = KeyedVectors.load_word2vec_format(vectors_path) _log.info("Creating new, minimized word vectors") min_kv = KeyedVectors(kv.vector_size) for w in kv.vocab: if w in vocab["words"]: min_kv[w] = kv[w] _log.info("Saving the new word vectors to %s", output_path) min_kv.save_word2vec_format(output_path)
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000): filename = fewshot_filename(cache_dir, W2V_SMALL) if not os.path.exists(filename): orig_model = _load_large_word_vector_model(cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True) return KeyedVectors.load_word2vec_format(filename, binary=True)
def get_kv(model_path, embed_size=None): # optional ecosystem is to limit to only one ecosystem if embed_size is None: embed_size = int(''.join(c for c in model_path if c.isdigit())) w2v = KeyedVectors(embed_size) model = tf.keras.models.load_model(model_path) all_weights = model.get_weights() # input_weights = all_weights[0] # embed_bias = all_weights[1] output_weights = all_weights[2].T # output_bias = all_weights[3] index = list(id2lib) weights = output_weights w2v.add(index, weights) return w2v
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = [f'___some_entity{i}_not_present_in_keyed_vectors___' for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add_vectors(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = KeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add_vectors(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def test_add_multiple(self): """Test that adding a bulk of entities in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. vocab_size = len(self.vectors) self.vectors.add_vectors(entities, vectors, replace=False) self.assertEqual(vocab_size + len(entities), len(self.vectors)) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = KeyedVectors(self.vectors.vector_size) kv[entities] = vectors self.assertEqual(len(kv), len(entities)) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def load_keyedvectors(**kwargs): embeddings = load_embeddings(**kwargs) vectors = KeyedVectors(int(kwargs["size"])) vectors.add(embeddings.index, embeddings.values) return vectors
def test_word_embeds(): tokens: List[str] = ['私', 'は'] kv: KeyedVectors = KeyedVectors(vector_size=200) embed = models._word_embeds(tokens, kv=kv, uniform_range=(-0.01, 0.01)) assert embed.shape == (2, 200)
def test_word_embed(): token: str = '私' kv: KeyedVectors = KeyedVectors(vector_size=200) embed = models._word_embed(token, kv=kv) assert embed.shape == (200, )
def __init__(self, vocab): self.wv = KeyedVectors() self.wv.vocab = vocab
indices = np.array(list(freq_vocab.values())) M_ = (M[indices].T)[indices].T M_.shape # Square root of the matrix M before Arnoldi iteration sqrt_M = M_.sqrt() # Construct word embeddings by Arnoldi iteration Q_file = f"./tmp/{tag}_Q_{win_size}.npy" if os.path.exists(Q_file): Q = np.load(Q_file) logging.info("Successfully loaded embeddings matrix from %s.", Q_file) else: logging.info("Start arnoldi iterations.") b = np.random.random(size=max_vocab) # initial vector Q, h = arnoldi_iteration(sqrt_M, b, embed_dim) logging.info( "Successfully extracted word embeddings from arnoldi iteration.") np.save(Q_file, Q) # save Word embeddings logging.info("Successfully saved word embedding matrix Q to %s.", Q_file) dim = embed_dim word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv" Q_ = Q[:, :dim] we_ = normalize(Q_, axis=1, norm="l2") kv = KeyedVectors(vector_size=dim) kv.add(list(vocab_.keys()), we_) kv.save_word2vec_format(word2vec_file) logging.info("Successfully saved word embeddings of dimension %d to %s.", dim, word2vec_file)
def setUp(self): kv: KeyedVectors = KeyedVectors(vector_size=200) self.swem = models.SWEM(kv)
def initialize_word_vectors(self): #word self.wv = KeyedVectors() #category self.cv = KeyedVectors()
def get_random_kv(embed_size): w2v = KeyedVectors(embed_size) w2v.add(list(id2lib), np.random.random((len(id2lib), embed_size))) return w2v