def test_no_header(self): randkv = KeyedVectors(vector_size=100) count = 20 keys = [str(i) for i in range(count)] weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] randkv.add_vectors(keys, weights) tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False) reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True) self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """ Store the input-hidden weight matrix. `fname` is the file used to save the vectors in `doctag_vec` is an optional boolean indicating whether to store document vectors `word_vec` is an optional boolean indicating whether to store word vectors (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) `prefix` to uniquely identify doctags from word vocab, and avoid collision in case of repeated string in doctag and word vocab `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) """ total_vec = len(self.wv.vocab) + len(self.docvecs) # save word vectors if word_vec: if not doctag_vec: total_vec = len(self.wv.vocab) KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: with utils.smart_open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) fout.write( utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) # store as in input order for i in range(len(self.docvecs)): doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) row = self.docvecs.doctag_syn0[i] if binary: fout.write( utils.to_utf8(doctag) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR): orig_model = load_word_vector_model(small=False, cache_dir=cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True)
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000): filename = fewshot_filename(cache_dir, W2V_SMALL) if not os.path.exists(filename): orig_model = _load_large_word_vector_model(cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True) return KeyedVectors.load_word2vec_format(filename, binary=True)
def minimize(_log, vectors_path="wiki.en.vec", output_path="wiki.min.en.vec"): """Minimize the given vectors file to contain only words in the given corpus.""" samples = {wh: list(read_samples(which=wh)) for wh in ["train", "test"]} try: samples["dev"] = list(read_samples(which="dev")) except FileNotFoundError: pass # skip if not exist vocab = Vocab.from_samples(chain(*samples.values())) kv = KeyedVectors.load_word2vec_format(vectors_path) _log.info("Creating new, minimized word vectors") min_kv = KeyedVectors(kv.vector_size) for w in kv.vocab: if w in vocab["words"]: min_kv[w] = kv[w] _log.info("Saving the new word vectors to %s", output_path) min_kv.save_word2vec_format(output_path)
def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """ Store the input-hidden weight matrix. `fname` is the file used to save the vectors in `doctag_vec` is an optional boolean indicating whether to store document vectors `word_vec` is an optional boolean indicating whether to store word vectors (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) `prefix` to uniquely identify doctags from word vocab, and avoid collision in case of repeated string in doctag and word vocab `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) """ total_vec = len(self.wv.vocab) + len(self.docvecs) # save word vectors if word_vec: if not doctag_vec: total_vec = len(self.wv.vocab) KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: with utils.smart_open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) # store as in input order for i in range(len(self.docvecs)): doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) row = self.docvecs.doctag_syn0[i] if binary: fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
indices = np.array(list(freq_vocab.values())) M_ = (M[indices].T)[indices].T M_.shape # Square root of the matrix M before Arnoldi iteration sqrt_M = M_.sqrt() # Construct word embeddings by Arnoldi iteration Q_file = f"./tmp/{tag}_Q_{win_size}.npy" if os.path.exists(Q_file): Q = np.load(Q_file) logging.info("Successfully loaded embeddings matrix from %s.", Q_file) else: logging.info("Start arnoldi iterations.") b = np.random.random(size=max_vocab) # initial vector Q, h = arnoldi_iteration(sqrt_M, b, embed_dim) logging.info( "Successfully extracted word embeddings from arnoldi iteration.") np.save(Q_file, Q) # save Word embeddings logging.info("Successfully saved word embedding matrix Q to %s.", Q_file) dim = embed_dim word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv" Q_ = Q[:, :dim] we_ = normalize(Q_, axis=1, norm="l2") kv = KeyedVectors(vector_size=dim) kv.add(list(vocab_.keys()), we_) kv.save_word2vec_format(word2vec_file) logging.info("Successfully saved word embeddings of dimension %d to %s.", dim, word2vec_file)
from gensim.models.keyedvectors import KeyedVectors if __name__ == '__main__': w2v = KeyedVectors.load_word2vec_format('glove500_w2v.txt', binary=False, unicode_errors='ignore') KeyedVectors.save_word2vec_format(w2v, 'glove500_w2v.bin', binary=True)
def save_w2v_model(file_name: str, w2v_model: KeyedVectors): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) w2v_model.save_word2vec_format(file_name, binary=True) print("word2vec model '%s' saved" % file_name) return w2v_model