def embedding_seperate(path, words, model_embedding, hParams): model_embedding_kv = KeyedVectors(hParams.embedding_size) for word in words: try: model_embedding_kv.add([word], [model_embedding.wv[word]]) except: pass model_embedding_kv.save(path) return model_embedding_kv
def embedding_ayir(dizin, kelimeler, model_embedding, hParams): model_embedding_kv = KeyedVectors(hParams.embedding_matris_boyut) for kelime in kelimeler: try: model_embedding_kv.add([kelime], [model_embedding.wv[kelime]]) except: pass model_embedding_kv.save(dizin) return model_embedding_kv
def save_fasttext(vocab): model = FastText.load_word2vec_format('../../corpora/wiki.en.vec') # 新建KeyedVectors kmodel = KeyedVectors(300) loss = 0 for word in vocab: try: vec = model[word] except: loss += 1 continue kmodel.add(word, vec, replace=True) print('loss word: ', loss) kmodel.save('../../corpora/fasttext.wv')
def save_gnews(vocab): model = KeyedVectors.load_word2vec_format('../../corpora/GoogleNews-vectors-negative300.bin', binary=True) # 新建KeyedVectors kmodel = KeyedVectors(300) loss = 0 for word in vocab: try: vec = model[word] except: loss += 1 continue kmodel.add(word, vec, replace=True) print('loss word: ', loss) kmodel.save('../../corpora/gnews.wv')
def save_glove(vocab): #model = KeyedVectors.load_word2vec_format('../../corpora/glove.840B.300d.txt', binary=False) kmodel = KeyedVectors(300) vocab = set(vocab.to_list()) f = open('../../corpora/glove.840B.300d.txt', encoding='utf-8') for line in f: values = line.split() word = str(values[0]) if word not in vocab: continue try: vec = np.asarray(values[1:], dtype='float32') except: continue kmodel.add(word, vec, replace=True) f.close() kmodel.save('../../corpora/glove.wv')
def handle(self, output_file, debug_output_file, **options): logger.info("Building definition vectors") definitions = Definition.objects.filter( auto_translation_source_id__isnull=True).prefetch_related( "wordform__lemma") count = definitions.count() news_vectors = google_news_vectors() definition_vector_keys = [] definition_vector_vectors = [] unknown_words = set() with create_debug_output(debug_output_file) as debug_output: for d in tqdm(definitions.iterator(), total=count): keys = extract_keyed_words(d.semantic_definition, news_vectors, unknown_words) debug_output( json.dumps( { "definition": d.text, "wordform_text": d.wordform.text, "extracted_keys": keys, }, ensure_ascii=False, )) if keys: vec_sum = vector_for_keys(news_vectors, keys) definition_vector_keys.append(definition_to_cvd_key(d)) definition_vector_vectors.append(vec_sum) definition_vectors = KeyedVectors( vector_size=news_vectors.vector_size) definition_vectors.add_vectors(definition_vector_keys, definition_vector_vectors) output_file.parent.mkdir(exist_ok=True) definition_vectors.save(fspath(output_file))
def optimize_embeddings( vocabulary, embedding_file, output_embedding_path, n_dim, logging, ): original_embeddings = KeyedVectors.load(embedding_file, mmap='r') embeddings_redux = KeyedVectors(n_dim) words = [] weights = [] logging.info( "Generating optimized W2V embedding based on vocabulary words...") count = 0 for word in vocabulary: try: vector = original_embeddings[word] words.append(word) weights.append(vector) count += 1 except: logging.info(f'Embeddings: word "{word}" not found on embeddings!') pass del original_embeddings embeddings_redux.add(words, weights) del words del weights os.makedirs(os.path.dirname(output_embedding_path), exist_ok=True) embeddings_redux.save(output_embedding_path) del embeddings_redux logging.info( f'\n\nGenerated optimized Gensim W2V embedding file at "{output_embedding_path}"' ) del output_embedding_path logging.info(f'{count}/{len(vocabulary)} words found on embeddings')
def main(): """Entry point.""" parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors") parser.add_argument("--model", required=True) parser.add_argument("--dictionary", required=True) parser.add_argument("--output", required=True) args = parser.parse_args() dictionary = torch.load(args.dictionary) model = torch.load(args.model, map_location='cpu') embeddings = model[0].encoder.weight.data.cpu().numpy() kv = KeyedVectors(embeddings.shape[1]) kv.syn0 = embeddings kv.vocab = { w: Vocab(index=i) for i, w in enumerate(dictionary.dictionary.idx2word) } kv.index2word = dictionary.dictionary.idx2word kv.save(args.output)
def convert(dir_path): with open(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.txt'), mode="r") as txtfile: lines = txtfile.readlines() num_entries = len(lines) vecs = np.zeros((num_entries-1, 300), float) words = [] print(type(words)) idx = 0 for line in lines[1:]: tok = line.split(" ") words.append(tok[0]) vals = tok[1:] vecs_ = np.array([float(item) for item in vals]) vecs[idx, :] = vecs_ idx = idx + 1 model = KeyedVectors(vecs.shape[1]) model.add(words, vecs) model.save(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.wv')) # load the model back and verify results model_ = model.load(os.path.join(dir_path, 'data/cord19q', 'cord19-300d.wv'))
def rename_wikipedia2vec_entities(src, tgt_w, tgt_f): from gensim.models import KeyedVectors import numpy as np old_model = KeyedVectors.load(src, mmap='r') words = [ word for word in old_model.vocab.keys() if not word.startswith("ENTITY/") ] titles = [ word[7:] for word in old_model.vocab.keys() if word.startswith("ENTITY/") ] titles = [title for title in titles if not "#" in title] title2deviant_title = {title.split("|")[0]: title for title in titles} titles = [title.split("|")[0] for title in titles] t2w = title2wikidata(titles) t2f = init_mapping(titles) w2f = wikidata2freebase(sum([list(x) for x in t2w.values()], [])) for title in t2w: for w in t2w[title]: for f in w2f[w]: t2f[title].add(f) w2t = init_mapping(sum([list(x) for x in t2w.values()], [])) f2t = init_mapping(sum([list(x) for x in t2f.values()], [])) for title in titles: for w in t2w[title]: w2t[w].add(title) for f in t2f[title]: f2t[f].add(title) print("Some stats", flush=True) print("t2f", len(t2f), Counter([len(x) for x in t2f.values()])) print("w2f", len(w2f), Counter([len(x) for x in w2f.values()])) print("t2w", len(t2w), Counter([len(x) for x in t2w.values()])) print("f2t", len(f2t), Counter([len(x) for x in f2t.values()])) print("w2t", len(w2t), Counter([len(x) for x in w2t.values()])) w_vecs = {word: old_model[word] for word in words} freebase_model = KeyedVectors(old_model.vector_size) freebase_model.add(words, [w_vecs[word] for word in words]) freebase_words = list(f2t.keys()) f_vecs = { f: np.mean([ old_model["ENTITY/" + title2deviant_title[title]] for title in f2t[f] ], 0) for f in freebase_words } freebase_model.add(freebase_words, [f_vecs[word] for word in freebase_words]) freebase_model.save(tgt_f) del freebase_model wikidata_model = KeyedVectors(old_model.vector_size) wikidata_model.add(words, [w_vecs[word] for word in words]) wikidata_words = list(w2t.keys()) w_vecs = { w: np.mean([ old_model["ENTITY/" + title2deviant_title[title]] for title in w2t[w] ], 0) for w in wikidata_words } wikidata_model.add(wikidata_words, [w_vecs[word] for word in wikidata_words]) wikidata_model.save(tgt_w) del wikidata_model
def test_build_vocab_build_vocab_from_embeddings(): """ This test shows that all fields in the embeddings will be included. In embeddings and data: blue green yellow In embeddings only: purple gold In data only: white Expected vocab: blue green yellow purple gold white """ model = KeyedVectors(10) model.add('purple', np.random.rand(10)) model.add('gold', np.random.rand(10)) model.add('<unk>', np.random.rand(10)) model.add('blue', np.random.rand(10)) model.add('green', np.random.rand(10)) model.add('<pad>', np.random.rand(10)) model.add('yellow', np.random.rand(10)) with tempfile.NamedTemporaryFile() as tmpfile: model.save(tmpfile.name) field = TextField.from_embeddings( embeddings=tmpfile.name, embeddings_format='gensim', build_vocab_from_embeddings=True, ) dummy = ["blue green", "yellow", 'white'] field.setup(dummy) # assert vocab setup in expected order assert field.vocab == odict([ ('<pad>', 0), ('<unk>', 1), ('blue', 2), ('green', 3), ('yellow', 4), ('white', 1), ('purple', 5), ('gold', 6), ]) # assert embedding matrix organized in expected order assert torch.equal( field.embedding_matrix, torch.stack([ torch.tensor(model['<pad>']), torch.tensor(model['<unk>']), torch.tensor(model['blue']), torch.tensor(model['green']), torch.tensor(model['yellow']), torch.tensor(model['purple']), torch.tensor(model['gold']) ]), )
vector = embeddings[key] words.append(key) weights.append(vector) found += 1 if len(words) > 0 and len(weights) > 0: liwc_embeddings.add(words, weights) else: del embeddings sys.exit("No words found on LIWC dictionary!") del embeddings del words del weights output_path = os.path.join(OUTPUT_FOLDER, f'{args.lang}_liwc.w2v') logging.info(f'Saving LIWC embeddings to {output_path}...') os.makedirs(os.path.dirname(output_path), exist_ok=True) liwc_embeddings.save(output_path) del liwc_embeddings del output_path logging.info(f'LIWC embeddings saved') logging.info(f'{found}/{count} words on embeddings were found on LIWC') del found del count end = time.time() logging.info(f'Elapsed execution time: {end-start}s')