def load_saved_word_embeddings(w2v, fasttext): global wv_model if w2v: wv_model = KeyedVectors.load(word2vec_data(w2v)) global vector_size vector_size = w2v elif fasttext: wv_model = KeyedVectors.load(fasttext_data) return wv_model
def train_translation(model_source_path, model_target_path, transmat_outpath): word_pairs = [] source_model = KeyedVectors.load(model_source_path) target_model = KeyedVectors.load(model_target_path) for word in target_model.wv.vocab: if word in source_model.wv.vocab: word_pairs.append((word, word)) trans_model = TranslationMatrix(source_model.wv, target_model.wv, word_pairs=word_pairs) trans_model.save(transmat_path) return trans_model
def plot_data(data_path, index, save_path, filenames_get, embedding_size): path_model_load = data_path + "/" + filenames_get print "########" print path_model_load model = KeyedVectors.load(path_model_load) length = len(model.wv.vocab.keys()) words = model.wv.vocab.keys() print filenames_get[1] X = npx.empty((length, 128)) count = 0 un_seen_node = 0 for node in words: try: vec_one = model[node] except: vec_one = npx.random.rand(embedding_size) un_seen_node += 1 #print vec_one.shape X[count, :] = vec_one count += 1 y_pred = KMeans(n_clusters=6, random_state=2017, max_iter=15).fit_predict(X) X_embedded = None #kmeans.labels_ # print "do the TSNE" # flag = exists(save_path) # if not flag: # X_embedded = TSNE(n_components=3).fit_transform(X) # save_picle(save_path, X_embedded) # else: # X_embedded = load_pickle(save_path) return X, y_pred, X_embedded
def main(): print("good") model = models.Word2Vec.load("D:\\NTUST\\人工智慧\\final\\csv\\code250.model.bin") # print(len(model.wv.vocab)) #print(model.wv.vocab[0]) word_vectors = KeyedVectors.load("D:\\NTUST\\人工智慧\\final\\csv\\code250.model.bin") print(word_vectors["市長"]) ''' output = open('D:\\NTUST\\人工智慧\\final\\csv\\wordvec.csv', 'w') count=0; for word in model.wv.vocab: print(word+"~~", end='') output.write(word) print(len(word_vectors[word]),"~", end='') for item in word_vectors[word]: print(item, end='') output.write(",") output.write(str(item)) output.write("\n") print() #count+=1 #if(count==20): # break output.close() ''' '''
def get_embeddings(): # build index mapping words in the embeddings set # to their embedding vector #glove vectors glove_embeddings_index = {} f = open(os.path.join(EMBEDDING_DIR, 'glove.6B.100d.txt')) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') glove_embeddings_index[word] = coefs f.close() #word2vec model bible_embeddings_index = {} bible_model = KeyedVectors.load(EMBEDDING_DIR + 'bible_word2vec') bible_vocab = bible_model.vocab for word in bible_vocab: bible_embeddings_index[word] = bible_model[word] #print("WORD: ", word) #print("VECTOR: ", bible_model[word]) combined_index = combine_embeddings(glove_embeddings_index, bible_embeddings_index) print('Found word vectors: ', len(combined_index)) print('dimensions: ', len(combined_index["god"])) print('dimensions: ', len(combined_index["bottle"])) return combined_index
def _load_gensim_word2vec_model(self, model_uri=None, max_lru_cache_size=1024): """ Loads pre-trained Gensim word2vec keyed vector model from either local or Redis >>> from textpipe.doc import Doc >>> model = Doc('')._load_gensim_word2vec_model('tests/models/gensim_test_nl.kv') >>> type(model) <class 'gensim.models.keyedvectors.Word2VecKeyedVectors'> """ lang = self.language if self.is_reliable_language else self.hint_language if not self._gensim_vectors or lang not in self._gensim_vectors: if urlparse(model_uri).scheme == 'redis': vectors = RedisKeyedVectors(model_uri, lang, max_lru_cache_size) if not vectors.exists: raise TextpipeMissingModelException( f'Redis does not contain a model ' f'for language {lang}. The model ' f'needs to be loaded before use ' f'(see load_keyed_vectors_into_redis).') elif model_uri: try: vectors = KeyedVectors.load(model_uri, mmap='r') except FileNotFoundError: raise TextpipeMissingModelException( f'Gensim keyed vector file {model_uri} is not available.' ) else: raise TextpipeMissingModelException( 'Either specify model filename or redis URI') self._gensim_vectors[lang] = vectors return self._gensim_vectors[lang]
def load_full_data(): train_data = pd.read_excel(url_full_train_data, 'Sheet1') test_data = train_data[4415:4914] train_data = train_data.drop(test_data.index) texts = train_data.text tokenizer = Tokenizer(num_words=NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True) tokenizer.fit_on_texts(texts) sequences_train = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index X_train = pad_sequences(sequences_train, maxlen=max_length, padding=pad[0]) word_vectors = KeyedVectors.load(url_word2vec_full, mmap='r') vocabulary_size = min(len(word_index) + 1, NUM_WORDS) embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= NUM_WORDS: continue try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM) del (word_vectors) return tokenizer, sequences_train, X_train
def __init__(self, emb_dim, hid_dim, z_dim, word2vec_file='data/word2vec_recipes.bin', with_attention=True): super(IngredientsEncoderRNN, self).__init__() wv = KeyedVectors.load(word2vec_file, mmap='r') vec = torch.from_numpy(np.copy(wv.vectors)).float() # first two index has special meaning, see load_dict() in utils.py emb = nn.Embedding(vec.shape[0] + 2, vec.shape[1], padding_idx=0) emb.weight.data[2:].copy_(vec) # for p in emb.parameters(): # p.requires_grad = False self.embed_layer = emb print('IngredientsEncoderRNN:', emb) self.rnn = nn.GRU(input_size=emb_dim, hidden_size=hid_dim, bidirectional=True, batch_first=True) self.with_attention = with_attention if with_attention: self.atten_layer = AttentionLayer(2 * hid_dim, with_attention)
def main(args): print('Loading datasets...') X, y = load_data_and_labels(args.data_path) x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42) embeddings = KeyedVectors.load(args.embedding_path).wv print('Transforming datasets...') p = IndexTransformer() p.fit(X, y) embeddings = filter_embeddings(embeddings, p._word_vocab, embeddings.vector_size) print('Building a model...') model = BiLSTMCRF(char_vocab_size=p.char_vocab_size, word_vocab_size=p.word_vocab_size, num_labels=p.label_size, embeddings=embeddings, char_embedding_dim=50) model.build() print('Training the model...') trainer = Trainer(model, preprocessor=p) trainer.train(x_train, y_train, x_valid, y_valid) print('Saving the model...') model.save(args.weights_file, args.params_file) p.save(args.preprocessor_file)
def word2vec(request): model_path = '/home/hasher/Documents/textbook_trained' w2v_model = KeyedVectors.load(model_path) ds = DocSim(w2v_model) df = open(MEDIA_ROOT + "/texts/blueprint.txt", "r") blueprint = df.read() df.close() # target_docs = [source_doc,'Beggar was so poor that he had no money' ,'Beggar was very rich and had lots of money', 'Beggar had nothing to buy slippers'] df = open(MEDIA_ROOT + "/texts/answersheet.txt", "r") answer = df.read() df.close() answer = nltk.sent_tokenize(answer) print(answer) answer.append(blueprint) # This will return 3 target docs with similarity score sim_scores = ds.calculate_similarity(blueprint, answer) output = dict() output.update({ 'model': 'Word2vec', 'blueprint': blueprint, 'output': sim_scores }) request = HttpRequest() request.output = output return generate_model_report(request)
def build_w2v( path_train_union_test, path_words_vectors, w2v_model_path='/home/penglu/LewPeng/TranSummary/lcsts_data/word2vec/embedding/w2v.model', min_count=10): w2v = Word2Vec(sentences=LineSentence(path_train_union_test), size=512, window=5, min_count=min_count, iter=5) w2v.save(w2v_model_path) model = Word2Vec.load(w2v_model_path) model.wv.save_word2vec_format( '/home/penglu/LewPeng/TranSummary/lcsts_data/word2vec/embedding/w2v.vector' ) logging.info('语料数:{}'.format(model.corpus_count)) logging.info('词表长度:{}'.format(len(model.wv.vocab))) model = KeyedVectors.load(w2v_model_path) words_vectors = {} for word in model.wv.vocab: words_vectors[word] = model[word] dump_pkl(words_vectors, path_words_vectors, overwrite=True)
def trim_wemb(conf, econf): ''' Trim embeddings to the minimally required size and load them. ''' # Load the embeddings from disk. fn = econf.embedding_fn if fn.endswith('.kv'): wv = KeyedVectors.load(fn, mmap='r') else: wv = KeyedVectors.load_word2vec_format(fn, binary=fn.endswith('.bin')) # Account for mapping changes due to preprocessing (eg. stemming). vocab = _adapt_mapping(econf, wv) vocab = {w: e.index for w, e in vocab.items()} # Reduce the matrix to the actual vocabulary of the dataset. used = _get_dataset_vocab(conf, econf, vocab) mapping = {old: new for new, old in enumerate(sorted(used))} # preserve order ds_vocab = [None] * len(mapping) # Add two rows in the beginning: one for padding and one for unknown words. shape = len(mapping) + 2, wv.vectors.shape[1] matrix = np.zeros(shape, dtype=wv.vectors.dtype) matrix[1] = np.random.standard_normal(shape[1]) # unknown words for w, i in vocab.items(): n = mapping.get(i) if n is not None: ds_vocab[n] = w matrix[n + 2] = wv.vectors[i] return ds_vocab, matrix
def __init__(self, path_out: str) -> None: self.path_out = path_out self.classifier = HypernymClassifier(path_out) self.classifier.load() self.path_emb = os.path.join( path_out, 'embeddings/embs_token_global_Word2Vec.vec') self.term_id_to_emb = KeyedVectors.load(self.path_emb)
def gensim_model2txt(): from gensim.models.keyedvectors import KeyedVectors # https://bit.ly/3pXwIDx model = KeyedVectors.load( './trained_model/wiki_model/word2vec_wiki_zh.model.bin') model.wv.save_word2vec_format( './trained_model/wiki_model/word2vec_wiki_zh.model.txt', binary=False)
def __init__(self): # self.model = Word2Vec.load(GENSIM_MODEL) self.model = KeyedVectors.load_word2vec_format(GENSIM_MODEL, binary=False) self.gloveModel = KeyedVectors.load(GLOVE_MODEL) self.wordLemmas = pd.read_csv(INPUT_FILE, index_col=0) self.data = pd.read_csv(FILE_TO_UPDATE, index_col=0)
def open_word2vec(w2v_bin_path, binary=True): # model = None try: model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) except Exception as e: ## Loading a different format. print('Loading original word2vec format failed. Trying Gensim format.') model = KeyedVectors.load(w2v_bin_path, binary=True) return model
def create_embedding_matrix(self): model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec') model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec') embeddings_index = {} for w in model_ug_cbow.wv.vocab.keys(): embeddings_index[w] = np.append(model_ug_cbow.wv[w], model_ug_sg.wv[w]) print('Found %s word vectors.' % len(embeddings_index)) embedding_matrix = np.zeros((self.vocabulary_size, 200)) for word, i in self.tokenizer.word_index.items(): if i > self.vocabulary_size - 1: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix
def load_gensim_kv(filename=None, path=None, mmap=None): if path is not None: return KeyedVectors.load(path, mmap=mmap) elif filename is not None: for dir_path in ASSET_SEARCH_DIRS: try: path = os.path.join(dir_path, filename) return KeyedVectors.load(path, mmap=mmap) except FileNotFoundError: continue raise FileNotFoundError("Please make sure that 'filename' \ specifies the word vector binary name \ in default search paths or 'path' \ speficies file path of the binary") else: raise TypeError( "load_gensim_kv() requires either 'filename' or 'path' to be set." )
def test_word_2vec(): from gensim.models.keyedvectors import KeyedVectors id = "honeypot_clean_model_revised" mymodel = KeyedVectors.load(id) n_dim = mymodel.wv.syn0.shape[1] print(n_dim) print(mymodel.wv.most_similar("today"))
def loadModel(model): #loadedModel = gensim.models.KeyedVectors.load_word2vec_format(model) #loadedModel = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True ) #loadedModel.save("2ndmodel.txt") print("SAVED") loadedModel = KeyedVectors.load(model) print("LOADED") #return word_vectors return loadedModel
def obtainModel(path): # file = basename(normpath(path)) file = path + '.prep' # saving in the same folder if exists(file): return KeyedVectors.load(file) else: model = KeyedVectors.load_word2vec_format(path, binary=False) model.save(file) return model
def create_db_fasttext(file_name, path_to_db): documents1 = [] documents2 = [] related = {} morph = pymorphy2.MorphAnalyzer() df = pd.read_csv("quora.csv") for i, row in df[:5000].iterrows(): id1 = "q" + str(i) id2 = "d" + str(i) doc1 = str(row["question1"]).lower() doc1 = re.split(r"[^а-яё]+", doc1) doc1 = [morph.parse(word)[0].normal_form for word in doc1] documents1.append(doc1) doc2 = str(row["question2"]).lower() doc2 = re.split(r"[^а-яё]+", doc2) doc2 = [morph.parse(word)[0].normal_form for word in doc2] documents2.append(doc2) if row["is_duplicate"] == 1: if id1 not in related: related[id1] = [] related[id1].append(id2) model_file = './fasttext/model.model' model = KeyedVectors.load(model_file) dimensions = model.vector_size data = [] for document in documents2: doc_embedding = np.array([0 for i in range(dimensions)], dtype="float64") for word in document: if word in model: doc_embedding += model[word] doc_embedding = doc_embedding / len(document) data.append(doc_embedding) data = np.array(data) data = data.reshape((5000, dimensions)) path = os.path.join(path_to_db, "fasttext") if not os.path.exists(path): os.mkdir(path) with open(os.path.join(path, "documents.pickle"), "wb") as pickle_file: pickle.dump(documents2, pickle_file) with open(os.path.join(path, "data.pickle"), "wb") as pickle_file: pickle.dump(data, pickle_file)
def __init__(self, indexing=False): if indexing: self.model = KeyedVectors.load(fast_model) log.info("Indexing starts") with open("Lemmatized_corpus.pickle", 'rb') as f: texts = pickle.load(f) indexed = [] for doc in texts: indexed.append(lookup(doc.split(' '), self.model.wv)) with open("Fasttext_matrix.pickle", 'wb') as f: pickle.dump(indexed, f) self.indexed_corpora = indexed self.model = KeyedVectors.load(fast_model) if not self.indexed_corpora: with open("Fasttext_matrix.pickle", 'rb') as f: self.indexed_corpora = pickle.load(f)
def load_model(filepath, keyed_vec=False): """ Instantiate a pre-trained model located at `filepath`. If read-only model vectors were trained by another application, set `keyed_vec=True`. Otherwise, word2vec model is assumed. """ if keyed_vec: model = KeyedVectors.load(filepath) else: model = Word2Vec.load(filepath) return model
def __init__(self, filename): super().__init__() self.fasttext_model = KeyedVectors.load(filename) self.vocab = self.fasttext_model.wv.vocab self.wv = self.fasttext_model.wv self.vector_size = self.fasttext_model.vector_size self.Z = 0 for k in self.vocab: self.Z += self.vocab[k].count
def get_init_parameters(path, ext=None): if ext == 'vec': word_model = KeyedVectors.load_word2vec_format(path).wv else: word_model = KeyedVectors.load(path).wv n_words = len(word_model.vocab) vocab_dim = word_model[word_model.index2word[0]].shape[0] index_dict = dict() for i in range(n_words): index_dict[word_model.index2word[i]] = i+1 return word_model, index_dict, n_words, vocab_dim
def load_embedding(embd_file_path): from gensim.models.keyedvectors import KeyedVectors model = KeyedVectors.load(embd_file_path) vocab = model.vocab embd = [] for word in vocab: embd.append(model[word]) return vocab, embd
def test_ft_kv_backward_compat_w_360(self): kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) expected = ['trees', 'survey', 'system', 'graph', 'interface'] actual = [word for (word, similarity) in kv.most_similar("human", topn=5)] self.assertEqual(actual, expected) actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)] self.assertEqual(actual, expected)
def __init__(self, **kwargs): """ :param model_file: path of model file. If not supplied, will be downloaded. """ super().__init__(**kwargs) self._model_file = self._options.get("model_file") self._model = KeyedVectors.load(str(self._model_file), mmap="r") self._vector_size = 512 self._zero_vector = np.zeros(self._vector_size, dtype=np.float32) self._window_size = 3
def build_w2v(train_union_test_path, path_words_vectors, w2v_model_path='w2v.model', min_count=5): w2v = Word2Vec(sentences=LineSentence(train_union_test_path), size=128, window=5, min_count=min_count, iter=5) w2v.save(w2v_model_path) model = Word2Vec.load(w2v_model_path) model = KeyedVectors.load(w2v_model_path) words_vectors = {} for word in model.wv.vocab: words_vectors[word] = model[word] print(len(words_vectors)) dump_pkl(words_vectors, path_words_vectors, overwrite=True)