def load_word2vec( # model_filename=word2vec_model_dir + "GoogleNews-vectors-negative300.bin.gz", # model_type="c_word2vec", # compact_name="google300", model_filename=word2vec_model_dir + 'en_1000_no_stem/en.model', model_type="gensim", compact_name="wiki1000" ): """ :param model_filename: :param model_type: can be "c_word2vec" or "gensim" :return: """ print >> sys.stderr, "loading word2vec model ", model_filename, \ "(may take a few minutes) ..." start_time = time.time() if "c_word2vec" == model_type: model = Word2Vec.load_word2vec_format(model_filename, binary=True) elif "gensim" == model_type: model = Word2Vec.load(model_filename) else: raise ValueError(sys.stderr, "The specified model_type '" + str(model_type) + "' is not matched!") model.compact_name = compact_name elapsed = time.time() - start_time print >> sys.stderr, "word2vec model loading finished.", elapsed, "s" return model
def getEmbeddingsByTerm(self, dim=50, win=1, pretrained_file=None, binary=False, word_cleaning=False, op='doc'): #op={doc, avg, sum} d2v = [] if op is 'doc': #doc2vec d2v_model = TextCorpus._getDoc2Vec(self._word_count.toarray(), dim, win) # w2v = [] # for i in xrange(len(d2v_model.vocab)): # w2v.append(d2v_model[str(1)]) for i in xrange(len(d2v_model.docvecs.doctags)): d2v.append(d2v_model.docvecs[str(i)]) else: #word2vec and then apply op on words of each doc w2v = None if pretrained_file is not None: w2v = Word2Vec.load_word2vec_format(pretrained_file, binary=binary) else: w2v = Word2Vec(TextCorpus._getDocsByBagOfTokenIds( self._word_count.toarray()), size=dim, window=win, min_count=0, workers=multiprocessing.cpu_count()) if op == 'sum': func = np.sum elif op == 'avg': func = np.average if word_cleaning: #shouldn't be used when there is no pretrained! np.apply_along_axis(lambda x: d2v.append( func([ w2v[re.sub("[^a-zA-Z]", " ", self.inv_words[i]).strip( )] for i in x.nonzero()[0] if w2v.__contains__( re.sub("[^a-zA-Z]", " ", self.inv_words[i]).strip( )) ], axis=0) if len(x.nonzero()[0]) > 0 else np.zeros(dim)), arr=self._word_count.toarray(), axis=1) else: np.apply_along_axis(lambda x: d2v.append( func([ w2v[str(i)] for i in x.nonzero()[0] if w2v.__contains__(str(i)) ], axis=0) if len(x.nonzero()[0]) > 0 else np.zeros(dim)), arr=self._word_count.toarray(), axis=1) return np.array(d2v)
def trainWord2Vec(doc_list=None, buildvoc=1, passes=20, sg=1, size=100, dm_mean=0, window=5, hs=1, negative=5, min_count=1, workers=4): model = Word2Vec(size=size, sg=sg, window=window, hs=hs, negative=negative, min_count=min_count, workers=workers) if buildvoc == 1: print('Building Vocabulary') model.build_vocab(doc_list) # build vocabulate with words + nodeID for epoch in range(passes): print('Iteration %d ....' % epoch) shuffle(doc_list) # shuffling gets best results model.train(doc_list) return model
def getEmbeddingsByChar(self, dim=50, win=1, pre=None, op='doc'): #avg/sum d2v = [] if op is 'doc': #doc2vec d2v_model = TextCorpus._getDoc2Vec(self._char_count.toarray(), dim, win) for i in xrange(len(d2v_model.docvecs.doctags)): d2v.append(d2v_model.docvecs[str(i)]) return np.array(d2v) #, np.array(w2v) else: w2v = Word2Vec(TextCorpus._getDocsByBagOfTokenIds( self._char_count.toarray()), size=dim, window=win, min_count=0, workers=multiprocessing.cpu_count()) if op == 'sum': func = np.sum elif op == 'avg': func = np.average np.apply_along_axis(lambda x: d2v.append( func([ w2v[str(i)] for i in x.nonzero()[0] if w2v.__contains__(str(i)) ], axis=0) if len(x.nonzero()[0]) > 0 else np.zeros(dim)), arr=self._char_count.toarray(), axis=1) return np.array(d2v)
def train_model(doc_path, output_path, dim=100): """ Training a model. Reading the file, building a vocabulary, training, and saving the model Args: doc_path - str: path to a doc file output_path - str: path to the model """ print "Reading a file ..." sentences = read_file(doc_path) print "Training a model ..." model = Word2Vec(sentences, min_count=0, size=dim, window=10) print "Saving the moles ..." model.save(output_path) print "Done."
def eval_model(): w2v = Word2Vec.load_word2vec_format(args.save_path, binary=False) word2id = dict([(w, i) for i, w in enumerate(w2v.index2word)]) analogy_questions = read_analogies(args.eval_data, word2id) correct = 0 total = len(analogy_questions) for question in analogy_questions: a, b, c, d = question # E.g. [Athens, Greece, Baghdad, Iraq] analogies = w2v.most_similar(positive=[b, c], negative=[a], topn=4) for analogy in analogies: word, _ = analogy if d == word: # Predicted Correctly! correct += 1 break print('Eval %4d/%d accuracy = %4.1f%%' % (correct, total, correct * 100.0 / total))
def train(self, size=500, min_count=3, iter=4, window=6, workers=3, **kwargs): """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3. Args: kwargs: all arguments that gensim.models.doc2vec.Docvec will take. """ job_postings_generator = job_postings_chain(self.s3_conn, self.quarters, self.jp_s3_path, source=self.source) if self.model_type == 'word2vec': if not self._model: model = Word2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs) else: logging.info("Model existed") model = self._model self.update = True batch_iter = 1 batch_gen = batches_generator(Word2VecGensimCorpusCreator(job_postings_generator), self.batch_size) for batch in batch_gen: batch = Reiterable(batch) logging.info("Training batch #{} ".format(batch_iter)) if not self.update: model.build_vocab(batch, update=False) self.update = True else: model.build_vocab(batch, update=True) model.train(batch, total_examples=model.corpus_count, epochs=model.iter) self.vocab_size_cumu.append(len(model.wv.vocab)) batch_iter += 1 logging.info('\n') elif self.model_type == 'doc2vec': model = Doc2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs) corpus_gen = Doc2VecGensimCorpusCreator(job_postings_generator) reiter_corpus_gen = Reiterable(corpus_gen) model.build_vocab(reiter_corpus_gen) model.train(reiter_corpus_gen, total_examples=model.corpus_count, epochs=model.iter) self._lookup = corpus_gen.lookup self._model = model self._upload()
f.write("\n") # In[17]: vectors = cbow.get_weights()[0] ### 学習で得られた単語の特徴ベクトルを書き込む for word, i in tokenizer.word_index.items(): f.write(word) f.write(" ") f.write(" ".join(map(str, list(vectors[i,:])))) f.write("\n") f.close() # In[18]: w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False) # In[19]: w2v.most_similar(positive=['alice']) # In[ ]:
# In[16]: ### 書き込み用ファイルを生成 f = open("vectors.txt", "w") ### 語彙数と特徴ベクトルの次元数を書き込む f.write(" ".join([str(V - 1), str(dim)])) f.write("\n") # In[17]: vectors = cbow.get_weights()[0] ### 学習で得られた単語の特徴ベクトルを書き込む for word, i in tokenizer.word_index.items(): f.write(word) f.write(" ") f.write(" ".join(map(str, list(vectors[i, :])))) f.write("\n") f.close() # In[18]: w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False) # In[19]: w2v.most_similar(positive=['alice']) # In[ ]:
revvocab = [i + 4 for i, x in enumerate(vocab)] # print vocab train_datax = [i for i, x in enumerate(vocab[:top_words])] train_datay = [langdict[i] for i in vocab[:top_words]] test_datax = [i for i, x in enumerate(vocab[:500])] test_datay = [langdict[i] for i in vocab[:500]] # vocab = train_data totaldata = [] for line in data: x = line.split('\t') if (len(x) == 3): y = char_ngram_generator(x[0]) totaldata.append(y) w2vmodel = Word2Vec(totaldata, min_count=1) vectdict = {} for i in totaldata: newlist = [j for j in w2vmodel[i[0]]] for j in i[1:]: for k in range(len(w2vmodel[j])): newlist[k] += w2vmodel[j][k] vectdict[i[-1]] = newlist train_datax = [vectdict[x] for i, x in enumerate(vocab[:top_words])] train_datay = [langdict[i] for i in vocab[:top_words]] test_datax = [vectdict[x] for i, x in enumerate(vocab[:500])] test_datay = [langdict[i] for i in vocab[:500]] print train_datax[0] # create the model model = Sequential()
def compute_distance_features(): # Load data train = pd.read_csv("./data/train.csv", names=[ 'row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'have_same_meaning' ], index_col=0) test = pd.read_csv("./data/test.csv", names=[ 'row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'have_same_meaning' ], index_col=0) en_stop = set(stopwords.words('english')) glove_file = "./data/glove.840B.300d.w2vformat.txt" def clean(q): # Adapted from https://github.com/aerdem4/kaggle-quora-dup q = str(q).lower() q = q.replace(",000,000", "m").replace(",000", "k").replace( "′", "'").replace("’", "'").replace("won't", "will not").replace( "cannot", "can not").replace("can't", "can not").replace( "n't", " not").replace("what's", "what is").replace( "it's", "it is").replace("'ve", " have").replace( "i'm", "i am").replace("'re", " are").replace( "he's", "he is").replace("she's", "she is").replace( "'s", " own").replace("%", " percent ").replace( "₹", " rupee ").replace( "$", " dollar ").replace( "€", " euro ").replace( "'ll", " will") q = re.sub(r"([0-9]+)000000", r"\1m", q) q = re.sub(r"([0-9]+)000", r"\1k", q) return q # Start computation all_questions = pd.concat([ train["text_a_text"], train["text_b_text"], test["text_a_text"], test["text_b_text"] ]) question_counts = all_questions.value_counts() questions = [clean(q) for q in all_questions] questions_token = [[w for w in q.split(' ') if w not in en_stop] for q in questions] print("Fit TFIDF Model...") tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) tfidf_vectorizer.fit(all_questions) print("Load Glove Model...") glove_model = KeyedVectors.load_word2vec_format(glove_file) print("Fit Word2Vec Model...") word2Vec = Word2Vec(size=100, window=5, min_count=2, sg=1, workers=10) word2Vec.build_vocab(questions_token) # prepare the model vocabulary word2Vec.train(sentences=questions_token, total_examples=len(questions_token), epochs=word2Vec.iter) print("Fit LSI Model...") dictionary = corpora.Dictionary(questions_token) corpus = [dictionary.doc2bow(text) for text in questions_token] lsi = LsiModel(corpus, num_topics=200, id2word=dictionary) print("Fit doc2vec Model...") q2idx_dict = {tuple(q): idx for idx, q in enumerate(questions_token)} d2v_training_data = [] for idx, doc in enumerate(questions_token): d2v_training_data.append(TaggedDocument(doc, [idx])) d2v_dm = Doc2Vec(d2v_training_data, size=100, window=4, min_count=3, workers=16, iter=5) d2v_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) d2v_bow = Doc2Vec(d2v_training_data, size=100, window=4, min_count=3, dm=0, workers=16, iter=5) d2v_bow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) def preprocess(df): df_features = pd.DataFrame(index=df.index) df_intermediate = pd.DataFrame(index=df.index) print("--> Compute tokens...") df_intermediate["clean_a"] = df.text_a_text.apply(lambda x: clean(x)) df_intermediate["clean_b"] = df.text_b_text.apply(lambda x: clean(x)) df_intermediate["words_a"] = df_intermediate.apply( lambda row: row.clean_a.split(" "), axis=1) df_intermediate["words_b"] = df_intermediate.apply( lambda row: row.clean_b.split(" "), axis=1) df_intermediate["words_clean_a"] = df_intermediate.apply( lambda row: [w for w in row.words_a if w not in en_stop], axis=1) df_intermediate["words_clean_b"] = df_intermediate.apply( lambda row: [w for w in row.words_b if w not in en_stop], axis=1) df_intermediate["stop_a"] = df_intermediate.apply( lambda row: [w for w in row.words_a if w in en_stop], axis=1) df_intermediate["stop_b"] = df_intermediate.apply( lambda row: [w for w in row.words_b if w in en_stop], axis=1) print("--> Compute tfidf distance...") tfidf_a = tfidf_vectorizer.transform(df_intermediate["clean_a"]) tfidf_b = tfidf_vectorizer.transform(df_intermediate["clean_b"]) df_features["tfidf_dist_cosine"] = paired_cosine_distances( tfidf_a, tfidf_b) df_features["tfidf_dist_euclidean"] = paired_euclidean_distances( tfidf_a, tfidf_b) print("--> Compute glove distance...") glove_emb_a = df_intermediate["words_clean_a"].apply( lambda q: np.array( [glove_model.wv[w] for w in q if w in glove_model.wv])) glove_emb_b = df_intermediate["words_clean_b"].apply( lambda q: np.array( [glove_model.wv[w] for w in q if w in glove_model.wv])) glove_emb_a[glove_emb_a.apply(lambda x: len(x) == 0)] = glove_emb_a[ glove_emb_a.apply(lambda x: len(x) == 0)].apply(lambda y: np.zeros( (1, 300))) glove_emb_b[glove_emb_b.apply(lambda x: len(x) == 0)] = glove_emb_b[ glove_emb_b.apply(lambda x: len(x) == 0)].apply(lambda y: np.zeros( (1, 300))) glove_emb_a = glove_emb_a.apply(lambda x: np.mean(x, axis=0)) glove_emb_b = glove_emb_b.apply(lambda x: np.mean(x, axis=0)) glove_emb_a = np.vstack(glove_emb_a.values) glove_emb_b = np.vstack(glove_emb_b.values) df_features["glove_dist_cosine"] = paired_cosine_distances( glove_emb_a, glove_emb_b) df_features["glove_dist_euclidean"] = paired_euclidean_distances( glove_emb_a, glove_emb_b) df_features["glove_word_mover_dist"] = df_intermediate.apply( lambda row: glove_model.wv.wmdistance(row["words_clean_a"], row[ "words_clean_b"]), axis=1) print("--> Compute lsi distance...") lsi_emb_a = df_intermediate["words_clean_a"].apply( lambda x: np.array(lsi[dictionary.doc2bow(x)])) lsi_emb_b = df_intermediate["words_clean_b"].apply( lambda x: np.array(lsi[dictionary.doc2bow(x)])) lsi_emb_a[lsi_emb_a.apply(lambda x: len(x) == 0 or x.shape[ 0] != 200)] = lsi_emb_a[lsi_emb_a.apply( lambda x: len(x) == 0 or x.shape[0] != 200)].apply( lambda x: np.zeros((200, 2))) lsi_emb_b[lsi_emb_b.apply(lambda x: len(x) == 0 or x.shape[ 0] != 200)] = lsi_emb_b[lsi_emb_b.apply( lambda x: len(x) == 0 or x.shape[0] != 200)].apply( lambda x: np.zeros((200, 2))) # Derive question representations from single lsi vectors lsi_emb_a = lsi_emb_a.apply(lambda x: np.mean(x, axis=0)) lsi_emb_b = lsi_emb_b.apply(lambda x: np.mean(x, axis=0)) lsi_emb_a = np.vstack(lsi_emb_a.values) lsi_emb_b = np.vstack(lsi_emb_b.values) df_features["lsi_dist_cosine"] = paired_cosine_distances( lsi_emb_a, lsi_emb_b) df_features["lsi_dist_euclidean"] = paired_euclidean_distances( lsi_emb_a, lsi_emb_b) print("--> Compute word2vec distance...") word2Vec_emb_a = df_intermediate["words_clean_a"].apply( lambda q: np.array([word2Vec.wv[w] for w in q if w in word2Vec.wv])) word2Vec_emb_b = df_intermediate["words_clean_b"].apply( lambda q: np.array([word2Vec.wv[w] for w in q if w in word2Vec.wv])) word2Vec_emb_a[word2Vec_emb_a.apply( lambda x: len(x) == 0)] = word2Vec_emb_a[word2Vec_emb_a.apply( lambda x: len(x) == 0)].apply(lambda y: np.zeros((1, 100))) word2Vec_emb_b[word2Vec_emb_b.apply( lambda x: len(x) == 0)] = word2Vec_emb_b[word2Vec_emb_b.apply( lambda x: len(x) == 0)].apply(lambda y: np.zeros((1, 100))) word2Vec_emb_a = word2Vec_emb_a.apply(lambda x: np.mean(x, axis=0)) word2Vec_emb_b = word2Vec_emb_b.apply(lambda x: np.mean(x, axis=0)) word2Vec_emb_a = np.vstack(word2Vec_emb_a.values) word2Vec_emb_b = np.vstack(word2Vec_emb_b.values) df_features["w2v_dist_cosine"] = paired_cosine_distances( word2Vec_emb_a, word2Vec_emb_b) df_features["w2v_dist_euclidean"] = paired_euclidean_distances( word2Vec_emb_a, word2Vec_emb_b) df_features["word2vec_word_mover_dist"] = df_intermediate.apply( lambda row: word2Vec.wv.wmdistance(row["words_clean_a"], row[ "words_clean_b"]), axis=1) print("--> Compute doc2vec distance...") doc_vec_dm_emb_a = df_intermediate["words_clean_a"].apply( lambda q: d2v_dm.docvecs[q2idx_dict[tuple(q)]]) doc_vec_dm_emb_b = df_intermediate["words_clean_b"].apply( lambda q: d2v_dm.docvecs[q2idx_dict[tuple(q)]]) doc_vec_bow_emb_a = df_intermediate["words_clean_a"].apply( lambda q: d2v_bow.docvecs[q2idx_dict[tuple(q)]]) doc_vec_bow_emb_b = df_intermediate["words_clean_b"].apply( lambda q: d2v_bow.docvecs[q2idx_dict[tuple(q)]]) doc_vec_dm_emb_a = np.vstack(doc_vec_dm_emb_a.values) doc_vec_dm_emb_b = np.vstack(doc_vec_dm_emb_b.values) doc_vec_bow_emb_a = np.vstack(doc_vec_bow_emb_a.values) doc_vec_bow_emb_b = np.vstack(doc_vec_bow_emb_b.values) df_features["dm_dist_cosine"] = paired_cosine_distances( doc_vec_dm_emb_a, doc_vec_dm_emb_b) df_features["dm_dist_euclidean"] = paired_euclidean_distances( doc_vec_dm_emb_a, doc_vec_dm_emb_b) df_features["dm_word_mover_dist"] = df_intermediate.apply( lambda row: d2v_dm.wv.wmdistance(row["words_clean_a"], row[ "words_clean_b"]), axis=1) df_features["bow_dist_cosine"] = paired_cosine_distances( doc_vec_bow_emb_a, doc_vec_bow_emb_b) df_features["bow_dist_euclidean"] = paired_euclidean_distances( doc_vec_bow_emb_a, doc_vec_bow_emb_b) df_features["bow_word_mover_dist"] = df_intermediate.apply( lambda row: d2v_bow.wv.wmdistance(row["words_clean_a"], row[ "words_clean_b"]), axis=1) print("--> Compute edit distance...") df_features["edit_distance"] = df_intermediate.apply( lambda x: nltk.edit_distance(x["clean_a"], x["clean_b"]), axis=1) return df_features print("Compute train features...") train_features = preprocess(train) print("Compute test features...") test_features = preprocess(test) print("Store features...") train_features.to_csv("./data/distance_features_train.csv", index=False) test_features.to_csv("./data/distance_features_test.csv", index=False)
import numpy as np from gensim import corpora, utils, similarities from gensim.parsing.preprocessing import STOPWORDS from gensim.models.doc2vec import Doc2Vec, Word2Vec #from read_wiki import stream import pdb dir_path = './' #doc2vec_path = dir_path + 'wiki_model2.doc2vec' #doc2vec300_path = dir_path + 'wiki_model5.doc2vec' word2vec_path = dir_path + 'pretrained_word2vec.bin' print("Loading word2vec and doc2vec models") #doc2vec_model = Doc2Vec.load(doc2vec_path) word2vec_model = Word2Vec.load_word2vec_format(word2vec_path, binary=True) #doc2vec300_model = Doc2Vec.load(doc2vec300_path) print("Models loaded, proceeding...") np.random.seed(42) random_word = np.random.uniform(low=-0.25, high=0.25, size=(300,)) def tokenize(text, k): return [token for token in utils.simple_preprocess(text) if token not in STOPWORDS] def lda(text): dict_path = dir_path + 'cs_lda6.dict' lda_path = dir_path + 'wiki_model6.ldamodel' dictionary = corpora.Dictionary.load(dict_path) model = gensim.models.ldamodel.LdaModel.load(lda_path)