def loadGloVe(co_occur, dim_embed, vocab, is_fine): if is_fine: glove_original = glove2dict(glove_filename) mittens_model = Mittens(n=dim_embed, max_iter=5000) embeddings = mittens_model.fit(np.asarray(co_occur), vocab=vocab, initial_embedding_dict=glove_original) else: glove_model = GloVe(n=dim_embed, max_iter=5000) embeddings = glove_model.fit(np.asarray(co_occur)) return embeddings
def run_small_glove_evals(): from mittens import GloVe ##### YOUR CODE HERE giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'), index_col=0) result = {} for i in [10, 100, 200]: glv = GloVe(max_iter=i) X = glv.fit(giga20) glv_df = pd.DataFrame(X, index=giga20.index) evaluate = full_word_similarity_evaluation(glv_df) result[i] = evaluate['Macro-average'] glv.sess.close() return result
def run_small_glove_evals(): from mittens import GloVe scores = {} giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0) for iteration in [10, 100, 200]: glv = GloVe(max_iter=iteration) giga20_glv = glv.fit(giga20.values) giga20_glv = pd.DataFrame(giga20_glv, index=giga20.index) scores[iteration] = full_word_similarity_evaluation( giga20_glv)['Macro-average'] return scores
def run_small_glove_evals(iters=[10, 100, 200]): from mittens import GloVe ##### YOUR CODE HERE giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0) results = {} for max_iter in iters: model = GloVe(max_iter=max_iter) glove_matrix = model.fit(giga20.values) glove_df = pd.DataFrame(glove_matrix, index=giga20.index) similarity_eval = full_word_similarity_evaluation(glove_df) display(similarity_eval) results[max_iter] = similarity_eval["Macro-average"] return results
def __init__(self, words): self.words = np.array(words) self.dictionary = corpora.Dictionary(words) self.corpus = [self.dictionary.doc2bow(word) for word in words] self.tfidf = models.TfidfModel(dictionary=self.dictionary) # 这里加入字典,字典的id2token就会被初始化 self.lsi = models.LsiModel(self.tfidf[self.corpus], id2word=self.dictionary, num_topics=300) self.similarity = similarities.MatrixSimilarity(self.lsi[self.corpus]) # size 200~300最佳 # window 8 最佳 self.word2vec = Word2Vec(words, min_count=3, window=5, size=200, iter=10) words = [word for doc in self.words for word in doc] global_corpus = self.get_corpus(words) self.init_global_bows(global_corpus) self.init_global_tfidfs(global_corpus) self.init_cooccurrence_matrix() glove_model = GloVe(n=200, max_iter=500) self.glove = glove_model.fit(self.cooccurrence_matrix)
def run_small_glove_evals(): from mittens import GloVe results = {} #part 1 giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0) #part 2 & 3 for max_iter in (10, 100, 200): glove = GloVe( max_iter=max_iter ) #this is the model, note we havent yet given it any data embeddings = glove.fit(giga20) giga20_glove = pd.DataFrame(embeddings, index=giga20.index) results[max_iter] = full_word_similarity_evaluation( giga20_glove)['Macro-average'] return results
gc.collect() # Display of statistical results nowTime = tic.getNow().strftime('%Y%m%d_%H%M%S') coocPath = "...\Data\Reg_leave_one_out\cooccurrence_%s.csv" % (coWindow) writer = use.csvWrite(coocPath) for item in cooccurrence: writer.writerow(item) print("The co-occurrence matrix is derived, taking %s" % (tic.timmingGet())) # GloVe print("Start GloVe calculation") coocMatric = np.array(cooccurrence, "float32") glove_model = GloVe(n=vecLength, max_iter=max_iter, display_progress=display_progress) embeddings = glove_model.fit(coocMatric) del cooccurrence, coocMatric gc.collect() # Output calculation result dicIndex = 0 # result=[] nowTime = tic.getNow().strftime('%Y%m%d_%H%M%S') GlovePath = "...\Data\Reg_leave_one_out\keras_GloVeVec_%s_%s_%s.csv" % ( coWindow, vecLength, max_iter) writer = use.csvWrite(GlovePath) for embeddingsItem in embeddings: item = np.array([dicIndex])
row_log_prob = np.outer(row_log_prob, np.ones(true.shape[1])) prob = log_cooccur - row_log_prob return np.corrcoef(prob[mask], M[mask])[0, 1] # In[25]: correlation_test(glove_test_count_df.values, glove_test_df.values) # ### Applying GloVe to real VSMs # The `vsm.glove` implementation is too slow to use on real matrices. The distribution in the `mittens` package is significantly faster, making its use possible even without a GPU (and it will be very fast indeed on a GPU machine): # In[26]: glove_model = GloVe() imdb5_glv = glove_model.fit(imdb5.values) imdb5_glv = pd.DataFrame(imdb5_glv, index=imdb5.index) # In[27]: vsm.neighbors('superb', imdb5_glv).head() # ## Autoencoders # # An autoencoder is a machine learning model that seeks to learn parameters that predict its own input. This is meaningful when there are intermediate representations that have lower dimensionality than the inputs. These provide a reduced-dimensional view of the data akin to those learned by LSA, but now we have a lot more design choices and a lot more potential to learn higher-order associations in the underyling data. # ### Overview of the autoencoder method #
#!/usr/bin/env python # -*- coding: utf-8 -*- from mittens import GloVe import pandas as pd from scipy import sparse import json # embedding dimension d = 50 config = json.load(open('../config.json', 'r')) DATA_DIR = config['OUTPUT_DIR'] print('Loading file...') coocc = sparse.load_npz(DATA_DIR + 'glove_cooccurrence.npz') coocc = coocc.toarray() with open(DATA_DIR + 'joint_vocab.txt', 'r') as f: vocab = f.read().splitlines() print('Training model...') glove_model = GloVe(d, max_iter=5000, learning_rate=0.1) embeddings = glove_model.fit(coocc) embeddings = pd.DataFrame(embeddings, index=vocab) embeddings.to_csv(DATA_DIR + 'glove.' + str(d) + 'd.csv', sep='\t')
print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) ## 主題模型歸類結果對應檢查----------------------------------- for i in range(0, 1500, 100): y_str = y_tataol[i] start = y_str.find("why:") end = y_str.find("judgement:") key_reson = y_str[start + 20:end - 4] print( '模型推論Topic:', lad_result[i] + 1, ' 案由:', key_reson, ) # print(corpus) # count = CountVectorizer() # bag = count.fit_transform(seg_corpus) glove_model = GloVe(n=2, max_iter=1000) cooccurrence_matrix = np.dot(X.toarray().transpose(), X.toarray()) embeddings = glove_model.fit(cooccurrence_matrix) print(embeddings.shape) plt.scatter(embeddings[:, 0], embeddings[:, 1], marker="o") for i in range(0, len(feature_names)): plt.text(embeddings[i, 0], embeddings[i, 1], feature_names[i]) plt.show()
# In[ ]: #alternative test if 'IS_GRADESCOPE_ENV' not in os.environ: from mittens import GloVe #subword + glove #giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0) giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window5-scaled.csv.gz"), index_col=0) giga = vsm.pmi(giga) #giga = vsm.lsa(giga20, k = 50) #'''' glove = GloVe(max_iter=100) embeddings = glove.fit(giga) giga = pd.DataFrame(embeddings, index=giga.index) #''' output = full_word_similarity_evaluation(giga) return output # ## Bake-off [1 point] # # For the bake-off, we will release two additional datasets. The announcement will go out on the discussion forum. We will also release reader code for these datasets that you can paste into this notebook. You will evaluate your custom model $M$ (from the previous question) on these new datasets using `full_word_similarity_evaluation`. Rules: # # 1. Only one evaluation is permitted. # 1. No additional system tuning is permitted once the bake-off has started. # # The cells below this one constitute your bake-off entry. #
## 从1开始,留一位给UNK,从1开始标 word2idx = dict( zip(entities_all_count_keys, range(1, len(entities_all_count_keys) + 1))) # 对句子进行tokenize,生成token矩阵 sentences_tokens, _ = make_deepLearn_data(sentences, word2idx) # 生成共现矩阵 cooccurrence_matrix = generate_co_occurrence( sentences_tokens, len(word2idx.keys()), window_size=5, ) # 训练glove并输出文件 glove_model = GloVe(n=dim_n, max_iter=itter_n) embedMatrix = glove_model.fit(cooccurrence_matrix) print('load word2vec model...') model = KeyedVectors.load_word2vec_format('train_vec_byTencent_word.bin', binary=True) print('build embedding matrix...') new_embedMatrix = merge_glove_word2vec_embedding(word2idx, embedMatrix, model) with open('word2idx_embedMatrix_glove_word2vec.pkl', 'wb') as f: pickle.dump([word2idx, new_embedMatrix], f) #%% 3.生成训练集和测试集 ## no title
for context_index in range(context_start_index, context_end_index): if center_index != context_index: context_word = doc_words[context_index] context_id = dictionary.token2id[context_word] print center_id, center_word, context_id, context_word matrix[center_id, context_id] += 1 print matrix # 奇异值分解(Singular Value Decomposition) U, s, Vh = np.linalg.svd(matrix, full_matrices=False) # 聚类 X = -U[:, 0:2] labels = KMeans(n_clusters=2).fit(X).labels_ colors = ('y', 'g') glove_model = GloVe(n=2, max_iter=200) embeddings = glove_model.fit(matrix) X = embeddings mp.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文 for word in dictionary.token2id.keys(): i = dictionary.token2id[word] mp.scatter(X[i, 1], X[i, 0], c=colors[labels[i]], s=400, alpha=0.4) mp.text(X[i, 1], X[i, 0], word, ha='center', va='center', fontproperties=font) mp.show()