def knock_88(vec): sim_dic = {} for token, vector in sorted(vec.items()): if token != "England": sim_dic[token] = cos_sim(vec["England"], vector) for (token, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 10)): print("{}\t{}".format(token, sim))
def knock_89(vec): calc_vector = vec["Spain"] - vec["Madrid"] + vec["Athens"] sim_dic = {} for token, vector in sorted(vec.items()): sim_dic[token] = cos_sim(calc_vector, vector) for (token, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 10)): print("{}\t{}".format(token, sim))
def knock_88(vec): sim_dic = {} for token, vector in sorted(vec.items()): if token != "England": sim_dic[token] = cos_sim(vec["England"], vector) for (token, sim), i in zip( sorted(sim_dic.items(), key=lambda x: x[1], reverse=True), range(0, 10)): print("{}\t{}".format(token, sim))
def main(): # 単語→IDの辞書の読み込み t_id = pickle.load(open('t_id.pickle', 'rb')) # ppmi行列読み込み ppmi_mat_300 = io.loadmat('knock85_300')['ppmi_mat_300'] # ベクトル取得 v = ppmi_mat_300[t_id['England']] # 全単語とのコサイン類似度を計算 t_list = list(t_id) sims = [(cos_sim(v, ppmi_mat_300[i]), t_list[i]) for i in range(len(t_id))] # 上位十件表示 for sim, word in sorted(sims)[-2:-12:-1]: print(f'{word}\t{sim}')
def make_rank(word_vec, word_name, pca_matrix, ex_name=''): top_list = list() for word in word_name: if word == ex_name: continue top_list.sort(key=lambda x: x[1], reverse=True) v = pca_matrix[word_name[word]] temp_sim = knock87.cos_sim(word_vec, v) if len(top_list) < 10: top_list.append((word, temp_sim)) else: if top_list[9][1] < temp_sim: top_list[9] = (word, temp_sim) return top_list
def main(): # 単語→IDの辞書の読み込み t_id = pickle.load(open('t_id.pickle', 'rb')) # ppmi行列読み込み ppmi_mat_300 = io.loadmat('knock85_300')['ppmi_mat_300'] # ベクトル取得 v_spain = ppmi_mat_300[t_id['Spain']] v_madrid = ppmi_mat_300[t_id['Madrid']] v_athens = ppmi_mat_300[t_id['Athens']] v = v_spain - v_madrid + v_athens # 全単語とのコサイン類似度を計算 t_list = list(t_id) sims = [(cos_sim(v, ppmi_mat_300[i]), t_list[i]) for i in range(len(t_id))] # 上位十件表示 for sim, word in sorted(sims)[-1:-21:-1]: print(f'{word}\t{sim}')
from gensim.models import word2vec import pickle import sys sys.path.append("/Users/Yukio/work/100kcnok2016/yukio/chapter09/") from knock87 import cos_sim with open("../chapter09/word_vec_85.pickle", "rb") as f: vec_85 = pickle.load(f) vec_90 = word2vec.Word2Vec.load("word2vec") f_85 = open("result_94_v85.txt", "w") f_90 = open("result_94_v90.txt", "w") for i, line in enumerate(open("combined.tab")): if i != 0: word1, word2, val = line.strip().split() if word1 in vec_85 and word2 in vec_85: f_85.write("{}\t{}\n".format(line.strip(), cos_sim(vec_85[word1], vec_85[word2]))) if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys(): f_90.write("{}\t{}\n".format(line.strip(), cos_sim(vec_90[word1], vec_90[word2]))) f_85.close() f_90.close()
import sys sys.path.append("/Users/Yukio/work/100kcnok2016/yukio/chapter09/") from knock87 import cos_sim with open("../chapter09/word_vec_85.pickle", "rb") as f: vec_85 = pickle.load(f) vec_90 = word2vec.Word2Vec.load("word2vec") f_85 = open("result_92_v85.txt", "w") f_90 = open("result_92_v90.txt", "w") for line in open("data_91.txt"): word1, word2, word3, word4 = line.strip().split() if word1 in vec_85 and word2 in vec_85 and word3 in vec_85: sim_dic = {} for token, vector in sorted(vec_85.items()): sim_dic[token] = cos_sim(vec_85[word2] - vec_85[word1] + vec_85[word3], vector) for (word, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 1)): f_85.write("{} {} {}\n".format(line.strip(), word, sim)) if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys() and word3 in vec_90.vocab.keys(): result = vec_90.most_similar(positive = [word2, word3], negative = [word1]) for (word, sim), i in zip(result, range(0, 1)): f_90.write("{} {} {}\n".format(line.strip(), word, sim)) f_85.close() f_90.close()
with open("../chapter09/word_vec_85.pickle", "rb") as f: vec_85 = pickle.load(f) vec_90 = word2vec.Word2Vec.load("word2vec") f_85 = open("result_92_v85.txt", "w") f_90 = open("result_92_v90.txt", "w") for line in open("data_91.txt"): word1, word2, word3, word4 = line.strip().split() if word1 in vec_85 and word2 in vec_85 and word3 in vec_85: sim_dic = {} for token, vector in sorted(vec_85.items()): sim_dic[token] = cos_sim( vec_85[word2] - vec_85[word1] + vec_85[word3], vector) for (word, sim), i in zip( sorted(sim_dic.items(), key=lambda x: x[1], reverse=True), range(0, 1)): f_85.write("{} {} {}\n".format(line.strip(), word, sim)) if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys( ) and word3 in vec_90.vocab.keys(): result = vec_90.most_similar(positive=[word2, word3], negative=[word1]) for (word, sim), i in zip(result, range(0, 1)): f_90.write("{} {} {}\n".format(line.strip(), word, sim)) f_85.close() f_90.close()
import knock90 from gensim.models import word2vec if __name__ == '__main__': model = knock90.load_emb() sim_file = 'wordsim353/combined.tab' with open('../chapter09/word_context_pca.dump', 'rb') as john: word_name, pca_matrix = pickle.load(john) with open(sim_file) as i_f, open('similarity.90', 'w') as o_f: for i, line in enumerate(i_f): if i == 0: continue one, zwei, mean = line.strip().split() if not (one in model and zwei in model): continue o_f.write('{}\t{}\n'.format( line.strip(), knock87.cos_sim(model[one], model[zwei]))) with open(sim_file) as i_f, open('similarity.85', 'w') as o_f: for i, line in enumerate(i_f): if i == 0: continue one, zwei, mean = line.strip().split() if not (one in word_name and zwei in word_name): continue vec_one = pca_matrix[word_name[one]] vec_zwei = pca_matrix[word_name[zwei]] o_f.write('{}\t{}\n'.format(line.strip(), knock87.cos_sim(vec_one, vec_zwei)))