Example #1
0
def knock_88(vec):
    sim_dic = {}
    for token, vector in sorted(vec.items()):
        if token != "England":
            sim_dic[token] = cos_sim(vec["England"], vector)

    for (token, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 10)):
        print("{}\t{}".format(token, sim))
Example #2
0
def knock_89(vec):
    calc_vector = vec["Spain"] - vec["Madrid"] + vec["Athens"]
    sim_dic = {}
    for token, vector in sorted(vec.items()): 
        sim_dic[token] = cos_sim(calc_vector, vector)

    for (token, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 10)):
        print("{}\t{}".format(token, sim))                                               
Example #3
0
def knock_88(vec):
    sim_dic = {}
    for token, vector in sorted(vec.items()):
        if token != "England":
            sim_dic[token] = cos_sim(vec["England"], vector)

    for (token, sim), i in zip(
            sorted(sim_dic.items(), key=lambda x: x[1], reverse=True),
            range(0, 10)):
        print("{}\t{}".format(token, sim))
Example #4
0
def main():
    # 単語→IDの辞書の読み込み
    t_id = pickle.load(open('t_id.pickle', 'rb'))
    # ppmi行列読み込み
    ppmi_mat_300 = io.loadmat('knock85_300')['ppmi_mat_300']
    # ベクトル取得
    v = ppmi_mat_300[t_id['England']]
    # 全単語とのコサイン類似度を計算
    t_list = list(t_id)
    sims = [(cos_sim(v, ppmi_mat_300[i]), t_list[i]) for i in range(len(t_id))]
    # 上位十件表示
    for sim, word in sorted(sims)[-2:-12:-1]:
        print(f'{word}\t{sim}')
Example #5
0
def make_rank(word_vec, word_name, pca_matrix, ex_name=''):
    top_list = list()
    for word in word_name:
        if word == ex_name:
            continue
        top_list.sort(key=lambda x: x[1], reverse=True)
        v = pca_matrix[word_name[word]]
        temp_sim = knock87.cos_sim(word_vec, v)
        if len(top_list) < 10:
            top_list.append((word, temp_sim))
        else:
            if top_list[9][1] < temp_sim:
                top_list[9] = (word, temp_sim)
    return top_list
Example #6
0
def main():
    # 単語→IDの辞書の読み込み
    t_id = pickle.load(open('t_id.pickle', 'rb'))
    # ppmi行列読み込み
    ppmi_mat_300 = io.loadmat('knock85_300')['ppmi_mat_300']
    # ベクトル取得
    v_spain = ppmi_mat_300[t_id['Spain']]
    v_madrid = ppmi_mat_300[t_id['Madrid']]
    v_athens = ppmi_mat_300[t_id['Athens']]
    v = v_spain - v_madrid + v_athens
    # 全単語とのコサイン類似度を計算
    t_list = list(t_id)
    sims = [(cos_sim(v, ppmi_mat_300[i]), t_list[i]) for i in range(len(t_id))]
    # 上位十件表示
    for sim, word in sorted(sims)[-1:-21:-1]:
        print(f'{word}\t{sim}')
Example #7
0
from gensim.models import word2vec
import pickle
import sys
sys.path.append("/Users/Yukio/work/100kcnok2016/yukio/chapter09/")
from knock87 import cos_sim

with open("../chapter09/word_vec_85.pickle", "rb") as f:
    vec_85 = pickle.load(f)

vec_90 = word2vec.Word2Vec.load("word2vec")

f_85 = open("result_94_v85.txt", "w")
f_90 = open("result_94_v90.txt", "w")

for i, line in enumerate(open("combined.tab")):
    if i != 0:
        word1, word2, val = line.strip().split()

        if word1 in vec_85 and word2 in vec_85:
            f_85.write("{}\t{}\n".format(line.strip(),
                                         cos_sim(vec_85[word1],
                                                 vec_85[word2])))

        if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys():
            f_90.write("{}\t{}\n".format(line.strip(),
                                         cos_sim(vec_90[word1],
                                                 vec_90[word2])))

f_85.close()
f_90.close()
Example #8
0
import sys
sys.path.append("/Users/Yukio/work/100kcnok2016/yukio/chapter09/")
from knock87 import cos_sim

with open("../chapter09/word_vec_85.pickle", "rb") as f:
    vec_85 = pickle.load(f)

vec_90 = word2vec.Word2Vec.load("word2vec")

f_85 = open("result_92_v85.txt", "w")
f_90 = open("result_92_v90.txt", "w")

for line in open("data_91.txt"):
    word1, word2, word3, word4 = line.strip().split()

    if word1 in vec_85 and word2 in vec_85 and word3 in vec_85:
        sim_dic = {}
        for token, vector in sorted(vec_85.items()):
            sim_dic[token] = cos_sim(vec_85[word2] - vec_85[word1] + vec_85[word3], vector)
        for (word, sim), i in zip(sorted(sim_dic.items(), key = lambda x: x[1], reverse = True), range(0, 1)):
            f_85.write("{} {} {}\n".format(line.strip(), word, sim))

    if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys() and word3 in vec_90.vocab.keys():
        result = vec_90.most_similar(positive = [word2, word3], negative = [word1])
        for (word, sim), i in zip(result, range(0, 1)):
            f_90.write("{} {} {}\n".format(line.strip(), word, sim))

f_85.close()
f_90.close()

Example #9
0
with open("../chapter09/word_vec_85.pickle", "rb") as f:
    vec_85 = pickle.load(f)

vec_90 = word2vec.Word2Vec.load("word2vec")

f_85 = open("result_92_v85.txt", "w")
f_90 = open("result_92_v90.txt", "w")

for line in open("data_91.txt"):
    word1, word2, word3, word4 = line.strip().split()

    if word1 in vec_85 and word2 in vec_85 and word3 in vec_85:
        sim_dic = {}
        for token, vector in sorted(vec_85.items()):
            sim_dic[token] = cos_sim(
                vec_85[word2] - vec_85[word1] + vec_85[word3], vector)
        for (word, sim), i in zip(
                sorted(sim_dic.items(), key=lambda x: x[1], reverse=True),
                range(0, 1)):
            f_85.write("{} {} {}\n".format(line.strip(), word, sim))

    if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys(
    ) and word3 in vec_90.vocab.keys():
        result = vec_90.most_similar(positive=[word2, word3], negative=[word1])
        for (word, sim), i in zip(result, range(0, 1)):
            f_90.write("{} {} {}\n".format(line.strip(), word, sim))

f_85.close()
f_90.close()
Example #10
0
from gensim.models import word2vec
import pickle
import sys
sys.path.append("/Users/Yukio/work/100kcnok2016/yukio/chapter09/")
from knock87 import cos_sim

with open("../chapter09/word_vec_85.pickle", "rb") as f:
    vec_85 = pickle.load(f)

vec_90 = word2vec.Word2Vec.load("word2vec")

f_85 = open("result_94_v85.txt", "w")
f_90 = open("result_94_v90.txt", "w")

for i, line in enumerate(open("combined.tab")):
    if i != 0:
        word1, word2, val = line.strip().split()

        if word1 in vec_85 and word2 in vec_85:
            f_85.write("{}\t{}\n".format(line.strip(), cos_sim(vec_85[word1], vec_85[word2])))

        if word1 in vec_90.vocab.keys() and word2 in vec_90.vocab.keys():
            f_90.write("{}\t{}\n".format(line.strip(), cos_sim(vec_90[word1], vec_90[word2])))

f_85.close()
f_90.close()

Example #11
0
import knock90
from gensim.models import word2vec

if __name__ == '__main__':
    model = knock90.load_emb()
    sim_file = 'wordsim353/combined.tab'
    with open('../chapter09/word_context_pca.dump', 'rb') as john:
        word_name, pca_matrix = pickle.load(john)

    with open(sim_file) as i_f, open('similarity.90', 'w') as o_f:
        for i, line in enumerate(i_f):
            if i == 0:
                continue
            one, zwei, mean = line.strip().split()
            if not (one in model and zwei in model):
                continue
            o_f.write('{}\t{}\n'.format(
                line.strip(), knock87.cos_sim(model[one], model[zwei])))

    with open(sim_file) as i_f, open('similarity.85', 'w') as o_f:
        for i, line in enumerate(i_f):
            if i == 0:
                continue
            one, zwei, mean = line.strip().split()
            if not (one in word_name and zwei in word_name):
                continue
            vec_one = pca_matrix[word_name[one]]
            vec_zwei = pca_matrix[word_name[zwei]]
            o_f.write('{}\t{}\n'.format(line.strip(),
                                        knock87.cos_sim(vec_one, vec_zwei)))