def print_ppmi(line: str, t1: str, t2: str, f): """ t1, t2 の各単語ベクトルのコサイン類似度を求める (ppmi) """ try: res = cosine_similarity(ppmi[ppmi_index[t1]], ppmi[ppmi_index[t2]]) print(f"{line.rstrip()}\t{res}", file=f) except KeyError: print(f"{line.rstrip()}\t-", file=f)
def print_w2v(line: str, t1: str, t2: str, f): """ t1, t2 の各単語ベクトルのコサイン類似度を求める (word2vec) """ try: res = cosine_similarity(w2v[w2v_index[t1]], w2v[w2v_index[t2]]) print(f"{line.rstrip()}\t{res}", file=f) except KeyError: print(f"{line.rstrip()}\t-", file=f)
def similar_list(mat: M, t_idx: Dict[str, int], v: np.ndarray) -> Rank: """ v の類似度が高い n 語とその類似度のリストを返す """ t_keys = list(t_idx.keys()) dist = {t_keys[i]: cosine_similarity(v, mat[i]) for i in range(len(t_idx))} rank = sorted(dist.items(), key=itemgetter(1), reverse=True) return rank
def main(): r = redis.Redis(host="localhost", port=6379, db=0) matrix = loadmat("knock85.matrix")["knock85.matrix"] t = pickle.loads(r.get("knock83.t")) t_keys = list(t.keys()) t_index = OrderedDict((key, i) for i, key in enumerate(t_keys)) eng = matrix[t_index["England"]] dist = { t_keys[i]: cosine_similarity(eng, matrix[i]) for i in range(len(t_index)) } rank = sorted(dist.items(), key=itemgetter(1), reverse=True) for i in range(1, 11): print(f"{rank[i][0]}\t{rank[i][1]}")
def main(): r = redis.Redis(host="localhost", port=6379, db=0) matrix = loadmat("knock85.matrix")["knock85.matrix"] t = pickle.loads(r.get("knock83.t")) t_keys = list(t.keys()) t_index = OrderedDict((key, i) for i, key in enumerate(t_keys)) # スペイン - マドリード (首都) + アテネ (ギリシャの首都) vec = ( matrix[t_index["Spain"]] - matrix[t_index["Madrid"]] + matrix[t_index["Athens"]] ) dist = {t_keys[i]: cosine_similarity(vec, matrix[i]) for i in range(len(t_index))} rank = sorted(dist.items(), key=itemgetter(1), reverse=True) for i in range(1, 11): print(f"{rank[i][0]}\t{rank[i][1]}")
def main(): word2vec_filepath = "../data/w2v.txt" load_word2vec(word2vec_filepath) matrix, t_index = deserialize("matrix"), deserialize("t_index") # knock86 u_s = matrix[t_index["United_States"]] # knock87 print(cosine_similarity(matrix[t_index["U.S"]], u_s)) # knock88 for rank in similar_list(matrix, t_index, matrix[t_index["England"]])[1:11]: print(f"{rank[0].ljust(10)} : {rank[1]}") # knock89 for rank in multi_vec(matrix, t_index, "Spain", "Madrid", "Athens")[1:11]: print(f"{rank[0].ljust(10)} : {rank[1]}")