def test_cos_similarity(self): text = 'you say goodbye and I say hello.' corpus, w2id, id2w = preprocess(text) vocab_size = len(w2id) C = create_co_matrix(corpus, vocab_size) c0 = C[w2id['you']] c1 = C[w2id['i']] expected_c0 = 0.9999999800000005 expected_c1 = 0.7071067691154799 self.assertEqual(cos_similarity(c0, c0), expected_c0) self.assertEqual(cos_similarity(c0, c1), expected_c1)
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5): '''類似単語の検索 :param query: クエリ(テキスト) :param word_to_id: 単語から単語IDへのディクショナリ :param id_to_word: 単語IDから単語へのディクショナリ :param word_matrix: 単語ベクトルをまとめた行列。各行に対応する単語のベクトルが格納されていることを想定する :param top: 上位何位まで表示するか ''' if query not in word_to_id: print('%s is not found' % query) return print('\n[query] ' + query) query_id = word_to_id[query] query_vec = word_matrix[query_id] vocab_size = len(id_to_word) similarity = np.zeros(vocab_size) for i in range(vocab_size): similarity[i] = cos_similarity(word_matrix[i], query_vec) count = 0 for i in (-1 * similarity ).argsort(): # argsortはNumpy配列の要素を小さい順にソートするので、-1をかけて大きい順に取り出す if id_to_word[i] == query: continue print(' %s: %s' % (id_to_word[i], similarity[i])) count += 1 if count >= top: return
def test_cos_similarity(self): input_x = [np.array([0, 0]), np.array([3, 3, 3, 3])] input_y = [np.array([0, 0]), np.array([4, 4, 4, 4])] expected = [0, 1] for i in range(len(input_x)): actual = cos_similarity(input_x[i], input_y[i]) self.assertAlmostEqual(actual, expected[i])
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5): if query not in word_to_id: print('%s is not found' % query) return print('\n[query] ' + query) query_id = word_to_id[query] query_vec = word_matrix[query_id] vocab_size = len(id_to_word) similarity = np.zeros(vocab_size) for i in range(vocab_size): similarity[i] = cos_similarity(word_matrix[i], query_vec) count = 0 for i in (-1 * similarity).argsort(): if id_to_word[i] == query: continue print(' %s: %s' % (id_to_word[i], similarity[i])) count += 1 if count >= top: return
# coding: utf-8 import sys sys.path.append('..') from common.util import preprocess, create_co_matrix, cos_similarity text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) c0 = C[word_to_id['you']] #「you」的詞向量 c1 = C[word_to_id['i']] #「i」的詞向量 print(cos_similarity(c0, c1))
print(C[0]) print(C[word_to_id["goodbye"]]) #%% [markdown] # ###ベクトル間の類似度 # 共起行列をもちいてベクトル感の類似度を計測する # ベクトルの内積・ユークリッド距離などがありますが、単語のベクトル表現の類似度は # コサイン類似度(cosine similarity)がよく用いられる。 # $$ # similarity(x, y) = \frac{xy}{||x|| ||y||} = \frac{x_1y_1 + ... + x_ny_n}{\sqrt{x_1^2+ .. + x_n^2} \sqrt{y_1^2 + ... + y_n^2}} # $$ #%% from common import util text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = util.create_co_matrix(corpus, vocab_size) c0 = C[word_to_id["you"]] # youの単語ベクトル c1 = C[word_to_id["i"]] # iの単語ベクトル print(c0, c1) print(util.cos_similarity(c0, c1)) #%% import numpy as np x = np.array([100, -20, 2]) print(x.argsort())
# coding: utf-8 import sys sys.path.append(r'C:\Users\pc\Desktop\고영국\개발\AI\DeepLearning\Scratch2') from common.util import preprocess, create_co_matrix, cos_similarity text = 'You say goodbey and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) c0 = C[word_to_id['you']] # "you"의 단어 벡터 c1 = C[word_to_id['i']] # "i"의 단어 벡터 print(cos_similarity(c0, c1)) # 0.70... <- 1에 가까울수록 유사도가 높다
# -*- coding: utf-8 -*- """ Created on Sat Jan 23 00:36:44 2021 @author: ghqls """ import sys sys.path.append('..') from common.util import preprocess, create_co_matrix, cos_similarity text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) C0 = C[word_to_id['you']] C1 = C[word_to_id['i']] print(cos_similarity(C0, C1))
vocabulary_size = len(word_to_id) C = create_co_matrix(corpus=corpus, vocabulary_size=vocabulary_size) print(vocabulary_size) print(C) vec_you = C[word_to_id['you']] vec_i = C[word_to_id['i']] vec_hello = C[word_to_id['hello']] vec_say = C[word_to_id['say']] vec_goodbye = C[word_to_id['goodbye']] vec_and = C[word_to_id['and']] print('you, i') print(cos_similarity(vec_you, vec_i)) print('you, hello') print(cos_similarity(vec_you, vec_hello)) print('you, say') print(cos_similarity(vec_you, vec_say)) print('you, goodbye') print(cos_similarity(vec_you, vec_goodbye)) print('you, and') print(cos_similarity(vec_you, vec_and)) most_similar('you', word_to_id, id_to_word, C, top=5) W = ppmi(C, verbose=True) np.set_printoptions(precision=3) print('covariance matrix') print(C)
for i in range(1, window_size + 1): left_idx = idx - i right_idx = idx + i if left_idx >= 0: left_word_id = corpus[left_idx] matrix[word_id, left_word_id] += 1 if right_idx < corpus_size: right_word_id = corpus[right_idx] matrix[word_id, right_word_id] += 1 return matrix if __name__ == '__main__': text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess(text) print("corpus:", corpus) co_matrix = create_co_occurence_matrix(corpus, len(word_to_id)) print(co_matrix) # cosine similarity を計算してみる print('cosine similarities...') vec_you = co_matrix[word_to_id['you']] vec_I = co_matrix[word_to_id['i']] print('you and i:', cos_similarity(vec_you, vec_I)) vec_hello = co_matrix[word_to_id['hello']] print('you and hello:', cos_similarity(vec_you, vec_hello) ) # this is equal to that of you and i beacaus corpus is too small. print('i and hello:', cos_similarity(vec_I, vec_hello))