#!/usr/bin/env python import sys sys.path.append('..') from common.util import preprocess, create_co_matrix, cos_similarity, most_similar, ppmi import numpy as np text = 'You say goodbye and I say hello.'.lower() corpus, word_to_id, id_to_word = preprocess(text) print(corpus) print(id_to_word) vocabulary_size = len(word_to_id) C = create_co_matrix(corpus=corpus, vocabulary_size=vocabulary_size) print(vocabulary_size) print(C) vec_you = C[word_to_id['you']] vec_i = C[word_to_id['i']] vec_hello = C[word_to_id['hello']] vec_say = C[word_to_id['say']] vec_goodbye = C[word_to_id['goodbye']] vec_and = C[word_to_id['and']] print('you, i') print(cos_similarity(vec_you, vec_i)) print('you, hello') print(cos_similarity(vec_you, vec_hello)) print('you, say')
def cos_similarity(x, y, eps=1e-8): nx = x / np.sqrt(np.sum(x**2) + eps) ny = y / np.sqrt(np.sum(y**2) + eps) return np.dot(nx, ny) import sys sys.path.append('..') from common.util import preprocess, create_co_matrix, cos_similarity text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) c0 = C[word_to_id['you']] c1 = C[word_to_id['i']] print(cos_similarity(c0, c1)) def most_similar(query, word_to_id, id_to_word, word_matrix, top=5): if query not in word_to_id: print('%s is not found' % query) return print('\n[query] ' + query) query_id = word_to_id[query] query_vec = word_matrix[query_id]
import sys sys.path.append('..') import numpy as np from common.util import preprocess, create_co_matrix, cos_similarity, most_similar text = 'You say goodbye I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) c0 = C[word_to_id['say']] c1 = C[word_to_id['goodbye']] print(cos_similarity(c0, c1)) most_similar('hello', word_to_id, id_to_word, C, top=5)