def main(): text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) W = ppmi(C) np.set_printoptions(precision=3) # 有効桁3桁で表示 print('convariance matrix') print(C) print('-'*50) print('PPMI') print(W)
def test_ppmi(self): text = 'you say goodbye and I say hello.' corpus, w2id, id2w = preprocess(text) vocab_size = len(w2id) C = create_co_matrix(corpus, vocab_size) W = ppmi(C) W = np.around(W, 3) expected = np.array([[0., 1.807, 0., 0., 0., 0., 0.], [1.807, 0., 0.807, 0., 0.807, 0.807, 0.], [0., 0.807, 0., 1.807, 0., 0., 0.], [0., 0., 1.807, 0., 1.807, 0., 0.], [0., 0.807, 0., 1.807, 0., 0., 0.], [0., 0.807, 0., 0., 0., 0., 2.807], [0., 0., 0., 0., 0., 2.807, 0.]]) np.testing.assert_array_almost_equal(W, expected)
def main(): text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size, window_size=1) W = ppmi(C) # SVD U, S, V = np.linalg.svd(W) print(C[0]) # 共起行列 print(W[0]) # PPMI行列 print(U[0]) # SVD print(U[0, :2]) for word, word_id in word_to_id.items(): plt.annotate(word, (U[word_id, 0], U[word_id, 1])) plt.scatter(U[:, 0], U[:, 1], alpha=0.5) plt.show()
import sys sys.path.append('..') import numpy as np from common.util import preprocess, create_co_matrix, cos_similarity, ppmi text = "You say goodbye and I say hello." corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) W = ppmi(C) np.set_printoptions(precision=3) # 유효 자릿수를 세 자리로 표시 print('동시발생 행렬') print(C) print('-' * 50) print('PPMI') print(W)
# coding: utf-8 import sys sys.path.append('/home/hiromasa/deep-learning-from-scratch-2') import numpy as np from common.util import most_similar, create_co_matrix, ppmi from dataset import ptb window_size = 2 wordvec_size = 100 corpus, word_to_id, id_to_word = ptb.load_data('train') vocab_size = len(word_to_id) print('counting co-occurrence /home/hiromasa/deep-learning-from-scratch-2.') C = create_co_matrix(corpus, vocab_size, window_size) print('calculating PPMI /home/hiromasa/deep-learning-from-scratch-2.') W = ppmi(C, verbose=True) print('calculating SVD /home/hiromasa/deep-learning-from-scratch-2.') try: # truncated SVD (fast!) from sklearn.utils.extmath import randomized_svd U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None) except ImportError: # SVD (slow) U, S, V = np.linalg.svd(W) word_vecs = U[:, :wordvec_size]
for j in range(C.shape[1]): pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps) M[i, j] = max(0, pmi) if verbose: cnt += 1 if cnt % (total // 100) == 0: print('%.1f%% done' % (100 * cnt / total)) return M text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) W = ppmi(C) np.set_printoptions(precision=3) # 有効桁3桁で表示 print('covariance matrix') print(C) # [[0 1 0 0 0 0 0] # [1 0 1 0 1 1 0] # [0 1 0 1 0 0 0] # [0 0 1 0 1 0 0] # [0 1 0 1 0 0 0] # [0 1 0 0 0 0 1] # [0 0 0 0 0 1 0]] print('-' * 50) print('PPMI') print(W) # [[0. 1.807 0. 0. 0. 0. 0. ]