Esempio n. 1
0
def main() -> None:
    with open('cbow_params.pkl', 'rb') as f:
        params = pickle.load(f)
        word_vecs = params['word_vecs']
        word_to_id = params['word_to_id']
        id_to_word = params['id_to_word']

    queries = ['you', 'year', 'car', 'toyota']
    for query in queries:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)

    analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)
    analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs)
    analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs)
    analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs)
Esempio n. 2
0
def main(args):
    """ Display most similar k words
    """
    print(args)

    word_vec_svd = np.load(args.model_path)

    with open(args.id2word_path) as f:
        pairs = f.readlines()
        word_to_id = {}
        id_to_word = {}
        for p in pairs:
            p = re.sub(r"\n", "", p)
            p = p.split("\t")
            id = int(p[0])
            word = p[1]
            id_to_word[id] = word
            word_to_id[word] = id

    for word in word_to_id:
        most_similar(word, word_to_id, id_to_word, word_vec_svd)
Esempio n. 3
0
def main():
    window_size = 2
    wordvec_size = 100

    corpus, word_to_id, id_to_word = ptb.load_data('train')
    vocab_size = len(word_to_id)

    print('counting co-occurence...')
    co_matrix = create_co_matrix(corpus, vocab_size, window_size)
    print('calculating PPMI...')
    W = ppmi(co_matrix, verbose=True)

    print('calculating SVD ...')
    U, S, V = randomized_svd(W,
                             n_components=wordvec_size,
                             n_iter=5,
                             random_state=42)

    word_vecs = U[:, :wordvec_size]

    queries = ['you', 'year', 'car', 'toyota']
    for query in queries:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
    print('DONE')
Esempio n. 4
0
# %%
import numpy as np
from util import create_co_matrix, preprocess, cos_similarity

text = "You say goodbye and I say hellow."
corpus, word_to_id, id_to_word = preprocess(text)
C = create_co_matrix(corpus, len(word_to_id))
c0 = C[word_to_id['you']]
c2 = C[word_to_id['i']]
print(cos_similarity(c0, c2))

# %%
from util import most_similar
most_similar('hellow', word_to_id, id_to_word, C)
# %%
# 共起行列をppmi行列に変換する
import numpy as np
from util import create_co_matrix, preprocess, cos_similarity, ppmi
text = "You say goodbye and I say hellow."
corpus, word_to_id, id_to_word = preprocess(text)
W = ppmi(C, True)

np.set_printoptions(precision=3)
print('covariance matrix')
print(C)
print('-' * 50)
print('PPMI')
print(W)
# %%
# SVDによる次元削減p84
import numpy as np
Esempio n. 5
0
import pickle
from util import most_similar, analogy, analogy_text

if __name__ == '__main__':
    #fname = 'cbow_params.pkl'
    fname = 'skip_gram_params.pkl'
    with open(fname, 'rb') as f:
        params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

    # most similar
    querys = ['you', 'year', 'car', 'toyota', 'cat', 'music']
    for query in querys:
        most_similar(query, word_to_id, id_to_word, word_vecs, top=5)

    # analogy
    print('\n-' + ' (distance) ' + '-' * 46)
    analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs, top=30)
    analogy('take', 'took', 'go', word_to_id, id_to_word, word_vecs)
    analogy('car', 'cars', 'child', word_to_id, id_to_word, word_vecs, top=10)
    analogy('good', 'better', 'bad', word_to_id, id_to_word, word_vecs, top=30)
    print('\n-' + ' (text) ' + '-' * 50)
    analogy_text('king',
                 'man',
                 'queen',
                 word_to_id,
                 id_to_word,
                 word_vecs,
                 top=30)
Esempio n. 6
0
word1 = "猫"
word2 = "ライオン"
word3 = "犬"

emb1 = embedding[v_to_i[word1]]
emb2 = embedding[v_to_i[word2]]
emb3 = embedding[v_to_i[word3]]

cos_sim_1_2 = U.cos_similarity(emb1, emb2)
cos_sim_1_3 = U.cos_similarity(emb1, emb3)
cos_sim_2_3 = U.cos_similarity(emb2, emb3)

print("===== cosine similarity =====")
print("{} : {} = {}".format(word1, word2, cos_sim_1_2))
print("{} : {} = {}".format(word1, word3, cos_sim_1_3))
print("{} : {} = {}".format(word2, word3, cos_sim_2_3))

### 類似単語トップ5

print("\n===== most similar =====")
query = []
query.append('猫')
print(U.most_similar(query[0], v_to_i, i_to_v, embedding, top=5))

### 類推

print("\n===== analogy =====")
U.analogy('日本', '東京', 'アメリカ', v_to_i, i_to_v, embedding)
U.analogy('王', '男', '女王', v_to_i, i_to_v, embedding)
Esempio n. 7
0
    print('vocab size: {0}'.format(len(word_to_id)))
    print('corpus size: {0}'.format(len(corpus)))

    # 共起行列
    print('counting co_occurence..')
    c = create_co_matrix(corpus,
                         vocab_size=len(word_to_id),
                         window_size=window_size)

    # ppmi
    print('calculating ppmi (t) ..')
    m_t = ppmi_text(c, verbose=True)

    print('calculating ppmi (self) ..')
    m = ppmi(c)

    # 次元削減 SVD
    print('calculating svd..')
    U, S, V = randomized_svd(m, n_components=vec_size)

    U_t, S_t, V_t = randomized_svd(m_t, n_components=vec_size)

    # ひょうか
    querys = ['you', 'year', 'car', 'toyota']
    for q in querys:
        print('SVD (self ppmi)')
        most_similar(q, word_to_id, id_to_word, U)
        print('SVD (t ppmi)')
        most_similar(q, word_to_id, id_to_word, U_t)