def test_word2vec_n_closest(): embedded = malaya.malaya_word2vec(256) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) word = 'anwar' assert len(word_vector.n_closest(word=word, num_closest=8, metric='cosine')) > 0
def test_word2vec_n_closest_without_similarity(): embedded = malaya.malaya_word2vec(256) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) word = 'anwar' assert len( word_vector.n_closest( word=word, num_closest=8, metric='cosine', return_similarity=False)) > 0
def test_word2vec_tsne(): embedded = malaya.malaya_word2vec(32) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) embed_2d, word_list = word_vector.project_2d(0, 100) assert embed_2d.shape[1] == 2
def test_word2vec_analogy(): embedded = malaya.malaya_word2vec(256) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) assert len(word_vector.analogy('anwar', 'penjara', 'kerajaan', 5)) == 5
def word_count(str): counts = dict() words = str.split() for word in words: if word in counts: counts[word] += 1 else: counts[word] = 1 return counts ''' TEST DIFFERENT EMBEDDING PERFORMANCE ON DIFFERENT DATASET ''' embedded = malaya.malaya_word2vec(256) print(len(embedded['dictionary']), embedded['nce_weights'].shape) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) ''' FEATURE SELECTION ''' tvec = TfidfVectorizer(max_features=100000, ngram_range=(1, 3)) x_train_tfidf = tvec.fit_transform(train_x) chi2score = chi2(x_train_tfidf, train_y)[0] plt.figure(figsize=(15, 10)) wscores = zip(tvec.get_feature_names(), chi2score) wchi2 = sorted(wscores, key=lambda x: x[1]) topchi2 = list(zip(*wchi2[-20:])) x = range(len(topchi2[1])) labels = topchi2[0]
def test_word2vec_calculator_n_closest_without_similarity(): embedded = malaya.malaya_word2vec(256) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) assert len(word_vector.calculator('anwar + mahathir', num_closest=8, metric='cosine', return_similarity=False))
def test_word2vec_calculator_bracket(): embedded = malaya.malaya_word2vec(256) word_vector = malaya.Word2Vec(embedded['nce_weights'], embedded['dictionary']) assert len(word_vector.calculator('(anwar+hadi) * mahathir', num_closest=8, metric='cosine'))