def loadGloVe(co_occur, dim_embed, vocab, is_fine):
    if is_fine:
        glove_original = glove2dict(glove_filename)

        mittens_model = Mittens(n=dim_embed, max_iter=5000)

        embeddings = mittens_model.fit(np.asarray(co_occur),
                                       vocab=vocab,
                                       initial_embedding_dict=glove_original)
    else:
        glove_model = GloVe(n=dim_embed, max_iter=5000)
        embeddings = glove_model.fit(np.asarray(co_occur))

    return embeddings
def run_small_glove_evals():

    from mittens import GloVe
    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'),
                         index_col=0)
    result = {}
    for i in [10, 100, 200]:
        glv = GloVe(max_iter=i)
        X = glv.fit(giga20)
        glv_df = pd.DataFrame(X, index=giga20.index)
        evaluate = full_word_similarity_evaluation(glv_df)
        result[i] = evaluate['Macro-average']
    glv.sess.close()
    return result
Beispiel #3
0
def run_small_glove_evals():

    from mittens import GloVe

    scores = {}
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)

    for iteration in [10, 100, 200]:
        glv = GloVe(max_iter=iteration)
        giga20_glv = glv.fit(giga20.values)
        giga20_glv = pd.DataFrame(giga20_glv, index=giga20.index)
        scores[iteration] = full_word_similarity_evaluation(
            giga20_glv)['Macro-average']
    return scores
Beispiel #4
0
def run_small_glove_evals(iters=[10, 100, 200]):

    from mittens import GloVe

    ##### YOUR CODE HERE
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)
    results = {}
    for max_iter in iters:
        model = GloVe(max_iter=max_iter)
        glove_matrix = model.fit(giga20.values)
        glove_df = pd.DataFrame(glove_matrix, index=giga20.index)
        similarity_eval = full_word_similarity_evaluation(glove_df)
        display(similarity_eval)
        results[max_iter] = similarity_eval["Macro-average"]

    return results
Beispiel #5
0
 def __init__(self, words):
     self.words = np.array(words)
     self.dictionary = corpora.Dictionary(words)
     self.corpus = [self.dictionary.doc2bow(word) for word in words]
     self.tfidf = models.TfidfModel(dictionary=self.dictionary)
     # 这里加入字典,字典的id2token就会被初始化
     self.lsi = models.LsiModel(self.tfidf[self.corpus], id2word=self.dictionary, num_topics=300)
     self.similarity = similarities.MatrixSimilarity(self.lsi[self.corpus])
     # size 200~300最佳
     # window 8 最佳
     self.word2vec = Word2Vec(words, min_count=3, window=5, size=200, iter=10)
     words = [word for doc in self.words for word in doc]
     global_corpus = self.get_corpus(words)
     self.init_global_bows(global_corpus)
     self.init_global_tfidfs(global_corpus)
     self.init_cooccurrence_matrix()
     glove_model = GloVe(n=200, max_iter=500)
     self.glove = glove_model.fit(self.cooccurrence_matrix)
Beispiel #6
0
def run_small_glove_evals():

    from mittens import GloVe

    results = {}

    #part 1
    giga20 = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"),
                         index_col=0)

    #part 2 & 3
    for max_iter in (10, 100, 200):
        glove = GloVe(
            max_iter=max_iter
        )  #this is the model, note we havent yet given it any data
        embeddings = glove.fit(giga20)
        giga20_glove = pd.DataFrame(embeddings, index=giga20.index)
        results[max_iter] = full_word_similarity_evaluation(
            giga20_glove)['Macro-average']

    return results
# Display of statistical results
nowTime = tic.getNow().strftime('%Y%m%d_%H%M%S')
coocPath = "...\Data\Reg_leave_one_out\cooccurrence_%s.csv" % (coWindow)
writer = use.csvWrite(coocPath)
for item in cooccurrence:
    writer.writerow(item)
print("The co-occurrence matrix is derived, taking %s" % (tic.timmingGet()))

# GloVe
print("Start GloVe calculation")
coocMatric = np.array(cooccurrence, "float32")

glove_model = GloVe(n=vecLength,
                    max_iter=max_iter,
                    display_progress=display_progress)
embeddings = glove_model.fit(coocMatric)

del cooccurrence, coocMatric
gc.collect()

# Output calculation result
dicIndex = 0
# result=[]
nowTime = tic.getNow().strftime('%Y%m%d_%H%M%S')
GlovePath = "...\Data\Reg_leave_one_out\keras_GloVeVec_%s_%s_%s.csv" % (
    coWindow, vecLength, max_iter)
writer = use.csvWrite(GlovePath)
for embeddingsItem in embeddings:
    item = np.array([dicIndex])
    item = np.append(item, embeddingsItem)
    writer.writerow(item)
Beispiel #8
0
    return np.corrcoef(prob[mask], M[mask])[0, 1]


# In[25]:

correlation_test(glove_test_count_df.values, glove_test_df.values)

# ### Applying GloVe to real VSMs

# The `vsm.glove` implementation is too slow to use on real matrices. The distribution in the `mittens` package is significantly faster, making its use possible even without a GPU (and it will be very fast indeed on a GPU machine):

# In[26]:

glove_model = GloVe()

imdb5_glv = glove_model.fit(imdb5.values)

imdb5_glv = pd.DataFrame(imdb5_glv, index=imdb5.index)

# In[27]:

vsm.neighbors('superb', imdb5_glv).head()

# ## Autoencoders
#
# An autoencoder is a machine learning model that seeks to learn parameters that predict its own input. This is meaningful when there are intermediate representations that have lower dimensionality than the inputs. These provide a reduced-dimensional view of the data akin to those learned by LSA, but now we have a lot more design choices and a lot more potential to learn higher-order associations in the underyling data.

# ### Overview of the autoencoder method
#
# The module `torch_autoencoder` uses PyToch to implement a simple one-layer autoencoder:
#
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from mittens import GloVe
import pandas as pd
from scipy import sparse
import json

# embedding dimension
d = 50
config = json.load(open('../config.json', 'r'))
DATA_DIR = config['OUTPUT_DIR']

print('Loading file...')
coocc = sparse.load_npz(DATA_DIR + 'glove_cooccurrence.npz')
coocc = coocc.toarray()

with open(DATA_DIR + 'joint_vocab.txt', 'r') as f:
    vocab = f.read().splitlines()

print('Training model...')
glove_model = GloVe(d, max_iter=5000, learning_rate=0.1)
embeddings = glove_model.fit(coocc)

embeddings = pd.DataFrame(embeddings, index=vocab)

embeddings.to_csv(DATA_DIR + 'glove.' + str(d) + 'd.csv', sep='\t')
Beispiel #10
0
    print(" ".join(
        [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

## 主題模型歸類結果對應檢查-----------------------------------
for i in range(0, 1500, 100):
    y_str = y_tataol[i]
    start = y_str.find("why:")
    end = y_str.find("judgement:")
    key_reson = y_str[start + 20:end - 4]
    print(
        '模型推論Topic:',
        lad_result[i] + 1,
        '   案由:',
        key_reson,
    )
    # print(corpus)

# count = CountVectorizer()
# bag = count.fit_transform(seg_corpus)

glove_model = GloVe(n=2, max_iter=1000)
cooccurrence_matrix = np.dot(X.toarray().transpose(), X.toarray())
embeddings = glove_model.fit(cooccurrence_matrix)
print(embeddings.shape)

plt.scatter(embeddings[:, 0], embeddings[:, 1], marker="o")
for i in range(0, len(feature_names)):
    plt.text(embeddings[i, 0], embeddings[i, 1], feature_names[i])

plt.show()
Beispiel #11
0
# In[ ]:

#alternative test
if 'IS_GRADESCOPE_ENV' not in os.environ:

    from mittens import GloVe

    #subword + glove
    #giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window20-flat.csv.gz"), index_col=0)
    giga = pd.read_csv(os.path.join(VSM_HOME, "giga_window5-scaled.csv.gz"),
                       index_col=0)
    giga = vsm.pmi(giga)
    #giga = vsm.lsa(giga20, k = 50)
    #''''
    glove = GloVe(max_iter=100)
    embeddings = glove.fit(giga)
    giga = pd.DataFrame(embeddings, index=giga.index)
    #'''
    output = full_word_similarity_evaluation(giga)
    return output

# ## Bake-off [1 point]
#
# For the bake-off, we will release two additional datasets. The announcement will go out on the discussion forum. We will also release reader code for these datasets that you can paste into this notebook. You will evaluate your custom model $M$ (from the previous question) on these new datasets using `full_word_similarity_evaluation`. Rules:
#
# 1. Only one evaluation is permitted.
# 1. No additional system tuning is permitted once the bake-off has started.
#
# The cells below this one constitute your bake-off entry.
#
# People who enter will receive the additional homework point, and people whose systems achieve the top score will receive an additional 0.5 points. We will test the top-performing systems ourselves, and only systems for which we can reproduce the reported results will win the extra 0.5 points.
Beispiel #12
0
    ## 从1开始,留一位给UNK,从1开始标
    word2idx = dict(
        zip(entities_all_count_keys, range(1,
                                           len(entities_all_count_keys) + 1)))

    # 对句子进行tokenize,生成token矩阵
    sentences_tokens, _ = make_deepLearn_data(sentences, word2idx)
    # 生成共现矩阵
    cooccurrence_matrix = generate_co_occurrence(
        sentences_tokens,
        len(word2idx.keys()),
        window_size=5,
    )
    # 训练glove并输出文件
    glove_model = GloVe(n=dim_n, max_iter=itter_n)
    embedMatrix = glove_model.fit(cooccurrence_matrix)

    print('load word2vec model...')
    model = KeyedVectors.load_word2vec_format('train_vec_byTencent_word.bin',
                                              binary=True)

    print('build embedding matrix...')
    new_embedMatrix = merge_glove_word2vec_embedding(word2idx, embedMatrix,
                                                     model)

    with open('word2idx_embedMatrix_glove_word2vec.pkl', 'wb') as f:
        pickle.dump([word2idx, new_embedMatrix], f)

#%% 3.生成训练集和测试集
## no title
    print('produce training set...')
Beispiel #13
0
            for context_index in range(context_start_index, context_end_index):
                if center_index != context_index:
                    context_word = doc_words[context_index]
                    context_id = dictionary.token2id[context_word]
                    print center_id, center_word, context_id, context_word
                    matrix[center_id, context_id] += 1
    print matrix
    # 奇异值分解(Singular Value Decomposition)
    U, s, Vh = np.linalg.svd(matrix, full_matrices=False)

    # 聚类
    X = -U[:, 0:2]

    labels = KMeans(n_clusters=2).fit(X).labels_
    colors = ('y', 'g')

    glove_model = GloVe(n=2, max_iter=200)
    embeddings = glove_model.fit(matrix)
    X = embeddings
    mp.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
    for word in dictionary.token2id.keys():
        i = dictionary.token2id[word]
        mp.scatter(X[i, 1], X[i, 0], c=colors[labels[i]], s=400, alpha=0.4)
        mp.text(X[i, 1],
                X[i, 0],
                word,
                ha='center',
                va='center',
                fontproperties=font)
    mp.show()