Exemple #1
0
def build_common_words_vectors(vecs='word2vec'):
    words_list = load_Bing_Liu('negative')
    word_vectors = load_embeddings(
        'google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    if vecs == 'GloVe':
        word_vectors = load_embeddings(arg='glove')
    common_words(word_vectors, words_list)
def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings('zh',
                                '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt')

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings(
        'zh',
        '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt'
    )

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
def main():
    global args
    args = parser.parse_args()
    use_gpu = torch.cuda.is_available()

    # Load and process data
    print('Loading dataset')
    time_data = time.time()
    POS, NEG, train_iter_pos, train_iter_neg, val_iter_pos, val_iter_neg = preprocess(args.v, args.b)
    print('Loaded data. |POS| = {}, |NEG| = {}. Time: {:.2f}.'.format(len(POS.vocab), len(NEG.vocab), time.time() - time_data))

    # Load embeddings if available
    LOAD_EMBEDDINGS = False
    if LOAD_EMBEDDINGS:
        np_emb_e1_file = 'scripts/emb-{}-de.npy'.format(len(POS.vocab))
        np_emb_e2_file = 'scripts/emb-{}-de.npy'.format(len(NEG.vocab))
        np_emb_d1_file = 'scripts/emb-{}-de.npy'.format(len(POS.vocab))
        np_emb_d2_file = 'scripts/emb-{}-de.npy'.format(len(NEG.vocab))
        embedding_e1, epmbedding_e2, embedding_d1, embedding_d2 = load_embeddings(np_emb_e1_file, np_emb_e2_file, np_emb_d1_file, np_emb_d2_file)
        print('Loaded embedding vectors from np files')
    else:
        embedding_e1 = (torch.rand(len(POS.vocab), args.emb) - 0.5) * 2
        embedding_e2 = (torch.rand(len(NEG.vocab), args.emb) - 0.5) * 2
        embedding_d1 = (torch.rand(len(POS.vocab), args.emb) - 0.5) * 2
        embedding_d2 = (torch.rand(len(NEG.vocab), args.emb) - 0.5) * 2
        print('Initialized embedding vectors')

    # Create model
    tokens = [NEG.vocab.stoi[x] for x in ['<s>', '</s>', '<pad>', '<unk>']]
Exemple #5
0
def check_same_words(words):
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    full_words = model.vocab.keys()
    same_words = set(words).intersection(full_words)
    print(set(words) - same_words)
    print(len(same_words))
Exemple #6
0
def check_same_words(words):
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    full_words = model.vocab.keys()
    same_words = set(words).intersection(full_words)
    print(set(words) - same_words)
    print(len(same_words))
Exemple #7
0
def check_same_words(words):
    model = load_embeddings(
        'google_news',
        'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
    full_words = model.vocab.keys()
    same_words = set(words).intersection(full_words)
    print(set(words) - same_words)
    print(len(same_words))
def run_build_docvecs():
    model = load_embeddings('twitter')
    simple_evaluate(model)
    _, ratings = load_vader('./resource/tweets.txt')

    # Do not account the 1240 and 3516 -th item
    # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:]

    build_docvecs(model, ratings)
def run_build_docvecs():
    model = load_embeddings('twitter')
    simple_evaluate(model)
    _, ratings = load_vader('./resource/tweets.txt')

    # Do not account the 1240 and 3516 -th item
    # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:]

    build_docvecs(model, ratings)
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #11
0
def build_ori_anew_vectors(words):
    filename = "./tmp/anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #12
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_keras_input(texts, scores, test, new=True):
    dims = 300

    # texts, scores are dict type, key: train, dev, devtest.
    keys = ["train", "dev", "devtest"]
    train, train_scores = texts[keys[0]], scores[keys[0]]
    dev, dev_scores = texts[keys[1]], scores[keys[1]]
    devtest, devtest_scores = texts[keys[2]], scores[keys[2]]

    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    test_filename = './tmp/test_data.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False:
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)

        test_data = load_pickle(test_filename)

        print('Use existing data. Load OK.')
        return (data, W, test_data)

    print("Construct new data.")
    # load data from pickle

    vocab = get_vocab(train)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v')
    word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v')
    # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin',
    #                             binary=True)

    word_vecs = add_unknown_words(word_vecs, vocab, k=dims)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims)

    idx_data_train = make_idx_data(train, word_idx_map)
    idx_data_dev = make_idx_data(dev, word_idx_map)
    idx_data_devtest = make_idx_data(devtest, word_idx_map)

    idx_data_test = make_idx_data(test[2], word_idx_map)

    data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores)

    test_data = (test[0], test[1], idx_data_test)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    dump_picle(test_data, test_filename)
    print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w))

    return (data, W, test_data)
Exemple #14
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors_retrofitted_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt")
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #15
0
def build_ori_anew_vectors(words):
    filename = './tmp/anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        'google_news',
        'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
    vecs = []
    for w in words:
        vecs.append(model[w])
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #18
0
def build_amended_anew_vectors(words):
    filename = "./tmp/amended_anew_vectors.p"
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    amended_pos = load_pickle("./tmp/amended_pos.p")
    amended_neg = load_pickle("./tmp/amended_neg.p")
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
Exemple #20
0
def build_keras_input_amended():
    filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    # word_vecs = load_embeddings('glove')

    # load amended word vectors
    word_vecs = load_embeddings('amended_word2vec')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
Exemple #21
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings('google_news',
                            '/home/hs/Data/Word_Embeddings/google_news.bin')
    amended_pos = load_pickle('./tmp/amended_pos.p')
    amended_neg = load_pickle('./tmp/amended_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #22
0
def build_amended_anew_vectors(words):
    filename = './tmp/retrofitted_anew_vectors_word2vec.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt")
    amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p')
    amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
Exemple #23
0
def build_amended_anew_vectors(words):
    filename = './tmp/amended_anew_vectors_glove.p'
    if os.path.isfile(filename):
        return load_pickle(filename)
    model = load_embeddings(
        arg='zh_tw',
        filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    amended_pos = load_pickle('./tmp/amended_GloVe_pos.p')
    amended_neg = load_pickle('./tmp/amended_GloVe_neg.p')
    vecs = []
    for w in words:
        vec = amended_neg.get(w)
        if vec is None:
            vec = amended_pos.get(w)
        if vec is None:
            vec = model[w]
        vecs.append(vec)
    vecs = np.array(vecs)
    dump_picle(vecs, filename)
    return vecs
vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
c
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
Exemple #25
0
############################################## all ###############################################
corpus, ratings = load_vader(
    ['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings],
           './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
Exemple #26
0
__author__ = 'NLP-PC'
import gensim
import os
import time
from file_name import get_file_path
from load_data import load_corpus, load_lexicon, load_mark
from load_data import load_embeddings
from word2vec_fn import buill_word_vector
from word2vec_fn import gold_valence_arousal
import numpy as np
from sklearn import cross_validation
from cross_validation import cv
from word2vec_fn import build_doc_vector
# '''
model = load_embeddings('CVAT_docvecs')
print(model.docvecs[1])
print(model.docvecs['SENT_23'])
print(len(model.vocab.keys()))

corpus = load_corpus(get_file_path('cn_corpus'))
mark = load_mark(get_file_path('mark'))
vecs = build_doc_vector(corpus, model)

valence, arousal = gold_valence_arousal(corpus, mark)
cv(vecs, valence, multivariant=True)
cv(vecs, arousal, multivariant=True)
# '''
# from save_data import dump_picle
# dump_picle(model.key(), get_file_path('words_in_wordvec'))
# print('ok')
#
# vocabularies in word2vec models.
# The simplifying method is to just keep the vocabularies of word2vec, which appears in GloVe at the same time.


def save_word_vecs(wordVectors, outFileName):
    # the type of wordVectors is: dict()
    print('\nWriting down the vectors in ' + outFileName + '\n')
    outFile = open(outFileName, 'w', encoding='utf-8')
    for word, values in wordVectors.items():
        outFile.write(word + ' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' % (val) + ' ')
        outFile.write('\n')
    outFile.close()


from load_data import load_embeddings

glove = load_embeddings(
    "zh_tw", 'D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt')
vocabularies = glove.vocab.keys()
word2vec = load_embeddings(
    'google_news',
    'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
common_keys = set(word2vec.keys()) & set(vocabularies)
wordVectors = dict()
for word in common_keys:
    wordVectors[word] = word2vec[word]
save_word_vecs(wordVectors,
               "D:\Word_Embeddings\English\simplified_word2vecs.txt")
Exemple #28
0
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
# print('dump word_idx_map successful')
# dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p')
# print('OK')

word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

# dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)
import tensorflow as tf
from load_data import load_embeddings
import numpy as np

# directory = 'glove/glove.twitter.27B/'
directory = 'glove/glove.6B/'
# filename = 'glove.twitter.27B.200d.txt'
filename = 'glove.6B.100d.txt'
embedding_dim = 100

wordsList, wordVectors = load_embeddings(directory, filename, embedding_dim)

maxSeqLength = 10  #Maximum length of sentence
numDimensions = 100  #Dimensions for each word vector

firstSentence = np.zeros((maxSeqLength), dtype='int32')
firstSentence[0] = wordsList.index("i")
firstSentence[1] = wordsList.index("thought")
firstSentence[2] = wordsList.index("the")
firstSentence[3] = wordsList.index("movie")
firstSentence[4] = wordsList.index("was")
firstSentence[5] = wordsList.index("incredible")
firstSentence[6] = wordsList.index("and")
firstSentence[7] = wordsList.index("inspiring")
#firstSentence[8] and firstSentence[9] are going to be 0
print(firstSentence.shape)
print(firstSentence)  #Shows the row index for each word

with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors, firstSentence).eval().shape)
def check_same_words(words):
    model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt")
    full_words = model.vocab.keys()
    same_words = set(words).intersection(full_words)
    print(set(words)-same_words)
    print(len(same_words))
Exemple #31
0
# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'),
                                         vocab,
                                         k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
Exemple #32
0
__author__ = 'NLP-PC'
import random

import numpy as np

from file_name import get_file_path
from load_data import load_embeddings
from word2vec_fn import buill_word_vector
from cross_validation import cv
from load_data import load_vader
from affective_score_vader import screen_data
from load_data import load_anew

print('start')
model = load_embeddings('google_news')

corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(np.histogram(ratings, bins=range(10)))
print(len(model.vocab.keys()))
vecs = np.concatenate([buill_word_vector(text, model, size=300) for text in corpus])
print(vecs[1])
cv(vecs, ratings, multivariant=True)

vecs = None
ratings = None
corpus, ratings = load_vader(['tweets'])
lexicon_name = get_file_path('anew')
def build_common_words_vectors(vecs='word2vec'):
    words_list = load_Bing_Liu('negative')
    word_vectors = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    if vecs == 'GloVe':
        word_vectors = load_embeddings(arg='glove')
    common_words(word_vectors, words_list)
# The purpose of this file is to simplify word2vec pre-trained models,
# as it is too big for loading it into an 8 GB memory laptop,
# and another reason is for fairly comparison because the words in GloVe vectors is much less than the
# vocabularies in word2vec models.
# The simplifying method is to just keep the vocabularies of word2vec, which appears in GloVe at the same time.


def save_word_vecs(wordVectors, outFileName):
    # the type of wordVectors is: dict()
    print('\nWriting down the vectors in ' + outFileName + '\n')
    outFile = open(outFileName, 'w', encoding='utf-8')
    for word, values in wordVectors.items():
        outFile.write(word + ' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' % (val) + ' ')
        outFile.write('\n')
    outFile.close()


from load_data import load_embeddings

glove = load_embeddings("zh_tw", 'D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt')
vocabularies = glove.vocab.keys()
word2vec = load_embeddings('google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin')
common_keys = set(word2vec.keys()) & set(vocabularies)
wordVectors = dict()
for word in common_keys:
    wordVectors[word]=word2vec[word]
save_word_vecs(wordVectors, "D:\Word_Embeddings\English\simplified_word2vecs.txt")
Exemple #35
0
def check_same_words(words):
    model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin")
    full_words = model.vocab.keys()
    same_words = set(words).intersection(full_words)
    print(set(words) - same_words)
    print(len(same_words))
def keras_nn_input(word_vectors_model, amending):
    if word_vectors_model == 'word2vec':
        if amending == True:
            filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p'
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p'
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)

    if word_vectors_model == 'word2vec':
        if amending == True:
            word_vecs = load_embeddings('amended_word2vec')
        elif amending == False:
            word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            word_vecs = load_embeddings('amended_glove')
        elif amending == False:
            word_vecs = load_embeddings('glove')
        else:
            raise Exception('Wrong!')
    else:
        raise Exception('Wrong parameter!')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
    return (data, W)
Exemple #37
0
def keras_nn_input(word_vectors_model, amending):
    if word_vectors_model == 'word2vec':
        if amending == True:
            filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p'
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p'
        elif amending == False:
            filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p'
        else:
            raise Exception('Wrong!')

    elif word_vectors_model == 'retrofitted_GloVe':
        filename_data, filename_w = './tmp/retrofitted_GloVe_indexed_data.p', './tmp/retrofitted_GloVe_Weight.p'
    elif word_vectors_model == 'retrofitted_word2vec':
        filename_data, filename_w = './tmp/retrofitted_word2vec_indexed_data.p', './tmp/retrofitted_word2vec_Weight.p'

    else:
        raise Exception('Wrong parameter!')

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
        return (data, W)

    # load data from pickle
    (x_train, y_train_valence, y_train_labels,
     x_test, y_test_valence, y_test_labels,
     x_valid, y_valid_valence, y_valid_labels,
     x_train_polarity, y_train_polarity,
     x_test_polarity, y_test_polarity,
     x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/')

    vocab = get_vocab(x_train)

    if word_vectors_model == 'word2vec':
        if amending == True:
            word_vecs = load_embeddings('amended_word2vec')
        elif amending == False:
            word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'GloVe':
        if amending == True:
            word_vecs = load_embeddings('amended_glove')
        elif amending == False:
            word_vecs = load_embeddings('glove')
        else:
            raise Exception('Wrong!')
    elif word_vectors_model == 'retrofitted_GloVe':
        word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt')
        # convert gensim model to dict type
        w2v = dict()
        for key in word_vecs.vocab.keys():
            w2v[key] = word_vecs[key]
        word_vecs = w2v

    elif word_vectors_model == 'retrofitted_word2vec':
        word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\word2vec_out_vec_file.txt')
        # convert gensim model to dict type
        w2v = dict()
        for key in word_vecs.vocab.keys():
            w2v[key] = word_vecs[key]
        word_vecs = w2v

    else:
        raise Exception('Wrong parameter!')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    x_train_idx_data = make_idx_data(x_train, word_idx_map)
    x_test_idx_data = make_idx_data(x_test, word_idx_map)
    x_valid_idx_data = make_idx_data(x_valid, word_idx_map)
    x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map)
    x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map)
    x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map)

    data = (x_train_idx_data, y_train_valence, y_train_labels,
            x_test_idx_data, y_test_valence, y_test_labels,
            x_valid_idx_data, y_valid_valence, y_valid_labels,
            x_train_polarity_idx_data, y_train_polarity,
            x_test_polarity_idx_data, y_test_polarity,
            x_valid_polarity_idx_data, y_valid_polarity)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending))
    return (data, W)
__author__ = 'NLP-PC'
import gensim
import os
import time
from file_name import get_file_path
from load_data import load_corpus, load_lexicon, load_mark
from load_data import load_embeddings
from word2vec_fn import buill_word_vector
from word2vec_fn import gold_valence_arousal
import numpy as np
from sklearn import cross_validation
from cross_validation import cv

model = load_embeddings('zh_tw')

print(len(model.vocab.keys()))
corpus = load_corpus(get_file_path('cn_corpus'))
mark = load_mark(get_file_path('mark'))
vecs = np.concatenate([buill_word_vector(text, model) for text in corpus])
valence, arousal = gold_valence_arousal(corpus, mark)
cv(vecs, valence, multivariant=True)
cv(vecs, arousal, multivariant=True)
exit()
# from save_data import dump_picle
# dump_picle(model.key(), get_file_path('words_in_wordvec'))
# print('ok')
#
# # print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))
# # print(model.doesnt_match("breakfast cereal dinner lunch".split()))
# # print(model.similarity('woman', 'man'))
# # print(model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'], topn=10))
Exemple #39
0
# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()
Exemple #40
0
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
# print('dump word_idx_map successful')
# dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p')
# print('OK')

word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

# dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)