def build_common_words_vectors(vecs='word2vec'): words_list = load_Bing_Liu('negative') word_vectors = load_embeddings( 'google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') if vecs == 'GloVe': word_vectors = load_embeddings(arg='glove') common_words(word_vectors, words_list)
def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings('zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt') # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings( 'zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt' ) # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def main(): global args args = parser.parse_args() use_gpu = torch.cuda.is_available() # Load and process data print('Loading dataset') time_data = time.time() POS, NEG, train_iter_pos, train_iter_neg, val_iter_pos, val_iter_neg = preprocess(args.v, args.b) print('Loaded data. |POS| = {}, |NEG| = {}. Time: {:.2f}.'.format(len(POS.vocab), len(NEG.vocab), time.time() - time_data)) # Load embeddings if available LOAD_EMBEDDINGS = False if LOAD_EMBEDDINGS: np_emb_e1_file = 'scripts/emb-{}-de.npy'.format(len(POS.vocab)) np_emb_e2_file = 'scripts/emb-{}-de.npy'.format(len(NEG.vocab)) np_emb_d1_file = 'scripts/emb-{}-de.npy'.format(len(POS.vocab)) np_emb_d2_file = 'scripts/emb-{}-de.npy'.format(len(NEG.vocab)) embedding_e1, epmbedding_e2, embedding_d1, embedding_d2 = load_embeddings(np_emb_e1_file, np_emb_e2_file, np_emb_d1_file, np_emb_d2_file) print('Loaded embedding vectors from np files') else: embedding_e1 = (torch.rand(len(POS.vocab), args.emb) - 0.5) * 2 embedding_e2 = (torch.rand(len(NEG.vocab), args.emb) - 0.5) * 2 embedding_d1 = (torch.rand(len(POS.vocab), args.emb) - 0.5) * 2 embedding_d2 = (torch.rand(len(NEG.vocab), args.emb) - 0.5) * 2 print('Initialized embedding vectors') # Create model tokens = [NEG.vocab.stoi[x] for x in ['<s>', '</s>', '<pad>', '<unk>']]
def check_same_words(words): model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') full_words = model.vocab.keys() same_words = set(words).intersection(full_words) print(set(words) - same_words) print(len(same_words))
def check_same_words(words): model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") full_words = model.vocab.keys() same_words = set(words).intersection(full_words) print(set(words) - same_words) print(len(same_words))
def check_same_words(words): model = load_embeddings( 'google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') full_words = model.vocab.keys() same_words = set(words).intersection(full_words) print(set(words) - same_words) print(len(same_words))
def run_build_docvecs(): model = load_embeddings('twitter') simple_evaluate(model) _, ratings = load_vader('./resource/tweets.txt') # Do not account the 1240 and 3516 -th item # r = ratings[:1240] + ratings[1241:3516] + ratings[3517:] build_docvecs(model, ratings)
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = "./tmp/anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_keras_input(texts, scores, test, new=True): dims = 300 # texts, scores are dict type, key: train, dev, devtest. keys = ["train", "dev", "devtest"] train, train_scores = texts[keys[0]], scores[keys[0]] dev, dev_scores = texts[keys[1]], scores[keys[1]] devtest, devtest_scores = texts[keys[2]], scores[keys[2]] filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' test_filename = './tmp/test_data.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False: data = load_pickle(filename_data) W = load_pickle(filename_w) test_data = load_pickle(test_filename) print('Use existing data. Load OK.') return (data, W, test_data) print("Construct new data.") # load data from pickle vocab = get_vocab(train) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v') word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v') # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin', # binary=True) word_vecs = add_unknown_words(word_vecs, vocab, k=dims) W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims) idx_data_train = make_idx_data(train, word_idx_map) idx_data_dev = make_idx_data(dev, word_idx_map) idx_data_devtest = make_idx_data(devtest, word_idx_map) idx_data_test = make_idx_data(test[2], word_idx_map) data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores) test_data = (test[0], test[1], idx_data_test) dump_picle(data, filename_data) dump_picle(W, filename_w) dump_picle(test_data, test_filename) print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w)) return (data, W, test_data)
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_retrofitted_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( 'google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = "./tmp/amended_anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") amended_pos = load_pickle("./tmp/amended_pos.p") amended_neg = load_pickle("./tmp/amended_neg.p") vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_keras_input_amended(): filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('glove') # load amended word vectors word_vecs = load_embeddings('amended_word2vec') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') amended_pos = load_pickle('./tmp/amended_pos.p') amended_neg = load_pickle('./tmp/amended_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') c idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew')
############################################## all ############################################### corpus, ratings = load_vader( ['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') # word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p') idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
__author__ = 'NLP-PC' import gensim import os import time from file_name import get_file_path from load_data import load_corpus, load_lexicon, load_mark from load_data import load_embeddings from word2vec_fn import buill_word_vector from word2vec_fn import gold_valence_arousal import numpy as np from sklearn import cross_validation from cross_validation import cv from word2vec_fn import build_doc_vector # ''' model = load_embeddings('CVAT_docvecs') print(model.docvecs[1]) print(model.docvecs['SENT_23']) print(len(model.vocab.keys())) corpus = load_corpus(get_file_path('cn_corpus')) mark = load_mark(get_file_path('mark')) vecs = build_doc_vector(corpus, model) valence, arousal = gold_valence_arousal(corpus, mark) cv(vecs, valence, multivariant=True) cv(vecs, arousal, multivariant=True) # ''' # from save_data import dump_picle # dump_picle(model.key(), get_file_path('words_in_wordvec')) # print('ok') #
# vocabularies in word2vec models. # The simplifying method is to just keep the vocabularies of word2vec, which appears in GloVe at the same time. def save_word_vecs(wordVectors, outFileName): # the type of wordVectors is: dict() print('\nWriting down the vectors in ' + outFileName + '\n') outFile = open(outFileName, 'w', encoding='utf-8') for word, values in wordVectors.items(): outFile.write(word + ' ') for val in wordVectors[word]: outFile.write('%.4f' % (val) + ' ') outFile.write('\n') outFile.close() from load_data import load_embeddings glove = load_embeddings( "zh_tw", 'D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt') vocabularies = glove.vocab.keys() word2vec = load_embeddings( 'google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') common_keys = set(word2vec.keys()) & set(vocabularies) wordVectors = dict() for word in common_keys: wordVectors[word] = word2vec[word] save_word_vecs(wordVectors, "D:\Word_Embeddings\English\simplified_word2vecs.txt")
# dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) # print('dump word_idx_map successful') # dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p') # print('OK') word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) # dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit() word_vecs = load_embeddings('zh_tw') dim = len(word_vecs['我們']) # 400 embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim) print(embedding_matrix[1]) print(idx_map['我們']) print(len(word_vecs['我們'])) print(word_vecs['我們'].shape) print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim)) print('Result') sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim) print(sentence_embedding_matrix.shape)
import tensorflow as tf from load_data import load_embeddings import numpy as np # directory = 'glove/glove.twitter.27B/' directory = 'glove/glove.6B/' # filename = 'glove.twitter.27B.200d.txt' filename = 'glove.6B.100d.txt' embedding_dim = 100 wordsList, wordVectors = load_embeddings(directory, filename, embedding_dim) maxSeqLength = 10 #Maximum length of sentence numDimensions = 100 #Dimensions for each word vector firstSentence = np.zeros((maxSeqLength), dtype='int32') firstSentence[0] = wordsList.index("i") firstSentence[1] = wordsList.index("thought") firstSentence[2] = wordsList.index("the") firstSentence[3] = wordsList.index("movie") firstSentence[4] = wordsList.index("was") firstSentence[5] = wordsList.index("incredible") firstSentence[6] = wordsList.index("and") firstSentence[7] = wordsList.index("inspiring") #firstSentence[8] and firstSentence[9] are going to be 0 print(firstSentence.shape) print(firstSentence) #Shows the row index for each word with tf.Session() as sess: print(tf.nn.embedding_lookup(wordVectors, firstSentence).eval().shape)
def check_same_words(words): model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") full_words = model.vocab.keys() same_words = set(words).intersection(full_words) print(set(words)-same_words) print(len(same_words))
# 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) # idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape)
__author__ = 'NLP-PC' import random import numpy as np from file_name import get_file_path from load_data import load_embeddings from word2vec_fn import buill_word_vector from cross_validation import cv from load_data import load_vader from affective_score_vader import screen_data from load_data import load_anew print('start') model = load_embeddings('google_news') corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(np.histogram(ratings, bins=range(10))) print(len(model.vocab.keys())) vecs = np.concatenate([buill_word_vector(text, model, size=300) for text in corpus]) print(vecs[1]) cv(vecs, ratings, multivariant=True) vecs = None ratings = None corpus, ratings = load_vader(['tweets']) lexicon_name = get_file_path('anew')
def build_common_words_vectors(vecs='word2vec'): words_list = load_Bing_Liu('negative') word_vectors = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') if vecs == 'GloVe': word_vectors = load_embeddings(arg='glove') common_words(word_vectors, words_list)
# The purpose of this file is to simplify word2vec pre-trained models, # as it is too big for loading it into an 8 GB memory laptop, # and another reason is for fairly comparison because the words in GloVe vectors is much less than the # vocabularies in word2vec models. # The simplifying method is to just keep the vocabularies of word2vec, which appears in GloVe at the same time. def save_word_vecs(wordVectors, outFileName): # the type of wordVectors is: dict() print('\nWriting down the vectors in ' + outFileName + '\n') outFile = open(outFileName, 'w', encoding='utf-8') for word, values in wordVectors.items(): outFile.write(word + ' ') for val in wordVectors[word]: outFile.write('%.4f' % (val) + ' ') outFile.write('\n') outFile.close() from load_data import load_embeddings glove = load_embeddings("zh_tw", 'D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt') vocabularies = glove.vocab.keys() word2vec = load_embeddings('google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') common_keys = set(word2vec.keys()) & set(vocabularies) wordVectors = dict() for word in common_keys: wordVectors[word]=word2vec[word] save_word_vecs(wordVectors, "D:\Word_Embeddings\English\simplified_word2vecs.txt")
def check_same_words(words): model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") full_words = model.vocab.keys() same_words = set(words).intersection(full_words) print(set(words) - same_words) print(len(same_words))
def keras_nn_input(word_vectors_model, amending): if word_vectors_model == 'word2vec': if amending == True: filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p' elif amending == False: filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p' elif amending == False: filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p' else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) if word_vectors_model == 'word2vec': if amending == True: word_vecs = load_embeddings('amended_word2vec') elif amending == False: word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: word_vecs = load_embeddings('amended_glove') elif amending == False: word_vecs = load_embeddings('glove') else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W)
def keras_nn_input(word_vectors_model, amending): if word_vectors_model == 'word2vec': if amending == True: filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p' elif amending == False: filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p' elif amending == False: filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'retrofitted_GloVe': filename_data, filename_w = './tmp/retrofitted_GloVe_indexed_data.p', './tmp/retrofitted_GloVe_Weight.p' elif word_vectors_model == 'retrofitted_word2vec': filename_data, filename_w = './tmp/retrofitted_word2vec_indexed_data.p', './tmp/retrofitted_word2vec_Weight.p' else: raise Exception('Wrong parameter!') if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) if word_vectors_model == 'word2vec': if amending == True: word_vecs = load_embeddings('amended_word2vec') elif amending == False: word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: word_vecs = load_embeddings('amended_glove') elif amending == False: word_vecs = load_embeddings('glove') else: raise Exception('Wrong!') elif word_vectors_model == 'retrofitted_GloVe': word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt') # convert gensim model to dict type w2v = dict() for key in word_vecs.vocab.keys(): w2v[key] = word_vecs[key] word_vecs = w2v elif word_vectors_model == 'retrofitted_word2vec': word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\word2vec_out_vec_file.txt') # convert gensim model to dict type w2v = dict() for key in word_vecs.vocab.keys(): w2v[key] = word_vecs[key] word_vecs = w2v else: raise Exception('Wrong parameter!') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W)
__author__ = 'NLP-PC' import gensim import os import time from file_name import get_file_path from load_data import load_corpus, load_lexicon, load_mark from load_data import load_embeddings from word2vec_fn import buill_word_vector from word2vec_fn import gold_valence_arousal import numpy as np from sklearn import cross_validation from cross_validation import cv model = load_embeddings('zh_tw') print(len(model.vocab.keys())) corpus = load_corpus(get_file_path('cn_corpus')) mark = load_mark(get_file_path('mark')) vecs = np.concatenate([buill_word_vector(text, model) for text in corpus]) valence, arousal = gold_valence_arousal(corpus, mark) cv(vecs, valence, multivariant=True) cv(vecs, arousal, multivariant=True) exit() # from save_data import dump_picle # dump_picle(model.key(), get_file_path('words_in_wordvec')) # print('ok') # # # print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)) # # print(model.doesnt_match("breakfast cereal dinner lunch".split())) # # print(model.similarity('woman', 'man')) # # print(model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'], topn=10))
# 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) # idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit()