def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings( 'zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt' ) # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def train_doc2vec(): # def isEnglish(s): # try: # s.encode('ascii') # except UnicodeEncodeError: # return False # else: # return True labeled_data, _ = load_vader('./resource/tweets.txt') # for i,d in enumerate(labeled_data): # print(i) # if not isEnglish(d): # print('*'*111) # print(i,d) # exit() unlabeled_data, _ = load_sentiment140('/home/hs/Data/Corpus/training.csv') labeled_data = preprocess(labeled_data, replace=True) dump_picle(labeled_data, './data/acc/labeled_data.p') unlabeled_data = preprocess(unlabeled_data, replace=True) dump_picle(unlabeled_data, './data/acc/unlabeled_data.p') # labeled_data = load_pickle('./data/acc/labeled_data.p') # unlabeled_data = load_pickle('./data/acc/unlabeled_data.p') sentence = TaggedLineSentence(labeled_data, unlabeled_data) train_docvecs(sentence)
def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings('zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt') # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('kNN classifier training complete, saved predict labels to pickle') return
def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('MaxEnt classifier training complete, saved predict labels to pickle') return
def svm_classify(train_data, train_labels, test): log_state('Use SVM classifier') clf = svm.SVC(C=5.0, kernel='linear') clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('SVM classifier training complete, saved predict labels to pickle') return
def build_docvecs(model, ratings): nb_text = len(ratings) # 4200 size = len(model.docvecs['L_SENT_0']) # 50 vecs = [ model.docvecs['L_SENT_%s' % id].reshape((1, size)) for id in range(nb_text) ] dump_picle((np.concatenate(vecs), ratings), './data/acc/twitter_docvecs.p')
def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'MaxEnt classifier training complete, saved predict labels to pickle') return
def svm_classify(train_data, train_labels, test): log_state('Use SVM classifier') clf = svm.SVC(C=5.0, kernel='linear') clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'SVM classifier training complete, saved predict labels to pickle') return
def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'kNN classifier training complete, saved predict labels to pickle') return
def build_ori_anew_vectors(words): filename = "./tmp/anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def gNB(train_data, train_labels, test, save_result=False): log_state('Use Gaussian Naive Bayes classifier') clf = GaussianNB() clf.fit(train_data, train_labels) predict_labels = clf.predict(test) predict_proba = clf.predict_proba(test) if save_result == True: dump_picle(predict_labels, './data/predict_labels/predict_labels.p') dump_picle(predict_proba, './data/predict_labels/predict_proba.p') logger.info('Classifier training complete, saved predict labels to pickle') return predict_labels
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( 'google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_vectors(arg='word2vec'): prefix = None if arg == 'word2vec' else 'GloVe_' pos_vectors = load_pickle('./tmp/'+prefix+'common_positive_words.p') neg_vectors = load_pickle('./tmp/'+prefix+'common_negative_words.p') size = len(pos_vectors[list(pos_vectors.keys())[0]]) print('The dimension of word vectors: %s.' % size) for k in pos_vectors: pos_vectors[k]=np.array(pos_vectors[k]).reshape((1, size)) for k in neg_vectors: neg_vectors[k]=np.array(neg_vectors[k]).reshape((1, size)) amended_pos, amended_neg = amend(pos_vectors, neg_vectors) dump_picle(amended_pos, './tmp/amended_'+prefix+'pos.p') dump_picle(amended_neg, './tmp/amended_'+prefix+'neg.p')
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_retrofitted_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def common_words(word_vectors, words_list): #################################### # for word2vec model: # full_words = word_vectors.vocab.keys() # for GloVe model: full_words = word_vectors.keys() #################################### same_words = set(words_list).intersection(full_words) print('Total Number: %s, same word number: %s.'%(len(words_list), len(same_words))) vector_dict=dict() for w in same_words: vector_dict[w]=word_vectors[w] dump_picle(vector_dict, './tmp/GloVe_common_negative_words.p')
def common_words(word_vectors, words_list): #################################### # for word2vec model: # full_words = word_vectors.vocab.keys() # for GloVe model: full_words = word_vectors.keys() #################################### same_words = set(words_list).intersection(full_words) print('Total Number: %s, same word number: %s.' % (len(words_list), len(same_words))) vector_dict = dict() for w in same_words: vector_dict[w] = word_vectors[w] dump_picle(vector_dict, './tmp/GloVe_common_negative_words.p')
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = "./tmp/amended_anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") amended_pos = load_pickle("./tmp/amended_pos.p") amended_neg = load_pickle("./tmp/amended_neg.p") vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_keras_input_amended(): filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('glove') # load amended word vectors word_vecs = load_embeddings('amended_word2vec') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') amended_pos = load_pickle('./tmp/amended_pos.p') amended_neg = load_pickle('./tmp/amended_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_keras_input(texts, scores, test, new=True): dims = 300 # texts, scores are dict type, key: train, dev, devtest. keys = ["train", "dev", "devtest"] train, train_scores = texts[keys[0]], scores[keys[0]] dev, dev_scores = texts[keys[1]], scores[keys[1]] devtest, devtest_scores = texts[keys[2]], scores[keys[2]] filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' test_filename = './tmp/test_data.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False: data = load_pickle(filename_data) W = load_pickle(filename_w) test_data = load_pickle(test_filename) print('Use existing data. Load OK.') return (data, W, test_data) print("Construct new data.") # load data from pickle vocab = get_vocab(train) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v') word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v') # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin', # binary=True) word_vecs = add_unknown_words(word_vecs, vocab, k=dims) W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims) idx_data_train = make_idx_data(train, word_idx_map) idx_data_dev = make_idx_data(dev, word_idx_map) idx_data_devtest = make_idx_data(devtest, word_idx_map) idx_data_test = make_idx_data(test[2], word_idx_map) data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores) test_data = (test[0], test[1], idx_data_test) dump_picle(data, filename_data) dump_picle(W, filename_w) dump_picle(test_data, test_filename) print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w)) return (data, W, test_data)
def load_sst(path=None, level=None): filename = './tmp/SST.p' if os.path.isfile(filename): print('Load OK.') return load_pickle(filename) def cleanStr(string): string = re.sub(r'^A-Za-z0-9(),!?\'\`', ' ', string) string = re.sub(r'\s{2,}', ' ', string) string = string.replace('á', 'á').replace('é', 'é').replace('ñ', 'ñ').replace('Â', '').replace('ï', 'ï') string = string.replace('ü', 'ü').replace('â', 'â').replace('è', 'è').replace('ö', 'ö').replace('æ', 'æ') string = string.replace('ó', 'ó').replace('û', 'û').replace('ô', 'ô').replace('ã', 'ã').replace('ç', 'ç') string = string.replace('à ', 'à ').replace('Ã', 'í').replace('í', 'í') return string # sentiment label sentiment_file = open(path + 'sentiment_labels.txt', 'r') sentiment_label = {} n = 0 for line in sentiment_file: lines = line.strip().split('|') if n > 0: sentiment_label[int(lines[0])] = float(lines[1]) n += 1 sentiment_file.close() # phrase dict dict_file = open(path + 'dictionary.txt', 'r') phrase_dict = {} for line in dict_file: # line = line.decode('utf-8') lines = line.strip().split('|') phrase_dict[lines[0]] = int(lines[1]) dict_file.close() # sentence dict sentence_file = open(path + 'datasetSentences.txt', 'r') sentence_dict = {} n = 0 for line in sentence_file: # line = line.decode('utf-8') line = line.replace('-LRB-', '(') line = line.replace('-RRB-', ')') lines = line.strip().split('\t') if n > 0: sentence_dict[int(lines[0])] = lines[1] n += 1 sentence_file.close() # datasplit datasplit_file = open(path + 'datasetSplit.txt', 'r') split_dict = {} n = 0 for line in datasplit_file: lines = line.strip().split(',') if n > 0: split_dict[int(lines[0])] = int(lines[1]) n += 1 datasplit_file.close() size = len(sentence_dict) # size = 11855 # for i in range(1000): # senti = sentiment_label[phrase_dict[cleanStr(sentence_dict[i + 1])]] # print(i, senti, cleanStr(sentence_dict[i + 1])) # exit() x_train, y_train_valence, y_train_labels = [], [], [] x_test, y_test_valence, y_test_labels = [], [], [] x_valid, y_valid_valence, y_valid_labels = [], [], [] x_train_polarity, y_train_polarity = [], [] x_test_polarity, y_test_polarity = [], [] x_valid_polarity, y_valid_polarity = [], [] for i in range(size): # print sentence_dict[i+1].encode('utf-8') sentence = cleanStr(sentence_dict[i + 1]) senti = sentiment_label[phrase_dict[sentence]] # print(senti, sentence) labels, polarity = None, None if 0 <= senti <= 0.2: labels = 1 polarity = 0 if 0.2 < senti <= 0.4: labels = 2 polarity = 0 if 0.4 < senti <= 0.6: labels = 3 if 0.6 < senti <= 0.8: labels = 4 polarity = 1 if 0.8 < senti <= 1: labels = 5 polarity = 1 if labels is None: raise Exception('Sentiment Error !') if split_dict[i + 1] == 1: x_train.append(sentence) y_train_valence.append(senti) y_train_labels.append(labels) if polarity is not None: x_train_polarity.append(sentence) y_train_polarity.append(polarity) elif split_dict[i + 1] == 2: x_test.append(sentence) y_test_valence.append(senti) y_test_labels.append(labels) if polarity is not None: x_test_polarity.append(sentence) y_test_polarity.append(polarity) else: x_valid.append(sentence) y_valid_valence.append(senti) y_valid_labels.append(labels) if polarity is not None: x_valid_polarity.append(sentence) y_valid_polarity.append(polarity) print("Fine-grained: #training: %s, #valid: %s, #test: %s" % (len(x_train), len(x_valid), len(x_test))) print("Binary classification: #train: %s, #valid: %s, #test: %s" % ( len(x_train_polarity), len(x_valid_polarity), len(x_test_polarity))) # t = zip(x_train, y_train) # random.shuffle(t) # x_train, y_train = zip(*t) output = (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) dump_picle(output, filename) print('Data saved and load successfully.') return output
def build_docvecs(model, ratings): nb_text = len(ratings) # 4200 size = len(model.docvecs['L_SENT_0']) # 50 vecs = [model.docvecs['L_SENT_%s' % id].reshape((1, size)) for id in range(nb_text)] dump_picle((np.concatenate(vecs), ratings), './data/acc/twitter_docvecs.p')
# corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') c idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
from load_data import load_CVAT_2 filename = './resources/CVAT (utf-8).csv' texts, valence, arousal = load_CVAT_2(filename, categorical="all") len_text = [] from CKIP_tokenizer import segsentence out = [] for idx, i in enumerate(texts): # print(list(i)) print(idx) out.append(" ".join(segsentence(i))) # len_text.append(len(.split())) from save_data import dump_picle dump_picle(out, "tokenized_texts_(newest3.31).p") print("The tokenized text is saved.")
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
neg_idx_data, neg_length = load_data(file_dir + 'neg/') print(neg_idx_data.shape, neg_length) data = np.concatenate((pos_idx_data, neg_idx_data), axis=0) print(data.shape) return data, pos_length, neg_length if __name__ == '__main__': ########################################## config ######################################## file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/' vec_dim = 300 ########################################################################################## # get vocab and save to pickle vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful') dump_picle(W, '/home/hs/Data/embedding_matrix.p') print('OK')
def keras_nn_input(word_vectors_model, amending): if word_vectors_model == 'word2vec': if amending == True: filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p' elif amending == False: filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p' elif amending == False: filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'retrofitted_GloVe': filename_data, filename_w = './tmp/retrofitted_GloVe_indexed_data.p', './tmp/retrofitted_GloVe_Weight.p' elif word_vectors_model == 'retrofitted_word2vec': filename_data, filename_w = './tmp/retrofitted_word2vec_indexed_data.p', './tmp/retrofitted_word2vec_Weight.p' else: raise Exception('Wrong parameter!') if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) if word_vectors_model == 'word2vec': if amending == True: word_vecs = load_embeddings('amended_word2vec') elif amending == False: word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: word_vecs = load_embeddings('amended_glove') elif amending == False: word_vecs = load_embeddings('glove') else: raise Exception('Wrong!') elif word_vectors_model == 'retrofitted_GloVe': word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt') # convert gensim model to dict type w2v = dict() for key in word_vecs.vocab.keys(): w2v[key] = word_vecs[key] word_vecs = w2v elif word_vectors_model == 'retrofitted_word2vec': word_vecs = load_embeddings('zh_tw', 'D:\Word_Embeddings\English\word2vec_out_vec_file.txt') # convert gensim model to dict type w2v = dict() for key in word_vecs.vocab.keys(): w2v[key] = word_vecs[key] word_vecs = w2v else: raise Exception('Wrong parameter!') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W)
def convert(source_file): s = load_pickle(source_file) dump_picle(s, str(source_file)[:-2] + '_v2.7.p', protocol=2)
vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab # 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
neg_idx_data, neg_length = load_data(file_dir + 'neg/') print(neg_idx_data.shape, neg_length) data = np.concatenate((pos_idx_data, neg_idx_data), axis=0) print(data.shape) return data, pos_length, neg_length if __name__ == '__main__': ########################################## config ######################################## file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/' vec_dim = 300 ########################################################################################## # get vocab and save to pickle vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful')
lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') # word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p') idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words)
def load_sst(path=None, level=None): filename = './tmp/SST.p' if os.path.isfile(filename): print('Load OK.') return load_pickle(filename) def cleanStr(string): string = re.sub(r'^A-Za-z0-9(),!?\'\`', ' ', string) string = re.sub(r'\s{2,}', ' ', string) string = string.replace('á', 'á').replace('é', 'é').replace( 'ñ', 'ñ').replace('Â', '').replace('ï', 'ï') string = string.replace('ü', 'ü').replace('â', 'â').replace( 'è', 'è').replace('ö', 'ö').replace('æ', 'æ') string = string.replace('ó', 'ó').replace('û', 'û').replace( 'ô', 'ô').replace('ã', 'ã').replace('ç', 'ç') string = string.replace('à ', 'à ').replace('Ã', 'í').replace('í', 'í') return string # sentiment label sentiment_file = open(path + 'sentiment_labels.txt', 'r') sentiment_label = {} n = 0 for line in sentiment_file: lines = line.strip().split('|') if n > 0: sentiment_label[int(lines[0])] = float(lines[1]) n += 1 sentiment_file.close() # phrase dict dict_file = open(path + 'dictionary.txt', 'r') phrase_dict = {} for line in dict_file: # line = line.decode('utf-8') lines = line.strip().split('|') phrase_dict[lines[0]] = int(lines[1]) dict_file.close() # sentence dict sentence_file = open(path + 'datasetSentences.txt', 'r') sentence_dict = {} n = 0 for line in sentence_file: # line = line.decode('utf-8') line = line.replace('-LRB-', '(') line = line.replace('-RRB-', ')') lines = line.strip().split('\t') if n > 0: sentence_dict[int(lines[0])] = lines[1] n += 1 sentence_file.close() # datasplit datasplit_file = open(path + 'datasetSplit.txt', 'r') split_dict = {} n = 0 for line in datasplit_file: lines = line.strip().split(',') if n > 0: split_dict[int(lines[0])] = int(lines[1]) n += 1 datasplit_file.close() size = len(sentence_dict) # size = 11855 # for i in range(1000): # senti = sentiment_label[phrase_dict[cleanStr(sentence_dict[i + 1])]] # print(i, senti, cleanStr(sentence_dict[i + 1])) # exit() x_train, y_train_valence, y_train_labels = [], [], [] x_test, y_test_valence, y_test_labels = [], [], [] x_valid, y_valid_valence, y_valid_labels = [], [], [] x_train_polarity, y_train_polarity = [], [] x_test_polarity, y_test_polarity = [], [] x_valid_polarity, y_valid_polarity = [], [] for i in range(size): # print sentence_dict[i+1].encode('utf-8') sentence = cleanStr(sentence_dict[i + 1]) senti = sentiment_label[phrase_dict[sentence]] # print(senti, sentence) labels, polarity = None, None if 0 <= senti <= 0.2: labels = 1 polarity = 0 if 0.2 < senti <= 0.4: labels = 2 polarity = 0 if 0.4 < senti <= 0.6: labels = 3 if 0.6 < senti <= 0.8: labels = 4 polarity = 1 if 0.8 < senti <= 1: labels = 5 polarity = 1 if labels is None: raise Exception('Sentiment Error !') if split_dict[i + 1] == 1: x_train.append(sentence) y_train_valence.append(senti) y_train_labels.append(labels) if polarity is not None: x_train_polarity.append(sentence) y_train_polarity.append(polarity) elif split_dict[i + 1] == 2: x_test.append(sentence) y_test_valence.append(senti) y_test_labels.append(labels) if polarity is not None: x_test_polarity.append(sentence) y_test_polarity.append(polarity) else: x_valid.append(sentence) y_valid_valence.append(senti) y_valid_labels.append(labels) if polarity is not None: x_valid_polarity.append(sentence) y_valid_polarity.append(polarity) print("Fine-grained: #training: %s, #valid: %s, #test: %s" % (len(x_train), len(x_valid), len(x_test))) print("Binary classification: #train: %s, #valid: %s, #test: %s" % (len(x_train_polarity), len(x_valid_polarity), len(x_test_polarity))) # t = zip(x_train, y_train) # random.shuffle(t) # x_train, y_train = zip(*t) output = (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) dump_picle(output, filename) print('Data saved and load successfully.') return output
from vectorizers import TFIDF_estimator, anew_estimator # class StemmedTfidfVectorizer(TfidfVectorizer): # def build_analyzer(self): # # 利用NLTK进行词干化处理 # english_stemmer = nltk.stem.SnowballStemmer('english') # analyzer = super(TfidfVectorizer, self).build_analyzer() # return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) # # # vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', # 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], # 'binary': parameters['TF_binary'], 'norm': parameters['norm'], # 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} # log_state('Start generating features') # log_state((sorted(list(vectorizer_param.items())))) # log_state('Training data size: ' + str(parameters['test_data_size'])) if __name__ == "__main__": vectorizer = anew_estimator() train_type = 'Sentiment140' texts, _ = load_train_data(train_type) transformed_train = vectorizer.fit_transform(texts) testdata, _ = load_test_data() transformed_test = vectorizer.transform(testdata) dump_picle(vectorizer.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
print(embedding_matrix[1]) print(idx_map['我們']) print(len(word_vecs['我們'])) print(word_vecs['我們'].shape) print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim)) print('Result') sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim) print(sentence_embedding_matrix.shape) print(sentence_embedding_matrix[3], valence[3], arousal[3]) from save_data import dump_picle dump_picle((sentence_embedding_matrix, valence), get_file_path('CVAT_sentence_matrix_400')) exit() ''' X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(sentence_embedding_matrix, valence, test_size=0.2, random_state=0) print(X_train.shape) print(len(Y_test)) maxlen = 200 size = 50 X_train = X_train.reshape(X_train.shape[0], 1, maxlen, size) X_test = X_test.reshape(X_test.shape[0], 1, maxlen, size) print(X_train.shape) batch_size = 128
def keras_nn_input(word_vectors_model, amending): if word_vectors_model == 'word2vec': if amending == True: filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p' elif amending == False: filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p' elif amending == False: filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p' else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) if word_vectors_model == 'word2vec': if amending == True: word_vecs = load_embeddings('amended_word2vec') elif amending == False: word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: word_vecs = load_embeddings('amended_glove') elif amending == False: word_vecs = load_embeddings('glove') else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W)
print(embedding_matrix[1]) print(idx_map['我們']) print(len(word_vecs['我們'])) print(word_vecs['我們'].shape) print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim)) print('Result') sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim) print(sentence_embedding_matrix.shape) print(sentence_embedding_matrix[3], valence[3], arousal[3]) from save_data import dump_picle dump_picle((sentence_embedding_matrix, valence), get_file_path('CVAT_sentence_matrix_400')) exit() ''' X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(sentence_embedding_matrix, valence, test_size=0.2, random_state=0) print(X_train.shape) print(len(Y_test)) maxlen = 200 size = 50 X_train = X_train.reshape(X_train.shape[0], 1, maxlen, size) X_test = X_test.reshape(X_test.shape[0], 1, maxlen, size) print(X_train.shape)
lexicon_name = get_file_path('anew') logger.info(r"loading lexicon form : " + lexicon_name) words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus)) # for i in corpus[:100]: # print(i) lexicon = dict() for i, word in enumerate(words): lexicon[word] = valences[i] mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos = calculate_ratings( corpus, ratings, lexicon) dump_picle([ mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings ], './data/vader_out.p') exit() from collections import defaultdict # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()]))) # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') # length = len(vocab) ##################################### IDF #################################################### # idf=dict() # for i, word in enumerate(words): # denominator = sum(1 for doc in corpus if word in doc.split()) # if denominator != 0: # idf[word] = sp.log(float(len(corpus)) / denominator) # if i%50 == 0:
for i in top_n_ind: print('Parameter setting: %s, acc: %s' % (str(list(grid)[i]), param_fitness[i])) if __name__=='__main__': result_analysis('./tmp/grid_search_result.p') print('OK') exit() scope = [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = {'a':scope, 'b': scope, 'c': scope} param_fitness = [] grid = ParameterGrid(param_grid) for params in grid: print('calculating... parameter: %s' % str(params)) score = my_function(params['a'], params['b'], params['c']) print('Score: %s' % score) param_fitness.append(score) print('grid search complete.') # return the best fitness value and its settings best_fitness = np.min(np.array(param_fitness)) best_ind = np.where(np.array(param_fitness)==best_fitness)[0] print('best fitness: %s' % best_fitness) print('best setting: %s' % str(list(grid)[best_ind])) dump_picle((param_grid, param_fitness), './tmp/grid_search_result.p')
def get_vocab(corpus): vocab = defaultdict(float) for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab # 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark)
return [clean_str(sent) for sent in corpus] vec_dim = 300 ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) dump_picle(vocab, './data/corpus/vader/vocab_moview_tweets.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print('词汇数量:%s' % str(len(vocab))) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_tweets.p') print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_tweets.p') print('dump embedding matrix file OK') # word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p') idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_tweets.p') print(idx_data[0]) print(ratings[0]) print('OK')