/
build_cnn_input_data_tweets.py
54 lines (46 loc) · 1.99 KB
/
build_cnn_input_data_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
__author__ = 'NLP-PC'
from load_data import load_vader
import numpy as np
from collections import defaultdict
from imdb_processing import clean_str
from save_data import dump_picle
from load_data import load_pickle, load_embeddings
from word2vec_fn import build_embedding_matrix
from word2vec_fn import make_idx_data
from affective_score_vader import screen_data
from file_name import get_file_path
from load_data import load_anew
def get_vocab(corpus):
vocab = defaultdict(float)
for sent in corpus:
for word in clean_str(sent).split():
vocab[word] += 1
print(len(vocab))
return vocab
def process(corpus):
return [clean_str(sent) for sent in corpus]
vec_dim = 300
############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))
vocab = get_vocab(corpus)
dump_picle(vocab, './data/corpus/vader/vocab_moview_tweets.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print('词汇数量:%s' % str(len(vocab)))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_tweets.p')
print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_tweets.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_tweets.p')
print(idx_data[0])
print(ratings[0])
print('OK')