def load_data(file_dir):
     file_names = os.listdir(file_dir)
     data = []
     length = len(file_names)
     for file_name in file_names:
         text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines())
         data.append(clean_str(text))
     idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5)
     return idx_data, length
 def load_data(file_dir):
     file_names = os.listdir(file_dir)
     data = []
     length = len(file_names)
     for file_name in file_names:
         text = ' '.join(
             codecs.open(os.path.join(file_dir, file_name), 'r',
                         'utf-8').readlines())
         data.append(clean_str(text))
     idx_data = make_idx_data(data,
                              word_idx_map,
                              max_len=200,
                              kernel_size=5)
     return idx_data, length
Example #3
0
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
Example #4
0
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings],
           './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))