Exemple #1
0
# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()
vec_dim = 300
############################################## all ###############################################
corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
c
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
Exemple #3
0
corpus, ratings = load_vader(
    ['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
# corpus, ratings = load_vader(['movie_reviews'])
# # name: tweets, movie_reviews, product_reviews, news_articles
lexicon_name = get_file_path('anew')
words, valences, _ = load_anew(lexicon_name)
corpus, ratings = screen_data(corpus, ratings, words)
ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5
print(len(corpus), len(corpus))

vocab = get_vocab(corpus)
# dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p')
# vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p')
print(len(vocab))
W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'),
                                         vocab,
                                         k=300)
# dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p')
# print('dump word_idx_map successful')
dump_picle(W, './data/corpus/vader/embedding_matrix_all.p')
print('dump embedding matrix file OK')
# word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p')
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
dump_picle([idx_data, ratings],
           './data/corpus/vader/vader_processed_data_all.p')
print(idx_data[0])
print(ratings[0])

############################################## tweets ###############################################
# corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles'])
corpus, ratings = load_vader(['tweets'])
word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

# dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)
print(sentence_embedding_matrix[3], valence[3], arousal[3])

from save_data import dump_picle
Exemple #5
0
# 注意: 这个文件就是CVAT构造cnn输入数据的代码
########################################## config ########################################
vec_dim = 400
##########################################################################################
corpus = load_corpus(get_file_path('cn_corpus'))
print(corpus[:2])
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'),
                                         vocab,
                                         k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()
word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

# dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])

print(len(word_vecs['我們']))
print(word_vecs['我們'].shape)

print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim))

print('Result')
sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim)
print(sentence_embedding_matrix.shape)
print(sentence_embedding_matrix[3], valence[3], arousal[3])

from save_data import dump_picle