# 注意: 这个文件就是CVAT构造cnn输入数据的代码 ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) print(corpus[:2]) vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) # idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit()
vec_dim = 300 ############################################## all ############################################### corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') c idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew')
corpus, ratings = load_vader( ['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) # corpus, ratings = load_vader(['movie_reviews']) # # name: tweets, movie_reviews, product_reviews, news_articles lexicon_name = get_file_path('anew') words, valences, _ = load_anew(lexicon_name) corpus, ratings = screen_data(corpus, ratings, words) ratings = np.array(ratings) + np.ones(len(ratings), dtype=float) * 5 print(len(corpus), len(corpus)) vocab = get_vocab(corpus) # dump_picle(vocab, './data/corpus/vader/vocab_moview_reviews.p') # vocab = load_pickle('./data/corpus/vader/vocab_moview_reviews.p') print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) # dump_picle(word_idx_map, './data/corpus/vader/word_idx_map_movie_reviews.p') # print('dump word_idx_map successful') dump_picle(W, './data/corpus/vader/embedding_matrix_all.p') print('dump embedding matrix file OK') # word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_movie_reviews.p') idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, ratings], './data/corpus/vader/vader_processed_data_all.p') print(idx_data[0]) print(ratings[0]) ############################################## tweets ############################################### # corpus, ratings = load_vader(['tweets', 'movie_reviews', 'product_reviews', 'news_articles']) corpus, ratings = load_vader(['tweets'])
word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) # dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit() word_vecs = load_embeddings('zh_tw') dim = len(word_vecs['我們']) # 400 embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim) print(embedding_matrix[1]) print(idx_map['我們']) print(len(word_vecs['我們'])) print(word_vecs['我們'].shape) print(build_sentence_matrix(model=word_vecs, sententces=corpus[:2], dim=dim)) print('Result') sentence_embedding_matrix = build_sentence_matrix(word_vecs, corpus, dim=dim) print(sentence_embedding_matrix.shape) print(sentence_embedding_matrix[3], valence[3], arousal[3]) from save_data import dump_picle