corpus_name = get_file_path('cn_corpus') logger.info(r"loading corpus from : " + corpus_name) lexicon_name = get_file_path('lexicon') logger.info(r"loading lexicon form : " + lexicon_name) expand_name = get_file_path('neural_cand') logger.info(r"loading expand_word from : " + expand_name) mark_name = get_file_path('mark') logger.info(r"loading mark from : " + mark_name) corpus = load_corpus(corpus_name) lexicon = load_lexicon(lexicon_name) mark = load_mark(mark_name) # log_state('use extend lexicon') lexicon = combine_lexicon(lexicon_name, expand_name) log_state('mean') evaluate_mean(corpus, lexicon, mark) log_state('tf_mean') evaluate_tf_mean(corpus, lexicon, mark) log_state('tfidf_mean') evaluate_tfidf_mean(corpus, lexicon, mark) log_state('geo') evaluate_geo(corpus, lexicon, mark) log_state('tfidf_geo') evaluate_tfidf_geo(corpus, lexicon, mark) log_state('tf_geo')
vocab = get_vocab(corpus) dump_picle(vocab, get_file_path('CVAT_Vocab')) print('Dump CVAT vocab OK') # vocab = load_pickle(get_file_path('CVAT_Vocab')) for i in vocab: print(i) print(len(vocab)) W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) print('dump word_idx_map successful') dump_picle(W, './data/tmp/embedding_matrix_CVAT.p') print('OK') # word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5) dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data')) # idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data')) print(idx_data.shape) exit() word_vecs = load_embeddings('zh_tw') dim = len(word_vecs['我們']) # 400 embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim) print(embedding_matrix[1]) print(idx_map['我們'])
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg') if __name__ == '__main__': normalize = True corpus = load_corpus(get_file_path('cn_corpus')) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path('mark')) lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print('start.....') cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print('OK') valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) print('start.....')
def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo( corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
from positive_negative_split import get_pos_neg_va def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR") linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg") if __name__ == "__main__": normalize = True corpus = load_corpus(get_file_path("cn_corpus")) # lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path("mark")) lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand")) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) print("start.....") cv(valence_mean, valence_true, multivariant=False) cv(arousal_mean, arousal_true, multivariant=False) print("OK") valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) print("start.....")