def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo( corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
logger.info(r"running %s" % ''.join(sys.argv)) corpus_name = get_file_path('cn_corpus') logger.info(r"loading corpus from : " + corpus_name) lexicon_name = get_file_path('lexicon') logger.info(r"loading lexicon form : " + lexicon_name) expand_name = get_file_path('neural_cand') logger.info(r"loading expand_word from : " + expand_name) mark_name = get_file_path('mark') logger.info(r"loading mark from : " + mark_name) corpus = load_corpus(corpus_name) lexicon = load_lexicon(lexicon_name) mark = load_mark(mark_name) # log_state('use extend lexicon') lexicon = combine_lexicon(lexicon_name, expand_name) log_state('mean') evaluate_mean(corpus, lexicon, mark) log_state('tf_mean') evaluate_tf_mean(corpus, lexicon, mark) log_state('tfidf_mean') evaluate_tfidf_mean(corpus, lexicon, mark) log_state('geo') evaluate_geo(corpus, lexicon, mark) log_state('tfidf_geo') evaluate_tfidf_geo(corpus, lexicon, mark)
from regression import linear_regression, linear_regression_multivariant from positive_negative_split import get_pos_neg_va def cv(data, target, multivariant=False): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=0) if multivariant is False: linear_regression(X_train, X_test, Y_train, Y_test, plot=False) else: linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') normalize = True corpus = load_corpus(get_file_path('cn_corpus')) lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon')) mark = load_mark(get_file_path('normalized_onezero_mark')) # # the following could use to check the same words in corpus and lexicon # from visualization import show_common_term # show_common_term(corpus, lexicon) # exit() # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark) valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark) # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
__author__ = "NLP-PC" # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir("..") print(load_lexicon(get_file_path("lexicon"))) words, valence, arousal = load_anew(get_file_path("anew")) gs = goslate.Goslate() for tw_text in gs.translate(words, "zh-tw"): print(tw_text) print(gs.translate("Hi", "zh-TW")) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages["zh-TW"])
return result def scaling_onezero(num_list): # Note: the type of the parameter is np.array # Function: To normalize data result = [] for num in num_list: result.append(num / 9) return result if __name__ == '__main__': from load_data import load_lexicon from load_data import load_mark from file_name import get_file_path from save_data import save_csv lexicon = load_lexicon(get_file_path('lexicon')) mark = load_mark(get_file_path('mark')) lexicon = np.array(lexicon) mark = np.array(mark) ##################################### lexicon[:, 1] = scaling_onezero(np.array(lexicon[:, 1], dtype=float)) lexicon[:, 2] = scaling_onezero(np.array(lexicon[:, 2], dtype=float)) mark[:, 1] = scaling_onezero(np.array(mark[:, 1], dtype=float)) mark[:, 2] = scaling_onezero(np.array(mark[:, 2], dtype=float)) ###################################### save_csv(lexicon, get_file_path('normalized_onezero_lexicon')) save_csv(mark, get_file_path('normalized_onezero_mark'))
__author__ = 'NLP-PC' # coding: utf-8 import goslate from load_data import load_lexicon from file_name import get_file_path from load_data import load_anew import os os.chdir('..') print(load_lexicon(get_file_path('lexicon'))) words, valence, arousal = load_anew(get_file_path('anew')) gs = goslate.Goslate() for tw_text in gs.translate(words, 'zh-tw'): print(tw_text) print(gs.translate('Hi', 'zh-TW')) # You could get all supported language list through get_languages languages = gs.get_languages() print(languages['zh-TW'])