Beispiel #1
0
    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
            data, target, test_size=0.2, random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train,
                                           X_test,
                                           Y_train,
                                           Y_test,
                                           cost_fun='Ridge_Regression')

    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(
        corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')
    evaluate_tfidf_geo(corpus, lexicon, mark)
    from regression import linear_regression, linear_regression_multivariant
    from positive_negative_split import get_pos_neg_va


    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2,
                                                                             random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
    logger.info(r"running %s" % ''.join(sys.argv))

    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')
    evaluate_tfidf_geo(corpus, lexicon, mark)
__author__ = "NLP-PC"
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os

os.chdir("..")
print(load_lexicon(get_file_path("lexicon")))
words, valence, arousal = load_anew(get_file_path("anew"))
gs = goslate.Goslate()
for tw_text in gs.translate(words, "zh-tw"):
    print(tw_text)
print(gs.translate("Hi", "zh-TW"))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages["zh-TW"])
Beispiel #6
0
    return result


def scaling_onezero(num_list):
    # Note: the type of the parameter is np.array
    # Function: To normalize data
    result = []
    for num in num_list:
        result.append(num / 9)
    return result


if __name__ == '__main__':
    from load_data import load_lexicon
    from load_data import load_mark
    from file_name import get_file_path
    from save_data import save_csv

    lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path('mark'))
    lexicon = np.array(lexicon)
    mark = np.array(mark)
    #####################################
    lexicon[:, 1] = scaling_onezero(np.array(lexicon[:, 1], dtype=float))
    lexicon[:, 2] = scaling_onezero(np.array(lexicon[:, 2], dtype=float))
    mark[:, 1] = scaling_onezero(np.array(mark[:, 1], dtype=float))
    mark[:, 2] = scaling_onezero(np.array(mark[:, 2], dtype=float))
    ######################################
    save_csv(lexicon, get_file_path('normalized_onezero_lexicon'))
    save_csv(mark, get_file_path('normalized_onezero_mark'))
Beispiel #7
0
__author__ = 'NLP-PC'
# coding: utf-8
import goslate
from load_data import load_lexicon
from file_name import get_file_path
from load_data import load_anew
import os
os.chdir('..')
print(load_lexicon(get_file_path('lexicon')))
words, valence, arousal = load_anew(get_file_path('anew'))
gs = goslate.Goslate()
for tw_text in gs.translate(words, 'zh-tw'):
    print(tw_text)
print(gs.translate('Hi', 'zh-TW'))
# You could get all supported language list through get_languages
languages = gs.get_languages()
print(languages['zh-TW'])