corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')
    evaluate_tfidf_geo(corpus, lexicon, mark)
    log_state('tf_geo')
Exemple #2
0
vocab = get_vocab(corpus)
dump_picle(vocab, get_file_path('CVAT_Vocab'))
print('Dump CVAT vocab OK')
# vocab = load_pickle(get_file_path('CVAT_Vocab'))
for i in vocab:
    print(i)
print(len(vocab))

W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400)
dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT'))
print('dump word_idx_map successful')
dump_picle(W, './data/tmp/embedding_matrix_CVAT.p')
print('OK')

# word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT'))
mark = load_mark(get_file_path('mark'))
valence, arousal = gold_valence_arousal(corpus, mark)
idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)

dump_picle([idx_data, valence, arousal], get_file_path('CVAT_processed_data'))
# idx_data, valence, arousal = load_pickle(get_file_path('CVAT_processed_data'))
print(idx_data.shape)
exit()

word_vecs = load_embeddings('zh_tw')

dim = len(word_vecs['我們'])  # 400

embedding_matrix, idx_map = build_embedding_matrix(word_vecs, k=dim)
print(embedding_matrix[1])
print(idx_map['我們'])
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR')
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg')


if __name__ == '__main__':
    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path('mark'))
    lexicon = combine_lexicon(get_file_path('lexicon'), get_file_path('neural_cand'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print('start.....')
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print('OK')

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    print('start.....')
Exemple #4
0
    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
            data, target, test_size=0.2, random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train,
                                           X_test,
                                           Y_train,
                                           Y_test,
                                           cost_fun='Ridge_Regression')

    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(
        corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
    from positive_negative_split import get_pos_neg_va


    def cv(data, target, multivariant=False):
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2,
                                                                             random_state=0)
        if multivariant is False:
            linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
        else:
            linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression')


    normalize = True
    corpus = load_corpus(get_file_path('cn_corpus'))
    lexicon = load_lexicon(get_file_path('normalized_onezero_lexicon'))
    mark = load_mark(get_file_path('normalized_onezero_mark'))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = nonlinear_max_fusion(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_tfidf(corpus, lexicon, mark)
    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tf(corpus, lexicon, mark)
    # valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_geo_tfidf(corpus, lexicon, mark)
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.1, random_state=10)
    if multivariant is False:
        linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
    else:
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="ordinary_least_squares")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Ridge_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="Bayesian_Regression")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="SVR")
        linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun="KNN_Reg")


if __name__ == "__main__":
    normalize = True
    corpus = load_corpus(get_file_path("cn_corpus"))
    # lexicon = load_lexicon(get_file_path('lexicon'))
    mark = load_mark(get_file_path("mark"))
    lexicon = combine_lexicon(get_file_path("lexicon"), get_file_path("neural_cand"))

    # # the following could use to check the same words in corpus and lexicon
    # from visualization import show_common_term
    # show_common_term(corpus, lexicon)
    # exit()

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion(corpus, lexicon, mark)
    print("start.....")
    cv(valence_mean, valence_true, multivariant=False)
    cv(arousal_mean, arousal_true, multivariant=False)
    print("OK")

    valence_mean, valence_true, arousal_mean, arousal_true = linear_fusion_sqr(corpus, lexicon, mark)
    print("start.....")
    corpus_name = get_file_path('cn_corpus')
    logger.info(r"loading corpus from : " + corpus_name)

    lexicon_name = get_file_path('lexicon')
    logger.info(r"loading lexicon form : " + lexicon_name)

    expand_name = get_file_path('neural_cand')
    logger.info(r"loading expand_word from : " + expand_name)

    mark_name = get_file_path('mark')
    logger.info(r"loading mark from : " + mark_name)

    corpus = load_corpus(corpus_name)
    lexicon = load_lexicon(lexicon_name)
    mark = load_mark(mark_name)
    # log_state('use extend lexicon')
    lexicon = combine_lexicon(lexicon_name, expand_name)

    log_state('mean')
    evaluate_mean(corpus, lexicon, mark)
    log_state('tf_mean')
    evaluate_tf_mean(corpus, lexicon, mark)
    log_state('tfidf_mean')
    evaluate_tfidf_mean(corpus, lexicon, mark)

    log_state('geo')
    evaluate_geo(corpus, lexicon, mark)
    log_state('tfidf_geo')
    evaluate_tfidf_geo(corpus, lexicon, mark)
    log_state('tf_geo')