コード例 #1
0
    num_cluster = clustering_control_param['num_training_cluster']
    num_pos_cluster = num_cluster
    num_neg_cluster = num_cluster
    clustering_testdata = clustering_control_param['training_clustering_method']
    clustered_pos = clustering_testdata(pos, num_pos_cluster)
    clustered_neg = clustering_testdata(neg, num_neg_cluster)
    X_train, Y_train = clustered_pos + clustered_neg, [1] * num_pos_cluster + [0] * num_neg_cluster
    X_train, Y_train = np.array(X_train), np.array(Y_train)
    trian_vec = vectorizer.fit_transform(X_train)

    # clustering test data
    from candidate_content import get_candidate

    expanding_pos_content, expanding_neg_content = get_candidate()
    expanding_pos_content, expanding_neg_content = np.array(expanding_pos_content), np.array(expanding_neg_content)
    expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform(
        expanding_pos_content), vectorizer.transform(expanding_neg_content)

    # 加载测试资料
    from Utils import load_test_data

    X_test, Y_test = load_test_data()

    # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data
    # from candidate_content import get_candidate_dynamic
    # get_candidate_dynamic(X_test, neg, 5, 'neg')
    # get_candidate_dynamic(X_test, pos, 5, 'pos')
    # exit()
    # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py

コード例 #2
0
# 加载模型
clf = pickle.load(open("./acc_tmp/predict/classifier.p", "rb"))
logger.info('成功加载分类器模型')
# 向量化
from customed_vectorizer import StemmedTfidfVectorizer
from parameters import vectorizer_param as param
vectorizer = StemmedTfidfVectorizer(**param)
neg_clustered_texts_vec = vectorizer.fit_transform(neg_clustered_texts)
neg_extantion_content_vec = vectorizer.fit_transform(neg_extantion_content)
pos_clustered_texts_vec = vectorizer.fit_transform(pos_clustered_texts)
pos_extantion_content_vec = vectorizer.fit_transform(pos_extantion_content)
logger.info('向量化完成')

# 预测
predict_neg_clustered_texts_vec = clf.predict_proba(neg_clustered_texts_vec)[:, 1]
predict_neg_extantion_content_vec = clf.predict_proba(neg_extantion_content_vec)[:, 1]
predict_pos_clustered_texts_vec = clf.predict_proba(pos_clustered_texts_vec)[:, 1]
predict_pos_extantion_content_vec = clf.predict_proba(pos_extantion_content_vec)[:, 1]
from Utils import load_test_data

text, _ = load_test_data()
predict_testdata_without_clustering = clf.predict_proba(vectorizer.fit_transform(text))[:, 1]
logger.info('完成预测,即将保存')

# 保存结果
pickle.dump(predict_neg_clustered_texts_vec, open("./data/predict_dynamics/predict_neg_clustered_texts_vec.p", "wb"))
pickle.dump(predict_neg_extantion_content_vec, open("./data/predict_dynamics/predict_neg_extantion_content_vec.p", "wb"))
pickle.dump(predict_pos_clustered_texts_vec, open("./data/predict_dynamics/predict_pos_clustered_texts_vec.p", "wb"))
pickle.dump(predict_pos_extantion_content_vec, open("./data/predict_dynamics/predict_pos_extantion_content_vec.p", "wb"))
pickle.dump(predict_testdata_without_clustering, open("./data/predict_dynamics/predict_testdata_without_clustering.p", "wb"))
logger.info('完成保存')
    plt.xlabel('pos distance')
    plt.ylabel('neg distance')
    plt.show()

    plt.plot(range(0, length), expand_with_pos, 'r-', alpha=0.8)
    plt.scatter(range(0,length), expand_with_pos, marker='o', c=colors)
    plt.plot(range(0, length), pos_expand_content, 'r-', alpha=0.4)
    plt.scatter(range(0,length), expand_with_neg, marker='o', c=colors)
    plt.axhline(0.5, color='black')
    plt.plot(range(0, length), expand_with_neg, 'g-', alpha=0.8)
    plt.plot(range(0, length), neg_expand_content, 'g-', alpha=0.4)
    plt.show()

import pickle
from Utils import load_test_data

_, true = load_test_data()
length = len(true)
colors = ['red' if true[i] == 1 else 'green' for i in range(0, length)]
g = {'c': colors, 'alpha': 0.7}
expand_with_pos = pickle.load(open("./data/predict_dynamics/predict_pos_clustered_texts_vec.p", "rb"))
expand_with_neg = pickle.load(open("./data/predict_dynamics/predict_neg_clustered_texts_vec.p", "rb"))
pos_expand_content = pickle.load(open("./data/predict_dynamics/predict_pos_extantion_content_vec.p", "rb"))
neg_expand_content = pickle.load(open("./data/predict_dynamics/predict_neg_extantion_content_vec.p", "rb"))
analysis_dynamics(expand_with_pos, expand_with_neg, pos_expand_content, neg_expand_content)

# predict
predict1 =[0 if ele==True else 1 for ele in ((1 - expand_with_pos) > expand_with_neg)]
from analysis import analysis_result as ar
ar(predict1,true)
ar(predict_lable_testdata_without_clustering,true)
コード例 #4
0
            else:
                pos.append(X_train[i])
        num_cluster = clustering_control_param['num_training_cluster']
        num_pos_cluster = num_cluster
        num_neg_cluster = num_cluster
        clustering_testdata = clustering_control_param[
            'training_clustering_method']
        clustered_pos = clustering_testdata(pos, num_pos_cluster)
        clustered_neg = clustering_testdata(neg, num_neg_cluster)
        X_train, Y_train = clustered_pos + clustered_neg, [
            1
        ] * num_pos_cluster + [0] * num_neg_cluster
        X_train, Y_train = np.array(X_train), np.array(Y_train)

    from Utils import load_test_data
    X_test, Y_test = load_test_data()

    if parameters['clustering_test_data'] == True and clustering_control_param[
            'use_additional_texts'] == False:
        clustering_test_data_method = clustering_control_param[
            'clustering_test_data_method']
        X_test, X_test_labels = clustering_test_data_method(
            X_test, clustering_control_param['num_test_cluster'])
    elif parameters[
            'clustering_test_data'] == True and clustering_control_param[
                'use_additional_texts'] == True:
        # #另一种方法,clustering_texts_using_trainingset
        clustering_test_data_method = clustering_control_param[
            'clustering_test_data_method']
        cluster_size = clustering_control_param['cluster_size']
        if clustering_control_param['additional_texts'] == 'test_data':
コード例 #5
0
                     alpha=0.6,
                     color='b')
    plt.xlabel('Result')
    plt.ylabel('Scores')
    plt.title('Experiment analysis')
    plt.xticks(index + bar_width,
               ('Accuracy', 'F', 'Precision', 'Recall', 'F'))
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    # TRUE LABELS
    from Utils import load_test_data
    _, true = load_test_data()
    # Predict labels
    predict_label_1 = pickle.load(
        open("./acc_tmp/predict/predict_label_1.p", "rb"))
    predict_label_2 = pickle.load(
        open("./acc_tmp/predict/predict_label_2.p", "rb"))
    predict_label_3 = pickle.load(
        open("./acc_tmp/predict/predict_label_3.p", "rb"))
    predict_label_4 = pickle.load(
        open("./acc_tmp/predict/predict_label_4.p", "rb"))
    analysis_result(predict_label_1, true)
    analysis_result(predict_label_2, true)
    analysis_result(predict_label_3, true)
    analysis_result(predict_label_4, true)

    predict_without_clustering = pickle.load(
コード例 #6
0
def train_model(clc_factory, X, Y):
    print('start train_model...')
    # 设置随机状态,来得到确定性的行为
    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.0, random_state=0)

    # accuracy
    scores = []
    # AUC
    pr_scores = []
    # F1 score
    f1 = []

    # 暂时先用for吧,仅仅迭代一次
    for train, test in cv:
        # 测试方法一: do not clustering test data and training data
        # '''
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = load_test_data()
        X_test, Y_test = np.array(X_test), np.array(Y_test)

        # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb"))
        clf = clc_factory()
        clf.fit(X_train, Y_train)

        pickle.dump(clf, open("./acc_tmp/clf.p", "wb"))

        # accuracy
        test_score = clf.score(X_test, Y_test)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        precision, recall, pr_thresholds = precision_recall_curve(Y_test, proba[:, 1])
        # AUC
        aera_uc = auc(recall, precision)
        pr_scores.append(aera_uc)

        #F1_score
        f1.append(f1_score(Y_test, clf.predict(X_test), average='macro'))

        summary = (np.mean(scores), np.mean(pr_scores), np.mean(f1))
        # Area Under Curve (曲线下面的面积)
        print('正确率(Accuracy):%.3f\nP/R AUC值:%.3f\nF值(Macro-F score):%.3f' % (summary))

        # 画图
        pl.clf()
        pl.plot(recall, precision, label='Precision-Recall curve')
        pl.xlabel('Recall')
        pl.ylabel('Precision')
        pl.ylim([0.0, 1.05])
        pl.xlim([0.0, 1.0])
        pl.title('Precision-Recall Curve (AUC=%0.3f)' % aera_uc)
        pl.legend(loc="lower left")
        pl.show()
        # '''
        '''
        # 测试方法2, do not use clustering in test data, changed to specified test data
        X_train, Y_train = X[train], Y[train]
        clf = clc_factory()
        clf.fit(X_train, Y_train)

        pickle.dump(clf, open("./acc_tmp/clf_all_data.p", "wb"))

        X_test, Y_test=load_test_data()
        X_test,X_test_labels=build_clustered_testdata(X_test)
        # print(X_test_labels,len(X_test_labels))

        X_test, Y_test=np.array(X_test), np.array(Y_test)

        true_labels=Y_test
        predict_labels=np.array(sentiment_map_cluster2tweets(clf.predict(X_test),X_test_labels))
        precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary')
        print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score))
        # '''

        '''
    X_train, Y_train = np.array(X_train), np.array(Y_train)
    trian_vec = vectorizer.fit_transform(X_train)

    # clustering test data
    from candidate_content import get_candidate

    expanding_pos_content, expanding_neg_content = get_candidate()
    expanding_pos_content, expanding_neg_content = np.array(
        expanding_pos_content), np.array(expanding_neg_content)
    expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform(
        expanding_pos_content), vectorizer.transform(expanding_neg_content)

    # 加载测试资料
    from Utils import load_test_data

    X_test, Y_test = load_test_data()

    # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data
    from candidate_content import get_candidate_dynamic
    # get_candidate_dynamic(X_test, neg, 8, 'neg')
    # get_candidate_dynamic(X_test, pos, 8, 'pos')
    # exit()
    # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py

    from test_data_clustering import expand_text_list as expanding_method

    expanded_texts_with_pos, expanded_texts_with_neg = expanding_method(
        X_test,
        expanding_pos_content), expanding_method(X_test, expanding_neg_content)
    expanded_texts_vec_with_pos, expanded_texts_vec_with_neg = vectorizer.transform(
        expanded_texts_with_pos), vectorizer.transform(expanded_texts_with_neg)
コード例 #8
0
def train_model(clc_factory, X, Y):
    print('start train_model...')
    # 设置随机状态,来得到确定性的行为
    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.0, random_state=0)

    # accuracy
    scores = []
    # AUC
    pr_scores = []
    # F1 score
    f1 = []

    # 暂时先用for吧,仅仅迭代一次
    for train, test in cv:
        # 测试方法一: do not clustering test data and training data
        # '''
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = load_test_data()
        X_test, Y_test = np.array(X_test), np.array(Y_test)

        # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb"))
        clf = clc_factory()
        clf.fit(X_train, Y_train)

        pickle.dump(clf, open("./acc_tmp/clf.p", "wb"))

        # accuracy
        test_score = clf.score(X_test, Y_test)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        precision, recall, pr_thresholds = precision_recall_curve(
            Y_test, proba[:, 1])
        # AUC
        aera_uc = auc(recall, precision)
        pr_scores.append(aera_uc)

        #F1_score
        f1.append(f1_score(Y_test, clf.predict(X_test), average='macro'))

        summary = (np.mean(scores), np.mean(pr_scores), np.mean(f1))
        # Area Under Curve (曲线下面的面积)
        print('正确率(Accuracy):%.3f\nP/R AUC值:%.3f\nF值(Macro-F score):%.3f' %
              (summary))

        # 画图
        pl.clf()
        pl.plot(recall, precision, label='Precision-Recall curve')
        pl.xlabel('Recall')
        pl.ylabel('Precision')
        pl.ylim([0.0, 1.05])
        pl.xlim([0.0, 1.0])
        pl.title('Precision-Recall Curve (AUC=%0.3f)' % aera_uc)
        pl.legend(loc="lower left")
        pl.show()
        # '''
        '''
        # 测试方法2, do not use clustering in test data, changed to specified test data
        X_train, Y_train = X[train], Y[train]
        clf = clc_factory()
        clf.fit(X_train, Y_train)

        pickle.dump(clf, open("./acc_tmp/clf_all_data.p", "wb"))

        X_test, Y_test=load_test_data()
        X_test,X_test_labels=build_clustered_testdata(X_test)
        # print(X_test_labels,len(X_test_labels))

        X_test, Y_test=np.array(X_test), np.array(Y_test)

        true_labels=Y_test
        predict_labels=np.array(sentiment_map_cluster2tweets(clf.predict(X_test),X_test_labels))
        precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary')
        print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score))
        # '''
        '''
    X_train, Y_train = clustered_pos + clustered_neg, [1] * num_pos_cluster + [0] * num_neg_cluster
    X_train, Y_train = np.array(X_train), np.array(Y_train)
    trian_vec = vectorizer.fit_transform(X_train)

    # clustering test data
    from candidate_content import get_candidate

    expanding_pos_content, expanding_neg_content = get_candidate()
    expanding_pos_content, expanding_neg_content = np.array(expanding_pos_content), np.array(expanding_neg_content)
    expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform(
        expanding_pos_content), vectorizer.transform(expanding_neg_content)

    # 加载测试资料
    from Utils import load_test_data

    X_test, Y_test = load_test_data()

    # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data
    from candidate_content import get_candidate_dynamic
    # get_candidate_dynamic(X_test, neg, 8, 'neg')
    # get_candidate_dynamic(X_test, pos, 8, 'pos')
    # exit()
    # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py

    from test_data_clustering import expand_text_list as expanding_method

    expanded_texts_with_pos, expanded_texts_with_neg = expanding_method(X_test,
                                                                        expanding_pos_content), expanding_method(X_test,
                                                                                                                 expanding_neg_content)
    expanded_texts_vec_with_pos, expanded_texts_vec_with_neg = vectorizer.transform(
        expanded_texts_with_pos), vectorizer.transform(expanded_texts_with_neg)