num_cluster = clustering_control_param['num_training_cluster'] num_pos_cluster = num_cluster num_neg_cluster = num_cluster clustering_testdata = clustering_control_param['training_clustering_method'] clustered_pos = clustering_testdata(pos, num_pos_cluster) clustered_neg = clustering_testdata(neg, num_neg_cluster) X_train, Y_train = clustered_pos + clustered_neg, [1] * num_pos_cluster + [0] * num_neg_cluster X_train, Y_train = np.array(X_train), np.array(Y_train) trian_vec = vectorizer.fit_transform(X_train) # clustering test data from candidate_content import get_candidate expanding_pos_content, expanding_neg_content = get_candidate() expanding_pos_content, expanding_neg_content = np.array(expanding_pos_content), np.array(expanding_neg_content) expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform( expanding_pos_content), vectorizer.transform(expanding_neg_content) # 加载测试资料 from Utils import load_test_data X_test, Y_test = load_test_data() # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data # from candidate_content import get_candidate_dynamic # get_candidate_dynamic(X_test, neg, 5, 'neg') # get_candidate_dynamic(X_test, pos, 5, 'pos') # exit() # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py
# 加载模型 clf = pickle.load(open("./acc_tmp/predict/classifier.p", "rb")) logger.info('成功加载分类器模型') # 向量化 from customed_vectorizer import StemmedTfidfVectorizer from parameters import vectorizer_param as param vectorizer = StemmedTfidfVectorizer(**param) neg_clustered_texts_vec = vectorizer.fit_transform(neg_clustered_texts) neg_extantion_content_vec = vectorizer.fit_transform(neg_extantion_content) pos_clustered_texts_vec = vectorizer.fit_transform(pos_clustered_texts) pos_extantion_content_vec = vectorizer.fit_transform(pos_extantion_content) logger.info('向量化完成') # 预测 predict_neg_clustered_texts_vec = clf.predict_proba(neg_clustered_texts_vec)[:, 1] predict_neg_extantion_content_vec = clf.predict_proba(neg_extantion_content_vec)[:, 1] predict_pos_clustered_texts_vec = clf.predict_proba(pos_clustered_texts_vec)[:, 1] predict_pos_extantion_content_vec = clf.predict_proba(pos_extantion_content_vec)[:, 1] from Utils import load_test_data text, _ = load_test_data() predict_testdata_without_clustering = clf.predict_proba(vectorizer.fit_transform(text))[:, 1] logger.info('完成预测,即将保存') # 保存结果 pickle.dump(predict_neg_clustered_texts_vec, open("./data/predict_dynamics/predict_neg_clustered_texts_vec.p", "wb")) pickle.dump(predict_neg_extantion_content_vec, open("./data/predict_dynamics/predict_neg_extantion_content_vec.p", "wb")) pickle.dump(predict_pos_clustered_texts_vec, open("./data/predict_dynamics/predict_pos_clustered_texts_vec.p", "wb")) pickle.dump(predict_pos_extantion_content_vec, open("./data/predict_dynamics/predict_pos_extantion_content_vec.p", "wb")) pickle.dump(predict_testdata_without_clustering, open("./data/predict_dynamics/predict_testdata_without_clustering.p", "wb")) logger.info('完成保存')
plt.xlabel('pos distance') plt.ylabel('neg distance') plt.show() plt.plot(range(0, length), expand_with_pos, 'r-', alpha=0.8) plt.scatter(range(0,length), expand_with_pos, marker='o', c=colors) plt.plot(range(0, length), pos_expand_content, 'r-', alpha=0.4) plt.scatter(range(0,length), expand_with_neg, marker='o', c=colors) plt.axhline(0.5, color='black') plt.plot(range(0, length), expand_with_neg, 'g-', alpha=0.8) plt.plot(range(0, length), neg_expand_content, 'g-', alpha=0.4) plt.show() import pickle from Utils import load_test_data _, true = load_test_data() length = len(true) colors = ['red' if true[i] == 1 else 'green' for i in range(0, length)] g = {'c': colors, 'alpha': 0.7} expand_with_pos = pickle.load(open("./data/predict_dynamics/predict_pos_clustered_texts_vec.p", "rb")) expand_with_neg = pickle.load(open("./data/predict_dynamics/predict_neg_clustered_texts_vec.p", "rb")) pos_expand_content = pickle.load(open("./data/predict_dynamics/predict_pos_extantion_content_vec.p", "rb")) neg_expand_content = pickle.load(open("./data/predict_dynamics/predict_neg_extantion_content_vec.p", "rb")) analysis_dynamics(expand_with_pos, expand_with_neg, pos_expand_content, neg_expand_content) # predict predict1 =[0 if ele==True else 1 for ele in ((1 - expand_with_pos) > expand_with_neg)] from analysis import analysis_result as ar ar(predict1,true) ar(predict_lable_testdata_without_clustering,true)
else: pos.append(X_train[i]) num_cluster = clustering_control_param['num_training_cluster'] num_pos_cluster = num_cluster num_neg_cluster = num_cluster clustering_testdata = clustering_control_param[ 'training_clustering_method'] clustered_pos = clustering_testdata(pos, num_pos_cluster) clustered_neg = clustering_testdata(neg, num_neg_cluster) X_train, Y_train = clustered_pos + clustered_neg, [ 1 ] * num_pos_cluster + [0] * num_neg_cluster X_train, Y_train = np.array(X_train), np.array(Y_train) from Utils import load_test_data X_test, Y_test = load_test_data() if parameters['clustering_test_data'] == True and clustering_control_param[ 'use_additional_texts'] == False: clustering_test_data_method = clustering_control_param[ 'clustering_test_data_method'] X_test, X_test_labels = clustering_test_data_method( X_test, clustering_control_param['num_test_cluster']) elif parameters[ 'clustering_test_data'] == True and clustering_control_param[ 'use_additional_texts'] == True: # #另一种方法,clustering_texts_using_trainingset clustering_test_data_method = clustering_control_param[ 'clustering_test_data_method'] cluster_size = clustering_control_param['cluster_size'] if clustering_control_param['additional_texts'] == 'test_data':
alpha=0.6, color='b') plt.xlabel('Result') plt.ylabel('Scores') plt.title('Experiment analysis') plt.xticks(index + bar_width, ('Accuracy', 'F', 'Precision', 'Recall', 'F')) plt.ylim(0, 1) plt.tight_layout() plt.show() if __name__ == "__main__": # TRUE LABELS from Utils import load_test_data _, true = load_test_data() # Predict labels predict_label_1 = pickle.load( open("./acc_tmp/predict/predict_label_1.p", "rb")) predict_label_2 = pickle.load( open("./acc_tmp/predict/predict_label_2.p", "rb")) predict_label_3 = pickle.load( open("./acc_tmp/predict/predict_label_3.p", "rb")) predict_label_4 = pickle.load( open("./acc_tmp/predict/predict_label_4.p", "rb")) analysis_result(predict_label_1, true) analysis_result(predict_label_2, true) analysis_result(predict_label_3, true) analysis_result(predict_label_4, true) predict_without_clustering = pickle.load(
def train_model(clc_factory, X, Y): print('start train_model...') # 设置随机状态,来得到确定性的行为 cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.0, random_state=0) # accuracy scores = [] # AUC pr_scores = [] # F1 score f1 = [] # 暂时先用for吧,仅仅迭代一次 for train, test in cv: # 测试方法一: do not clustering test data and training data # ''' X_train, Y_train = X[train], Y[train] X_test, Y_test = load_test_data() X_test, Y_test = np.array(X_test), np.array(Y_test) # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb")) clf = clc_factory() clf.fit(X_train, Y_train) pickle.dump(clf, open("./acc_tmp/clf.p", "wb")) # accuracy test_score = clf.score(X_test, Y_test) scores.append(test_score) proba = clf.predict_proba(X_test) precision, recall, pr_thresholds = precision_recall_curve(Y_test, proba[:, 1]) # AUC aera_uc = auc(recall, precision) pr_scores.append(aera_uc) #F1_score f1.append(f1_score(Y_test, clf.predict(X_test), average='macro')) summary = (np.mean(scores), np.mean(pr_scores), np.mean(f1)) # Area Under Curve (曲线下面的面积) print('正确率(Accuracy):%.3f\nP/R AUC值:%.3f\nF值(Macro-F score):%.3f' % (summary)) # 画图 pl.clf() pl.plot(recall, precision, label='Precision-Recall curve') pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title('Precision-Recall Curve (AUC=%0.3f)' % aera_uc) pl.legend(loc="lower left") pl.show() # ''' ''' # 测试方法2, do not use clustering in test data, changed to specified test data X_train, Y_train = X[train], Y[train] clf = clc_factory() clf.fit(X_train, Y_train) pickle.dump(clf, open("./acc_tmp/clf_all_data.p", "wb")) X_test, Y_test=load_test_data() X_test,X_test_labels=build_clustered_testdata(X_test) # print(X_test_labels,len(X_test_labels)) X_test, Y_test=np.array(X_test), np.array(Y_test) true_labels=Y_test predict_labels=np.array(sentiment_map_cluster2tweets(clf.predict(X_test),X_test_labels)) precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary') print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score)) # ''' '''
X_train, Y_train = np.array(X_train), np.array(Y_train) trian_vec = vectorizer.fit_transform(X_train) # clustering test data from candidate_content import get_candidate expanding_pos_content, expanding_neg_content = get_candidate() expanding_pos_content, expanding_neg_content = np.array( expanding_pos_content), np.array(expanding_neg_content) expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform( expanding_pos_content), vectorizer.transform(expanding_neg_content) # 加载测试资料 from Utils import load_test_data X_test, Y_test = load_test_data() # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data from candidate_content import get_candidate_dynamic # get_candidate_dynamic(X_test, neg, 8, 'neg') # get_candidate_dynamic(X_test, pos, 8, 'pos') # exit() # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py from test_data_clustering import expand_text_list as expanding_method expanded_texts_with_pos, expanded_texts_with_neg = expanding_method( X_test, expanding_pos_content), expanding_method(X_test, expanding_neg_content) expanded_texts_vec_with_pos, expanded_texts_vec_with_neg = vectorizer.transform( expanded_texts_with_pos), vectorizer.transform(expanded_texts_with_neg)
def train_model(clc_factory, X, Y): print('start train_model...') # 设置随机状态,来得到确定性的行为 cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.0, random_state=0) # accuracy scores = [] # AUC pr_scores = [] # F1 score f1 = [] # 暂时先用for吧,仅仅迭代一次 for train, test in cv: # 测试方法一: do not clustering test data and training data # ''' X_train, Y_train = X[train], Y[train] X_test, Y_test = load_test_data() X_test, Y_test = np.array(X_test), np.array(Y_test) # clf = pickle.load(open("./acc_tmp/clf_all_data_noclustering.p", "rb")) clf = clc_factory() clf.fit(X_train, Y_train) pickle.dump(clf, open("./acc_tmp/clf.p", "wb")) # accuracy test_score = clf.score(X_test, Y_test) scores.append(test_score) proba = clf.predict_proba(X_test) precision, recall, pr_thresholds = precision_recall_curve( Y_test, proba[:, 1]) # AUC aera_uc = auc(recall, precision) pr_scores.append(aera_uc) #F1_score f1.append(f1_score(Y_test, clf.predict(X_test), average='macro')) summary = (np.mean(scores), np.mean(pr_scores), np.mean(f1)) # Area Under Curve (曲线下面的面积) print('正确率(Accuracy):%.3f\nP/R AUC值:%.3f\nF值(Macro-F score):%.3f' % (summary)) # 画图 pl.clf() pl.plot(recall, precision, label='Precision-Recall curve') pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title('Precision-Recall Curve (AUC=%0.3f)' % aera_uc) pl.legend(loc="lower left") pl.show() # ''' ''' # 测试方法2, do not use clustering in test data, changed to specified test data X_train, Y_train = X[train], Y[train] clf = clc_factory() clf.fit(X_train, Y_train) pickle.dump(clf, open("./acc_tmp/clf_all_data.p", "wb")) X_test, Y_test=load_test_data() X_test,X_test_labels=build_clustered_testdata(X_test) # print(X_test_labels,len(X_test_labels)) X_test, Y_test=np.array(X_test), np.array(Y_test) true_labels=Y_test predict_labels=np.array(sentiment_map_cluster2tweets(clf.predict(X_test),X_test_labels)) precision,recall,fbeta_score,support=precision_recall_fscore_support(true_labels, predict_labels, average='binary') print('精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f'%(precision,recall,fbeta_score)) # ''' '''
X_train, Y_train = clustered_pos + clustered_neg, [1] * num_pos_cluster + [0] * num_neg_cluster X_train, Y_train = np.array(X_train), np.array(Y_train) trian_vec = vectorizer.fit_transform(X_train) # clustering test data from candidate_content import get_candidate expanding_pos_content, expanding_neg_content = get_candidate() expanding_pos_content, expanding_neg_content = np.array(expanding_pos_content), np.array(expanding_neg_content) expanding_pos_content_vec, expanding_neg_content_vec = vectorizer.transform( expanding_pos_content), vectorizer.transform(expanding_neg_content) # 加载测试资料 from Utils import load_test_data X_test, Y_test = load_test_data() # 下面代码临时执行,需要用时再执行,不用时注释,用来产生扩展的test data from candidate_content import get_candidate_dynamic # get_candidate_dynamic(X_test, neg, 8, 'neg') # get_candidate_dynamic(X_test, pos, 8, 'pos') # exit() # 临时执行代码结束,为了更好的组织代码结构,将此文件另存为一份dynamic_classifer.py from test_data_clustering import expand_text_list as expanding_method expanded_texts_with_pos, expanded_texts_with_neg = expanding_method(X_test, expanding_pos_content), expanding_method(X_test, expanding_neg_content) expanded_texts_vec_with_pos, expanded_texts_vec_with_neg = vectorizer.transform( expanded_texts_with_pos), vectorizer.transform(expanded_texts_with_neg)