def store_classifier(clf, trainset, filepath): classifier = SklearnClassifier(clf) classifier.train(trainset) pred = classifier.prob_classify_many(extract_features(sentiment)) p_file = open(filepath,'w+') #把结果写入文档 # for i in pred: # p_file.write(str(i.prob('pos'))+' '+str(i.prob('neg'))) for (i,j) in zip(pred,sen_cur): p_file.write(str(i.prob('pos'))+'\t'+str(i.prob('neg'))+'\t'+j + '\n') p_file.close()
def plot_ROC(classifier): cv = StratifiedKFold(n_splits=6) # 导入该模型,后面将数据划分6份 classifier = SklearnClassifier(classifier) global k # 画平均ROC曲线的两个参数 mean_tpr = 0.0 # 用来记录画平均ROC曲线的信息 mean_fpr = np.linspace(0, 1, 100) cnt = 0 for i, (train, test) in enumerate(cv.split( dev_roc, tag_dev_roc)): # 利用模型划分数据集和目标变量 为一一对应的下标 cnt += 1 devtest1 = np.array(for_roc_plot) classifier.train(devtest1[train]) # print(cnt) # print(train) dev1 = np.array(dev_roc) tag_dev1 = np.array(tag_dev_roc) pred_ = classifier.prob_classify_many(dev1[test]) # 测试集的概率 # print(pred_) # probas_ = classifier.fit(dev1[train], tag_dev1[train]).predict_proba(dev1[test]) # 训练模型后预测每条样本得到两种结果的概率 probas_ = [] for j in pred_: probas_.append(j.prob('pos')) fpr, tpr, thresholds = roc_curve( tag_dev1[test], probas_, pos_label='pos') # 该函数得到伪正例、真正例、阈值,这里只使用前两个 # print(fpr) mean_tpr += np.interp( mean_fpr, fpr, tpr) # 插值函数 interp(x坐标,每次x增加距离,y坐标) 累计每次循环的总值后面求平均值 mean_tpr[0] = 0.0 # 将第一个真正例=0 以0为起点 roc_auc = auc(fpr, tpr) # 求auc面积 plt.plot(fpr, tpr, lw=1, label='ROC fold {0:.2f} (area = {1:.2f})'.format( i, roc_auc)) # 画出当前分割数据的ROC曲线 plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') # 画对角线 mean_tpr /= cnt # 求数组的平均值 mean_tpr[-1] = 1.0 # 坐标最后一个点为(1,1) 以1为终点 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = {0:.2f})'.format(mean_auc), lw=2) plt.xlim([-0.05, 1.05]) # 设置x、y轴的上下限,设置宽一点,以免和边缘重合,可以更好的观察图像的整体 plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # 可以使用中文,但需要导入一些库即字体 plt.title( 'Receiver operating characteristic curve using {0}'.format(classifier)) plt.legend(loc="lower right") fig = plt.gcf() plt.show() fig.savefig('../out/ROC_curves/第{0}张'.format(k + 1), dpi=400) #保存图片 k = k + 1
print("Labels :", LogisticRegression_classifier.labels()) print("Type : ", type(LogisticRegression_classifier)) dictum = [tupl[0] for tupl in testing_set] print("dictum : ", dictum) try: print("classify many:", LogisticRegression_classifier.classify_many(dictum)) except: print("classify many erreur \n", "Type testing_set: ", type(dictum), "\n testing_set :", dictum) try: print("prob_classify_many:", LogisticRegression_classifier.prob_classify_many(dictum)) for probdisti in LogisticRegression_classifier.prob_classify_many( dictum): list_of_samples = probdisti.samples() for sample in list_of_samples: print("Sample: ", sample, " Prob : ", probdisti.prob(sample)) except: print("prob_classify_many erreur \n", "Type testing_set:", type(dictum)) save_classifier = open("LogisticRegression_classifier.pickle", "wb") pickle.dump(LogisticRegression_classifier, save_classifier) save_classifier.close() '''