def store_classifier(clf, trainset, filepath):
    classifier = SklearnClassifier(clf)
    classifier.train(trainset)

    pred = classifier.prob_classify_many(extract_features(sentiment))
    p_file = open(filepath,'w+') #把结果写入文档
    # for i in pred:
    #     p_file.write(str(i.prob('pos'))+' '+str(i.prob('neg')))
    for (i,j) in zip(pred,sen_cur):
        p_file.write(str(i.prob('pos'))+'\t'+str(i.prob('neg'))+'\t'+j + '\n')
    p_file.close()
def plot_ROC(classifier):
    cv = StratifiedKFold(n_splits=6)  # 导入该模型,后面将数据划分6份
    classifier = SklearnClassifier(classifier)
    global k
    # 画平均ROC曲线的两个参数
    mean_tpr = 0.0  # 用来记录画平均ROC曲线的信息
    mean_fpr = np.linspace(0, 1, 100)
    cnt = 0

    for i, (train, test) in enumerate(cv.split(
            dev_roc, tag_dev_roc)):  # 利用模型划分数据集和目标变量 为一一对应的下标
        cnt += 1
        devtest1 = np.array(for_roc_plot)
        classifier.train(devtest1[train])
        # print(cnt)
        # print(train)

        dev1 = np.array(dev_roc)
        tag_dev1 = np.array(tag_dev_roc)
        pred_ = classifier.prob_classify_many(dev1[test])  # 测试集的概率
        # print(pred_)
        # probas_ = classifier.fit(dev1[train], tag_dev1[train]).predict_proba(dev1[test])  # 训练模型后预测每条样本得到两种结果的概率
        probas_ = []
        for j in pred_:
            probas_.append(j.prob('pos'))
        fpr, tpr, thresholds = roc_curve(
            tag_dev1[test], probas_,
            pos_label='pos')  # 该函数得到伪正例、真正例、阈值,这里只使用前两个
        # print(fpr)
        mean_tpr += np.interp(
            mean_fpr, fpr,
            tpr)  # 插值函数 interp(x坐标,每次x增加距离,y坐标)  累计每次循环的总值后面求平均值
        mean_tpr[0] = 0.0  # 将第一个真正例=0 以0为起点

        roc_auc = auc(fpr, tpr)  # 求auc面积
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 label='ROC fold {0:.2f} (area = {1:.2f})'.format(
                     i, roc_auc))  # 画出当前分割数据的ROC曲线

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')  # 画对角线

    mean_tpr /= cnt  # 求数组的平均值
    mean_tpr[-1] = 1.0  # 坐标最后一个点为(1,1)  以1为终点
    mean_auc = auc(mean_fpr, mean_tpr)

    plt.plot(mean_fpr,
             mean_tpr,
             'k--',
             label='Mean ROC (area = {0:.2f})'.format(mean_auc),
             lw=2)

    plt.xlim([-0.05, 1.05])  # 设置x、y轴的上下限,设置宽一点,以免和边缘重合,可以更好的观察图像的整体
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')  # 可以使用中文,但需要导入一些库即字体
    plt.title(
        'Receiver operating characteristic curve using {0}'.format(classifier))
    plt.legend(loc="lower right")
    fig = plt.gcf()
    plt.show()
    fig.savefig('../out/ROC_curves/第{0}张'.format(k + 1), dpi=400)  #保存图片
    k = k + 1
Ejemplo n.º 3
0
            print("Labels :", LogisticRegression_classifier.labels())
            print("Type : ", type(LogisticRegression_classifier))

            dictum = [tupl[0] for tupl in testing_set]
            print("dictum : ", dictum)
            try:
                print("classify many:",
                      LogisticRegression_classifier.classify_many(dictum))
            except:
                print("classify many erreur \n", "Type testing_set: ",
                      type(dictum), "\n testing_set :", dictum)

            try:
                print("prob_classify_many:",
                      LogisticRegression_classifier.prob_classify_many(dictum))
                for probdisti in LogisticRegression_classifier.prob_classify_many(
                        dictum):
                    list_of_samples = probdisti.samples()
                    for sample in list_of_samples:
                        print("Sample: ", sample, " Prob : ",
                              probdisti.prob(sample))
            except:
                print("prob_classify_many erreur \n", "Type testing_set:",
                      type(dictum))

            save_classifier = open("LogisticRegression_classifier.pickle",
                                   "wb")
            pickle.dump(LogisticRegression_classifier, save_classifier)
            save_classifier.close()
            '''