Ejemplo n.º 1
0
    def test_model(self, feature_list, testing_type="cv", outpath="roc.png"):
        """
        testing_type:
            cv: cross validation
            train: test on train
            Produce AUC
        """

        x_train = self.get_features(feature_list)
        y_train = self.get_numeric_label().values
        #print(len(x_train),len(y_train))

        clfs = {
            #"decision tree":tree.DecisionTreeClassifier(),
            "random forest":
            ensemble.RandomForestClassifier(n_estimators=100,
                                            max_depth=2,
                                            random_state=0),
            #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True),
            #"log regression":linear_model.LogisticRegression(),
            "simple":
            simpleclassifier.Simple1DClassifier(),
            #"gradient boosting":ensemble.GradientBoostingClassifier(),
            #"naive bayes":naive_bayes.GaussianNB()
        }

        if testing_type == "cv":
            fpr_list, tpr_list, auc_list = self.test_with_cv(
                clfs, x_train, y_train)
        else:
            fpr_list, tpr_list, auc_list = self.test_on_train(
                clfs, x_train, y_train)

        self.display_output(fpr_list, tpr_list, auc_list, path=outpath)
Ejemplo n.º 2
0
    def boxplot_auc_distance(self):
        clfs = {
            #"decision tree":tree.DecisionTreeClassifier(),
            "random forest":
            ensemble.RandomForestClassifier(n_estimators=100,
                                            max_depth=2,
                                            random_state=0),
            #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True),
            #"log regression":linear_model.LogisticRegression(),
            "simple":
            simpleclassifier.Simple1DClassifier(),
            #"gradient boosting":ensemble.GradientBoostingClassifier(),
            #"naive bayes":naive_bayes.GaussianNB()
        }

        dists = ["distance-numeric", "distance-categorical"]

        auc_dict = {}
        for dist in dists:
            auc_dict[dist] = []
            for i in range(100):
                x_train = self.get_features(dist)
                y_train = self.get_numeric_label().values
                fpr_list, tpr_list, auc_list = self.test_with_cv(
                    clfs, x_train, y_train)
                auc_dict[dist].append(auc_list[0])

        self.scatter_boxplot_dict(auc_dict)
Ejemplo n.º 3
0
    def compare_distance_features(self, iter=10, fpr_lim=100):
        clfs = {
            #"decision tree":tree.DecisionTreeClassifier(),
            "random forest":
            ensemble.RandomForestClassifier(n_estimators=100,
                                            max_depth=2,
                                            random_state=0),
            #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True),
            #"log regression":linear_model.LogisticRegression(),
            "simple":
            simpleclassifier.Simple1DClassifier(),
            #"gradient boosting":ensemble.GradientBoostingClassifier(),
            #"naive bayes":naive_bayes.GaussianNB()
        }

        dists = [["dist-numeric"], ["dist-categorical"]]

        auc_dict = {}
        for dist_type in dists:
            dname = dist_type[0]
            auc_dict[dname] = []
            for i in range(iter):
                print("Processing using %s, iteration %d" % (dist_type, i + 1))
                x_train = self.get_features(dist_type)
                y_train = self.get_numeric_label().values
                fpr_list, tpr_list, auc_list = self.test_with_cv(
                    clfs, x_train, y_train, fpr_lim=fpr_lim)
                auc_dict[dname].append(auc_list['random forest'])

        print("Making scatter boxplot for each feature...")
        utils.scatter_boxplot_dict(auc_dict, ylabel="AUC")

        print("Two sided wilcox test, pval: %.4f" % utils.wilcox_test(
            auc_dict["dist-numeric"], auc_dict["dist-categorical"]))
        print("Numeric > Categorical test, pval: %.4f" %
              utils.wilcox_test(auc_dict["dist-numeric"],
                                auc_dict["dist-categorical"],
                                alternative="greater"))
        print("Numeric < Categorical test, pval: %.4f" %
              utils.wilcox_test(auc_dict["dist-numeric"],
                                auc_dict["dist-categorical"],
                                alternative="less"))
Ejemplo n.º 4
0
    def roc_simple_clf(self, n_splits=1):
        # still numeric for now
        x_train = self.training["distance"].values
        y_train = self.get_numeric_label().values
        distances = self.training['distance'].unique()

        if n_splits > 1:
            cv = model_selection.KFold(n_splits=n_splits, shuffle=True)
            split = cv.split(x_train, y_train)
        else:
            split = [(range(len(x_train)), range(len(y_train)))]

        fpr_all = []
        tpr_all = []
        auc_all = []

        for train, test in split:
            fpr_list = [0]
            tpr_list = [0]
            for dist in sorted(distances):
                scf = simpleclassifier.Simple1DClassifier()
                scf.fit_on_thres(x_train[train], y_train[train], dist)
                y_pred = scf.test(x_train[test])
                #print("Accuracy %f" % metrics.accuracy_score(ytrain, ypred))
                fpr, tpr = calculate_fpr_tpr(y_train[test], y_pred)
                fpr_list.append(fpr)
                tpr_list.append(tpr)

            fpr_list.append(1)
            tpr_list.append(1)

            auc = metrics.auc(fpr_list, tpr_list)
            auc_all.append(auc)
            fpr_all.append(fpr_list)
            tpr_all.append(tpr_list)
        return fpr_all, tpr_all, auc_all
Ejemplo n.º 5
0
    def test_seq_features(self, outpath="auc.png"):
        clfs = {
            "decision tree":
            tree.DecisionTreeClassifier(),
            "random forest":
            ensemble.RandomForestClassifier(n_estimators=100,
                                            max_depth=2,
                                            random_state=0),
            #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True),
            #"log regression":linear_model.LogisticRegression(),
            "simple":
            simpleclassifier.Simple1DClassifier(),
            #"gradient boosting":ensemble.GradientBoostingClassifier(),
            #"naive bayes":naive_bayes.GaussianNB()
        }

        span_in_list = [1, 2, 3, 4, 5, 6, 7, 8]
        span_out_list = [1, 2, 3]
        dlist = ["distance-categorical", "distance-numeric", "None"]
        combs = [{
            "distance": "distance-numeric"
        }, {
            "distance": "distance-categorical"
        }]

        for x1 in span_in_list:
            for x2 in span_out_list:
                for x3 in dlist:
                    combs.append({"span_in": x1, "span_out": x2, "wdist": x3})

        classifier_names = list(clfs.keys())

        # we only need to make y_train once
        y_train = self.get_numeric_label().values

        n = 0
        numcol = 2  # adjust the number of columns in the plot
        numrow = 2  # adjust the number of rows in the plot
        with PdfPages(outpath) as pdf:
            for comb in combs:
                # we need this because n is always reset after one full page
                if n == 0:
                    fig = plt.figure(figsize=(12, 12))
                    fig.subplots_adjust(hspace=0.4, wspace=0.5)
                n += 1

                if "distance" in comb:
                    x_train = self.get_features(comb["distance"])
                else:
                    x_train = self.get_features_custom(comb["span_out"],
                                                       comb["span_in"],
                                                       comb["wdist"])
                #self.display_output(fpr_list, tpr_list, auc_list, list(clfs.keys()), path=outpath)

                fpr_list, tpr_list, auc_list = self.test_with_cv(
                    clfs, x_train, y_train)

                ax = fig.add_subplot(numcol, numrow, n)

                ax.plot([0, 1], [0, 1], linestyle="--", color="red", alpha=0.1)
                for i in range(len(fpr_list)):
                    ax.plot(fpr_list[i],
                            tpr_list[i],
                            lw=2,
                            alpha=0.4,
                            label='%s, AUC %f' %
                            (classifier_names[i], auc_list[i]))

                # Show the ROC curves for all classifiers on the same plot
                ax.xaxis.set_label_text('False Positive Rate')
                ax.yaxis.set_label_text('True Positive Rate')
                if "distance" in comb:
                    ax.set_title("distance type %s" % comb["distance"])
                else:
                    ax.set_title(
                        'span_out %d, span_in %d, with_dist %s' %
                        (comb["span_out"], comb["span_in"], comb["wdist"]))
                ax.legend(loc="lower right")
                if n == numcol * numrow:
                    pdf.savefig(fig)
                    plt.close()
                    n = 0
            pdf.savefig(fig)
            plt.close()