def test_model(self, feature_list, testing_type="cv", outpath="roc.png"): """ testing_type: cv: cross validation train: test on train Produce AUC """ x_train = self.get_features(feature_list) y_train = self.get_numeric_label().values #print(len(x_train),len(y_train)) clfs = { #"decision tree":tree.DecisionTreeClassifier(), "random forest": ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True), #"log regression":linear_model.LogisticRegression(), "simple": simpleclassifier.Simple1DClassifier(), #"gradient boosting":ensemble.GradientBoostingClassifier(), #"naive bayes":naive_bayes.GaussianNB() } if testing_type == "cv": fpr_list, tpr_list, auc_list = self.test_with_cv( clfs, x_train, y_train) else: fpr_list, tpr_list, auc_list = self.test_on_train( clfs, x_train, y_train) self.display_output(fpr_list, tpr_list, auc_list, path=outpath)
def boxplot_auc_distance(self): clfs = { #"decision tree":tree.DecisionTreeClassifier(), "random forest": ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True), #"log regression":linear_model.LogisticRegression(), "simple": simpleclassifier.Simple1DClassifier(), #"gradient boosting":ensemble.GradientBoostingClassifier(), #"naive bayes":naive_bayes.GaussianNB() } dists = ["distance-numeric", "distance-categorical"] auc_dict = {} for dist in dists: auc_dict[dist] = [] for i in range(100): x_train = self.get_features(dist) y_train = self.get_numeric_label().values fpr_list, tpr_list, auc_list = self.test_with_cv( clfs, x_train, y_train) auc_dict[dist].append(auc_list[0]) self.scatter_boxplot_dict(auc_dict)
def compare_distance_features(self, iter=10, fpr_lim=100): clfs = { #"decision tree":tree.DecisionTreeClassifier(), "random forest": ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True), #"log regression":linear_model.LogisticRegression(), "simple": simpleclassifier.Simple1DClassifier(), #"gradient boosting":ensemble.GradientBoostingClassifier(), #"naive bayes":naive_bayes.GaussianNB() } dists = [["dist-numeric"], ["dist-categorical"]] auc_dict = {} for dist_type in dists: dname = dist_type[0] auc_dict[dname] = [] for i in range(iter): print("Processing using %s, iteration %d" % (dist_type, i + 1)) x_train = self.get_features(dist_type) y_train = self.get_numeric_label().values fpr_list, tpr_list, auc_list = self.test_with_cv( clfs, x_train, y_train, fpr_lim=fpr_lim) auc_dict[dname].append(auc_list['random forest']) print("Making scatter boxplot for each feature...") utils.scatter_boxplot_dict(auc_dict, ylabel="AUC") print("Two sided wilcox test, pval: %.4f" % utils.wilcox_test( auc_dict["dist-numeric"], auc_dict["dist-categorical"])) print("Numeric > Categorical test, pval: %.4f" % utils.wilcox_test(auc_dict["dist-numeric"], auc_dict["dist-categorical"], alternative="greater")) print("Numeric < Categorical test, pval: %.4f" % utils.wilcox_test(auc_dict["dist-numeric"], auc_dict["dist-categorical"], alternative="less"))
def roc_simple_clf(self, n_splits=1): # still numeric for now x_train = self.training["distance"].values y_train = self.get_numeric_label().values distances = self.training['distance'].unique() if n_splits > 1: cv = model_selection.KFold(n_splits=n_splits, shuffle=True) split = cv.split(x_train, y_train) else: split = [(range(len(x_train)), range(len(y_train)))] fpr_all = [] tpr_all = [] auc_all = [] for train, test in split: fpr_list = [0] tpr_list = [0] for dist in sorted(distances): scf = simpleclassifier.Simple1DClassifier() scf.fit_on_thres(x_train[train], y_train[train], dist) y_pred = scf.test(x_train[test]) #print("Accuracy %f" % metrics.accuracy_score(ytrain, ypred)) fpr, tpr = calculate_fpr_tpr(y_train[test], y_pred) fpr_list.append(fpr) tpr_list.append(tpr) fpr_list.append(1) tpr_list.append(1) auc = metrics.auc(fpr_list, tpr_list) auc_all.append(auc) fpr_all.append(fpr_list) tpr_all.append(tpr_list) return fpr_all, tpr_all, auc_all
def test_seq_features(self, outpath="auc.png"): clfs = { "decision tree": tree.DecisionTreeClassifier(), "random forest": ensemble.RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), #"SVM":svm.SVC(kernel="rbf",gamma=1.0/5,probability=True), #"log regression":linear_model.LogisticRegression(), "simple": simpleclassifier.Simple1DClassifier(), #"gradient boosting":ensemble.GradientBoostingClassifier(), #"naive bayes":naive_bayes.GaussianNB() } span_in_list = [1, 2, 3, 4, 5, 6, 7, 8] span_out_list = [1, 2, 3] dlist = ["distance-categorical", "distance-numeric", "None"] combs = [{ "distance": "distance-numeric" }, { "distance": "distance-categorical" }] for x1 in span_in_list: for x2 in span_out_list: for x3 in dlist: combs.append({"span_in": x1, "span_out": x2, "wdist": x3}) classifier_names = list(clfs.keys()) # we only need to make y_train once y_train = self.get_numeric_label().values n = 0 numcol = 2 # adjust the number of columns in the plot numrow = 2 # adjust the number of rows in the plot with PdfPages(outpath) as pdf: for comb in combs: # we need this because n is always reset after one full page if n == 0: fig = plt.figure(figsize=(12, 12)) fig.subplots_adjust(hspace=0.4, wspace=0.5) n += 1 if "distance" in comb: x_train = self.get_features(comb["distance"]) else: x_train = self.get_features_custom(comb["span_out"], comb["span_in"], comb["wdist"]) #self.display_output(fpr_list, tpr_list, auc_list, list(clfs.keys()), path=outpath) fpr_list, tpr_list, auc_list = self.test_with_cv( clfs, x_train, y_train) ax = fig.add_subplot(numcol, numrow, n) ax.plot([0, 1], [0, 1], linestyle="--", color="red", alpha=0.1) for i in range(len(fpr_list)): ax.plot(fpr_list[i], tpr_list[i], lw=2, alpha=0.4, label='%s, AUC %f' % (classifier_names[i], auc_list[i])) # Show the ROC curves for all classifiers on the same plot ax.xaxis.set_label_text('False Positive Rate') ax.yaxis.set_label_text('True Positive Rate') if "distance" in comb: ax.set_title("distance type %s" % comb["distance"]) else: ax.set_title( 'span_out %d, span_in %d, with_dist %s' % (comb["span_out"], comb["span_in"], comb["wdist"])) ax.legend(loc="lower right") if n == numcol * numrow: pdf.savefig(fig) plt.close() n = 0 pdf.savefig(fig) plt.close()