def model(timestamps, predictors, classes, classifier=None, prediction_attribute='predict_proba', hyperparams=None, roc_bounds=None, verbose=False): ''' Creates several models using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. prediction_attribute: Name of the attribute of the classifier that returns the probability of belonging to the positive class. roc_bounds : [min, max] values to use when computing partial AUC. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. roc_ax : the matplotlib axes object containing all of the ROC curves. pr_ax : the matplotlib axes object containing all of the PR curves. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9]) pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9]) roc_fig.subplots_adjust(left=0.07, right=0.67) pr_fig.subplots_adjust(left=0.07, right=0.67) clfs = dict() auc_rocs = [] for yr in range(start, stop + 1): is_not_yr = timestamps != yr train_indices = np.array(is_not_yr) test_indices = np.array(~is_not_yr) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices, :], classes[train_indices]) clfs[yr] = clf predictions = getattr(clf, prediction_attribute)( predictors.ix[test_indices, :])[:, 1] auc_roc = viz.roc(predictions, classes[test_indices], block_show=False, ax=roc_ax, bounds=roc_bounds)[3] auc_pr = viz.precision_recall(predictions, classes[test_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_rocs.append(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label( str(yr) + ' - AUC: {0:.5f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label( str(yr) + ' - AUC: {0:.5f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width + 1, fi)) return clfs, auc_rocs, roc_ax, pr_ax
'auto') # we are going to be zooming around, set it to auto roc_ax.grid(True, which='major') roc_ax.set_title('ROC - mean pAUC: {0:.7f} $\pm$ {1:.7f}'.format( np.mean(auc_rocs), np.std(auc_rocs))) roc_ax.set_ylim([0, .5]) ## Precision-Recall curve ### Make PR yearly lines more transparent c = pr_ax.get_children() for line in c: line.set_alpha(.7) ### Plot PR curve for EPA model tpr, ppv, threshes, auc_pr = viz.precision_recall( epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235, ax=pr_ax, block_show=False) ### Format the EPA line auc_pr = float(auc_pr) epa_line = pr_ax.get_lines()[-2] epa_line.set_color([0, 0, 0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.5f}'.format(auc_pr)) ### Plot an X where the current model is performing i = np.where(threshes < 235.0)[0][0] pr_ax.plot(tpr[i], ppv[i],
ax=roc_ax, block_show=False)[3] auc_roc = float(auc_roc) epa_line = roc_ax.get_lines()[-2] epa_line.set_color([0,0,0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.4f}'.format(auc_roc)) roc_ax.legend(loc=4) roc_ax.grid(True, which='major') c = pr_ax.get_children() for line in c: line.set_alpha(.75) auc_pr = viz.precision_recall(epa_model_df['Drek_Prediction'], epa_model_df['Escherichia.coli'] > 235, ax=pr_ax, block_show=False)[3] auc_pr = float(auc_pr) epa_line = pr_ax.get_lines()[-2] epa_line.set_color([0,0,0]) epa_line.set_ls('--') epa_line.set_linewidth(3) epa_line.set_alpha(.85) epa_line.set_label('EPA Model - AUC: {0:.4f}'.format(auc_pr)) pr_ax.legend(loc=1) pr_ax.grid(True, which='major') plt.draw() plt.show(block=True)
def model(timestamps, predictors, classes, classifier=None, hyperparams=None, verbose=False): ''' Creates several GBMs using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_ax = plt.subplots(1)[1] pr_ax = plt.subplots(1)[1] clfs = dict() for yr in range(start, stop+1): train_indices = np.array((timestamps < yr) | (timestamps > yr)) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices,:], classes[train_indices]) clfs[yr] = clf predictions = clf.predict_proba(predictors.ix[~train_indices,:])[:,1] auc_roc = viz.roc(predictions, classes[~train_indices], block_show=False, ax=roc_ax)[3] auc_pr = viz.precision_recall(predictions, classes[~train_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.4f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi)) return clfs, roc_ax, pr_ax
def model(timestamps, predictors, classes, classifier=None, prediction_attribute='predict_proba', hyperparams=None, roc_bounds=None, verbose=False): ''' Creates several models using leave-one-year-out cross validation. ROC and PR curves are plotted as a side-effect. Parameters ---------- timestamps : Nx1 pandas series of timestamps. Each element should have a "year" attribute. predictors : NxM pandas DataFrame, all values should be numeric, and there should be no NaN values. classes : Nx1 array like of binary outcomes, e.g. True or False. classifier : sklearn classifier, should have the attributes "fit" and "predict_proba" at the least. hyperparams: Dictionary of hyper parameters to pass to the classifier method. prediction_attribute: Name of the attribute of the classifier that returns the probability of belonging to the positive class. roc_bounds : [min, max] values to use when computing partial AUC. verbose : True if the clf.feature_importances_ should be printed Returns ------- clfs : Dictionary of (year, classifier) pairs, where the classifier is the model found by leaving the specified year out of the training set. roc_ax : the matplotlib axes object containing all of the ROC curves. pr_ax : the matplotlib axes object containing all of the PR curves. ''' if classifier is None: classifier = sklearn.ensemble.GradientBoostingClassifier if hyperparams is None: hyperparams = {} timestamps = timestamps.map(lambda x: x.year) start = timestamps.min() stop = timestamps.max() stop = min(stop, 2014) # do not include 2015 roc_fig, roc_ax = plt.subplots(1, figsize=[12, 9]) pr_fig, pr_ax = plt.subplots(1, figsize=[12, 9]) roc_fig.subplots_adjust(left=0.07, right=0.67) pr_fig.subplots_adjust(left=0.07, right=0.67) clfs = dict() auc_rocs = [] for yr in range(start, stop+1): is_not_yr = timestamps != yr train_indices = np.array(is_not_yr) test_indices = np.array(~is_not_yr) clf = classifier(**hyperparams) clf.fit(predictors.ix[train_indices,:], classes[train_indices]) clfs[yr] = clf predictions = getattr(clf, prediction_attribute)(predictors.ix[test_indices,:])[:,1] auc_roc = viz.roc(predictions, classes[test_indices], block_show=False, ax=roc_ax, bounds=roc_bounds)[3] auc_pr = viz.precision_recall(predictions, classes[test_indices], block_show=False, ax=pr_ax)[3] auc_roc = float(auc_roc) auc_rocs.append(auc_roc) auc_pr = float(auc_pr) roc_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_roc)) pr_ax.get_lines()[-2].set_label(str(yr) + ' - AUC: {0:.5f}'.format(auc_pr)) if verbose: print('Year ' + str(yr)) print('Feature importances:') feat_imps = clf.feature_importances_ idxs = np.argsort(feat_imps)[::-1] max_width = max([len(c) for c in predictors.columns]) for c, fi in zip(predictors.columns[idxs], feat_imps[idxs]): print(' {0:<{1}} : {2:.5f}'.format(c, max_width+1, fi)) return clfs, auc_rocs, roc_ax, pr_ax