def train_rfg(X_train, y_train, X_test, y_test, sample_weight=None, uncertainty=False): rfg = RandomForestRegressor(n_estimators=300, random_state=0).fit(X_train, y_train, sample_weight) preds = [rfg.predict(X_train), rfg.predict(X_test)] variance_tr = fci.random_forest_error(rfg, X_train, X_train) variance_te = fci.random_forest_error(rfg, X_train, X_test) if uncertainty: sw_tr = variance_tr sw_te = variance_te else: sw_tr = (preds[0] - y_train)**2 sw_te = (preds[1] - y_test)**2 variance = [variance_tr, variance_te] sws = [sw_tr, sw_te] # print("Train rmse: ", mean_squared_error(preds[0], y_train, squared=False)) # print("Test rmse: ", mean_squared_error(preds[1], y_test, squared=False)) return preds, variance, sws
def confidence_interval(model, Xtrain, Xtest): inbag = fci.calc_inbag(Xtrain.shape[0], model) ci = fci.random_forest_error(model, Xtrain.values, Xtest.values, inbag=inbag) return ci
def do_fci(n_trees): # Calculate the variance # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test) rfr.n_estimators = n_trees rfr.fit(X_train, Y_train) pred_test = rfr.predict(X_test).round(0).astype(int) # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test).round(0).astype(int) mpg_V_IJ_unbiased = fci.random_forest_error( rfr, X_train, X_test, memory_constrained=True, memory_limit=1024, calibration_scale=calibration_scale).round(0).astype(int) # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test, calibrate=False) df_test['pred_test'] = pred_test df_test['mpg_V_IJ_unbiased'] = mpg_V_IJ_unbiased df_test['mpg_V_IJ_unbiased_sqrt'] = np.sqrt(mpg_V_IJ_unbiased).round( 0).astype(int) # df_test['lower'] = interval[0] # df_test['upper'] = interval[1] # df_test['diff'] = df_test['yield_pred'] - mpg_y_hat # df_test['stderr'] = stderr pd.options.display.max_columns = df_test.shape[1] print(df_test.describe()) out_csv = r"out.rs/out.{0}.{1}.csv".format(n_trees, calibration_scale) df_test.describe().to_csv(out_csv, index=True, header=True, sep=',', float_format='%.0f')
def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0]) # We cannot calculate inbag from a non-bootstrapped forest. This is because # Scikit-learn trees do not store their own sample weights. If you did This # some other way, you can still use your own inbag non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees, bootstrap=False) npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0], non_bootstrap_forest)
def calibration_isotonic_regression(model_name, model, prob_model, X_calibration, y_calibration, X_train): # 1. function that trains the calibration regressor using as input calibration data in the first instance # 2. it then takes in the prob_out of the mdel on the test and outputs calibrated prob for further calculation of # calibrated std # ref: https: // arxiv.org / abs / 1807.00263 if model_name == 'Bayes_Ridge_model': y_hat_calibration, sem_hat_calibration = model.predict(X_calibration, return_std=True) elif model_name == 'RF_model': y_hat_calibration = model.predict(X_calibration) sem_hat_calibration = np.sqrt( fci.random_forest_error(model, X_train, X_calibration)) else: print('Error: Not able to calculate variace!') # y_hat, sem = model.predict(X_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval( y_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated
def confidence_cal(self, train_data, test_data, rf): import forestci as fci # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) return V_IJ, V_IJ_unbiased
def predict(self, X): # compute predictions y_bar = super(RandomForestRegressorWithIntervals, self).predict(X) # compute variance estimate y_var = forestci.random_forest_error(self, self.X_train, X) y_std = np.sqrt(y_var) return y_bar, y_std
def confidence_cal(train_data, test_data, rf): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance spam_inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) return V_IJ, V_IJ_unbiased
def rf_predict_proba(self, x, return_var=False, train_x=None): predictions = self.predict_proba_orig(x) import forestci as fci if return_var: assert train_x is not None var = fci.random_forest_error(self, train_x, x) return predictions, var else: return predictions
def pred_int_calc(self, calcV_IJ=False): trueV = self.yP self.df = pd.DataFrame() self.df['v'] = trueV if calcV_IJ: self.df['V_IJ_unbiased'] = fci.random_forest_error( self.clf, self.X, self.XP) self.df['p_d'] = self.err_dn self.df['p_u'] = self.err_up self.df['p_m'] = self.err_mean incorrect = ((np.sum(self.df.v > self.df.p_u) + np.sum(self.df.v < self.df.p_d)) / self.df.shape[0]) return 1 - incorrect
def confidence_cal(train_data, test_data, rf): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) print("inbag: {}".format(inbag)) print("V_IJ_unbiased: {}".format(V_IJ_unbiased)) # Plot error bars for predicted MPG using unbiased variance return inbag, V_IJ_unbiased
def do_fci(): # Calculate the variance mpg_V_IJ_unbiased, pred_mean_t = fci.random_forest_error( rfr, X_train, X_test) # mpg_V_IJ_unbiased = fci.random_forest_error(rfr, X_train, X_test, calibrate=False) print(mpg_V_IJ_unbiased.shape) print(mpg_V_IJ_unbiased) pred_rf = rfr.predict(X_test) import pandas as pd df = pd.DataFrame() df['pred_rf'] = pred_rf df['pred_mean_t'] = pred_mean_t df['mpg_V_IJ_unbiased'] = mpg_V_IJ_unbiased df['mpg_V_IJ_unbiased_sqrt'] = np.sqrt(mpg_V_IJ_unbiased) pd.options.display.max_columns = df.shape[1] print(df.describe())
def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) V_IJ_unbiased = fci.random_forest_error(forest, inbag, X_train, X_test) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def test_with_calibration(): # Test both with and without interpolation: for n in [25 * 5, 205 * 5]: X = np.random.rand(n).reshape(n // 5, 5) y = np.random.rand(n // 5) train_idx = np.arange(int(n // 5 * 0.75)) test_idx = np.arange(int(n//5 * 0.75), n//5) y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def confidenceCal(self, train_data, test_data, predictions, test_y, rf): pmax = np.amax(predictions) tmax = np.amax(test_y) axismax = max(pmax, tmax) import forestci as fci # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) # print "inbag: {}".format(inbag) # print "V_IJ_unbiased: {}".format(V_IJ_unbiased) # # Plot error bars for predicted MPG using unbiased variance (_, caps, _) = plt.errorbar(predictions, test_y, yerr=np.sqrt(V_IJ), fmt='o', markersize=4, capsize=10, mfc='red', mec='green') for cap in caps: cap.set_markeredgewidth(1) plt.title('Error bars for Patient: ' + str(self.patient_id)) plt.xlabel('Actual BG') plt.ylabel('Predicted BG') plt.xlim(0, axismax) plt.ylim(0, axismax) plt.savefig( "prediction/tmp/confidence_intervals_bias_patient{}.png".format( self.patient_id)) plt.close() return V_IJ, V_IJ_unbiased
def confidence_cal(train_data, train_y, test_data, test_y, predictions, rf, patientID): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance spam_inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(test_y == 1)[0] plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label='Hyper') idx = np.where(test_y == 0)[0] plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label='Non') plt.xlabel('Prediction (hyper probability)') plt.ylabel('Standard deviation') plt.legend() plt.show()
def compute(self, X_test): if self.model_type == "gp": self.model.gp.fit(self.X_train, self.y_train) y_mean, y_std = self.model.gp.predict(X_test, return_std=True) y_variance = y_std**2 else: self.model.rf.fit(self.y_train, self.y_train) y_mean = self.model.rf.predict(X_test) y_variance = fci.random_forest_error(self.model.rf, self.X_train, X_test) y_std = np.sqrt(y_variance) z = (y_mean - self.current_optimal - self.trade_off) / y_std if self.mode == "ei": if y_std < 0.000001: return 0, y_mean, y_variance result = y_std * (z * norm.cdf(z) + norm.pdf(z)) elif self.mode == "pi": result = norm.cdf(z) else: result = -(y_mean - self.trade_off * y_std) return np.squeeze(result), np.squeeze(y_mean), np.squeeze(y_variance)
def test_bagging_svr_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 bagger = BaggingRegressor(base_estimator=SVR(), n_estimators=n_trees) bagger.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], bagger) for ib in [inbag, None]: for calibrate in [True, False]: V_IJ_unbiased = fci.random_forest_error( bagger, X_train, X_test, inbag=ib, calibrate=calibrate ) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def analyze_on_test_data(rf_model, X_test, X_train, y_test=None): plot_settings = {'c': '#ff5e78', 's': 10,} error_plot_settings = {'ecolor': '#ff5e78', 'elinewidth': 0.5, 'alpha': 0.4, 'fmt': 'o' } y_test_pred = rf_model.predict(X_test) prediction_variance = fci.random_forest_error(rf_model, X_train, X_test) cli_95 = 1.96 * np.sqrt(prediction_variance) plt.rcParams['svg.fonttype'] = 'none' plt.scatter(range(len(y_test_pred)), y_test_pred, **plot_settings, alpha=0.6) plt.errorbar(range(len(y_test_pred)), y_test_pred, yerr=cli_95, **error_plot_settings) plt.xlabel('Sample Index', fontsize=20) plt.ylabel('Predicted Response', fontsize=20) plt.show() if y_test is not None: mae = mean_absolute_error(y_true=y_test, y_pred=y_test_pred) r2 = r2_score(y_true=y_test, y_pred=y_test_pred) plot_parity(x=y_test, y=y_test_pred, xlabel='True response', ylabel='Predicted response', **plot_settings, show_plot=False, text='MAE: {:.2f} R2: {:.2f}'.format(mae, r2), text_x=0.1, text_y=0.9) plt.errorbar(y_test, y_test_pred, yerr=cli_95, **error_plot_settings) plt.show()
def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) for ib in [inbag, None]: for calibrate in [True, False]: V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test, inbag=ib, calibrate=calibrate) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0]) # We cannot calculate inbag from a non-bootstrapped forest. This is because # Scikit-learn trees do not store their own sample weights. If you did This # some other way, you can still use your own inbag non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees, bootstrap=False) npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0], non_bootstrap_forest)
def demo_variance_prediction(classifier, X_Train, X_Test): prediction_variance = fci.random_forest_error(classifier, X_Train, X_Test) print({ "prediction_mean": classifier.predict(X_Test), "prediction_variance": prediction_variance })
# split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42 ) # create RandomForestRegressor n_trees = 2000 mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42) mpg_forest.fit(mpg_X_train, mpg_y_train) mpg_y_hat = mpg_forest.predict(mpg_X_test) # Plot predicted MPG without error bars plt.scatter(mpg_y_test, mpg_y_hat) plt.plot([5, 45], [5, 45], 'k--') plt.xlabel('Reported MPG') plt.ylabel('Predicted MPG') plt.show() # Calculate the variance: mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train, mpg_X_test) # Plot error bars for predicted MPG using unbiased variance plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o') plt.plot([5, 45], [5, 45], 'k--') plt.xlabel('Reported MPG') plt.ylabel('Predicted MPG') plt.show()
def predict(json): print('\n-----------------------') print('Started prediction') print('-----------------------') modelname = json['model'] datloc = json['data_location'] cases = json['data_cases'] savloc = json['save_location'] dist = False uq = False os.makedirs(savloc, exist_ok=True) compare_predict = False if (("prediction_accuracy" in json) == True): compare_predict = True target = json['prediction_accuracy']['target'] type = json['prediction_accuracy']['type'] thresh = 0.5 if (("prediction_threshold" in json) == True): thresh = json['prediction_threshold'] if (("features_to_drop" in json) == True): features_to_drop = json['features_to_drop'] else: features_to_drop = None if (("features_to_keep" in json) == True): features_to_keep = json['features_to_keep'] else: features_to_keep = None if (features_to_drop and features_to_keep): quit('features_to_drop and features_to_keep both set') if (("uq" in json) == True): if json["uq"] == True: uq = True import forestci as fci if (("dist" in json) == True): if json["dist"] == True: dist = True from cfd2ml.utilities import mahalanobis # Read in ML model # filename = modelname + '.joblib' filename = modelname + '.p' print('\nReading model from ', filename) # model = load(filename) model = pickle.load(open(filename, 'rb')) if isinstance(model, MondrianForestRegressor) or ( isinstance(model, RandomForestRegressor) and uq is True): #training data needed in these instances X_train = pd.read_csv(modelname + '_Xdat.csv') Y_train = pd.read_csv(modelname + '_Ydat.csv')[target] if (features_to_drop is not None): X_train = X_train.drop(columns=features_to_drop) elif (features_to_keep is not None): X_train = X_train[features_to_keep] #TODO - this required for now as pickle/joblib not saving fitted MF properly if isinstance(model, MondrianForestRegressor): model.fit(X_train, Y_train) cmap = plt.get_cmap('tab10') # Open a figure axes fig1, ax1 = plt.subplots() fig2, ax2 = plt.subplots() # Read in each X_data, predict Y, write predicted Y for caseno, case in enumerate(cases): # Read in RANS (X) data filename = os.path.join(datloc, case + '_X.pkl') X_case = CaseData(filename) print('\n***********************') print(' Case %d: %s ' % (caseno + 1, case)) print('***********************') X_pred = X_case.pd if (features_to_drop is not None): X_pred = X_pred.drop(columns=features_to_drop) elif (features_to_keep is not None): X_pred = X_pred[features_to_keep] # Predict HiFi (Y) data and store add to vtk Y_pred = CaseData(case + '_pred') Y_pred.vtk = vista.UnstructuredGrid(X_case.vtk.offset, X_case.vtk.cells, X_case.vtk.celltypes, X_case.vtk.points) if (type == 'classification'): Y_prob = pd.Series( model.predict_proba(X_pred)[:, 1] ) # only need as numpy ndarray but convert to pd series for consistency Y_pred.pd = pd.Series(predict_with_threshold(Y_prob, thresh)) Y_pred.vtk.point_arrays['Y_prob'] = Y_prob.to_numpy() elif (type == 'regression'): if isinstance(model, RandomForestRegressor): y_pred = model.predict(X_pred) elif isinstance( model, MondrianForestRegressor ) and uq is False: #if uq true prediction made below y_pred = model.predict(X_pred) # Uncertainty quantification if (uq is True): if isinstance(model, RandomForestRegressor): print('Calculating infinitesimal jackknife variance') y_var = fci.random_forest_error(model, X_train, X_pred, calibrate=True) y_sd = np.sqrt(np.maximum(y_var, 0)) elif isinstance(model, MondrianForestRegressor): print( 'Calculating mondrian forest posterior mean and standard deviation' ) y_pred, y_sd = model.predict(X_pred, return_std=True) Y_pred.vtk.point_arrays['Y_std'] = y_sd # Print out rms of var sd_mean = np.mean(y_sd) y_mean = np.mean(y_pred) print('sd_mean/y_mean = ', 100 * sd_mean / y_mean, '%') if (dist is True): mah_dist = mahalanobis(x=X_pred, data=X_train) Y_pred.vtk.point_arrays['mah_dist'] = mah_dist print('Mean mahalanobis distance = ', np.mean(mah_dist)) Y_pred.pd = pd.Series( y_pred ) # only need as numpy ndarray but convert to pd series for consistency Y_pred.vtk.point_arrays['Y_pred'] = y_pred # Read in true HiFi (Y) data and compare to predict if (compare_predict == True): filename = os.path.join(datloc, case + '_Y.pkl') Y_true = CaseData(filename) # Write Y_true to vtk for analysis index = Y_true.pd.columns.get_loc(target) Y_pred.vtk.point_arrays['Y_true'] = Y_true.pd.to_numpy()[:, index] # accuracy metrics if (type == 'classification'): predict_classifier_accuracy(Y_pred.pd, Y_true.pd[target]) # Write TP, TN, FP, FN to vtk if (type == 'classification'): Y_pred.vtk.point_arrays['confuse'] = confusion_labels( Y_pred.pd, Y_true.pd[target]) # Calc precision, recall and decision thresholds precisions, recalls, thresholds = precision_recall_curve( Y_true.pd[target], Y_prob) c = cmap(caseno) # Plot precision-recall curve with current decision threshold marked plot_precision_recall_threshold(precisions, recalls, thresholds, t=thresh, ax=ax1, c=c) # Plot precision and recall vs decision threshold plot_precision_recall_vs_threshold(precisions, recalls, thresholds, ax=ax2, c=c, t=thresh, case=case) elif (type == 'regression'): predict_regressor_accuracy(Y_pred.pd, Y_true.pd[target]) Y_pred.vtk.point_arrays['error'] = local_error( Y_pred.pd, Y_true.pd[target]) filename = os.path.join(savloc, Y_pred.name + '.vtk') Y_pred.WriteVTK(filename) if (type == 'classification'): ax1.legend() ax2.legend() plt.show() print('\n-----------------------') print('Finished prediction') print('-----------------------')
def predict(self, X_test): y_mean = self.rf.predict(X_test) y_variance = fci.random_forest_error(self.rf, self.X_train, X_test) y_std = np.sqrt(y_variance) return y_mean, y_std, y_variance
spam_RFC.fit(spam_X_train, spam_y_train) spam_y_hat = spam_RFC.predict_proba(spam_X_test) idx_spam = np.where(spam_y_test == 1)[0] idx_ham = np.where(spam_y_test == 0)[0] # Histogram predictions without error bars: fig, ax = plt.subplots(1) ax.hist(spam_y_hat[idx_spam, 1], histtype='step', label='spam') ax.hist(spam_y_hat[idx_ham, 1], histtype='step', label='not spam') ax.set_xlabel('Prediction (spam probability)') ax.set_ylabel('Number of observations') plt.legend() # Calculate the variance spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train, spam_X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails fig, ax = plt.subplots(1) ax.scatter(spam_y_hat[idx_spam, 1], np.sqrt(spam_V_IJ_unbiased[idx_spam]), label='spam') ax.scatter(spam_y_hat[idx_ham, 1], np.sqrt(spam_V_IJ_unbiased[idx_ham]), label='not spam') ax.set_xlabel('Prediction (spam probability)') ax.set_ylabel('Standard deviation') plt.legend()
def get_forest_conf_interval(rf_model, X_test, X_train): prediction_variance = fci.random_forest_error(rf_model, X_train, X_test) cli_95 = get_confidence_interval_from_std(np.sqrt(prediction_variance)) return cli_95
# split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42) # Create RandomForestRegressor n_estimators = 1000 mpg_bagger = BaggingRegressor(base_estimator=SVR(), n_estimators=n_estimators, random_state=42) mpg_bagger.fit(mpg_X_train, mpg_y_train) mpg_y_hat = mpg_bagger.predict(mpg_X_test) # Plot predicted MPG without error bars plt.scatter(mpg_y_test, mpg_y_hat) plt.plot([5, 45], [5, 45], "k--") plt.xlabel("Reported MPG") plt.ylabel("Predicted MPG") plt.show() # Calculate the variance mpg_V_IJ_unbiased = fci.random_forest_error(mpg_bagger, mpg_X_train, mpg_X_test) # Plot error bars for predicted MPG using unbiased variance plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt="o") plt.plot([5, 45], [5, 45], "k--") plt.xlabel("Reported MPG") plt.ylabel("Predicted MPG") plt.show()
def _get_model_errors(cls, model, X, X_train, X_test, error_method='stdev_weak_learners', remove_outlier_learners=False): err_down = list() err_up = list() indices_TF = list() X_aslist = X.values.tolist() if model.model.__class__.__name__ in ['RandomForestRegressor', 'GradientBoostingRegressor', 'ExtraTreesRegressor', 'BaggingRegressor', 'AdaBoostRegressor']: if error_method == 'jackknife_after_bootstrap': model_errors_var = random_forest_error(forest=model.model, X_test=X_test, X_train=X_train) # Wager method returns the variance. Take sqrt to turn into stdev model_errors = np.sqrt(model_errors_var) num_removed_learners = list() if remove_outlier_learners is True: print("Warning: removal of outlier learners isn't supported with jackknife after bootstrap") for _ in model_errors: num_removed_learners.append(0) elif error_method == 'stdev_weak_learners': num_removed_learners = list() for x in range(len(X_aslist)): preds = list() if model.model.__class__.__name__ == 'RandomForestRegressor': for pred in model.model.estimators_: preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0]) elif model.model.__class__.__name__ == 'BaggingRegressor': for pred in model.model.estimators_: preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0]) elif model.model.__class__.__name__ == 'ExtraTreesRegressor': for pred in model.model.estimators_: preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0]) elif model.model.__class__.__name__ == 'GradientBoostingRegressor': for pred in model.model.estimators_.tolist(): preds.append(pred[0].predict(np.array(X_aslist[x]).reshape(1, -1))[0]) elif model.model.__class__.__name__ == 'AdaBoostRegressor': for pred in model.model.estimators_: preds.append(pred.predict(np.array(X_aslist[x]).reshape(1, -1))[0]) # HERE flag outlier predictions, perhaps result of e.g. numerical issues in ensemble of models if remove_outlier_learners == True: preds, num_outliers = cls._remove_outlier_preds(preds=preds) num_removed_learners.append(num_outliers) else: num_removed_learners.append(0) e_down = np.std(preds) e_up = np.std(preds) err_down.append(e_down) err_up.append(e_up) nan_indices = np.where(np.isnan(err_up)) nan_indices_sorted = np.array(sorted(nan_indices[0], reverse=True)) for i, val in enumerate(list(err_up)): if i in nan_indices_sorted: indices_TF.append(False) else: indices_TF.append(True) model_errors = (np.array(err_up) + np.array(err_down)) / 2 else: print('ERROR: error_method must be one of "stdev_weak_learners" or "jackknife_after_bootstrap"') sys.exit() if model.model.__class__.__name__ == 'GaussianProcessRegressor': preds = model.model.predict(X, return_std=True)[1] # Get the stdev model error from the predictions of GPR err_up = preds err_down = preds model_errors = (np.array(err_up) + np.array(err_down)) / 2 nan_indices = np.where(np.isnan(err_up)) nan_indices_sorted = np.array(sorted(nan_indices[0], reverse=True)) num_removed_learners = list() for i, val in enumerate(list(err_up)): num_removed_learners.append(0) if i in nan_indices_sorted: indices_TF.append(False) else: indices_TF.append(True) model_errors = pd.Series(model_errors, name='model_errors') num_removed_learners = pd.Series(num_removed_learners, name='num_removed_learners') return model_errors, num_removed_learners
# compute errors errors = yhat_reais - y_reais # compute median absolute error median_abs_error = np.median(np.absolute(errors)) print('median absolute error (in R$):', median_abs_error) # compute proportional error (error / asking price) proportional_errors = errors / y_reais median_prop_error = np.median(np.absolute(proportional_errors)) mean_prop_error = np.mean(np.absolute(proportional_errors)) print('median absolute error (in %):', median_prop_error) print('mean absolute error (in %):', mean_prop_error) # estimate uncertainty variances = fci.random_forest_error(model, X_train, X_test) plt.errorbar(y_test, yhat, yerr=np.sqrt(variances), fmt='o', ecolor='red') plt.plot([10, 16], [10, 16], 'k--') plt.xlabel('actual price, in log(R$)') plt.ylabel('predicted price, in log(R$)') plt.show() # check interval predictions lower = yhat - np.sqrt(variances) upper = yhat + np.sqrt(variances) corrects = 0 for y_i, l, u in zip(y_test, lower, upper): if l <= y_i <= u: corrects += 1 print(corrects, 'corrects out of', len(yhat))
def get_RF_ci(RF_type,RF_classi,X_train,X_test,y_test,y_score, classes=['yes','no'],plot_fh=None): """ Get confidence intervals for predicted classifications :param RF_type: type of random forest algorithm :param RF_classi: Classification estimator object :param X_train: pandas dataframe, Training data :param X_test: pandas dataframe, Testing data :param y_test: pandas dataframe with the target values :param y_score: pandas dataframe with the y score values """ # calculate inbag and unbiased variance inbag = fci.calc_inbag(X_train.shape[0], RF_classi) V_IJ_unbiased = fci.random_forest_error(RF_classi,inbag, X_train, X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(y_test == 1)[0] fig=plt.figure(figsize=[3,3]) ax=plt.subplot(111) if RF_type=='classi': ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[0]) idx = np.where(y_test == 0)[0] ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[1]) ax.set_xlabel('Prediction probability') ax.set_ylabel('Standard deviation') space=0.3 ax.set_ylim([ax.get_ylim()[0]*(1+space), ax.get_ylim()[1]*(1+space)]) leg=ax.legend(loc='upper right',frameon=True) leg.get_frame().set_alpha(0.5) # plt.axis('equal') if RF_type=='regress': # Plot error bars for predicted MPG using unbiased variance ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o') xlim,ylim=get_axlims(y_test,y_score, space=0.1,equal=True) ax.plot(xlim,xlim, '--',color='gray') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('Test') ax.set_ylabel('Predicted') results,_,_=get_regression_metrics(y_test,y_score) logging.info(results.replace('\n',' ')) ax.text(0, 1, results, horizontalalignment='left', verticalalignment='top', transform=ax.transAxes) data_regress=pd.DataFrame({'y_test':y_test, 'y_pred':y_score, 'err':np.sqrt(V_IJ_unbiased) }) if not plot_fh is None: data_regress.to_csv('%s.csv' % plot_fh) ax.grid(True) saveplot(plot_fh)
def get_RF_ci(RF_type, RF_classi, X_train, X_test, y_test, y_score, classes=['yes', 'no'], plot_fh=None): """ Get confidence intervals for predicted classifications :param RF_type: type of random forest algorithm :param RF_classi: Classification estimator object :param X_train: pandas dataframe, Training data :param X_test: pandas dataframe, Testing data :param y_test: pandas dataframe with the target values :param y_score: pandas dataframe with the y score values """ # calculate inbag and unbiased variance inbag = fci.calc_inbag(X_train.shape[0], RF_classi) V_IJ_unbiased = fci.random_forest_error(RF_classi, inbag, X_train, X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(y_test == 1)[0] fig = plt.figure(figsize=[3, 3]) ax = plt.subplot(111) if RF_type == 'classi': ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[0]) idx = np.where(y_test == 0)[0] ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[1]) ax.set_xlabel('Prediction probability') ax.set_ylabel('Standard deviation') space = 0.3 ax.set_ylim( [ax.get_ylim()[0] * (1 + space), ax.get_ylim()[1] * (1 + space)]) leg = ax.legend(loc='upper right', frameon=True) leg.get_frame().set_alpha(0.5) # plt.axis('equal') if RF_type == 'regress': # Plot error bars for predicted MPG using unbiased variance ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o') xlim, ylim = get_axlims(y_test, y_score, space=0.1, equal=True) ax.plot(xlim, xlim, '--', color='gray') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('Test') ax.set_ylabel('Predicted') results, _, _ = get_regression_metrics(y_test, y_score) logging.info(results.replace('\n', ' ')) ax.text(0, 1, results, horizontalalignment='left', verticalalignment='top', transform=ax.transAxes) data_regress = pd.DataFrame({ 'y_test': y_test, 'y_pred': y_score, 'err': np.sqrt(V_IJ_unbiased) }) if not plot_fh is None: data_regress.to_csv('%s.csv' % plot_fh) ax.grid(True) saveplot(plot_fh)
import forestci as fci # retreive mpg data from machine learning library mpg_data = fetch_mldata('mpg') # separate mpg data into predictors and outcome variable mpg_X = mpg_data["data"] mpg_y = mpg_data["target"] # split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42) # create RandomForestRegressor n_trees = 2000 mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42) mpg_forest.fit(mpg_X_train, mpg_y_train) mpg_y_hat = mpg_forest.predict(mpg_X_test) # calculate inbag and unbiased variance mpg_inbag = fci.calc_inbag(mpg_X_train.shape[0], mpg_forest) mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_inbag, mpg_X_train, mpg_X_test) # Plot error bars for predicted MPG using unbiased variance plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o') plt.plot([5, 45], [5, 45], '--') plt.xlabel('Reported MPG') plt.ylabel('Predicted MPG') plt.show()
def train_cv_one_fold(arg): g = None if len(arg) == 6: # groupの情報がある場合 x, y, h, one_kf, g, args = arg else: # groupの情報がない場合 x, y, h, one_kf, args = arg pipeline = [] ## ## 学習用セットとテスト用セットに分ける ## train_idx, test_idx = one_kf if args.train_data_sample is not None: train_idx = np.random.choice(train_idx, args.train_data_sample, replace=False) train_x = np.copy(x[train_idx]) train_y = y[train_idx] test_x = np.copy(x[test_idx]) test_y = y[test_idx] test_g = g[test_idx] if g is not None else None ## ## 手法を選択 ## if args.task == "regression": clf, param_grid = get_regressor_model(args) else: clf, param_grid = get_classifier_model(args) result = {} ## ## 特徴選択を行う ## selected_feature = None if args.feature_selection: ## ## 特徴選択を行い、選択された特徴で予測をする ## if args.num_features is not None: rfe = RFE(clf, args.num_features) else: rfe = RFECV(clf, cv=3) mask = ~np.isnan(train_y) rfe = rfe.fit(train_x[mask, :], train_y[mask]) """ # feature selection による予測結果を保存する場合はコメントをはずす result["feature_selection_pred_y"] = rfe.predict(test_x) prob_y = rfe.predict_proba(test_x) if hasattr(clf, "predict_proba") else None result["feature_selection_prob_y"] = prob_y """ ## ## 選択された特徴を保存する ## selected_feature = rfe.support_ print("=== selected feature ===") if h is None: selected_feature_name = [ i for i, el in enumerate(selected_feature) if el == True ] print(len(selected_feature_name), ":", selected_feature_name) else: selected_feature_name = [ attr for attr, el in zip(h, selected_feature) if el == True ] print(len(selected_feature_name), ":", selected_feature_name) result["selected_feature_name"] = selected_feature_name result["selected_feature"] = selected_feature result["feature_name"] = selected_feature ## ## 学習・テストデータをこのfold中、選択された特徴のみにする ## train_x = rfe.transform(train_x) test_x = rfe.transform(test_x) pipeline.append(rfe) if h is not None: result["feature_name"] = h if args.grid_search: ## ## グリッドサーチでハイパーパラメータを選択する ## ハイパーパラメータを評価するため学習セットを、さらに、パラメータを決定する学習セットとハイパーパラメータを評価するためのバリデーションセットに分けてクロスバリデーションを行う ## grid_search = sklearn.model_selection.GridSearchCV( clf, param_grid, cv=args.param_search_splits) mask = ~np.isnan(train_y) grid_search.fit(train_x[mask, :], train_y[mask]) ## ## 最も良かったハイパーパラメータや結果を保存 ## print("Best parameters: {}".format(grid_search.best_params_)) print("Best cross-validation: {}".format(grid_search.best_score_)) result.update({ "param": grid_search.best_params_, "best_score": grid_search.best_score_, }) """ ## 最も良かったハイパーパラメータのモデルを用いてテストデータで評価を行い、保存する場合はコメントをはずす pred_y = grid_search.predict(test_x) prob_y = grid_search.predict_proba(test_x) if hasattr(grid_search, "predict_proba") else None result["grid_search_pred_y"] = pred_y prob_y = rfe.predict_proba(test_x) if hasattr(clf, "predict_proba") else None result["grid_search_prob_y"] = prob_y """ ## ## 最も良かったハイパーパラメータの識別器を保存 ## (学習データ全体での再フィッティングはこの段階では行わない) ## clf = grid_search.best_estimator_ if args.opt: clf = optimize(train_x, train_y) ## ## clf を学習データ全体で再学習する ## mask = ~np.isnan(train_y) clf.fit(train_x[mask, :], train_y[mask]) ## ## 予測器ごとに特有の結果を出力する ## # ベイズ回帰の予測標準偏差 if isinstance(clf, sklearn.linear_model.BayesianRidge): pred_y, pred_y_std = clf.predict(test_x, return_std=True) result["pred_y_std"] = pred_y_std else: pred_y = clf.predict(test_x) # 特徴量の重要度 if hasattr(clf, "feature_importances_"): fi = clf.feature_importances_ result["feature_importance"] = fi fi_str = ",".join(map(str, fi)) print("feature_importance", len(fi), ":", fi_str) # ランダムフォレストの予測標準偏差 if isinstance(clf, RandomForestRegressor): if args.fci: import forestci as fci unbiased_var = fci.random_forest_error(clf, train_x, test_x) result["test_y_std"] = np.sqrt(unbiased_var) ## ## 予測結果やインデックスの保存 ## result["test_y"] = test_y result["test_idx"] = test_idx result["test_group"] = test_g result["pred_y"] = pred_y prob_y = None if hasattr(clf, "predict_proba"): prob_y = clf.predict_proba(test_x) result["prob_y"] = prob_y pipeline.append(clf) ## ## 評価 ## #if test_g is not None: # result=evaluate_group(test_y, pred_y, prob_y, test_g, args, result=result) result = evaluate(test_y, pred_y, prob_y, args, result) if "accuracy" in result: if args.task == "binary": print("Cross-validation test accuracy: %3f" % (result["accuracy"])) print("Cross-validation test AUC: %3f" % (result["auc"])) if args.task == "multiclass": for i, auc in enumerate(result["auc"]): print("Task %d Cross-validation test AUC: %3f" % (i, auc)) acc = result["accuracy"] print("Cross-validation test accuracy: %3f" % (acc)) else: print("Cross-validation r2: %3f" % (result["r2"])) return (result, pipeline)