def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0]) # We cannot calculate inbag from a non-bootstrapped forest. This is because # Scikit-learn trees do not store their own sample weights. If you did This # some other way, you can still use your own inbag non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees, bootstrap=False) npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0], non_bootstrap_forest)
def confidence_interval(model, Xtrain, Xtest): inbag = fci.calc_inbag(Xtrain.shape[0], model) ci = fci.random_forest_error(model, Xtrain.values, Xtest.values, inbag=inbag) return ci
def confidence_cal(self, train_data, test_data, rf): import forestci as fci # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) return V_IJ, V_IJ_unbiased
def confidence_cal(train_data, test_data, rf): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance spam_inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) return V_IJ, V_IJ_unbiased
def confidence_cal(train_data, test_data, rf): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) print("inbag: {}".format(inbag)) print("V_IJ_unbiased: {}".format(V_IJ_unbiased)) # Plot error bars for predicted MPG using unbiased variance return inbag, V_IJ_unbiased
def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) V_IJ_unbiased = fci.random_forest_error(forest, inbag, X_train, X_test) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def confidenceCal(self, train_data, test_data, predictions, test_y, rf): pmax = np.amax(predictions) tmax = np.amax(test_y) axismax = max(pmax, tmax) import forestci as fci # calculate inbag and unbiased variance inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) # print "inbag: {}".format(inbag) # print "V_IJ_unbiased: {}".format(V_IJ_unbiased) # # Plot error bars for predicted MPG using unbiased variance (_, caps, _) = plt.errorbar(predictions, test_y, yerr=np.sqrt(V_IJ), fmt='o', markersize=4, capsize=10, mfc='red', mec='green') for cap in caps: cap.set_markeredgewidth(1) plt.title('Error bars for Patient: ' + str(self.patient_id)) plt.xlabel('Actual BG') plt.ylabel('Predicted BG') plt.xlim(0, axismax) plt.ylim(0, axismax) plt.savefig( "prediction/tmp/confidence_intervals_bias_patient{}.png".format( self.patient_id)) plt.close() return V_IJ, V_IJ_unbiased
def test_bagging_svr_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 bagger = BaggingRegressor(base_estimator=SVR(), n_estimators=n_trees) bagger.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], bagger) for ib in [inbag, None]: for calibrate in [True, False]: V_IJ_unbiased = fci.random_forest_error( bagger, X_train, X_test, inbag=ib, calibrate=calibrate ) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
def confidence_cal(train_data, train_y, test_data, test_y, predictions, rf, patientID): import forestci as fci from matplotlib import pyplot as plt import numpy as np # calculate inbag and unbiased variance spam_inbag = fci.calc_inbag(train_data.shape[0], rf) V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(test_y == 1)[0] plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label='Hyper') idx = np.where(test_y == 0)[0] plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label='Non') plt.xlabel('Prediction (hyper probability)') plt.ylabel('Standard deviation') plt.legend() plt.show()
def test_random_forest_error(): X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]]) y = np.array([70, 100, 60, 100, 120]) train_idx = [2, 3, 4] test_idx = [0, 1] y_test = y[test_idx] y_train = y[train_idx] X_test = X[test_idx] X_train = X[train_idx] n_trees = 4 forest = RandomForestRegressor(n_estimators=n_trees) forest.fit(X_train, y_train) inbag = fci.calc_inbag(X_train.shape[0], forest) for ib in [inbag, None]: for calibrate in [True, False]: V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test, inbag=ib, calibrate=calibrate) npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0]) # We cannot calculate inbag from a non-bootstrapped forest. This is because # Scikit-learn trees do not store their own sample weights. If you did This # some other way, you can still use your own inbag non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees, bootstrap=False) npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0], non_bootstrap_forest)
def get_RF_ci(RF_type, RF_classi, X_train, X_test, y_test, y_score, classes=['yes', 'no'], plot_fh=None): """ Get confidence intervals for predicted classifications :param RF_type: type of random forest algorithm :param RF_classi: Classification estimator object :param X_train: pandas dataframe, Training data :param X_test: pandas dataframe, Testing data :param y_test: pandas dataframe with the target values :param y_score: pandas dataframe with the y score values """ # calculate inbag and unbiased variance inbag = fci.calc_inbag(X_train.shape[0], RF_classi) V_IJ_unbiased = fci.random_forest_error(RF_classi, inbag, X_train, X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(y_test == 1)[0] fig = plt.figure(figsize=[3, 3]) ax = plt.subplot(111) if RF_type == 'classi': ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[0]) idx = np.where(y_test == 0)[0] ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[1]) ax.set_xlabel('Prediction probability') ax.set_ylabel('Standard deviation') space = 0.3 ax.set_ylim( [ax.get_ylim()[0] * (1 + space), ax.get_ylim()[1] * (1 + space)]) leg = ax.legend(loc='upper right', frameon=True) leg.get_frame().set_alpha(0.5) # plt.axis('equal') if RF_type == 'regress': # Plot error bars for predicted MPG using unbiased variance ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o') xlim, ylim = get_axlims(y_test, y_score, space=0.1, equal=True) ax.plot(xlim, xlim, '--', color='gray') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('Test') ax.set_ylabel('Predicted') results, _, _ = get_regression_metrics(y_test, y_score) logging.info(results.replace('\n', ' ')) ax.text(0, 1, results, horizontalalignment='left', verticalalignment='top', transform=ax.transAxes) data_regress = pd.DataFrame({ 'y_test': y_test, 'y_pred': y_score, 'err': np.sqrt(V_IJ_unbiased) }) if not plot_fh is None: data_regress.to_csv('%s.csv' % plot_fh) ax.grid(True) saveplot(plot_fh)
import forestci as fci # retreive mpg data from machine learning library mpg_data = fetch_mldata('mpg') # separate mpg data into predictors and outcome variable mpg_X = mpg_data["data"] mpg_y = mpg_data["target"] # split mpg data into training and test set mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split( mpg_X, mpg_y, test_size=0.25, random_state=42) # create RandomForestRegressor n_trees = 2000 mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42) mpg_forest.fit(mpg_X_train, mpg_y_train) mpg_y_hat = mpg_forest.predict(mpg_X_test) # calculate inbag and unbiased variance mpg_inbag = fci.calc_inbag(mpg_X_train.shape[0], mpg_forest) mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_inbag, mpg_X_train, mpg_X_test) # Plot error bars for predicted MPG using unbiased variance plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o') plt.plot([5, 45], [5, 45], '--') plt.xlabel('Reported MPG') plt.ylabel('Predicted MPG') plt.show()
spam_X, spam_y = make_classification(5000) # split the datainto training and test set spam_X_train, spam_X_test, spam_y_train, spam_y_test = train_test_split( spam_X, spam_y, test_size=0.2) # create RandomForestClassifier n_trees = 500 spam_RFC = RandomForestClassifier(max_features=5, n_estimators=n_trees, random_state=42) spam_RFC.fit(spam_X_train, spam_y_train) spam_y_hat = spam_RFC.predict_proba(spam_X_test) # calculate inbag and unbiased variance spam_inbag = fci.calc_inbag(spam_X_train.shape[0], spam_RFC) spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train, spam_X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(spam_y_test == 1)[0] plt.errorbar(spam_y_hat[idx, 1], np.sqrt(spam_V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label='spam') idx = np.where(spam_y_test == 0)[0] plt.errorbar(spam_y_hat[idx, 1], np.sqrt(spam_V_IJ_unbiased[idx]),
def get_RF_ci(RF_type,RF_classi,X_train,X_test,y_test,y_score, classes=['yes','no'],plot_fh=None): """ Get confidence intervals for predicted classifications :param RF_type: type of random forest algorithm :param RF_classi: Classification estimator object :param X_train: pandas dataframe, Training data :param X_test: pandas dataframe, Testing data :param y_test: pandas dataframe with the target values :param y_score: pandas dataframe with the y score values """ # calculate inbag and unbiased variance inbag = fci.calc_inbag(X_train.shape[0], RF_classi) V_IJ_unbiased = fci.random_forest_error(RF_classi,inbag, X_train, X_test) # Plot forest prediction for emails and standard deviation for estimates # Blue points are spam emails; Green points are non-spam emails idx = np.where(y_test == 1)[0] fig=plt.figure(figsize=[3,3]) ax=plt.subplot(111) if RF_type=='classi': ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[0]) idx = np.where(y_test == 0)[0] ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]), fmt='.', alpha=0.75, label=classes[1]) ax.set_xlabel('Prediction probability') ax.set_ylabel('Standard deviation') space=0.3 ax.set_ylim([ax.get_ylim()[0]*(1+space), ax.get_ylim()[1]*(1+space)]) leg=ax.legend(loc='upper right',frameon=True) leg.get_frame().set_alpha(0.5) # plt.axis('equal') if RF_type=='regress': # Plot error bars for predicted MPG using unbiased variance ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o') xlim,ylim=get_axlims(y_test,y_score, space=0.1,equal=True) ax.plot(xlim,xlim, '--',color='gray') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('Test') ax.set_ylabel('Predicted') results,_,_=get_regression_metrics(y_test,y_score) logging.info(results.replace('\n',' ')) ax.text(0, 1, results, horizontalalignment='left', verticalalignment='top', transform=ax.transAxes) data_regress=pd.DataFrame({'y_test':y_test, 'y_pred':y_score, 'err':np.sqrt(V_IJ_unbiased) }) if not plot_fh is None: data_regress.to_csv('%s.csv' % plot_fh) ax.grid(True) saveplot(plot_fh)