def test_random_forest_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test)
    npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])

    # We cannot calculate inbag from a non-bootstrapped forest. This is because
    # Scikit-learn trees do not store their own sample weights. If you did This
    # some other way, you can still use your own inbag
    non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees,
                                                 bootstrap=False)

    npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0],
                      non_bootstrap_forest)
Example #2
0
def confidence_interval(model, Xtrain, Xtest):
    inbag = fci.calc_inbag(Xtrain.shape[0], model)
    ci = fci.random_forest_error(model,
                                 Xtrain.values,
                                 Xtest.values,
                                 inbag=inbag)
    return ci
Example #3
0
    def confidence_cal(self, train_data, test_data, rf):
        import forestci as fci
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                      test_data)

        return V_IJ, V_IJ_unbiased
Example #4
0
    def confidence_cal(train_data, test_data, rf):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        spam_inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                     test_data)

        return V_IJ, V_IJ_unbiased
Example #5
0
    def confidence_cal(train_data, test_data, rf):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ_unbiased = fci.random_forest_error(rf, train_data, test_data)

        print("inbag: {}".format(inbag))
        print("V_IJ_unbiased: {}".format(V_IJ_unbiased))
        # Plot error bars for predicted MPG using unbiased variance

        return inbag, V_IJ_unbiased
Example #6
0
def test_random_forest_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    V_IJ_unbiased = fci.random_forest_error(forest, inbag, X_train, X_test)
    npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
Example #7
0
    def confidenceCal(self, train_data, test_data, predictions, test_y, rf):
        pmax = np.amax(predictions)
        tmax = np.amax(test_y)

        axismax = max(pmax, tmax)

        import forestci as fci
        # calculate inbag and unbiased variance
        inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ, V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                      test_data)

        # print "inbag: {}".format(inbag)
        # print "V_IJ_unbiased: {}".format(V_IJ_unbiased)
        # # Plot error bars for predicted MPG using unbiased variance
        (_, caps, _) = plt.errorbar(predictions,
                                    test_y,
                                    yerr=np.sqrt(V_IJ),
                                    fmt='o',
                                    markersize=4,
                                    capsize=10,
                                    mfc='red',
                                    mec='green')
        for cap in caps:
            cap.set_markeredgewidth(1)
        plt.title('Error bars for Patient: ' + str(self.patient_id))

        plt.xlabel('Actual BG')
        plt.ylabel('Predicted BG')
        plt.xlim(0, axismax)
        plt.ylim(0, axismax)

        plt.savefig(
            "prediction/tmp/confidence_intervals_bias_patient{}.png".format(
                self.patient_id))
        plt.close()

        return V_IJ, V_IJ_unbiased
def test_bagging_svr_error():
    X = np.array([[5, 2], [5, 5], [3, 3], [6, 4], [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    bagger = BaggingRegressor(base_estimator=SVR(), n_estimators=n_trees)
    bagger.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], bagger)
    for ib in [inbag, None]:
        for calibrate in [True, False]:
            V_IJ_unbiased = fci.random_forest_error(
                bagger, X_train, X_test, inbag=ib, calibrate=calibrate
            )
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])
Example #9
0
    def confidence_cal(train_data, train_y, test_data, test_y, predictions, rf, patientID):
        import forestci as fci
        from matplotlib import pyplot as plt
        import numpy as np
        # calculate inbag and unbiased variance
        spam_inbag = fci.calc_inbag(train_data.shape[0], rf)
        V_IJ_unbiased = fci.random_forest_error(rf, train_data,
                                                     test_data)

        # Plot forest prediction for emails and standard deviation for estimates
        # Blue points are spam emails; Green points are non-spam emails
        idx = np.where(test_y == 1)[0]
        plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label='Hyper')

        idx = np.where(test_y == 0)[0]
        plt.errorbar(predictions[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label='Non')

        plt.xlabel('Prediction (hyper probability)')
        plt.ylabel('Standard deviation')
        plt.legend()
        plt.show()
Example #10
0
def test_random_forest_error():
    X = np.array([[5, 2],
                  [5, 5],
                  [3, 3],
                  [6, 4],
                  [6, 6]])

    y = np.array([70, 100, 60, 100, 120])

    train_idx = [2, 3, 4]
    test_idx = [0, 1]

    y_test = y[test_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    X_train = X[train_idx]

    n_trees = 4
    forest = RandomForestRegressor(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    inbag = fci.calc_inbag(X_train.shape[0], forest)
    for ib in [inbag, None]:
        for calibrate in [True, False]:
            V_IJ_unbiased = fci.random_forest_error(forest, X_train, X_test,
                                                    inbag=ib,
                                                    calibrate=calibrate)
        npt.assert_equal(V_IJ_unbiased.shape[0], y_test.shape[0])

    # We cannot calculate inbag from a non-bootstrapped forest. This is because
    # Scikit-learn trees do not store their own sample weights. If you did This
    # some other way, you can still use your own inbag
    non_bootstrap_forest = RandomForestRegressor(n_estimators=n_trees,
                                                 bootstrap=False)

    npt.assert_raises(ValueError, fci.calc_inbag, X_train.shape[0],
                      non_bootstrap_forest)
Example #11
0
def get_RF_ci(RF_type,
              RF_classi,
              X_train,
              X_test,
              y_test,
              y_score,
              classes=['yes', 'no'],
              plot_fh=None):
    """
    Get confidence intervals for predicted classifications

    :param RF_type: type of random forest algorithm
    :param RF_classi: Classification estimator object
    :param X_train: pandas dataframe, Training data  
    :param X_test: pandas dataframe, Testing data
    :param y_test: pandas dataframe with the target values
    :param y_score: pandas dataframe with the y score values
    """
    # calculate inbag and unbiased variance
    inbag = fci.calc_inbag(X_train.shape[0], RF_classi)
    V_IJ_unbiased = fci.random_forest_error(RF_classi, inbag, X_train, X_test)
    # Plot forest prediction for emails and standard deviation for estimates
    # Blue points are spam emails; Green points are non-spam emails
    idx = np.where(y_test == 1)[0]
    fig = plt.figure(figsize=[3, 3])
    ax = plt.subplot(111)
    if RF_type == 'classi':
        ax.errorbar(y_score[idx, 1],
                    np.sqrt(V_IJ_unbiased[idx]),
                    fmt='.',
                    alpha=0.75,
                    label=classes[0])

        idx = np.where(y_test == 0)[0]
        ax.errorbar(y_score[idx, 1],
                    np.sqrt(V_IJ_unbiased[idx]),
                    fmt='.',
                    alpha=0.75,
                    label=classes[1])

        ax.set_xlabel('Prediction probability')
        ax.set_ylabel('Standard deviation')
        space = 0.3
        ax.set_ylim(
            [ax.get_ylim()[0] * (1 + space),
             ax.get_ylim()[1] * (1 + space)])
        leg = ax.legend(loc='upper right', frameon=True)
        leg.get_frame().set_alpha(0.5)
        # plt.axis('equal')
    if RF_type == 'regress':
        # Plot error bars for predicted MPG using unbiased variance
        ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o')
        xlim, ylim = get_axlims(y_test, y_score, space=0.1, equal=True)
        ax.plot(xlim, xlim, '--', color='gray')
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xlabel('Test')
        ax.set_ylabel('Predicted')
        results, _, _ = get_regression_metrics(y_test, y_score)
        logging.info(results.replace('\n', ' '))
        ax.text(0,
                1,
                results,
                horizontalalignment='left',
                verticalalignment='top',
                transform=ax.transAxes)
        data_regress = pd.DataFrame({
            'y_test': y_test,
            'y_pred': y_score,
            'err': np.sqrt(V_IJ_unbiased)
        })
        if not plot_fh is None:
            data_regress.to_csv('%s.csv' % plot_fh)
    ax.grid(True)
    saveplot(plot_fh)
import forestci as fci

# retreive mpg data from machine learning library
mpg_data = fetch_mldata('mpg')

# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]

# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(
    mpg_X, mpg_y, test_size=0.25, random_state=42)

# create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)

# calculate inbag and unbiased variance
mpg_inbag = fci.calc_inbag(mpg_X_train.shape[0], mpg_forest)
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_inbag, mpg_X_train,
                                            mpg_X_test)

# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], '--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
Example #13
0
spam_X, spam_y = make_classification(5000)

# split the datainto training and test set
spam_X_train, spam_X_test, spam_y_train, spam_y_test = train_test_split(
    spam_X, spam_y, test_size=0.2)

# create RandomForestClassifier
n_trees = 500
spam_RFC = RandomForestClassifier(max_features=5,
                                  n_estimators=n_trees,
                                  random_state=42)
spam_RFC.fit(spam_X_train, spam_y_train)
spam_y_hat = spam_RFC.predict_proba(spam_X_test)

# calculate inbag and unbiased variance
spam_inbag = fci.calc_inbag(spam_X_train.shape[0], spam_RFC)
spam_V_IJ_unbiased = fci.random_forest_error(spam_RFC, spam_X_train,
                                             spam_X_test)

# Plot forest prediction for emails and standard deviation for estimates
# Blue points are spam emails; Green points are non-spam emails
idx = np.where(spam_y_test == 1)[0]
plt.errorbar(spam_y_hat[idx, 1],
             np.sqrt(spam_V_IJ_unbiased[idx]),
             fmt='.',
             alpha=0.75,
             label='spam')

idx = np.where(spam_y_test == 0)[0]
plt.errorbar(spam_y_hat[idx, 1],
             np.sqrt(spam_V_IJ_unbiased[idx]),
Example #14
0
def get_RF_ci(RF_type,RF_classi,X_train,X_test,y_test,y_score,
                classes=['yes','no'],plot_fh=None):
    """
    Get confidence intervals for predicted classifications

    :param RF_type: type of random forest algorithm
    :param RF_classi: Classification estimator object
    :param X_train: pandas dataframe, Training data  
    :param X_test: pandas dataframe, Testing data
    :param y_test: pandas dataframe with the target values
    :param y_score: pandas dataframe with the y score values
    """
    # calculate inbag and unbiased variance
    inbag = fci.calc_inbag(X_train.shape[0], RF_classi)
    V_IJ_unbiased = fci.random_forest_error(RF_classi,inbag, X_train,
                                                 X_test)
    # Plot forest prediction for emails and standard deviation for estimates
    # Blue points are spam emails; Green points are non-spam emails
    idx = np.where(y_test == 1)[0]
    fig=plt.figure(figsize=[3,3])
    ax=plt.subplot(111)
    if RF_type=='classi':
        ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label=classes[0])

        idx = np.where(y_test == 0)[0]
        ax.errorbar(y_score[idx, 1], np.sqrt(V_IJ_unbiased[idx]),
                     fmt='.', alpha=0.75, label=classes[1])

        ax.set_xlabel('Prediction probability')
        ax.set_ylabel('Standard deviation')
        space=0.3
        ax.set_ylim([ax.get_ylim()[0]*(1+space),
                     ax.get_ylim()[1]*(1+space)])
        leg=ax.legend(loc='upper right',frameon=True)
        leg.get_frame().set_alpha(0.5)
        # plt.axis('equal')
    if RF_type=='regress':
        # Plot error bars for predicted MPG using unbiased variance
        ax.errorbar(y_test, y_score, yerr=np.sqrt(V_IJ_unbiased), fmt='o')
        xlim,ylim=get_axlims(y_test,y_score,
                             space=0.1,equal=True)
        ax.plot(xlim,xlim, '--',color='gray')
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        ax.set_xlabel('Test')
        ax.set_ylabel('Predicted')
        results,_,_=get_regression_metrics(y_test,y_score)
        logging.info(results.replace('\n',' '))
        ax.text(0, 1, results,
            horizontalalignment='left',
            verticalalignment='top',
            transform=ax.transAxes)
        data_regress=pd.DataFrame({'y_test':y_test,
                                    'y_pred':y_score,
                                    'err':np.sqrt(V_IJ_unbiased)
                                    })
        if not plot_fh is None:
            data_regress.to_csv('%s.csv' % plot_fh)
    ax.grid(True)
    saveplot(plot_fh)