Ejemplo n.º 1
0
def test_quantile_toy_data():
    rng = np.random.RandomState(1)
    x1 = rng.randn(1, 10)
    X1 = np.tile(x1, (10000, 1))
    x2 = 20.0 * rng.randn(1, 10)
    X2 = np.tile(x2, (10000, 1))
    X = np.concatenate((X1, X2))

    y1 = rng.randn(10000)
    y2 = 5.0 + rng.randn(10000)
    y = np.concatenate((y1, y2))

    est = MondrianForestRegressor(random_state=1)

    # est.set_params(max_depth=1)
    est.fit(X, y)
    for quantile in range(10, 90, 10):
        tree_quantile = 0.01 * quantile

        assert_array_almost_equal(
            est.predict_quantile(x1, quantile=tree_quantile),
            [np.percentile(y1, quantile)], 2)
        assert_array_almost_equal(
            est.predict_quantile(x2, quantile=tree_quantile),
            [np.percentile(y2, quantile)], 2)
Ejemplo n.º 2
0
def test_forest_attributes():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2])
    assert_false(hasattr(mr, "classes_"))
    assert_false(hasattr(mr, "n_classes_"))

    mr = MondrianForestClassifier(n_estimators=5, random_state=0)
    mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2])
    assert_true(hasattr(mr, "classes_"))
    assert_true(hasattr(mr, "n_classes_"))
Ejemplo n.º 3
0
def test_mean_std_forest_regressor():
    mfr = MondrianForestRegressor(random_state=0)
    mfr.fit(X, y)

    # For points completely in the training data.
    # and max depth set to None.
    # mean should converge to the actual target value.
    # variance should converge to 0.0
    mean, std = mfr.predict(X, return_std=True)
    assert_array_almost_equal(mean, y, 5)
    assert_array_almost_equal(std, 0.0, 2)

    # For points completely far away from the training data, this
    # should converge to the empirical mean and variance.
    # X is scaled between to -1.0 and 1.0
    X_inf = np.vstack(
        (30.0 * np.ones(X.shape[1]), -30.0 * np.ones(X.shape[1])))
    inf_mean, inf_std = mfr.predict(X_inf, return_std=True)
    assert_array_almost_equal(inf_mean, y.mean(), 1)
    assert_array_almost_equal(inf_std, y.std(), 2)
Ejemplo n.º 4
0
def test_interval_scorer():
    # Fit a simple linear model
    n_samples = 200
    n_features = 10
    rng = np.random.RandomState(0)
    X = rng.normal(size=(n_samples, n_features))
    w = rng.normal(size=n_features)
    # simple linear function without noise
    y = np.dot(X, w)

    mfr = MondrianForestRegressor()
    mfr.fit(X, y)
    # Create a scorer that measures the mean interval size
    interval_size_scorer = IntervalScorer(mean_interval_size,
                                          sign=-1,
                                          kwargs={'confidence': 0.9})
    # Get prediction intervals
    intervals = mfr.predict_interval(X, 0.9)

    interval_size = intervals[:, 1] - intervals[:, 0]
    calc_mean = np.mean(interval_size)
    # Ensure the scorer performs the correct calculation
    assert_almost_equal(interval_size_scorer(mfr, X, y), -1 * calc_mean)
Ejemplo n.º 5
0
def test_boston():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit(X, y)
    check_boston(mr)
    mr.partial_fit(X, y)
    check_boston(mr)
Ejemplo n.º 6
0
def test_mean_std_forest_regressor():
    mfr = MondrianForestRegressor(random_state=0)
    mfr.fit(X, y)
    check_mean_std_forest_regressor(mfr)
    mfr.partial_fit(X, y)
    check_mean_std_forest_regressor(mfr)
Ejemplo n.º 7
0
def test_boston():
    mr = MondrianForestRegressor(n_estimators=5, random_state=0)
    mr.fit(X, y)
    score = mr.score(X, y)
    assert_greater(score, 0.94, "Failed with score = %f" % score)
Ejemplo n.º 8
0
import numpy as np
from sklearn.datasets import load_boston
X = load_boston(return_X_y=True)
X_train = X[0]
y_train = X[1]
#@print(X_train)
print(X_train.shape)
print(np.amax(X_train))
print(np.amin(X_train))

### Use MondrianForests for variance estimation
from skgarden import MondrianForestRegressor
mfr = MondrianForestRegressor()
mfr.fit(X_train, y_train)
y_mean, y_std = mfr.predict(X_train, return_std=True)
print(y_mean)
#print(y_std)

### Use QuantileForests for quantile estimation
#from skgarden import RandomForestQuantileRegressor
#rfqr = RandomForestQuantileRegressor(random_state=0)
#rfqr.fit(X, y)
#y_mean = rfqr.predict(X)
#y_median = rfqr.predict(X, 50)
Ejemplo n.º 9
0
def RF_regressor(X_data,Y_data,options=None):
    from sklearn.ensemble import RandomForestRegressor

    ####################
    # Parse user options
    ####################
    params = {}
    gridsearch   = False
    GS_settings  = None
    randomsearch = False
    RS_settings  = None
    feature_selection = False
    accuracy = False
    cv_type = 'logo'
    scoring = 'neg_mean_absolute_error'
    mondrian = False
    search_std = False

    if (options is not None):

        if (("RF_parameters" in options)==True):
            params = options['RF_parameters']

        if (("grid_search" in options)==True):
            from sklearn.model_selection import GridSearchCV
            import time
            gridsearch = True
            GS_params   = options['grid_search']['parameter_grid']
            if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings']
            if (("search std" in options['grid_search'])==True):
                search_std = options['grid_search']['search std']

        if (("random_search" in options)==True):
            from sklearn.model_selection import RandomizedSearchCV
            from cfd2ml.utilities import convert_param_dist
            import time
            randomsearch = True
            RS_params, RS_Nmax   = convert_param_dist(options['random_search']['parameter_grid'])
            print('RS_Nmax = ', RS_Nmax)
            if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] 

        if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********')

        if (("feature_selection" in options)==True):
            from cfd2ml.utilities import RFE_perm
            feature_selection = True
            feats =  options['feature_selection']['feats']
#            if("step"         in options['feature_selection']): step         = options['feature_selection']['step']
#            if("min_features" in options['feature_selection']): min_features = options['feature_selection']['min_features']
            if(randomsearch==True or gridsearch==True): quit('******** Stopping! grid/random_search and feature selection both set ********')

        if (("accuracy" in options)==True):
            accuracy = options['accuracy']
            if (accuracy==True):
                from sklearn.model_selection import cross_validate
                from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

        if (("scoring" in options)==True):
            scoring = options['scoring']

        if (("cv_type" in options)==True):
            cv_type = options['cv_type']

        if (("mondrian" in options)==True):
            mondrian = options['mondrian']
            if mondrian: from skgarden import MondrianForestRegressor

    ##############
    # Prepare data
    ##############
    if(cv_type=='logo'): groups = X_data['group']
    X_data = X_data.drop(columns='group')

    # Find feature and target headers
    X_headers = X_data.columns
    Y_header  = Y_data.name

    nX = X_headers.size
    print('\nFeatures:')
    for i in range(0,nX):
        print('%d/%d: %s' %(i+1,nX,X_headers[i]) )
    print('\nTarget: ', Y_header)
  
    ########################
    # Prepare other settings
    ########################
    # Setting cross-validation type (either leave-one-group-out or 5-fold)
    if(cv_type=='logo'):
        from sklearn.model_selection import LeaveOneGroupOut
        logo = LeaveOneGroupOut()
        ngroup = logo.get_n_splits(groups=groups)
        print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups')
    elif(cv_type=='kfold'):
        from sklearn.model_selection import StratifiedKFold
        print('\nUsing 10-fold cross validation')
        k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        cv = k_fold.split(X_data,Y_data)

    #########################
    # Training the regressor
    #########################
    if(gridsearch==True):
        # Finding optimal hyperparameters with GridSearchCV
        if mondrian:
            print('\n Performing GridSearchCV to find optimal hyperparameters for mondrian forest regressor')
            regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False)
            if search_std: # MESSY HACK! Ignore "best model etc" if using this
                def my_scorer(model, X, y_true):
                    y_pred, y_sd = model.predict(X,return_std=True)
                    return np.mean(y_sd)
                scoring=my_scorer
        else:            
            print('\n Performing GridSearchCV to find optimal hyperparameters for random forest regressor')
            regr = RandomForestRegressor(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        GS_regr = GridSearchCV(estimator=regr,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings)
        GS_regr.fit(X_data,Y_data)

        # Write out results to file
        scores_df = pd.DataFrame(GS_regr.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('GridSearch_results.csv')

        # Pich out best results
        best_params = GS_regr.best_params_
        best_score  = GS_regr.best_score_
        regr = GS_regr.best_estimator_  # (this regr has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)

    elif(randomsearch==True):
        # Finding optimal hyperparameters with RandomSearchCV
        if mondrian:
            print('\n Performing RandomizedSearchCV to find optimal hyperparameters for mondrian forest regressor')
            regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False)
        else:            
            print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest regressor')
            regr = RandomForestRegressor(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        RS_regr = RandomizedSearchCV(estimator=regr,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings)
        RS_regr.fit(X_data,Y_data)
        
        # Write out results to file
        scores_df = pd.DataFrame(RS_regr.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('RandomSearch_results.csv')

        # Pick out best results
        best_params = RS_regr.best_params_
        best_score  = RS_regr.best_score_
        regr = RS_regr.best_estimator_  # (this regr has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)


    else:
        # Train RF regressor with hyperparameters given by user
        if mondrian:
            print('\nTraining mondrian forest regressor with given hyperparameters')
            regr = MondrianForestRegressor(**params,bootstrap=False)
        else:            
            print('\nTraining random forest regressor with given hyperparameters')
            regr = RandomForestRegressor(**params)

        # Feature selection before final fit
        if (feature_selection):
            if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
#            [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,cv=cv,scoring=scoring,step=step,min_features=min_features,timing=True)
            [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,feats,cv=cv,scoring=scoring,timing=True)

            if (scoring=='neg_mean_absolute_error'):
                scores = -scores
                bestscore  = -bestscore
            elif(scoring=='neg_mean_squared_error'):
                scores = np.sqrt(-scores)
                bestscore  = np.sqrt(-bestscore)
            import matplotlib.pyplot as plt
            plt.figure()
            plt.plot(nfeats,100*scores,lw=2)
            plt.xlabel('$N_{features}$')
            plt.ylabel('Score (%)')
            plt.figure()
            plt.plot(nfeats,traintimes,label='Training',lw=2)
            plt.plot(nfeats, predtimes,label='Prediction',lw=2)
            plt.xlabel('$N_{features}$')
            plt.ylabel('Time (s)')
            plt.legend()
            plt.show()

            print('Best score: %.2f' %(100*bestscore))
            print('Feature set:')
            print(X_headers[bestfeat])

            # Save results in CSV file
            featselect_df = pd.DataFrame(featsets,columns=X_headers)
            featselect_df['score'] = scores
            featselect_df['traintimes'] = traintimes
            featselect_df['predtimes'] = predtimes
            featselect_df['nfeats'] = nfeats
            featselect_df.to_csv('FeatSelect_results.csv')

            # cut down to optimial feature set
            X_data = X_data.iloc[:,bestfeat]

        # Fit model to data
        regr.fit(X_data,Y_data)

    # Cross validation accuracy metrics
    if(accuracy==True):
        print('\nPerforming cross validation to determine train and test accuracy/error')

        # Get generator object depending on cv strategy
        if (cv_type=='logo'): 
            cv = logo.split(X_data,Y_data,groups)
        elif(cv_type=='kfold'):
            cv = k_fold.split(X_data,Y_data)  # Need to regen "Generator" object

        from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

        # Init lists
        train_r2  = []
        test_r2   = []
        train_MAE = []
        test_MAE  = []
        train_MSE = []
        test_MSE  = []

        # Loop through CV folds
        i = 0
        for train_index, test_index in cv:
            X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
            Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]

            # Train regressor
            regr_cv = regr
            regr_cv.fit(X_train, Y_train)

            # Predict Y
            Y_pred_train = regr_cv.predict(X_train)
            Y_pred_test  = regr_cv.predict(X_test )

            # r2 scores
            r2score = r2_score(Y_test , Y_pred_test)
            train_r2.append(r2_score(Y_train, Y_pred_train) )
            test_r2.append(r2score)
            # Mean absolute error scores
            MAEscore = mean_absolute_error(Y_test , Y_pred_test)
            train_MAE.append(mean_absolute_error(Y_train, Y_pred_train) )
            test_MAE.append(MAEscore)
            # Mean squared error scores
            MSEscore = mean_squared_error(Y_test , Y_pred_test)
            train_MSE.append(mean_squared_error(Y_train, Y_pred_train) )
            test_MSE.append(MSEscore)

            # Print validation scores (training scores are stored to print mean later, but not printed for each fold)
            if(cv_type=='logo'):
                print('\nTest group = ', groups.iloc[test_index[0]])
            elif(cv_type=='kfold'):
                print('\nFold = ', i)
            print('-------------------')
            print('r2 score = %.2f %%' %(r2score*100) )
            print('Mean absolute error = %.2f %%' %(MAEscore*100) )
            print('Mean squared error = %.2f %%' %(MSEscore*100) )

            i += 1

        # Print performance scores
        print('\nMean training scores:')
        print('r2 score = %.2f %%' %(np.mean(train_r2)*100) )
        print('Mean absolute error = %.2f %%' %(np.mean(train_MAE)*100) )
        print('Mean squared error = %.2f %%' %(np.mean(train_MSE)*100) )
    
        print('\nMean validation scores:')
        print('r2 score = %.2f %%' %(np.mean(test_r2)*100) )
        print('Mean absolute error = %.2f %%' %(np.mean(test_MAE)*100) )
        print('Mean squared error = %.2f %%' %(np.mean(test_MSE)*100) )
        

    return regr