def test_quantile_toy_data(): rng = np.random.RandomState(1) x1 = rng.randn(1, 10) X1 = np.tile(x1, (10000, 1)) x2 = 20.0 * rng.randn(1, 10) X2 = np.tile(x2, (10000, 1)) X = np.concatenate((X1, X2)) y1 = rng.randn(10000) y2 = 5.0 + rng.randn(10000) y = np.concatenate((y1, y2)) est = MondrianForestRegressor(random_state=1) # est.set_params(max_depth=1) est.fit(X, y) for quantile in range(10, 90, 10): tree_quantile = 0.01 * quantile assert_array_almost_equal( est.predict_quantile(x1, quantile=tree_quantile), [np.percentile(y1, quantile)], 2) assert_array_almost_equal( est.predict_quantile(x2, quantile=tree_quantile), [np.percentile(y2, quantile)], 2)
def test_forest_attributes(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_"))
def test_mean_std_forest_regressor(): mfr = MondrianForestRegressor(random_state=0) mfr.fit(X, y) # For points completely in the training data. # and max depth set to None. # mean should converge to the actual target value. # variance should converge to 0.0 mean, std = mfr.predict(X, return_std=True) assert_array_almost_equal(mean, y, 5) assert_array_almost_equal(std, 0.0, 2) # For points completely far away from the training data, this # should converge to the empirical mean and variance. # X is scaled between to -1.0 and 1.0 X_inf = np.vstack( (30.0 * np.ones(X.shape[1]), -30.0 * np.ones(X.shape[1]))) inf_mean, inf_std = mfr.predict(X_inf, return_std=True) assert_array_almost_equal(inf_mean, y.mean(), 1) assert_array_almost_equal(inf_std, y.std(), 2)
def test_interval_scorer(): # Fit a simple linear model n_samples = 200 n_features = 10 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) # simple linear function without noise y = np.dot(X, w) mfr = MondrianForestRegressor() mfr.fit(X, y) # Create a scorer that measures the mean interval size interval_size_scorer = IntervalScorer(mean_interval_size, sign=-1, kwargs={'confidence': 0.9}) # Get prediction intervals intervals = mfr.predict_interval(X, 0.9) interval_size = intervals[:, 1] - intervals[:, 0] calc_mean = np.mean(interval_size) # Ensure the scorer performs the correct calculation assert_almost_equal(interval_size_scorer(mfr, X, y), -1 * calc_mean)
def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) check_boston(mr) mr.partial_fit(X, y) check_boston(mr)
def test_mean_std_forest_regressor(): mfr = MondrianForestRegressor(random_state=0) mfr.fit(X, y) check_mean_std_forest_regressor(mfr) mfr.partial_fit(X, y) check_mean_std_forest_regressor(mfr)
def test_boston(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit(X, y) score = mr.score(X, y) assert_greater(score, 0.94, "Failed with score = %f" % score)
import numpy as np from sklearn.datasets import load_boston X = load_boston(return_X_y=True) X_train = X[0] y_train = X[1] #@print(X_train) print(X_train.shape) print(np.amax(X_train)) print(np.amin(X_train)) ### Use MondrianForests for variance estimation from skgarden import MondrianForestRegressor mfr = MondrianForestRegressor() mfr.fit(X_train, y_train) y_mean, y_std = mfr.predict(X_train, return_std=True) print(y_mean) #print(y_std) ### Use QuantileForests for quantile estimation #from skgarden import RandomForestQuantileRegressor #rfqr = RandomForestQuantileRegressor(random_state=0) #rfqr.fit(X, y) #y_mean = rfqr.predict(X) #y_median = rfqr.predict(X, 50)
def RF_regressor(X_data,Y_data,options=None): from sklearn.ensemble import RandomForestRegressor #################### # Parse user options #################### params = {} gridsearch = False GS_settings = None randomsearch = False RS_settings = None feature_selection = False accuracy = False cv_type = 'logo' scoring = 'neg_mean_absolute_error' mondrian = False search_std = False if (options is not None): if (("RF_parameters" in options)==True): params = options['RF_parameters'] if (("grid_search" in options)==True): from sklearn.model_selection import GridSearchCV import time gridsearch = True GS_params = options['grid_search']['parameter_grid'] if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] if (("search std" in options['grid_search'])==True): search_std = options['grid_search']['search std'] if (("random_search" in options)==True): from sklearn.model_selection import RandomizedSearchCV from cfd2ml.utilities import convert_param_dist import time randomsearch = True RS_params, RS_Nmax = convert_param_dist(options['random_search']['parameter_grid']) print('RS_Nmax = ', RS_Nmax) if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********') if (("feature_selection" in options)==True): from cfd2ml.utilities import RFE_perm feature_selection = True feats = options['feature_selection']['feats'] # if("step" in options['feature_selection']): step = options['feature_selection']['step'] # if("min_features" in options['feature_selection']): min_features = options['feature_selection']['min_features'] if(randomsearch==True or gridsearch==True): quit('******** Stopping! grid/random_search and feature selection both set ********') if (("accuracy" in options)==True): accuracy = options['accuracy'] if (accuracy==True): from sklearn.model_selection import cross_validate from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error if (("scoring" in options)==True): scoring = options['scoring'] if (("cv_type" in options)==True): cv_type = options['cv_type'] if (("mondrian" in options)==True): mondrian = options['mondrian'] if mondrian: from skgarden import MondrianForestRegressor ############## # Prepare data ############## if(cv_type=='logo'): groups = X_data['group'] X_data = X_data.drop(columns='group') # Find feature and target headers X_headers = X_data.columns Y_header = Y_data.name nX = X_headers.size print('\nFeatures:') for i in range(0,nX): print('%d/%d: %s' %(i+1,nX,X_headers[i]) ) print('\nTarget: ', Y_header) ######################## # Prepare other settings ######################## # Setting cross-validation type (either leave-one-group-out or 5-fold) if(cv_type=='logo'): from sklearn.model_selection import LeaveOneGroupOut logo = LeaveOneGroupOut() ngroup = logo.get_n_splits(groups=groups) print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups') elif(cv_type=='kfold'): from sklearn.model_selection import StratifiedKFold print('\nUsing 10-fold cross validation') k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True) cv = k_fold.split(X_data,Y_data) ######################### # Training the regressor ######################### if(gridsearch==True): # Finding optimal hyperparameters with GridSearchCV if mondrian: print('\n Performing GridSearchCV to find optimal hyperparameters for mondrian forest regressor') regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False) if search_std: # MESSY HACK! Ignore "best model etc" if using this def my_scorer(model, X, y_true): y_pred, y_sd = model.predict(X,return_std=True) return np.mean(y_sd) scoring=my_scorer else: print('\n Performing GridSearchCV to find optimal hyperparameters for random forest regressor') regr = RandomForestRegressor(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) GS_regr = GridSearchCV(estimator=regr,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings) GS_regr.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(GS_regr.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('GridSearch_results.csv') # Pich out best results best_params = GS_regr.best_params_ best_score = GS_regr.best_score_ regr = GS_regr.best_estimator_ # (this regr has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) elif(randomsearch==True): # Finding optimal hyperparameters with RandomSearchCV if mondrian: print('\n Performing RandomizedSearchCV to find optimal hyperparameters for mondrian forest regressor') regr = MondrianForestRegressor(**params,random_state=42,bootstrap=False) else: print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest regressor') regr = RandomForestRegressor(**params,random_state=42) if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) RS_regr = RandomizedSearchCV(estimator=regr,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings) RS_regr.fit(X_data,Y_data) # Write out results to file scores_df = pd.DataFrame(RS_regr.cv_results_)#.sort_values(by='rank_test_score') scores_df.to_csv('RandomSearch_results.csv') # Pick out best results best_params = RS_regr.best_params_ best_score = RS_regr.best_score_ regr = RS_regr.best_estimator_ # (this regr has been fit to all of the X_data,Y_data) print('\nBest hyperparameters found:', best_params) print('\nScore with these hyperparameters:', best_score) else: # Train RF regressor with hyperparameters given by user if mondrian: print('\nTraining mondrian forest regressor with given hyperparameters') regr = MondrianForestRegressor(**params,bootstrap=False) else: print('\nTraining random forest regressor with given hyperparameters') regr = RandomForestRegressor(**params) # Feature selection before final fit if (feature_selection): if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) # [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,cv=cv,scoring=scoring,step=step,min_features=min_features,timing=True) [nfeats,scores,traintimes,predtimes], bestscore, bestfeat, featsets = RFE_perm(regr,X_data,Y_data,feats,cv=cv,scoring=scoring,timing=True) if (scoring=='neg_mean_absolute_error'): scores = -scores bestscore = -bestscore elif(scoring=='neg_mean_squared_error'): scores = np.sqrt(-scores) bestscore = np.sqrt(-bestscore) import matplotlib.pyplot as plt plt.figure() plt.plot(nfeats,100*scores,lw=2) plt.xlabel('$N_{features}$') plt.ylabel('Score (%)') plt.figure() plt.plot(nfeats,traintimes,label='Training',lw=2) plt.plot(nfeats, predtimes,label='Prediction',lw=2) plt.xlabel('$N_{features}$') plt.ylabel('Time (s)') plt.legend() plt.show() print('Best score: %.2f' %(100*bestscore)) print('Feature set:') print(X_headers[bestfeat]) # Save results in CSV file featselect_df = pd.DataFrame(featsets,columns=X_headers) featselect_df['score'] = scores featselect_df['traintimes'] = traintimes featselect_df['predtimes'] = predtimes featselect_df['nfeats'] = nfeats featselect_df.to_csv('FeatSelect_results.csv') # cut down to optimial feature set X_data = X_data.iloc[:,bestfeat] # Fit model to data regr.fit(X_data,Y_data) # Cross validation accuracy metrics if(accuracy==True): print('\nPerforming cross validation to determine train and test accuracy/error') # Get generator object depending on cv strategy if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups) elif(cv_type=='kfold'): cv = k_fold.split(X_data,Y_data) # Need to regen "Generator" object from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # Init lists train_r2 = [] test_r2 = [] train_MAE = [] test_MAE = [] train_MSE = [] test_MSE = [] # Loop through CV folds i = 0 for train_index, test_index in cv: X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index] Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index] # Train regressor regr_cv = regr regr_cv.fit(X_train, Y_train) # Predict Y Y_pred_train = regr_cv.predict(X_train) Y_pred_test = regr_cv.predict(X_test ) # r2 scores r2score = r2_score(Y_test , Y_pred_test) train_r2.append(r2_score(Y_train, Y_pred_train) ) test_r2.append(r2score) # Mean absolute error scores MAEscore = mean_absolute_error(Y_test , Y_pred_test) train_MAE.append(mean_absolute_error(Y_train, Y_pred_train) ) test_MAE.append(MAEscore) # Mean squared error scores MSEscore = mean_squared_error(Y_test , Y_pred_test) train_MSE.append(mean_squared_error(Y_train, Y_pred_train) ) test_MSE.append(MSEscore) # Print validation scores (training scores are stored to print mean later, but not printed for each fold) if(cv_type=='logo'): print('\nTest group = ', groups.iloc[test_index[0]]) elif(cv_type=='kfold'): print('\nFold = ', i) print('-------------------') print('r2 score = %.2f %%' %(r2score*100) ) print('Mean absolute error = %.2f %%' %(MAEscore*100) ) print('Mean squared error = %.2f %%' %(MSEscore*100) ) i += 1 # Print performance scores print('\nMean training scores:') print('r2 score = %.2f %%' %(np.mean(train_r2)*100) ) print('Mean absolute error = %.2f %%' %(np.mean(train_MAE)*100) ) print('Mean squared error = %.2f %%' %(np.mean(train_MSE)*100) ) print('\nMean validation scores:') print('r2 score = %.2f %%' %(np.mean(test_r2)*100) ) print('Mean absolute error = %.2f %%' %(np.mean(test_MAE)*100) ) print('Mean squared error = %.2f %%' %(np.mean(test_MSE)*100) ) return regr