def __init__(self, params):
     super(%CLASS%, self).__init__(params)
     tmp = RandomForestRegressor()
     params = tmp.get_params()
     for key in params:
         self.create_new_input(type_="data", label=key, widget_name="std line edit m", widget_pos="besides", pos=-1)
     del tmp
def get_feat_imps():

    X_train, X_test, y_train, y_test = data_for_gridsearch()
    column_names = X_train.columns

    model = RFR(max_features='auto',
                max_depth=None,
                bootstrap=True,
                min_samples_leaf=5,
                min_samples_split=10,
                n_estimators=100)

    model = model.fit(X_train, y_train)

    model_params = model.get_params()
    feat_imps = model.feature_importances_

    print('model_params', model_params)
    print('feat_imps', feat_imps)

    rmse_train, rmse_test, errors_for_plot = eval_model(
        model, X_train, y_train, X_test, y_test)
    print('RMSE train/test: ', rmse_train, rmse_test)

    return model_params, feat_imps, column_names
Beispiel #3
0
 def createModelRFC(self):
     logger.info("DEFINITION OF THE MODEL RFC")
     # model = ExtraTreesClassifier(n_estimators=self.model_param['n_estimators'],max_features=self.model_param['max_features'],n_jobs=-1)
     # model = RandomForestClassifier(n_jobs=-1),
     model = RandomForestRegressor(n_jobs=-1)
     logger.info("MODEL PARAMS: %s", model.get_params(deep=True))
     return model
Beispiel #4
0
 def __init__(self, params):
     super(RFRGetParams_NodeInstance, self).__init__(params)
     tmp = RandomForestRegressor()
     params = tmp.get_params()
     for key in params:
         self.create_new_output(type_="data", label=key, pos=-1)
     del tmp
     self.create_new_output(type_="data", label="param dict", pos=-1)
def main():

    ## Read in csv and correct formating that was lost in transition
    mydf = read_data_csv()

    #eliminate rows that have 1 or more missing values
    mydf = mydf.dropna(axis=0)
    #convert region to something numerical
    numeric_regions = {
        'Africa': 1,
        'Asia': 2,
        'Central America/ Caribbean': 3,
    }
    mydf['region_num'] = mydf['region'].map(numeric_regions)
    
    ###

    predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \
                      'treat_cost','patient_age','smile_scale']
    numfeat = len(predictor_names)
    Y = mydf.dollars_per_day #variable to predict
    X = mydf[predictor_names]
    
    #Build classifier using "best" random forest
    nfolds = 3 #number of folds to use for cross-validation
    #n_estimators is number of trees in forest
    #max_features is the number of features to consider when looking for best split
    parameters = {'n_estimators':[10,100,1000],  'max_features':[3,5,7]} # rf parameters to try
    njobs = 1 #number of jobs to run in parallel -- pickle problems may occur if njobs > 1
    rf_tune = grid_search.GridSearchCV(RandomForestRegressor(), parameters,
                             n_jobs = njobs, cv = nfolds)
    rf_opt = rf_tune.fit(X,Y)
    
    #Results of the grid search for optimal random forest parameters.
    print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n")
    print("Best zero-one score: " + str(rf_opt.best_score_) + "\n")
    print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n")
    #print "Parameters of random forest:\n " , rf_opt.get_params()

    #save optimal random forest regressor for future
    #mypickledRF = open('RF_Regressor' , 'wb') #w is for write; b is for binary
    #pickle.dump(rf_opt.best_estimator_ , mypickledRF) #Save classifier in file "RFclassifier"
    #mypickledRF.close()

    #Now use the optimal model's parameters to run random forest
        #(I couldn't get feature importances directly from the GridSearchCV fit)
    crf = RandomForestRegressor(n_jobs=njobs, max_features=3, n_estimators=1000).fit(X,Y) 
    print "Parameters used in chosen RF model:\n " , crf.get_params()

    plotting_names = np.array(('Day','Date','Sex','Region','Cost','Age','Smile'))
    print crf.feature_importances_
    indices = np.argsort(crf.feature_importances_)[::-1][:numfeat]
    plt.bar(xrange(numfeat), crf.feature_importances_[indices], align='center', alpha=.5)
    plt.xticks(xrange(numfeat), plotting_names[indices], rotation='horizontal', fontsize=12)
    plt.xlim([-1, numfeat])
    plt.ylabel('Feature importances', fontsize=12)
    plt.title('Feature importances computed by Random Forest', fontsize=16)
    plt.savefig('03_feature_importance.png', dpi=150);
Beispiel #6
0
def search_bestparam_RandomForestRegressor(X_train, y_train, df_search_best_param):
    print(f"Search best params for RandomForestRegressor ...")
    model = RandomForestRegressor()
    print("Supported params", model.get_params())
    param_grid = {
              'n_estimators': [500, 700, 1000],
              'max_depth': [None, 1, 2, 3],
              'min_samples_split': [1, 2, 3]
      }
    search_bestparam(model, param_grid, X_train, y_train, df_search_best_param)
Beispiel #7
0
def build_model(df):
    X = df.iloc[:, :-1]  # Using all column except for the last column as X
    y = df.iloc[:, -1]  # Selecting the last column as y

    st.markdown('**1.2. Data splits**')
    st.write('Training set')
    st.info(X.shape)
    st.write('Test set')
    st.info(y.shape)

    st.markdown('**1.3. Variable details**:')
    st.write('X variable')
    st.info(list(X.columns))
    st.write('y variable')
    st.info(y.name)

    # Data splitting
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=split_size)

    rf = RandomForestRegressor(n_estimators=parameter_n_estimators,
                               max_features=parameter_max_features,
                               random_state=parameter_random_state,
                               criterion=parameter_criterion,
                               min_samples_split=parameter_min_samples_split,
                               min_samples_leaf=parameter_min_samples_leaf,
                               bootstrap=parameter_bootstrap,
                               oob_score=parameter_oob_score,
                               n_jobs=parameter_n_jobs)

    rf.fit(X_train, y_train)

    st.subheader('2. Model Performance')

    st.markdown('**2.1. Training set**')
    y_pred_train = rf.predict(X_train)
    st.write('Coefficient of determination ($R^2$):')
    st.info(r2_score(y_train, y_pred_train))

    st.write('Error (MSE or MAE):')
    st.info(mean_squared_error(y_train, y_pred_train))

    st.markdown('**2.2. Test set**')
    y_pred_test = rf.predict(X_test)
    st.write('Coefficient of determination ($R^2$):')
    st.info(r2_score(y_test, y_pred_test))

    st.write('Error (MSE or MAE):')
    st.info(mean_squared_error(y_test, y_pred_test))

    st.subheader('3. Model Parameters')
    st.write(rf.get_params())
Beispiel #8
0
class RandomForestModel:
    def __init__(self):
        self.regressor = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=0)

    def get_params(self):
        return self.regressor.get_params()

    def train(self, X, y):
        self.regressor.fit(X, y)

    def predict(self, X):
        return self.regressor.predict(X.values.reshape(1, -1))
Beispiel #9
0
    def test_parameters(self):
        """ Testing parameters of Model class. """
#1.)
        #create instance of PLS model using Model class & creating instance
        #   using SKlearn libary, comparing if the parameters of both instances are equal
        pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200}
        model = Model(algorithm="PlsRegression", parameters=pls_parameters)
        pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(pls_model.get_params()))
#2.)
        rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10}
        model = Model(algorithm="RandomForest", parameters=rf_parameters)
        rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(rf_model.get_params()))
#3.)
        knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"}
        model = Model(algorithm="KNN", parameters=knn_parameters)
        knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(knn_model.get_params()))
#4.)
        svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1}
        model = Model(algorithm="SVR",parameters=svr_parameters)
        svr_model = SVR(kernel='poly', degree=5, coef0=1)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(svr_model.get_params()))
#5.)
        ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"}
        model = Model(algorithm="AdaBoost", parameters=ada_parameters)
        ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(ada_model.get_params()))
#6.)
        bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2}
        model = Model(algorithm="Bagging", parameters=bagging_parameters)
        bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(bagging_model.get_params()))
#7.)
        lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004}
        model = Model(algorithm="lasso", parameters=lasso_parameters)
        lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(lasso_model.get_params()))
def run_random_forest(mydf):

    print "\n************ Random Forest Results ************\n"

    mydf = prepare_data_for_RF(mydf)

    predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \
                      'treat_cost','patient_age','smile_scale']
    numfeat = len(predictor_names)
    Y = mydf.dollars_per_day  #variable to predict
    X = mydf[predictor_names]

    #Build classifier using "best" random forest
    nfolds = 3  #number of folds to use for cross-validation
    #n_estimators is number of trees in forest
    #max_features is the number of features to consider when looking for best split
    parameters = {
        'n_estimators': [10, 100, 1000],
        'max_features': [3, 5, 7]
    }  # to try
    njobs = 1  #number of jobs to run in parallel
    rf_tune = grid_search.GridSearchCV(RandomForestRegressor(),
                                       parameters,
                                       n_jobs=njobs,
                                       cv=nfolds)
    rf_opt = rf_tune.fit(X, Y)

    #Results of the grid search for optimal random forest parameters.
    print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n")
    print("Best zero-one score: " + str(rf_opt.best_score_) + "\n")
    print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n")
    #print "Parameters of random forest:\n " , rf_opt.get_params()

    #Now use the optimal model's parameters to run random forest
    #(I couldn't get feature importances directly from the GridSearchCV fit)
    crf = RandomForestRegressor(n_jobs=njobs,
                                max_features=3,
                                n_estimators=1000).fit(X, Y)
    print "Parameters used in chosen RF model:\n ", crf.get_params()

    plotting_names = np.array(
        ('Day', 'Date', 'Sex', 'Region', 'Cost', 'Age', 'Smile'))
    #print crf.feature_importances_
    indices = np.argsort(crf.feature_importances_)[::-1][:numfeat]
    plt.bar(xrange(numfeat), crf.feature_importances_[indices], \
        align='center', alpha=.5)
    plt.xticks(xrange(numfeat), plotting_names[indices], \
        rotation='horizontal', fontsize=20)
    plt.xlim([-1, numfeat])
    plt.ylabel('Feature importances', fontsize=24)
    plt.title('', fontsize=28)
    plt.savefig('03_feature_importance_v2.pdf')
Beispiel #11
0
def build_model(df):
    X = df.iloc[:, :-1]  # using all columns except for the last column as X
    y = df.iloc[:, -1]  # select the last column as y

    st.markdown('**1.2. Data Splits**')
    st.write('Training Set')
    st.info(X.shape)
    st.write('Testing Set')
    st.info(y.shape)

    st.markdown('**1.3. Variable Details:**')
    st.write('X Variables')
    st.info(list(X.columns))
    st.write('Y Variables')
    st.info(y.name)

    # Data splitting
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=split_size)

    rfr = RandomForestRegressor(n_estimators=parameter_n_estimator,
                                random_state=parameter_random_state,
                                max_features=parameter_max_feature,
                                criterion=parameter_criterion,
                                min_samples_split=parameter_min_sample_split,
                                min_samples_leaf=parameter_min_sample_leaf,
                                bootstrap=parameter_bootstrap,
                                oob_score=parameter_oob_store,
                                n_jobs=parameter_n_jobs)
    rfr.fit(X_train, y_train)

    st.subheader('2. Model Performance')

    st.markdown('**2.1. Training Set**')
    y_train_pred = rfr.predict(X_train)
    st.write('Coefficient of determination ($R^2$):')
    st.info(metrics.r2_score(y_train, y_train_pred))

    st.write('Error (MSE or MAE):')
    st.write(nama_error)
    st.info(error(y_train, y_train_pred))

    st.markdown('**2.2. Testing Set**')
    y_test_pred = rfr.predict(X_test)
    st.write(nama_error)
    st.info(error(y_test, y_test_pred))

    st.subheader('3. Model Parameters')
    st.write(rfr.get_params())  # get the parameters of the model
Beispiel #12
0
def get_feat_imps(X, y):
    column_names = X.columns
    model = RFR(max_features        = 'auto',
                max_depth           = None,
                bootstrap           = True,
                min_samples_leaf    = 5,
                min_samples_split   = 10,
                n_estimators        = 100
                )
    model = model.fit(X, y)
    model_params    = model.get_params()
    feat_imps       = model.feature_importances_
    print('model_params', model_params)
    print('feat_imps', feat_imps)

    return model_params, feat_imps, column_names
Beispiel #13
0
def Grid_Search_CV_RFR(X_train, y_train, X_test, y_test):
    #Default RandomForestRegression
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train.flatten())
    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)
    r2_default_train = metrics.r2_score(y_train, y_pred_train)
    r2_default_test = metrics.r2_score(y_test, y_pred_test)
    #base_accuracy = evaluate(rf, X_test, y_test.flatten())
    params = rf.get_params()
    #print('Default parameters in the fit are ', params)
    #print('R2 Default Train={:0.5f}'.format(r2_default_train))
    #print('R2 Default Test={:0.5f}'.format(r2_default_test))

    #Instantiate Grid search model to optimize hyperparameters for Random Forest Regression
    param_grid = {
        'bootstrap': [True, False],  #Replacement or not
        'max_depth':
        [5, 7, 10],  #Depth of each tree in the forest (1-32). 4, 8, 12, 16, 20
        'max_features':
        ['auto'
         ],  #Number of features to consider when looking for the best split
        'min_samples_leaf':
        [5, 7],  #Minimum number of samples required to be at a leaf node
        'min_samples_split': [
            6, 8
        ],  #Minimum samples required to split an internal node (10-100%). 8, 10, 12
        'n_estimators':
        [10, 20, 30]  #Number of trees in the forest (<200). 10, 20, 30, 40
    }

    grid_search = GridSearchCV(rf, param_grid, cv=5, refit=True, verbose=0)
    grid_results = grid_search.fit(X_train, y_train.flatten())
    best_parameters = grid_search.best_params_
    best_result = grid_search.best_score_
    best_estimator = grid_search.best_estimator_  #Optimized randomForestRegressor

    y_pred_grid_train = best_estimator.predict(X_train)
    y_pred_grid_test = best_estimator.predict(X_test)
    r2_grid_train = metrics.r2_score(y_train, y_pred_grid_train)
    r2_grid_test = metrics.r2_score(y_test, y_pred_grid_test)

    #print('Optimised parameters in the fit are ', best_parameters)
    #grid_accuracy =   evaluate(best_estimator, X_test, y_test.flatten())
    #print('base_accuracy={:0.5f}'.format(base_accuracy), 'grid_accuracy={:0.5f}'.format(grid_accuracy), '. Relative improvement of {:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

    return best_parameters, best_result, best_estimator
def run_random_forest(mydf):

    print "\n************ Random Forest Results ************\n"

    mydf = prepare_data_for_RF(mydf)   
    
    predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \
                      'treat_cost','patient_age','smile_scale']
    numfeat = len(predictor_names)
    Y = mydf.dollars_per_day #variable to predict
    X = mydf[predictor_names]
    
    #Build classifier using "best" random forest
    nfolds = 3 #number of folds to use for cross-validation
    #n_estimators is number of trees in forest
    #max_features is the number of features to consider when looking for best split
    parameters = {'n_estimators':[10,100,1000],  'max_features':[3,5,7]} # to try
    njobs = 1 #number of jobs to run in parallel
    rf_tune = grid_search.GridSearchCV(RandomForestRegressor(), parameters,
                             n_jobs = njobs, cv = nfolds)
    rf_opt = rf_tune.fit(X,Y)
    
    #Results of the grid search for optimal random forest parameters.
    print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n")
    print("Best zero-one score: " + str(rf_opt.best_score_) + "\n")
    print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n")
    #print "Parameters of random forest:\n " , rf_opt.get_params()

    #Now use the optimal model's parameters to run random forest
        #(I couldn't get feature importances directly from the GridSearchCV fit)
    crf = RandomForestRegressor(
        n_jobs=njobs, max_features=3, n_estimators=1000).fit(X,Y) 
    print "Parameters used in chosen RF model:\n " , crf.get_params()

    plotting_names = np.array(('Day','Date','Sex','Region','Cost','Age','Smile'))
    #print crf.feature_importances_
    indices = np.argsort(crf.feature_importances_)[::-1][:numfeat]
    plt.bar(xrange(numfeat), crf.feature_importances_[indices], \
        align='center', alpha=.5)
    plt.xticks(xrange(numfeat), plotting_names[indices], \
        rotation='horizontal', fontsize=20)
    plt.xlim([-1, numfeat])
    plt.ylabel('Feature importances', fontsize=24)
    plt.title('', fontsize=28)
    plt.savefig('03_feature_importance_v2.pdf');
Beispiel #15
0
def do_model(X_train, X_test, y_train, y_test):

    model = RFR(max_features='sqrt',
                max_depth=100,
                bootstrap=False,
                min_samples_leaf=1,
                min_samples_split=2,
                n_estimators=200)

    model = model.fit(X_train, y_train)

    ypred = model.predict(X_test)
    ytrainpred = model.predict(X_train)

    model_params = model.get_params()
    feat_imps = model.feature_importances_

    return model, model_params, feat_imps, ypred, ytrainpred
Beispiel #16
0
class _RandomForestRegressorWrapper(BaseEstimator, RegressorMixin):

    _params = ()

    def __init__(self, **kwargs):
        """Base wrapper for a RandomForestRegressor class."""

        super().__init__()
        self._forest = RandomForestRegressor_(**kwargs)

    def fit(self, X, y, **kwargs):
        self._forest.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        return self._forest.predict(X)

    def __getattr__(self, attr):
        # Check if own attribute.
        if attr in self.__dict__:
            return getattr(self, attr)

        # Proxy to forest.
        return getattr(self._forest, attr)

    def get_params(self, deep=True):
        # Merge own parameters with the forests.
        forest_params = self._forest.get_params(deep=deep)
        own_params = {p: getattr(self, p) for p in self._params}
        return toolz.merge(forest_params, own_params)

    def set_params(self, **parameters):
        # Copy dict.
        parameters = dict(parameters)

        # Extract own parameters.
        for own_param in self._params:
            if own_param in parameters:
                self.setattr(own_param, parameters.pop(own_param))

        # Pass others to the forest.
        self._forest.set_params(parameters)

        return self
Beispiel #17
0
def hyper_par_girdcv(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.7,
                                                        random_state=42)
    model = RandomForestRegressor()
    print(model.get_params().keys())
    parameters = {
        "n_estimators":
        [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
        "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        "max_features": ["auto", "log2", "sqrt", None],
        "max_leaf_nodes": [None, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    }
    gs = GridSearchCV(
        model,
        param_grid={
            'n_estimators':
            [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
        },
        cv=10,
        n_jobs=1,
        scoring='neg_mean_squared_error')

    gs.fit(X_train, y_train)
    print(gs.best_params_)
    best_grid = gs.best_estimator_
    y_pred = best_grid.predict(X_test)
    text_out = {
        "R-squared": round(r2_score(y_test, y_pred), 3),
        "MAE": round(mean_absolute_error(y_test, y_pred), 3),
        "MSE": round(mean_squared_error(y_test, y_pred), 3)
    }
    json_out = json.dumps(text_out, sort_keys=False, indent=4)

    with open('models/rtfr.pkl', 'wb') as output_file:
        pickle.dump(best_grid, output_file)

    return json_out
Beispiel #18
0
    def get_warmstart_configuration(self):
        """
        Determine the default hyperparameter configuration of the selected ML-algorithm. This configuration can be used
        as a warmstart configuration for the HPO-method.
        :return: default_params: dict
            Dictionary that contains the default HPs.
        """
        if self.ml_algorithm == 'RandomForestRegressor':
            default_model = RandomForestRegressor(
                random_state=self.random_seed)

        elif self.ml_algorithm == 'SVR':
            # SVR has no random_state parameter
            default_model = SVR()

        elif self.ml_algorithm == 'AdaBoostRegressor':
            default_model = AdaBoostRegressor(random_state=self.random_seed)

        elif self.ml_algorithm == 'DecisionTreeRegressor':
            default_model = DecisionTreeRegressor(
                random_state=self.random_seed)

        elif self.ml_algorithm == 'LinearRegression':
            # LinearRegression has no random_state parameter
            default_model = LinearRegression()

        elif self.ml_algorithm == 'KNNRegressor':
            # KNeighborsRegressor has no random_state parameter
            default_model = KNeighborsRegressor()

            # Add remaining ML-algorithms here (e.g. XGBoost, Keras)

        else:
            raise Exception('Unknown ML-algorithm!')

        # Default HPs of the ML-algorithm
        default_params = default_model.get_params()

        return default_params
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

clf = DecisionTreeRegressor(max_depth=6)

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

clf.score(X, y)
clf.get_depth()
clf.get_n_leaves()
clf.get_params()

# In[18]:

clf.get_depth()

# In[33]:

clf.score(X_test, y_test)

# In[31]:

# evaluate decision tree performance on train and test sets with different tree depths
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
Beispiel #20
0
def surrogateRF(Xarchive,
                Farchive,
                X,
                file_loc,
                file_loc_general,
                toUpdate,
                first_iter=False,
                problem='LABS',
                index=1):
    Xnew = Xarchive.T
    X_pred = X.T
    SMAC = False

    if SMAC:
        with open("/home/naamah/Documents/CatES/result_All/smac/RF/X1.p",
                  "wb") as fp:
            pickle.dump(Xnew, fp)
        with open("/home/naamah/Documents/CatES/result_All/smac/RF/F1.p",
                  "wb") as fp:
            pickle.dump(Farchive, fp)

        anf = smac_RF.main_loop(problem)

        print("SMAC {}".format(anf))
        sys.exit("Error message")

    clf = RandomForestRegressor(criterion="mse",
                                n_estimators=49,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                min_weight_fraction_leaf=0.0001060554,
                                max_leaf_nodes=1000,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                warm_start=False,
                                max_depth=None,
                                max_features="auto",
                                random_state=7)  #RF_9111

    # if problem=="LABS":
    #
    #     clf = RandomForestRegressor(criterion="mse",n_estimators=49, min_samples_leaf=1, min_samples_split=2,
    #                                 min_weight_fraction_leaf=0.0001060554, max_leaf_nodes=1000,
    #                                 min_impurity_decrease=0.0,min_impurity_split=None, warm_start=False, max_depth=None,
    #                                 max_features="auto",random_state=7
    #                                 ) #RF_9111
    #
    #
    # elif problem=="NKL":
    #     clf = RandomForestRegressor(criterion="mse",n_estimators=43, min_samples_leaf=1, min_samples_split=4,
    #                                 min_weight_fraction_leaf=0.0005238657425634613, max_leaf_nodes=673, min_impurity_decrease=0.0,
    #                                 warm_start=True, max_depth=None, max_features="auto", random_state=8
    #                                 ) #RF_1_sig
    #
    #
    #
    # else: #problem=="QAP"
    #     clf = RandomForestRegressor(criterion="mse",n_estimators=38, min_samples_leaf=1, min_samples_split=2,
    #                                 min_weight_fraction_leaf=0.0002313685, max_leaf_nodes=551, min_impurity_decrease=2.86E-08,
    #                                 warm_start=False, max_depth=None, max_features="auto", random_state=None
    #                                 )#RF_9333

    if not os.path.exists(file_loc_general + "/surrogate_configuration"):
        with open(file_loc_general + "/surrogate_configuration", 'a') as file:
            file.write("clf:\n{}\n\nTuning Algorithem: {} ".format(
                clf.get_params(), "smac"))
        file.close()

    clf.fit(Xnew, Farchive)
    F_pred = clf.predict(X_pred)

    return F_pred
Beispiel #21
0
# ### Displaying Results of the Random Forest Model

# In[20]:

plot_ground_truth_vs_prediction(y_valid, predictions)
plot_results_as_scatter(y_valid, predictions)
display_results(y_valid, predictions)
apply_cross_validation(randomForest, X, y)

# ### Hyperparameter Tuning
# Applying Hyperparameter Tuning to see if we can improve the results of our model

# In[21]:

print('Parameters currently in use:\n')
pprint(randomForest.get_params())

# ### Creation of all the possible features parameters
# Here we create different inputs for Hyperparameter Tuning
#
# Hyperparameter Tuning features found here: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# In[22]:

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
Beispiel #22
0
P_VALUE = 0.05

if kstest_results_predicted_values.pvalue <= P_VALUE or kstest_results_predicted_values.pvalue <= P_VALUE:
    print(stats.kruskal(future_real_target, y_future_pred))
    print(stats.mannwhitneyu(future_real_target, y_future_pred))
else:
    levene_results = stats.levene(future_real_target, y_future_pred)
    print("levene_results", levene_results)
    print(
        stats.ttest_ind(future_real_target,
                        y_future_pred,
                        equal_var=levene_results.pvalue > P_VALUE))

## Export results
n_trees = model.get_params()['n_estimators']
n_features = future_test.shape[1]

res = {
    'MAE': test_mae,
    'RMSE': test_rmse,
    'R2': test_r2,
    'Num Trees': n_trees,
    'Num Features': n_features,
    'Time Taken': time_taken
}

res_df = pd.DataFrame(
    [res],
    columns=['MAE', 'RMSE', 'R2', 'Num Trees', 'Num Features', 'Time Taken'])
result = res_df.to_string()



## Inspecting RF Hyperparameters in sklearn
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Set seed for reproducibility
SEED = 1

# Instantiate a random forests regressor 'rf'
rf = RandomForestRegressor(random_state= SEED)

# Inspect rf' s hyperparameters
rf.get_params()

# Basic imports
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV

# Define a grid of hyperparameter 'params_rf'
params_rf = {
                'n_estimators': [300, 400, 500],
                'max_depth': [4, 6, 8],
                'min_samples_leaf': [0.1, 0.2],
                'max_features': ['log2','sqrt']
            }

# Instantiate 'grid_rf'
grid_rf = GridSearchCV(estimator=rf,param_grid=params_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

submission.to_csv('submission.csv', index = False)


# In[69]:


# Got a score of 5.35 with k-fold Linear regression 
# Now trying random Forest regressor to check if there is any improvement
from sklearn.ensemble import RandomForestRegressor
rfgModel = RandomForestRegressor()
# Trying cross validation first to check if the model is givibng good results. Root mean square value is
# a good approximation of the performance of a prediction model
print("Random Forest Generator Parameters: ")
print(rfgModel.get_params() )
rfgModel.fit(X_train, y_train)
rfgModel_pred = rfgModel.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test , rfgModel_pred))
print("root mean Squared error: {}".format(rmse))
rfgModel.fit(train_data, output_data)
# Now running the model on actual data test data
rfgModel_pred = rfgModel.predict(test_data)


# In[70]:


submission = pd.DataFrame(
    {'key': test_data_with_key.key, 'fare_amount': rfgModel_pred},
    columns = ['key', 'fare_amount'])
rf = RandomForestRegressor(max_depth=20, n_estimators=50)

# Train model
rf.fit(X=x, y=y)

# Get prediction results
result = rf.predict(tX)

print "Result"
print "------"
print result

# Analyze performance
print "Performance"
print "-----------"
print "Root Mean Squared Error", mean_squared_error(tY, np.array(result)) ** 0.5
print "Mean Absolute Error", mean_absolute_error(tY, np.array(result))
print "R2", Measures.r2(tY, np.array(result))
# Dump pickle files
print df_mapper.features
print rf.get_params()

joblib.dump(df_mapper, mapper_pkl, compress = 3)
joblib.dump(rf, estimator_pkl, compress = 3)

# # Build pmml
# command = "java -jar converter-executable-1.1-SNAPSHOT.jar --pkl-mapper-input mapper.pkl --pkl-estimator-input estimator.pkl --pmml-output mapper-estimator.pmml"
# os.system(command)


sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = y_train.values.reshape(-1, 1)
y_train = sc_y.fit_transform(y_train)"""

# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=900, random_state=0)
regressor.fit(X_train, y_train)
# Predicting a new result
y_pred = regressor.predict(X_test)
print(regressor.score(X=X_test, y=y_test))

regressor.get_params()

parameters = [{
    'random_state': [0, 1, 2, 3, 4, 5],
    'n_estimators': [900, 950, 1000]
}]

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=regressor,
                           param_grid=parameters,
                           scoring='r2',
                           n_jobs=-1,
                           cv=3)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_score_
     'max_features': 'auto',
     'max_leaf_nodes': None,
     'min_impurity_decrease': 0.0,
     'min_impurity_split': None,
     'min_samples_leaf': 1,
     'min_samples_split': 2,
     'min_weight_fraction_leaf': 0.0,
     'n_estimators': 10,
     'n_jobs': 1,
     'oob_score': False,
     'random_state': 42,
     'verbose': 0,
     'warm_start': False}

print('Parameters currently in use:\n')
pprint(rf_r.get_params())




m_rf_hhb = rf_r
m_rf_hhb.fit(tuning_x_train_hhb,y_train_hhb)
m_rf_hhb.score(tuning_x_test_hhb, y_test_hhb)







Beispiel #28
0
class PostRankOptimization(object):
    """

    :param balanced:
    :param visual_expansion_use:
    :param re_score_alpha:
    :param re_score_method_proportional:
    :param regions: Define which of the regions to be considered upper body and which legs. If None, not used.
           Must be of length 2 if defined.
           Example: regions=[[0, 1], [2, 3, 4]]
    :return:
    """

    def __init__(self, balanced=False, visual_expansion_use=True, re_score_alpha=0.15,
                 re_score_proportional=True, regions=None, ve_estimators=20, ve_leafs=5):  # OK
        self.subject = -1  # The order of the person to be Re-identified by the user (Initially -1)
        self.probe_name = ""
        self.probe_selected = None  # Already feature extracted
        self.target_position = 0
        self.iteration = 0
        self.strong_negatives = []
        self.weak_negatives = []
        self.visual_expanded = []
        self.new_strong_negatives = []
        self.new_weak_negatives = []
        self.new_visual_expanded = []
        self.visual_expansion = RandomForestRegressor(n_estimators=ve_estimators, min_samples_leaf=ve_leafs,
                                                      n_jobs=-1)  # As in POP

        # regions = [[0], [1]]
        if regions is None:
            self.regions = [[0]]
            self.regions_parts = 1
        elif len(regions) == 2:
            self.regions = regions
            self.regions_parts = sum([len(e) for e in regions])
        else:
            raise ValueError("Regions size must be 2 (body region and legs region)")
        self.size_for_each_region_in_fe = 0  # Initialized at initial iteration

        self.execution = None
        self.ranking_matrix = None
        self.rank_list = None
        self.comp_list = None
        self.balanced = balanced
        if not balanced:
            self.use_visual_expansion = False
        else:
            self.use_visual_expansion = visual_expansion_use
        self.re_score_alpha = re_score_alpha
        self.re_score_proportional = re_score_proportional

    def set_ex(self, ex, rm):  # OK
        self.execution = ex
        self.ranking_matrix = rm
        self.initial_iteration()

    def new_samples(self, weak_negatives_index, strong_negatives_index, absolute_index=False):  # OK
        self.new_weak_negatives = [[e, idx] for [e, idx] in weak_negatives_index if
                                   [e, idx] not in self.weak_negatives]
        self.new_strong_negatives = [[e, idx] for [e, idx] in strong_negatives_index if
                                     [e, idx] not in self.strong_negatives]
        if not absolute_index:
            self.new_weak_negatives = [[self.rank_list[e], idx] for [e, idx] in self.new_weak_negatives]
            self.new_strong_negatives = [[self.rank_list[e], idx] for [e, idx] in self.new_strong_negatives]

    def _generate_visual_expansion(self):  # OK
        n_estimators = self.visual_expansion.get_params()['n_estimators']
        selected_len = round(float(n_estimators) * (2 / 3.))
        selected = np.random.RandomState()
        selected = selected.permutation(n_estimators)
        selected = selected[:selected_len]
        expansion = np.zeros_like(self.probe_selected)
        for s in selected:
            expansion += self.visual_expansion[s].predict(self.probe_selected).flatten()
        expansion /= float(selected_len)
        return expansion

    def new_subject(self):  # OK
        if self.subject < self.execution.dataset.test_size:
            self.subject += 1
            self.probe_name = self.execution.dataset.probe.files_test[self.subject]
            self.probe_name = "/".join(self.probe_name.split("/")[-2:])
            self.probe_selected = self.execution.dataset.probe.fe_test[self.subject]
            self.rank_list = self.ranking_matrix[self.subject].copy()
            self.comp_list = self.execution.matching_matrix[self.subject].copy()
            self._calc_target_position()
            self.iteration = 0
            self.strong_negatives = []
            self.weak_negatives = []
            self.visual_expanded = []
        else:
            return  # TODO Control situation

    def initial_iteration(self):  # OK
        self.new_subject()
        self.size_for_each_region_in_fe = self.execution.dataset.gallery.fe_test.shape[1] / self.regions_parts
        if self.use_visual_expansion:
            self.visual_expansion.fit(self.execution.dataset.probe.fe_train, self.execution.dataset.gallery.fe_train)

    def iterate(self):  # OK
        self.iteration += 1
        # print("Iteration %d" % self.iteration)
        to_expand_len = len(self.new_strong_negatives) - len(self.new_weak_negatives)
        if self.balanced:
            if to_expand_len < 0:
                return "There cannot be more weak negatives than strong negatives"
            elif to_expand_len > 0 and not self.use_visual_expansion:
                return "There must be the same number of weak negatives and strong negatives"

            for i in range(to_expand_len):
                # Randomly select if body or legs
                if len(self.regions) == 1:
                    reg = 0
                else:  # Assumes only two body parts
                    reg = random.choice([0, 1])
                self.new_visual_expanded.append([self._generate_visual_expansion(), reg])

        self.reorder()

        self._calc_target_position()

        self.strong_negatives.extend(self.new_strong_negatives)
        self.weak_negatives.extend(self.new_weak_negatives)
        self.visual_expanded.extend(self.new_visual_expanded)
        self.new_strong_negatives = []
        self.new_weak_negatives = []
        self.new_visual_expanded = []
        return "OK"

    def collage(self, name, cols=5, size=20, min_gap_size=5):  # OK
        """
        Adapted from http://answers.opencv.org/question/13876/
                     read-multiple-images-from-folder-and-concat/?answer=13890#post-id-13890

        :param name: path to save collage imgf
        :param cols: num of columms for the collage
        :param size: num of images to show in collage
        :param min_gap_size: space between images
        :return:
        """
        # Add reference imgf first
        imgs = []

        img = self.execution.dataset.probe.images_test[self.subject].copy()
        img[0:10, 0:10] = [0, 255, 0]
        cv2.putText(img, "Probe", (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.4, 255, 1)
        imgs.append(img)

        elements = self.rank_list.copy()

        # Open imgs and save in list
        size = min(len(elements), (size - 1))
        for i, elem in zip(range(size), elements):
            # print files_order_list[elem]
            img = self.execution.dataset.gallery.images_test[elem].copy()
            if self.execution.dataset.same_individual_by_pos(self.subject, elem, "test"):
                img[0:10, 0:10] = [0, 255, 0]
            cv2.putText(img, str(i), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, [0, 0, 255], 2)

            imgs.append(img)

        # let's find out the maximum dimensions
        max_width = 0
        max_height = 0

        for img in imgs:
            max_height = max(max_height, img.shape[0])  # rows
            max_width = max(max_width, img.shape[1])  # cols

        # number of images in y direction
        rows = int(math.ceil(len(imgs) / float(cols)))

        result = np.zeros(
            (rows * max_height + (rows - 1) * min_gap_size, cols * max_width + (cols - 1) * min_gap_size, 3), np.uint8)

        current_height = current_width = i = 0

        for y in range(rows):
            for x in range(cols):
                result[current_height:current_height + imgs[i].shape[0],
                current_width:current_width + imgs[i].shape[1]] = imgs[i]
                i += 1
                current_width += max_width + min_gap_size
            current_width = 0
            current_height += max_height + min_gap_size

        cv2.imwrite(name, result)
        cv2.imshow("tal", result)
        cv2.waitKey(1)

    def reorder(self):  # OK
        raise NotImplementedError("Please Implement reorder method")

    def _calc_target_position(self):  # OK
        for column, elemg in enumerate(self.rank_list):
            if self.execution.dataset.same_individual_by_pos(self.subject, elemg, selected_set="test"):
                target_position = column  # If not multiview we can exit loop here
                self.target_position = target_position
                break
Beispiel #29
0
for clf in [clf_A, clf_B, clf_C, clf_D, clf_E, clf_F]:
    train_predict(clf, X_train, y_train, X_valid, y_valid)'''
    

# RandomForestRegressor
parameters = {'n_estimators':(10,15,20),
              'min_samples_split':(2,3,4),
              'min_samples_leaf':(1,2,3)}

rfr = RandomForestRegressor(random_state=seed, warm_start=True)
score = make_scorer(mean_squared_error, greater_is_better=False)
grid_obj = GridSearchCV(rfr, param_grid=parameters, scoring=score, verbose=1, n_jobs=4, cv=5)
grid_obj= grid_obj.fit(X_train, y_train)
rfr = grid_obj.best_estimator_
print rfr.get_params(), '\n'
print "Tuned model has a training RMSE score of {:.4f}.".format(predict_labels(rfr, X_train, y_train))
print "Tuned model has a testing RMSE score of {:.4f}.".format(predict_labels(rfr, X_valid, y_valid))

# RidgeCV
ridge = RidgeCV(alphas=(1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1.0, 10.0), cv=5)
ridge = ridge.fit(X_train, y_train)
print ridge.get_params(), '\n'
print "Tuned model has a training RMSE score of {:.4f}.".format(predict_labels(ridge, X_train, y_train))
print "Tuned model has a testing RMSE score of {:.4f}.".format(predict_labels(ridge, X_valid, y_valid))

# Save regressors
pickle_file = 'regressor.pickle'

try:
  f = open(pickle_file, 'wb')
Beispiel #30
0
max_features = [4, 6, 8, 10]


--------------------------------------------------
# Exercise_2 
from sklearn.ensemble import RandomForestRegressor

# Fill in rfr using your variables
rfr = RandomForestRegressor(
    n_estimators=100,
    max_depth=random.choice(max_depth),
    min_samples_split=random.choice(min_samples_split),
    max_features=random.choice(max_features))

# Print out the parameters
print(rfr.get_params())

--------------------------------------------------
# Exercise_3 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# Finish the dictionary by adding the max_depth parameter
param_dist = {"max_depth": [2, 4, 6, 8],
              "max_features": [2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 8, 16]}

# Create a random forest regression model
rfr = RandomForestRegressor(n_estimators=10, random_state=1111)

# Create a scorer to use (use the mean squared error)
Beispiel #31
0
class TrainROI():
	"""Train a regressor and test it on ROI loan data

	"""

	def __init__(self):
		self.load_data()
		self.calculate_roi()
		self.convert_to_float()
		self.split_by_grade()

		self.create_targets_features()
		self.split_train_test(train_size=0.8)
		#self.balance()

		self.X_train = self.X_train.drop(['loan_status', 'total_pymnt', 'roi'], 1).values
		self.y_train = self.y_train.values
		self.X_test = self.X_test.drop(['loan_status', 'total_pymnt', 'roi'], 1).values
		self.y_test = self.y_test.values


	def load_data(self):
		fileName = 'data.pickle'
		print "Loading %s" %fileName
		f = open(fileName, 'rb')
		self.loanData = pickle.load(f)

	def calculate_roi(self):
		self.loanData['roi'] = (self.loanData['total_pymnt']-self.loanData['funded_amnt'])/self.loanData['funded_amnt']

	def convert_to_float(self):
		self.loanData = self.loanData.astype(float)

	def split_by_grade(self, grade='A'):
		self.loans = self.loanData[self.loanData[grade] == 1]
		self.loans = self.loans.drop(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 1)


	def split_train_test(self, train_size=0.8):
		mask = np.random.rand(len(self.targets)) < train_size
		self.X_train = self.features[mask]
		self.y_train = self.targets[mask]
		self.X_test = self.features[~mask]
		self.y_test = self.targets[~mask]

		print "Instances in training: ", len(self.X_train)
		print "Instances in testing: ", len(self.X_test)


	def scale(self):
		self.scalerX = StandardScaler().fit(self.X_train)
		self.X_train, self.X_test = self.scalerX.transform(self.X_train), \
									self.scalerX.transform(self.X_test)

	def standardize_samples(self):
		##0 mean, unit variance
		self.X_train = preprocessing.scale(self.X_train)
		self.X_test = preprocessing.scale(self.X_test)

	def scale_samples_to_range(self):
		##Samples lie in range between 0 and 1
		minMaxScaler = preprocessing.MinMaxScaler()
		self.X_train = minMaxScaler.fit_transform(self.X_train)
		self.X_test = minMaxScaler.fit_transform(self.X_test)

	def balance(self):
		"""Balances the training default and non-default instances"""
		print "Total loans before balancing: ", len(self.X_train)
		print "Defaults before balancing: ", np.sum(self.X_train['loan_status'] == 0)
		defaults_added = 0
		for i in range(1, len(self.X_train)):
			loan = self.X_train[i-1:i]
			loan_roi = self.y_train[i-1:i]
			if int(loan['loan_status']) == 0:
				for n in range(10): 	#replicate the loan multiple times
					defaults_added += 1
					if defaults_added%100 == 0:
						print defaults_added
					self.X_train = self.X_train.append(loan)
					self.y_train = self.y_train.append(loan_roi)
		print "Total loans after balancing: ", len(self.y_train)
		print "Defaults after balancing: ", np.sum(self.X_train['loan_status'] == 0)

	def create_targets_features(self):
		self.targets = self.loans['roi']
		self.features = self.loans

	def define_linear_regressor(self):
		self.regr = LinearRegression()

	def define_SVR(self, C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, 
				  probability=False, tol=0.01, cache_size=200, class_weight='auto', verbose=True, 
				  max_iter=-1, random_state=None):
		print "Using a Support Vector Machine Regressor ..."
		self.regr = SVR(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, 
				  probability=probability, tol=tol, cache_size=cache_size, verbose=verbose, 
				  max_iter=max_iter, random_state=random_state)

		print self.regr.get_params()

	def define_rfr(self, n_estimators):
		self.regr = RandomForestRegressor(n_estimators=n_estimators)

	def train_regr(self):
		self.regr.fit(self.X_train, self.y_train)

	def score_regr(self, X, y):
		score = self.regr.score(X, y)
		print "Score: %0.3f" %score

	def predict(self, filename_label):
		print "predicting"
		self.prediction = self.regr.predict(self.X_test)
		print "Saving prdiction as A_%s.pickle" %filename_label
		self.save_pickle(fileName="A_%s_predict.pickle" %filename_label,
										 data=self.prediction)
		self.save_pickle(fileName="A_%s_test.pickle" %filename_label, 
										 data=self.y_test)

	def runPCA(self, n_components=None, copy=False, whiten=False):
		print "Running PCA Dimensionality Reduction with n_components = ", n_components
		self.pca = PCA(n_components=n_components, copy=copy, whiten=whiten)
		self.X_train = self.pca.fit_transform(self.X_train)
		print "Reduced data down to ", self.pca.n_components_, " dimensions: "
		print "Transforming test data ..."
		self.X_test = self.pca.transform(self.X_test)
		#self.X_cv = self.pca.transform(self.X_cv)

	def runRFRGridSearch(self):
		n_estimators = [10,50,100,500]
		for n in n_estimators:
			self.define_rfr(n_estimators=n)
			self.train_regr()
			self.predict(filename_label="rfr_n_est_%i" %n)

	def runSVRGridSearch(self):
		C_vals = [0.1, 0.5, 1, 10, 100]
		gamma_vals = [1E-1, 1, 1E1, 1E2, 1E3]
		degrees = [3,4,5]

		for C in C_vals:
			for gamma in gamma_vals:
				for degree in degrees:
					print "\n\n C: ", C, "  gamma: ", gamma
					self.define_SVR(C=C, gamma=gamma, degree=degree, cache_size=2000)
					self.train_regr()
					print "Training Scores:"
					self.score_regr(self.X_train, self.y_train)
					print "Testing Scores:"
					self.score_regr(self.X_test, self.y_test)
					self.predict(filename_label="svr_C_%s_gamma_%s" %(C, gamma))

	def plot_score(self):
		plt.scatter(self.prediction, self.y_test)
		plt.plot([0,1.3], [0,1.3])
		plt.xlabel('prediction')
		plt.ylabel('y_test')
		plt.show()

	def save_pickle(self, fileName, data):
		f = open(fileName, 'wb')
		pickle.dump(data, f)
		f.close()
Beispiel #32
0
def main(arg1):
	#print arg1
	fname = '../EssentiaTrainFeatures/'+ arg1 #Liquids_To_UnvoicedPlosives.arff'
	fname2 = './'+ arg1	#Liquids_To_UnvoicedPlosives.arff'
	start = time()
	try:
		f = open(fname,'r')
	except:
		return('error')
	#lines = f.readlines()[:]
	#f.close()       
	#floats = []
	#for line in lines:   
	#	floats.append(shlex.split(line))
	
	#array = np.asarray(floats)
	#for (x,y), value in np.ndenumerate(array):
	#	if value == np.nan or value == 'NaN':
	#		array[x][y] = 0;
	#	elif value == np.infty:
	#		array[x][y] = 1;
	array = np.loadtxt(f)
	f.close()
	array = np.nan_to_num(array)
	#array = array.astype(np.float)
	print 'Data size'
	print np.shape(array)
	#scale = StandardScaler()
	#array = scale.fit_transform(array)
	trainY = array[:,305]
	trainX = np.delete(array, [302,303,304,305,306,307],1)
	elapsed = time() - start
	print 'Training array loading time'
	print elapsed/60
	f = open(fname2,'r')
	#lines = f.readlines()[:]
	#f.close()       
	#floats = []
	#for line in lines:     
	#	floats.append(shlex.split(line))
	#array2 = np.asarray(floats)
	#for (x,y), value in np.ndenumerate(array2):
	#	if value == np.nan or value == 'NaN':
	#		array2[x][y] = 0;
         #       elif value == np.infty:
          #              array2[x][y] = 2;
	array2 = np.loadtxt(f)
	f.close()
	array2 = np.nan_to_num(array2)
	#array2 = array2.astype(np.float)
	print 'Test size'
	print np.shape(array2)
	#scale = StandardScaler()
	#array = scale.fit_transform(array)
	#traiY = array[:,38]
	#Position = array2[:,36]
	#Hmmboundary = array2[:,37]
	#Manualboundary = array2[:,38]
	hmm_true = array2[:,305]
	hmmX = np.delete(array2, [302,303,304,305,306,307],1)
	
	#trainY, realY, trainX, testX = train_test_split(traiY,traiX,test_size=0.8,random_state=42)
	#Cost = np.power(2,np.arange(1,12));
	#g = [0.5,0.25,0.125,0.0625,0.03125,0.015625,0.0078125,0.00390625,0.001953125,0.0009765625,0.00048828125,0.00048828125]
	#print '\nCost values'
	#print Cost
	#print '\ngamma values'
	#print g
	#scorebest = 0
	#Cbest = 0
	#gammabest = 0
	#model_to_set = NuSVR(C=32, cache_size=2048, coef0=0.0, degree=3, gamma=0.03125, kernel='rbf',
 	#  max_iter=-1, nu=0.5, probability=False, shrinking=True, tol=0.001,
  	# verbose=True)
	#parameters = {'C':Cost,'gamma':g}#,'nu':[0.5],'kernel':['rbf'],'verbose':[True]}
	#k =[0.5,1]#2,5,7,8];
	model_to_set = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=5000, min_samples_split=2000, min_samples_leaf=10,min_density=0.1, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=3, random_state=None, verbose=0)
	#parameters = {'n_estimators':[10,100,500],'max_depth':[1,5,20,100,None],'min_samples_split':[1,5,20,100],}
	#trainY, realY, trainX, testX = train_test_split(traiY,traiX,test_size=0,random_state=42)
	print '\nparams'
	print model_to_set.get_params()
	start = time()
	print '\ntraining start time'
	print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
	model_to_set.fit(trainX,trainY)
	print '\ntraining end time'
	print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
	elapsed = (time() - start)
	print elapsed/60

	y_pred = model_to_set.predict(trainX)
	#return(y_pred,trainY)
	#score1 = model_to_set.score(trainX,trainY)
	#print 'score1'
	#print score1
	#print 'Myscore1'
	#print MyScore(trainY,y_pred)
	
	#y_pred = model_to_set.predict(testX)
	#score2 = model_to_set.score(testX,realY)
	#print '\nscore2'
	#print score2
	#print 'Myscore2'
	#print MyScore(realY,y_pred)
	
	'''TESTING'''
	
	hmm_pred = model_to_set.predict(hmmX)
	#baseName = arg1.replace('.arff','')
	#np.savetxt((baseName+'_hmm_pred.txt'),hmm_pred)
	#np.savetxt((baseName+'_hmm_true.txt'),hmm_true)
	#np.savetxt((baseName+'_Bhmm.txt'),array2[:,306])
	#np.savetxt((baseName+'_Btrue.txt'),array2[:,307])
	#np.savetxt((baseName+'_train_pred.txt'),y_pred)
	#np.savetxt((baseName+'_train_true.txt'),hmm_pred)
	#np.savetxt((baseName+'_Btrain.txt'),hmm_pred)	
	return(hmm_pred,hmm_true,array2[:,306],array2[:,307], y_pred,trainY, array[:,307])
	#cnt = 0;
	#print 'asdasd'
	#print hmm_pred
	'''for pred in hmm_pred:
Beispiel #33
0
valid_X = X_train[int(sz[0] * frac):, :]
valid_Y = Y_train[int(sz[0] * frac):]
####################################################################################
####################################################################################
####################################################################################
#classifier
RFmodel = RandomForestRegressor(
        n_estimators=1000,        #number of trees to generate
        n_jobs=1,               #run in parallel on all cores
        criterion="mse"
        )

#train
RFmodel = RFmodel.fit(train_X, train_Y)
#get parameters
params=RFmodel.get_params()
#score on training set
acc_rate=RFmodel.score(train_X,train_Y)
print acc_rate
#feature importances
feat_imp=RFmodel.feature_importances_
df_train=pd.io.parsers.read_table('X_train.csv',sep=',',header=False)
col_names=list(df_train.columns)
feat_imp_dict={col_names[i]:feat_imp[i] for i in range(len(feat_imp))}
feat_imp_sort = sorted(feat_imp_dict.items(), key=operator.itemgetter(1))


y_out=RFmodel.predict(valid_X)
pred = np.array([np.max([0.0,x]) for x in y_out])
print ('prediction error=%f' % np.sqrt(sum( (pred[i]-valid_Y[i])**2 for i in range(len(valid_Y))) / float(len(valid_Y)) ))
plt.clf()

###############End: Construct Feature Importance plot for first Forest #####

############### Start: Randomized Search CV ##################################

# Look at parameters used by our current forest
# from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

from sklearn.model_selection import RandomizedSearchCV

# =============================================================================
# # Randomized Search CV
#
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
Beispiel #35
0
              "min_samples_leaf": [1, 3, 10],
              "bootstrap" :[True,False]}

grid_search = GridSearchCV(ld_multirf1, param_grid=param_grid)
grid_search.fit(ldX_train, ldy_train)
ldy_grid_search = grid_search.predict(ldX_test)


scaler = MinMaxScaler(feature_range=(0, 1))
b = scaler.fit_transform(ldy2_test)
inv_ldyhat = scaler.inverse_transform(ldy_grid_search)

inv_ldyhat = scalef(ldy2_test,inv_ldyhat)
#grid_search1 = GridSearchCV(ld_multirf1, param_grid=param_grid)
#grid_search1.fit(ldX_train, ldy_train.iloc[:,1])
3ld_multirf1.get_params().keys()


regr_multimlp.get_params
ld_multirf1.get_params

# Predict on new data
ldy_multirf = ld_multirf.predict(ldX_test)
ldy_multimlp = ld_multimlp.predict(ldX_test)
ldy_multiada = ld_multiada.predict(ldX_test)
ldy_grid_search = grid_search.predict(ldX_test)
grid_search.score(ldX_test, ldy_test)
grid_search1.score(ldX_test, ldy_test.iloc[:,1])

ldysc_multirf = ld_multirf.predict(ldX_testsc)
ldysc_multimlp = ld_multimlp.predict(ldX_testsc)
Beispiel #36
0
    ,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 3, 'idx1': 4, 'ratio': svmRatio }
    ,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 4, 'idx1': 5, 'ratio': svmRatio }
    ,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 6, 'idx1': 7, 'ratio': svmRatio }
]

#models2 = combine.combineTrain(X_test, y_test, models)

print "Training random forest..."
forestSize = 30
print "\t# Examples: \t\t" + str(len(X_train)) 
print "\tForest Size: \t\t" + str(forestSize)
start = time.time()
clf = RandomForestRegressor(n_estimators=forestSize, n_jobs=8)
clf = clf.fit(X_train, y_train)
print "\tTraining Complete" 
print "\tTime: \t\t" + str(round(time.time() - start, 1)) + "s"

#Reset n_jobs to 1 because multicore evaluation is apparently hard
params = clf.get_params()
clf.set_params(n_jobs = 1)

print "\tRMSE: \t\t" + str(rmse(X_test, y_test, clf.predict, True))
#results = combine.combineTest(X_test, y_test, clf, models)



#def subPredict(X):
#    return combine.combinePredict(X, clf, models)
submission(clf.predict, filters, pca.transform)

Beispiel #37
0
class UncertainTaggerBuilder:
    """Encapsulates the machinery required to
    * discover features from the countably infinite set of possible
      motif-based features, and
    * return a function that, when called on a string, returns
      a corresponding array of numbers in [0, 1]
    """
    def __init__(self,
                 texts,
                 tags,
                 motifs,
                 a=0.85,
                 n=20,
                 initial_max_separation=1,
                 sort_sample_size=2000):
        """Initialize an UncertainTaggerBuilder with the required
        training data.
        TEXTS  - a sequence of strings
        TAGS   - a sequence of sequences of numbers in [0, 1]
                corresponding to TEXTS
        MOTIFS - a sequence of substrings used to create features
        A      - the first feature's probability of being chosen
        N      - the expected value of the size of a set of features,
                if approximating the set of possible features as infinite
        INITIAL_MAX_SEPARATION - the initial number of instances of a
                motif that can be detected ahead of or behind the
                current string position
        """
        self._texts = list(texts)
        self._tags = list(tags)
        self._motifs = list(motifs)
        # This is the best OOB score yet observed.
        self._max_oob = -float('inf')
        self._a = a
        self._n = n
        self._guaranteed_n = 0
        self._features = [PositionFeature(
            True), PositionFeature(False)] + [
                MotifFeature(motif, i) for motif in self._motifs
                for i in range(-initial_max_separation, initial_max_separation)
            ]
        # Past states are stored for future recovery and for prediction
        # of the performance of future states. States are mapped to
        # performances.
        self._states = dict()

        self._model = RandomForestRegressor(random_state=random.randint(
            0, 1000),
                                            verbose=1,
                                            n_jobs=-1,
                                            max_features='sqrt',
                                            oob_score=True)
        self._importance_sort(sort_sample_size)
        # Key invariant: The features with the highest rankings are at
        # the beginning of the list.
        self._rankings = {f: -i for i, f in enumerate(self._features)}
        self.scores = list()
        # Invariant: self._best_feature_set must be in descending order
        # by importance.
        self._best_feature_set = self._features

    def _importance_sort(self, sort_sample_size):
        """Sort the features stored in SELF by their importances."""
        quickmodel = RandomForestRegressor(random_state=random.randint(
            0, 1000),
                                           verbose=1,
                                           n_jobs=-1,
                                           max_features='sqrt',
                                           max_samples=0.2,
                                           min_samples_split=8,
                                           oob_score=True)
        sample_idx = random.sample(list(range(len(self._texts))),
                                   k=min(sort_sample_size, len(self._texts)))
        sample_texts = [self._texts[i] for i in sample_idx]
        sample_y = get_y([self._tags[i] for i in sample_idx])
        quickmodel.fit(get_X(sample_texts, self._features), sample_y)
        self._max_oob = max(self._max_oob, quickmodel.oob_score_)
        self._features = sort_a_by_b(self._features,
                                     quickmodel.feature_importances_)

    def _sort(self):
        """Sort the features in descending order by their rankings."""
        self._features.sort(reverse=True, key=lambda f: self._rankings[f])

    def _get_r(self):
        # This is just the geometric sum formula with the linearity of
        # expectation
        return 1 - self._a / (self._n - self._guaranteed_n)

    def _random_candidate_feature_set(self):
        """Returns a random feature set of nonzero length.
        """
        ret = self._best_feature_set[:self._guaranteed_n]
        p = self._a
        r = self._get_r()
        for candidate in self._features:
            if candidate not in ret:
                if random.random() < p:
                    ret.append(candidate)
                p *= r
        if len(ret) == 0:
            return self._random_candidate_feature_set()
        return frozenset(ret)

    def _new_max_oob(self, ordered_feature_set):
        """Carries out the operations that correspond to discovering a new
        high-performing feature set. (This is a private helper function
        to IMPROVE.)
        ORDERED_FEATURE_SET must be in the same order as was used to
        most recently train the model.
        """
        self._max_oob = self._states[frozenset(ordered_feature_set)].oob_score
        # Update the ideal length to be that of the high-performing feature
        # set.
        self._n = len(ordered_feature_set)
        # Update the ordered list of best features.
        feature_importances = self._model.feature_importances_
        assert len(feature_importances) == len(ordered_feature_set)
        self._best_feature_set = ordered_feature_set
        self._best_feature_set = sort_a_by_b(self._best_feature_set,
                                             feature_importances)
        print('DEBUG: Max OOB score updated to {}'.format(self._max_oob))
        # Update the current list of features with their successors.
        for feature in ordered_feature_set:
            successor = None
            try:
                successor = feature.successor()
            except AttributeError:
                pass  # This feature does not support successors.
            if successor and successor not in self._features:
                self._features.append(successor)
                self._rankings[successor] = self._rankings[feature] - 1
        self._sort()

    def _update_rankings(self, feature_set, oob_score):
        """Update the rankings for the features in FEATURE_SET according
        to whether the OOB score associated with them is good.
        """
        for f in feature_set:
            if f not in self._best_feature_set[:self._guaranteed_n]:
                z = z_score(self.scores, oob_score)
                if np.isfinite(z):
                    self._rankings[f] += z
        self._sort()

    def _improve(self):
        """Tests a new subset of the possible features. Returns True iff
        the operation terminated successfully.
        """
        feature_set = self._random_candidate_feature_set()
        if feature_set not in self._states:
            ordered_feature_set = list(feature_set)
            X = get_X(self._texts, ordered_feature_set, cache='self._texts')
            y = get_y(self._tags)
            self._states[feature_set] = UTBStatePerformance(
                self._model.get_params())
            self._model.fit(X, y)
            oob = self._model.oob_score_
            self._states[feature_set].oob_score = oob
            if oob >= self._max_oob:
                self._new_max_oob(feature_set)
            self._update_rankings(feature_set, oob)
            self.scores.append(oob)
            return True
        return False

    def run(self, duration):
        """Improves the feature selection for DURATION minutes."""
        t0 = time.time()
        duration_s = duration * SECONDS_PER_MINUTE
        while self._guaranteed_n < self._n:
            while not self._improve():
                # If improvement attempt failed, then the set of possible
                # feature sets that could be tested is probably small;
                # therefore, more flexibility in feature set selection is
                # warranted.
                self._guaranteed_n = max(0, self._guaranteed_n - 1)
            self._guaranteed_n = int(self._n * (time.time() - t0) / duration_s)
            print('N = {}. Guaranteed N = {}.\nBest OOB = {}.'.format(
                self._n, self._guaranteed_n, self._max_oob))

    def _CV(self, features, k=5, confidence=0.95):
        """Return a confidence interval for an f-score for a K-fold
        CV.
        """
        f_scores = list()
        for a in range(k):
            self._model.fit(
                get_X([
                    self._texts[i]
                    for i in range(len(self._texts)) if i % k == a
                ],
                      features,
                      cache='train partition {}/{}'.format(a, k)),
                get_y([
                    self._tags[i] for i in range(len(self._texts))
                    if i % k == a
                ]))
            actual = get_y(
                [self._tags[i] for i in range(len(self._texts)) if i % k != a])
            pred = self._model.predict(
                get_X([
                    self._texts[i]
                    for i in range(len(self._texts)) if i % k != a
                ],
                      features,
                      cache='pred partition {}/{}'.format(a, k)))
            print('DEBUG: pred: ', pred[:20])
            print('DEBUG: actual: ', actual[:20])
            pred = [round(p) for p in pred]
            print(
                'DEBUG: precision: ',
                sum(pred[i] and actual[i]
                    for i in range(len(actual))) / max(sum(pred), 0.00001))
            print(
                'DEBUG: recall: ',
                sum(actual[i] and pred[i]
                    for i in range(len(actual))) / max(sum(actual), 0.00001))
            f_scores.append(f1_score(actual, pred))
        return t_ci(f_scores, 1 - confidence)
log_regressor_summary(rfr, X_train, X_test, y_train, y_test)

# tests
exp = neptune.get_experiment()

# check logs
correct_logs_set = {
    'evs_test_sklearn', 'me_test_sklearn', 'mae_test_sklearn',
    'r2_test_sklearn', 'charts_sklearn'
}
from_exp_logs = set(exp.get_logs().keys())
assert correct_logs_set == from_exp_logs, '{} - incorrect logs'.format(exp)

# check sklearn parameters
assert set(exp.get_properties().keys()) == set(
    rfr.get_params().keys()), '{} parameters do not match'.format(exp)

# check neptune parameters
assert set(exp.get_parameters().keys()) == set(
    parameters.keys()), '{} parameters do not match'.format(exp)

## Step 5: Stop Neptune experiment after logging summary

neptune.stop()

## Explore results

# Scikit-learn classification

## Step 1: Create and fit gradient boosting classifier
Beispiel #39
0
def main_params(x_fname, y_fname, model_names, models_dir, ncores, model_type, verbose, cv_predictions, imbalanced, input_format):

    seed = 42

    # create models subdir
    if models_dir is None:
        models_dir = os.path.join(os.path.dirname(x_fname), "models")
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)
    for m in model_names:
        fpath = os.path.join(models_dir, m + ".pkl")
        if os.path.exists(fpath):
            os.remove(fpath)

    model_stat_fname = os.path.join(models_dir, "models_stat.txt")

    # load y
    y = load_y(y_fname)

    # load x
    if input_format == 'txt':
        descr_names, mol_names, x = load_sirms_txt(x_fname, names=y.keys())
    elif input_format == 'svm':
        descr_names, mol_names, x = load_sirms_svm(x_fname, names=y.keys())
    else:
        print("Illegal value of input format: " % input_format)
        exit()

    # process y
    y = np.asarray([y[n] for n in mol_names])

    # process x
    save_bound_box_constrains(x, os.path.join(models_dir, "bound_box.pkl"))
    save_object(descr_names, os.path.join(models_dir, "var_names.pkl"))

    # scale
    scale = StandardScaler().fit(x)
    save_object(scale, os.path.join(models_dir, "scale.pkl"))

    x = scale.transform(x)

    if model_type == "class":
        cv = ms.StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
    elif model_type == "reg":
        cv = ms.KFold(n_splits=5, random_state=seed, shuffle=True)

    if model_type == "class" and imbalanced:
        subsets = make_subsets(y, seed)
        if not subsets:
            warnings.warn("The data set is balanced (ratio majority:minority < 1.5)."
                          "No multiple undersampling will be done", Warning)
            subsets = [list(range(y.shape[0]))]
    else:
        subsets = [list(range(y.shape[0]))]

    # build models

    for current_model in model_names:

        if verbose:
            print(current_model.upper() + ' model building...')

        models_lst = []  # this lst refreshes on each model name; here we store either 1 model in balanced case or list of models=number of subsets in the case of imbalanced

        for subset in subsets:

            if current_model == "rf":

                # choosing optimal parameters
                param_grid = {"max_features": [x.shape[1] // 10, x.shape[1] // 7, x.shape[1] // 5, x.shape[1] // 3],
                              "n_estimators": [500]}

                if model_type == "reg":
                    m = ms.GridSearchCV(RandomForestRegressor(random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                elif model_type == "class":
                    m = ms.GridSearchCV(RandomForestClassifier(random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                m.fit(x[subset], y[subset])

                # final model
                if model_type == "reg":
                    m = RandomForestRegressor(n_estimators=m.best_params_["n_estimators"],
                                              max_features=m.best_params_["max_features"],
                                              bootstrap=True, random_state=seed)
                elif model_type == "class":
                    m = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"],
                                               max_features=m.best_params_["max_features"],
                                               bootstrap=True, random_state=seed)
                models_lst.append(m)

            if current_model == "gbm":

                # choosing optimal parameters
                param_grid = {"n_estimators": [100, 200, 300, 400, 500]}

                if model_type == "reg":
                    m = ms.GridSearchCV(GradientBoostingRegressor(subsample=0.5, max_features=0.5, random_state=seed),
                                        param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                elif model_type == "class":
                    m = ms.GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5, random_state=seed),
                                        param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                m.fit(x[subset], y[subset])

                # final model
                if model_type == "reg":
                    m = GradientBoostingRegressor(n_estimators=m.best_params_["n_estimators"],
                                                  subsample=0.5, max_features=0.5, random_state=seed)
                elif model_type == "class":
                    m = GradientBoostingClassifier(n_estimators=m.best_params_["n_estimators"],
                                                   subsample=0.5, max_features=0.5, random_state=seed)
                models_lst.append(m)

            if current_model == "svm":

                # choosing optimal parameters
                if model_type == "reg":
                    param_grid = {"C": [10 ** i for i in range(0, 5)],
                                  "epsilon": [0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.01]}
                    m = ms.GridSearchCV(svm.SVR(kernel='rbf'), param_grid, n_jobs=ncores, cv=cv, refit=False,
                                        verbose=verbose)
                elif model_type == "class":
                    param_grid = {"C": [10 ** i for i in range(0, 5)],
                                  "gamma": [10 ** i for i in range(-6, 0)]}
                    m = ms.GridSearchCV(svm.SVC(kernel='rbf', random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False,
                                        verbose=verbose)
                m.fit(x[subset], y[subset])

                # final model
                if model_type == "reg":
                    m = svm.SVR(kernel='rbf', C=m.best_params_["C"], epsilon=m.best_params_["epsilon"])
                elif model_type == "class":
                    m = svm.SVC(kernel='rbf', C=m.best_params_["C"], gamma=m.best_params_["gamma"],
                                probability=True, random_state=seed)
                models_lst.append(m)

            if current_model == "pls" and model_type == "reg":

                # choosing optimal parameters
                param_grid = {"n_components": [i for i in range(1, 8)]}
                m = ms.GridSearchCV(PLSRegression(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                m.fit(x[subset], y[subset])

                # final model
                m = PLSRegression(n_components=m.best_params_["n_components"])
                models_lst.append(m)

            if current_model == "knn":

                # choosing optimal parameters
                param_grid = {"n_neighbors": [i for i in range(3, 21)]}
                if model_type == "reg":
                    m = ms.GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                elif model_type == "class":
                    m = ms.GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose)
                m.fit(x[subset], y[subset])

                # final model
                if model_type == "reg":
                    m = KNeighborsRegressor(n_neighbors=m.best_params_["n_neighbors"])
                elif model_type == "class":
                    m = KNeighborsClassifier(n_neighbors=m.best_params_["n_neighbors"])
                models_lst.append(m)

        # return cv predictions
        ncol = len(models_lst) + 1 if len(models_lst) > 1 else len(models_lst)   # +1 column if consensus
        cv_pred = np.column_stack((y, np.full((y.shape[0], ncol), np.nan)))

        for i, (m, subset) in enumerate(zip(models_lst, subsets)):
            pred = ms.cross_val_predict(estimator=m, X=x[subset], y=y[subset], cv=cv)
            if current_model == 'pls':   # reshape for pls because it returns 2d array and we need 1d
                pred = pred.reshape(len(subset))
            cv_pred[subset, i + 1] = pred

            # build final model, save it and its stat
            m.fit(x[subset], y[subset])
            add_obj_to_file(os.path.join(models_dir, current_model + '.pkl'), m)
            save_model_stat_2(current_model + '_%i' % i, model_stat_fname, str(m.get_params())[1:-1],
                              y[subset],
                              cv_pred[subset, i + 1],
                              model_type,
                              verbose)

        # calc cv consensus and save stat
        if model_type == "class" and len(models_lst) > 1:
            cv_pred[:, -1] = np.apply_along_axis(get_major_vote, 1, cv_pred[:, 1:])
            # cv_pred[:, -1] = np.around(np.nanmean(cv_pred[:, 1:], axis=1))
            save_model_stat_2(current_model + "_consensus", model_stat_fname, "",
                              y,
                              cv_pred[:, -1],
                              model_type,
                              verbose)

        # save cv predictions
        if cv_predictions:
            np.savetxt(os.path.join(models_dir, current_model + "_cv_pred.txt"),
                       np.column_stack([mol_names, np.round(cv_pred, 3)]),
                       fmt="%s",
                       delimiter="\t",
                       comments="",
                       header="Mol\tObs\t" +
                              "\t".join("%s_%i" % (current_model, i) for i in range(len(models_lst))) +
                              "\t" + current_model + "_consensus")

        if verbose:
            print(current_model.upper() + ' model was built\n')