Esempio n. 1
0
    def training(grid_param, X_train, X_test, y_train, y_test):
        #Create cluster/client
        cluster = make_cluster()
        cluster
        client = Client(cluster)
        client
        #Construct Dask DataFrame
        X_train = dd.from_pandas(X_train, npartitions=4)       
        y_train = dd.from_pandas(y_train, npartitions=4)
        X_test = dd.from_pandas(X_test, npartitions=4)        
        y_test = dd.from_pandas(y_test, npartitions=4)

        estimator = RandomForestRegressor()
        #Train model
        train_time = time.time()
        grid_search = GridSearchCV_dask(estimator, grid_param, cv=2, n_jobs=-1)

        with joblib.parallel_backend("dask", scatter=[X_train, y_train]):
            grid_search.fit(X_train, y_train)
        grid_search.score(X_test, y_test)
        train_time = time.time() - train_time
        #Predictions
        acc_r2 = grid_search.best_estimator_.score(X_test, y_test)
        acc_mse = mean_squared_error(grid_search.best_estimator_.predict(X_test), y_test)
        return acc_r2, acc_mse, train_time
def train_evaluate(job_dir, training_dataset_path, search_space, scoring_measure):
  """Runs the training pipeline.""" 

  # Load and prepare training data  
  df_train = pd.read_csv(training_dataset_path)
  num_features_type_map = (
    {feature: 'float64' for feature in df_train.columns[NUMERIC_FEATURE_INDEXES]})
  df_train = df_train.astype(num_features_type_map)
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']

  # Define the training pipeline
  preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_FEATURE_INDEXES),
        ('cat', OneHotEncoder(), CATEGORICAL_FEATURE_INDEXES) 
    ])

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier())
  ])
  
  # Configure hyperparameter tuning
  grid = GridSearchCV(pipeline, cv=5,  
                      param_grid=search_space, 
                      scoring=scoring_measure)
  
  # Start training
  grid.fit(X_train, y_train)
    
  return grid
Esempio n. 3
0
    def RandomForestDask(param_grid, X_train, X_test, y_train, y_test):

        cluster = make_cluster()
        cluster
        client = Client(cluster)
        client
        dask_X_train = dd.from_pandas(X_train,
                                      npartitions=3)  # preprocess data
        dask_y_train = dd.from_pandas(y_train, npartitions=3)

        dask_X_test = dd.from_pandas(X_test, npartitions=3)
        dask_y_test = dd.from_pandas(y_test, npartitions=3)

        estimator = RandomForestRegressor()
        param_grid = param_grid

        grid_search_dask = GridSearchCV_dask(estimator,
                                             param_grid,
                                             cv=2,
                                             n_jobs=-1)
        with joblib.parallel_backend("dask",
                                     scatter=[dask_X_train, dask_y_train]):
            grid_search_dask.fit(dask_X_train, dask_y_train)
        grid_search_dask.score(dask_X_test, dask_y_test)
        r_2 = grid_search_dask.best_estimator_.score(dask_X_test, dask_y_test)
        mse = mean_squared_error(
            grid_search_dask.best_estimator_.predict(X_test), y_test)
        return r_2, mse,
Esempio n. 4
0
    def run(self):
        self.load_data()
        self.split_data()

        # nulls = X_train.isnull().sum()
        # total_nulls = nulls.sum()
        # if total_nulls > 0:
        #    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        #        print(nulls[nulls > 0], "total: ", total_nulls)

        reg = self._create_model()
        gs = GridSearchCV(reg,
                          self.model_desc["params_grid"],
                          cv=self.model_desc["num_folds"],
                          n_jobs=-1,
                          refit=False)

        start_time = time.monotonic()
        with joblib.parallel_backend("dask",
                                     scatter=[self.X_train, self.y_train]):
            gs.fit(self.X_train, self.y_train)
        finish_time = time.monotonic()
        gs_time = finish_time - start_time
        print("Searching for marameters for [{}]".format(
            self.model_desc["algorithm_name"]))
        print("GridSearchCV time: {}".format(gs_time))
        cv_res, best_params, gs_score = _get_best_params_score(gs)
        print("GridSearchCV score: {}".format(gs_score))
        print("Best params: {}".format(best_params))

        start_time = time.monotonic()
        regr_best = self._create_model(**best_params).fit(
            self.X_train, self.y_train)
        finish_time = time.monotonic()
        test_time = finish_time - start_time
        print("Final training time: {}".format(test_time))

        test_score = float(regr_best.score(self.X_test, self.y_test))

        print("Test score: {}".format(test_score))

        ans = {
            "algorithm_name": self.model_desc["algorithm_name"],
            "gs_time": gs_time,
            "gs_score": gs_score,
            "test_time": test_time,
            "test_score": test_score
        }

        if "output_path" in self.model_desc:
            with open(self.model_desc["output_path"], "w") as fp:
                json.dump(ans, fp)

        return ans
Esempio n. 5
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=self.cv_splits, n_repeats=self.cv_repeats, random_state=0)
        else:
            inner_cv = self.inner_cv
        gridpoints = self.gridpoints
        transformer_list = [None_T(), Log_T(), LogP1_T()]  # ,logp1_T()] # log_T()]#
        steps = [
            ('shrink_k1', ShrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv, max_iter=32))),   # retain a subset of the best original variables
            ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2)),  # create interactions among them

            ('drop_constant', DropConst()),
            ('shrink_k2', ShrinkBigKTransformer(selector=LassoLarsCV(cv=inner_cv, max_iter=64))),   # pick from all of those options
            ('reg', LinearRegression())]
        if self.bestT:
            steps.insert(0, ('xtransform', ColumnBestTransformer(float_k=len(self.float_idx))))

        X_T_pipe = Pipeline(steps=steps)
        Y_T_X_T_pipe = Pipeline(steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))])
        Y_T__param_grid = {
            'ttr__transformer': transformer_list,
            'ttr__regressor__polyfeat__degree': [2],
        }
        outerpipe = GridSearchCV(Y_T_X_T_pipe, param_grid=Y_T__param_grid, cv=inner_cv)
        if self.do_prep:
            steps = [('prep', MissingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)

        return outerpipe
Esempio n. 6
0
 def grid_search_hyperparams(self,
                             model,
                             data,
                             feature_names,
                             hyperparam_grid,
                             n_leave_out=1):
     cv_splits = LeavePGroupsOut(n_groups=n_leave_out).split(
         data[feature_names],
         np.ravel(data[self.target]),
         groups=data['group_id'])
     gs_models = GridSearchCV(model,
                              hyperparam_grid,
                              cv=cv_splits,
                              scoring=self.metric,
                              n_jobs=-1)
     gs_models.fit(data[feature_names], np.ravel(data[self.target]))
     return gs_models.best_params_
Esempio n. 7
0
    def run(self):
        print("Running dataset: " + self.name)
        for exp_param in self.param_set:
            param = common_param.copy()
            param.update(exp_param["xgb_params"])
            cv_param = exp_param["search parameter"]
            print("Running cv grid for: " + str(param))
            grid_param = cv_param

            # param["reg_alpha"] = 0.001
            # model = xgb.XGBRegressor(**param)
            # model.fit(X,y)
            # print(model.get_booster().get_dump()[0])
            # exit(0)
            model = xgb.XGBRegressor(
                **param
            ) if self.objective == "reg:linear" else xgb.XGBClassifier(**param)
            clf = GridSearchCV(model, grid_param, cv=5, n_jobs=-1)
            # with parallel_backend('dask'):
            clf.fit(self.X_train, self.y_train)
            # clf.fit(X_train, y_train)
            param.update(clf.best_params_)
            model = xgb.XGBRegressor(
                **param
            ) if self.objective == "reg:linear" else xgb.XGBClassifier(**param)
            model.fit(self.X_train, self.y_train)
            pred = model.predict(self.X_test)
            if isinstance(model, xgb.XGBRegressor):
                score = np.sqrt(metrics.mean_squared_error(self.y_test, pred))
            else:
                score = 1.0 - metrics.accuracy_score(self.y_test, pred)
            df_cv_results.at[exp_param['name'],
                             (self.name,
                              self.metric)] = "{0:.4g}".format(score)
            best_param_value = list(clf.best_params_.values())[0]
            best_param_string = "{0:.4g}".format(
                best_param_value
            ) if best_param_value > 0.1 or best_param_value == 0.0 else "{0:.4e}".format(
                best_param_value)
            df_cv_results.at[exp_param['name'],
                             (self.name, "param")] = best_param_string
            print(df_cv_results.to_latex())
Esempio n. 8
0
def do_gbm_tuning(X,
                  y,
                  model,
                  grid,
                  n_cores,
                  random_state=123,
                  save_path=None,
                  verbose=True):

    start_time = timeit.default_timer()
    o_print('Finding best params with 10-fold CV', verbose)
    rf = DaskGridSearchCV(
        GradientBoostingClassifier(random_state=random_state),
        param_grid=grid,
        n_jobs=n_cores,
        cv=10)
    rf.fit(X, y)
    means = rf.cv_results_['mean_test_score']
    stds = rf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, rf.cv_results_['params']):
        o_print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params), verbose)
    elapsed_time = timeit.default_timer() - start_time
    o_print('CV time: ' + str(elapsed_time), verbose)
    o_print('', verbose)

    best_params = rf.best_params_
    best_params.update({'score': rf.best_score_})

    forest_performance = {
        'score': rf.best_score_,
        'fitting_time': elapsed_time
    }

    save_tuning_results(save_path,
                        random_state,
                        best_params,
                        forest_performance,
                        model=model)

    return (best_params, forest_performance)
Esempio n. 9
0
def do_ada_tuning(X,
                  y,
                  model,
                  grid,
                  n_cores,
                  random_state=123,
                  save_path=None):

    start_time = timeit.default_timer()
    print('Finding best params with 10-fold CV')
    rf = DaskGridSearchCV(AdaBoostClassifier(random_state=random_state),
                          param_grid=grid,
                          n_jobs=n_cores,
                          cv=10)
    rf.fit(X, y)
    means = rf.cv_results_['mean_test_score']
    stds = rf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, rf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    elapsed_time = timeit.default_timer() - start_time
    print('CV time: ' + str(elapsed_time))
    print()

    best_params = rf.best_params_
    best_params.update({'score': rf.best_score_})

    forest_performance = {
        'score': rf.best_score_,
        'fitting_time': elapsed_time
    }

    print(best_params)
    best_params.update({'base_estimator': str(best_params['base_estimator'])})
    save_tuning_results(save_path,
                        random_state,
                        best_params,
                        forest_performance,
                        model=model)

    return (best_params, forest_performance)
Esempio n. 10
0
def __fit_and_score(x_train, y_train, x_test, y_test, clf, cv_repeat=None, grid_search_k=None, param_grid=None, grid_scorer=None, scorers=None):
    """Fitting and scoring a classifier through crossvalidated gridsearch over
    a parameter grid.
    """
    # copy classifier
    clf = clone(clf)
    # grid search
    grid_search_clf = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring=scorers,
        cv=grid_search_k,
        refit=grid_scorer,
        )
    grid_search_clf = grid_search_clf.fit(x_train, y_train)
    # save to results
    result = {}
    result['gridsearch'] = grid_search_clf.cv_results_
    result['best_params'] = grid_search_clf.best_params_
    result['scores'] = _multimetric_score(grid_search_clf, x_test, y_test, scorers)
    result['cv_repeat'] = cv_repeat
    result['clf'] = clf.__class__.__name__
    return result
Esempio n. 11
0
    def _fit_and_score(self, X, y, train, test):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]

        scores_by_params = dict.fromkeys(self.strategy_grid.keys())
        for i, (strategy,
                strategy_values) in enumerate(self.strategy_grid.items()):
            print(f'{strategy} ...')

            # inner-cv model selection
            gscv = DaskGridSearchCV(self.pipeline,
                                    param_grid=strategy_values,
                                    scoring=self.inner_scoring,
                                    cv=self.inner_cv,
                                    refit=True)
            gscv.fit(X_train, y_train)

            # outer-cv model evaluation
            scores_by_params[strategy] = self._score(gscv, X_test, y_test)

        return scores_by_params
Esempio n. 12
0
def subcount_forecast(data, feature):
    """
    Creates a new a column that is the predicted value of the input feature

    Essentially an abstraction for 'prediction_forecasts'

    :param data: a pandas dataframe where each row is an hour
    :param feature: a String containing the feature that should be forecasted (one of: casual, registered)
    :return: a pandas dataframe containing the new column
    """
    var_name = feature + "_forecast"
    print("\tAdding {} variable...".format(var_name))
    df = dd.get_dummies(data.copy().drop("cnt", axis=1))
    to_predict = dd.read_csv(PATH)[feature]
    df[feature] = to_predict

    train = get_train(df)

    model = RandomForestRegressor(random_state=SEED)
    model_params = {"n_estimators": list(range(10, 110, 10))}
    #tscv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(estimator=model,
                               param_grid=model_params,
                               scoring="r2",
                               cv=None,
                               refit=True)
    grid_search.fit(train.drop(feature, axis=1), train[feature])
    print("\t\tPredictions for GridSearchCV on {}: {:.5f} +/- {:.5f}".format(
        feature, grid_search.best_score_,
        grid_search.cv_results_["std_test_score"][da.argmax(
            grid_search.cv_results_["mean_test_score"])]))

    data[var_name] = grid_search.best_estimator_.predict(
        dd.get_dummies(data.drop("cnt", axis=1)))

    return data
Esempio n. 13
0
def search(model,
           X,
           y,
           params,
           method="randomized",
           n_iter=30,
           cv=5,
           **kwargs):
    """Run a cross-validated search for hyperparameters."""
    if method.lower() == "randomized":
        search = RandomizedSearchCV(model,
                                    param_distributions=params,
                                    n_iter=n_iter,
                                    cv=cv)
    elif method.lower() == "grid":
        search = GridSearchCV(model, param_grid=params, cv=cv)
    elif method.lower() == "bayes":
        search = BayesSearchCV(model,
                               search_spaces=params,
                               n_iter=n_iter,
                               cv=cv)
    else:
        message = ("'method' must be either 'randomized', 'grid' or 'bayes'."
                   " Got method='{}'".format(method))
        LOGGER.error(message)
        raise ValueError(message)

    method_name = method.capitalize() + "SearchCV"
    LOGGER.info("Beginning " + method_name)
    when_started = time()

    progress(search.fit(X, y))

    total_time = time() - when_started
    n_settings = len(search.cv_results_['params'])
    LOGGER.warn(
        "{} took {:.2f} seconds for {} candidates parameter settings.".format(
            method_name, total_time, n_settings))
    return search
Esempio n. 14
0
def test_gridsearch():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    grid = {"logisticregression__C": [1000, 100, 10, 2]}
    pipe = make_pipeline(DoNothingTransformer(), LogisticRegression())
    search = GridSearchCV(pipe, grid, cv=3)
    search.fit(X, y)
Esempio n. 15
0
def rbf_svr_tuning(c = [0.001, 0.01, 0.1, 1, 10], gamma = [0.001, 0.01, 0.1, 1, 10], k = 5,
             train_data_path = '../data/training_data.csv', save_model = False, tracking_uri = "http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("c", c)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path = train_data_path)
    (variable_names, X_train, X_test, y_train, y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps = [('scaling', StandardScaler()),
                                 ('regression', SVR(kernel = 'rbf'))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__C'] = c
    hyperparams['regression__gamma'] = gamma

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator = pipeline,
                           param_grid = hyperparams,
                           cv = k,
                           scoring = 'neg_mean_squared_error',
                           n_jobs = -1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i] # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    print(f"\nBest parameter set found for the training set:\n{modelCV.best_params_}")

    # Store the index of the best combination
    best_index = param_list.index(modelCV.best_params_)

    # Get the best values for hyperparams
    best_c = modelCV.best_params_['regression__C']
    best_gamma = modelCV.best_params_['regression__gamma']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is C
    criteria = 'c'
    mlflow.set_tag("criteria", criteria)
    param_values = c

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(steps = [('scaler', StandardScaler()),
                                  ('regression', SVR(
                                        C = param_value,
                                        gamma = best_gamma,
                                        kernel = 'rbf'))])
        param = {'regression__C': param_value, 'regression__gamma': best_gamma}

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae, r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria, training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print("\nEvaluation finished. Training final model with train + test data with the best hyperparameters...")
        final_model = Pipeline(steps = [('scaler', StandardScaler()),
                                        ('regression', SVR(
                                            C = param_list[best_index]['regression__C'],
                                            gamma = best_gamma,
                                            kernel = 'rbf'))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
with ProgressBar():
    parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute()))

print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test))
##### OUTPUT --------> Naive Bayes Classifier Score :  0.65

######################################################################################

# Performing GridSearch on the Logistic Regression Classifier
from dask_ml.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}

lr = LogisticRegression()

tuned_lr = GridSearchCV(lr, parameters)

with ProgressBar():
    tuned_lr.fit(X_train, y_train)

print('\n\nGrid Search Results for Logistic Regression')
print(pd.DataFrame(tuned_lr.cv_results_)[['params', 'mean_test_score']])

#### OUTPUT
#### Grid Search Results for Logistic Regression
####                         params  mean_test_score
#### 0  {'C': 0.5, 'penalty': 'l1'}         0.700778
#### 1  {'C': 0.5, 'penalty': 'l2'}         0.700306
#### 2    {'C': 1, 'penalty': 'l1'}         0.700806
#### 3    {'C': 1, 'penalty': 'l2'}         0.700500
#### 4    {'C': 2, 'penalty': 'l1'}         0.700972
Esempio n. 17
0
def cart_tuning(max_depth=None,
                min_samples_leaf=[1, 2, 1],
                min_samples_split=[2, 3, 1],
                k=5,
                train_data_path='../data/training_data.csv',
                save_model=False,
                tracking_uri="http://0.0.0.0:5000"):

    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('min_samples_leaf', min_samples_leaf)
    mlflow.log_param('min_samples_split', min_samples_split)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path=train_data_path)
    (variable_names, X_train, X_test, y_train,
     y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps=[('scaler', StandardScaler(
    )), ('regression', DecisionTreeRegressor(random_state=RANDOM_SEED))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__max_depth'] = [
        None
    ] if max_depth is None else np.arange(max_depth[0], max_depth[1],
                                          max_depth[2])
    hyperparams['regression__min_samples_leaf'] = np.arange(
        min_samples_leaf[0], min_samples_leaf[1], min_samples_leaf[2])
    hyperparams['regression__min_samples_split'] = np.arange(
        min_samples_split[0], min_samples_split[1], min_samples_split[2])

    print("Training started...\n")

    # Create an instance of Decision Tree Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator=pipeline,
                           param_grid=hyperparams,
                           cv=k,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i]  # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    print(
        f"\nBest parameters set found for the training set:\n{modelCV.best_params_}"
    )

    # Store the index of the best combination
    best_index = param_list.index(modelCV.best_params_)

    # Get the best values for hyperparams
    best_depth = modelCV.best_params_['regression__max_depth']
    best_samples_leaf = modelCV.best_params_['regression__min_samples_leaf']
    best_samples_split = modelCV.best_params_['regression__min_samples_split']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Select the hyperparam with most values as the criteria for the study and calculate test error with the best value
    # obtained for the other hyperparameters so the individual effect of this parameter can be studied
    criteria = [('max_depth', len(hyperparams['regression__max_depth'])),
                ('min_samples_leaf',
                 len(hyperparams['regression__min_samples_leaf'])),
                ('min_samples_split',
                 len(hyperparams['regression__min_samples_split']))]
    criteria = sorted(criteria, key=lambda x: x[1], reverse=True)[0][0]

    mlflow.set_tag("criteria", criteria)

    param_values = []
    if criteria == 'max_depth':
        if max_depth is None:
            param_values = [None]
        else:
            param_values = range(max_depth[0], max_depth[1], max_depth[2])
    elif criteria == 'min_samples_leaf':
        param_values = range(min_samples_leaf[0], min_samples_leaf[1],
                             min_samples_leaf[2])
    else:
        param_values = range(min_samples_split[0], min_samples_split[1],
                             min_samples_split[2])

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        if criteria == 'max_depth':
            model = Pipeline(steps=[('scaler', StandardScaler()),
                                    ('regression',
                                     DecisionTreeRegressor(
                                         max_depth=param_value,
                                         min_samples_leaf=best_samples_leaf,
                                         min_samples_split=best_samples_split,
                                         random_state=RANDOM_SEED))])
            param = {
                'regression__max_depth': param_value,
                'regression__min_samples_leaf': best_samples_leaf,
                'regression__min_samples_split': best_samples_split
            }
        elif criteria == 'min_samples_leaf':
            model = Pipeline(steps=[('scaler', StandardScaler()),
                                    ('regression',
                                     DecisionTreeRegressor(
                                         max_depth=best_depth,
                                         min_samples_leaf=param_value,
                                         min_samples_split=best_samples_split,
                                         random_state=RANDOM_SEED))])
            param = {
                'regression__max_depth': best_depth,
                'regression__min_samples_leaf': param_value,
                'regression__min_samples_split': best_samples_split
            }
        else:
            model = Pipeline(steps=[('scaler', StandardScaler()),
                                    ('regression',
                                     DecisionTreeRegressor(
                                         max_depth=best_depth,
                                         min_samples_leaf=best_samples_leaf,
                                         min_samples_split=param_value,
                                         random_state=RANDOM_SEED))])
            param = {
                'regression__max_depth': best_depth,
                'regression__min_samples_leaf': best_samples_leaf,
                'regression__min_samples_split': param_value
            }

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae,
         r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria,
                training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print(
            "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..."
        )
        final_model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    DecisionTreeRegressor(
                        max_depth=param_list[best_index]
                        ['regression__max_depth'],
                        min_samples_leaf=param_list[best_index]
                        ['regression__min_samples_leaf'],
                        min_samples_split=param_list[best_index]
                        ['regression__min_samples_split']))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Get a barplot with feature importances
        feature_importances = final_model.named_steps[
            'regression'].feature_importances_
        plot_feature_importances(feature_importances, variable_names)

        # Create a visual representation of the tree and convert it to PNG
        tree_graph = tree.export_graphviz(
            final_model.named_steps['regression'],
            out_file='/tmp/tree.dot',
            max_depth=4)
        (graph, ) = pydot.graph_from_dot_file('/tmp/tree.dot')
        graph.write_png('./img/tree.png')

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("training_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
Esempio n. 18
0
def xgb_reg(n_estimators=100,
            max_depth=6,
            learning_rate=0.05,
            k=5,
            train_data_path='../data/training_data.csv',
            save_model=False,
            tracking_uri="http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path=train_data_path)
    (variable_names, X_train, X_test, y_train,
     y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps=[('scaling', StandardScaler()),
                               ('regression',
                                xgb.XGBRegressor(objective="reg:squarederror",
                                                 seed=RANDOM_SEED))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__n_estimators'] = np.arange(
        n_estimators[0], n_estimators[1], n_estimators[2])
    hyperparams['regression__max_depth'] = np.arange(max_depth[0],
                                                     max_depth[1],
                                                     max_depth[2])
    hyperparams['regression__learning_rate'] = learning_rate

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator=pipeline,
                           param_grid=hyperparams,
                           cv=k,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i]  # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    best_params = modelCV.best_params_
    print(f"\nBest parameter set found for the training set:\n{best_params}")

    # Store the index of the best combination
    best_index = param_list.index(best_params)

    # Get the best values for hyperparams
    best_n_estimators = best_params['regression__n_estimators']
    best_max_depth = best_params['regression__max_depth']
    best_learning_rate = best_params['regression__learning_rate']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is C
    criteria = 'n_estimators'
    mlflow.set_tag("criteria", criteria)
    param_values = hyperparams['regression__n_estimators']

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    xgb.XGBRegressor(objective="reg:squarederror",
                                     n_estimators=param_value,
                                     max_depth=best_max_depth,
                                     learning_rate=best_learning_rate))])

        param = {
            'regression__n_estimators': param_value,
            'regression__max_depth': best_max_depth,
            'regression__learning_rate': best_learning_rate
        }

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae,
         r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria,
                training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print(
            "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..."
        )
        final_model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    xgb.XGBRegressor(objective="reg:squarederror",
                                     n_estimators=best_n_estimators,
                                     max_depth=best_max_depth,
                                     learning_rate=best_learning_rate))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Plot importances and final tree
        ax = xgb.plot_importance(final_model.named_steps['regression'])
        fig = ax.figure
        fig.savefig('./img/importances.png', bbox_inches='tight')
        plt.close(fig)

        ax = xgb.plot_tree(final_model.named_steps['regression'], rankdir='LR')
        fig = ax.figure
        fig.set_size_inches(30, 15)
        fig.savefig('./img/tree.png', dpi=400, bbox_inches='tight')
        plt.close(fig)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
Esempio n. 19
0
def train_model(x_train,
                x_test,
                y_train,
                alphas,
                l1_ratios,
                n_folds=5,
                max_iter=1000):
    """
    Build the logic and sklearn pipelines to train x matrix based on input y

    Arguments:
    x_train - pandas DataFrame of feature matrix for training data
    x_test - pandas DataFrame of feature matrix for testing data
    y_train - pandas DataFrame of processed y matrix (output from align_matrices())
    alphas - list of alphas to perform cross validation over
    l1_ratios - list of l1 mixing parameters to perform cross validation over
    n_folds - int of how many folds of cross validation to perform
    max_iter - the maximum number of iterations to test until convergence

    Output:
    The full pipeline sklearn object and y matrix predictions for training, testing,
    and cross validation
    """
    # Setup the classifier parameters
    clf_parameters = {
        "classify__loss": ["log"],
        "classify__penalty": ["elasticnet"],
        "classify__alpha": alphas,
        "classify__l1_ratio": l1_ratios,
    }

    estimator = Pipeline(steps=[(
        "classify",
        SGDClassifier(
            random_state=0,
            class_weight="balanced",
            loss="log",
            max_iter=max_iter,
            tol=1e-3,
        ),
    )])

    cv_pipeline = GridSearchCV(
        estimator=estimator,
        param_grid=clf_parameters,
        n_jobs=-1,
        cv=n_folds,
        scoring="roc_auc",
        return_train_score=True,
    )

    # Fit the model
    cv_pipeline.fit(X=x_train, y=y_train.status)

    # Obtain cross validation results
    y_cv = cross_val_predict(
        cv_pipeline.best_estimator_,
        X=x_train,
        y=y_train.status,
        cv=n_folds,
        method="decision_function",
    )

    # Get all performance results
    y_predict_train = cv_pipeline.decision_function(x_train)
    y_predict_test = cv_pipeline.decision_function(x_test)

    return cv_pipeline, y_predict_train, y_predict_test, y_cv
Esempio n. 20
0
from sklearn.datasets import load_digits
import sklearn.metrics as skmetrics
from sklearn.svm import SVC

run = Run.get_context()

# For a bigger dataset, try
# x, y = make_classification(5000, n_classes=16, n_informative=13)
digits = load_digits()
x = digits.data
y = digits.target

# Obtain cross validation performance for single parameter setting:
param_space = {'C': [1]}

model = SVC(kernel='rbf')
if args.cv:
    search = GridSearchCV(model,
                          param_space,
                          scoring=['accuracy'],
                          refit=False,
                          cv=args.cv)
    search.fit(x, y)
    run.log('accuracy_mean', search.cv_results_['mean_test_accuracy'][0])
    run.log('accuracy_std', search.cv_results_['std_test_accuracy'][0])
else:
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    model.fit(x_train, y_train)
    y_test_pred = model.predict(x_test)
    run.log('accuracy', skmetrics.accuracy_score(y_test, y_test_pred))
Esempio n. 21
0
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.externals import joblib


def simple_nn(hidden_neurons):
  model = Sequential()
  model.add(Dense(hidden_neurons, activation='relu', input_dim=30))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
  return model

param_grid = {'hidden_neurons': [100, 200, 300]}
if __name__=='__main__':
	client = Client()
	cv = GridSearchCV(KerasClassifier(build_fn=simple_nn, epochs=100), param_grid)
	X, y = load_breast_cancer(return_X_y=True)
	X_train, X_test, y_train, y_test = train_test_split(X, y)
	with joblib.parallel_backend("dask", scatter=[X_train, y_train]):
		cv.fit(X_train, y_train)
	print(f'Best Accuracy for {cv.best_score_:.4} using {cv.best_params_}')
Esempio n. 22
0
def optimize_it(query,
                truthfile,
                db,
                function,
                taxlevel=6,
                n_jobs=-1,
                grid='simple',
                **kwargs):
    taxon = taxlevels[taxlevel]
    prefix = os.path.basename(query)
    prefix = os.path.splitext(prefix)[0]
    mock = prefix.split('_')[0]
    name = function.__name__.split('_')[1]
    pickle_pref = '%s_%s_%s' % (prefix, name, taxon)
    print('Optimizing %s in %s' % (name, taxon))
    if os.path.isfile('%s.pckl' % pickle_pref):
        print('Loading previous run')
        with open('%s.pckl' % pickle_pref, 'rb') as p:
            d = dill.load(p)
    else:
        X, y, truth = fasta2array(query, truthfile)
        params = product(*[kv for kv in kwargs.values()])
        pnames = [x[0] for x in product(kwargs)]
        if os.path.isfile('%s_split.pckl' % prefix):
            with open('%s_split.pckl' % prefix, 'rb') as f:
                X_train, X_test, y_train, y_test = dill.load(f)
        else:
            X_train, X_test, y_train, y_test = split_train(
                X.copy(deep=True), y.copy(deep=True))
            with open('%s_split.pckl' % prefix, 'wb') as f:
                dill.dump((X_train, X_test, y_train, y_test), f)
        query_train = array2fasta(X_train)
        query_test = array2fasta(X_test)
        prefix = query[query.rfind('.fa')]
        f_param = dict(db=db,
                       taxlevel=taxlevel,
                       asfile=False,
                       prefix=prefix,
                       query=query_train)
        if grid == 'simple':
            asfile = False
            if os.path.isfile('%s_training.tsv' % pickle_pref):
                scores = pd.read_csv('%s_training.tsv' % pickle_pref, sep='\t')
            else:
                tax = taxlevels[taxlevel]
                delayed_results = [
                    dask.delayed(
                        compute_simple_grid(function, i, pnames, f_param,
                                            y_train, tax))
                    for i in tqdm(params)
                ]
                with ProgressBar():
                    scores = dask.compute(*delayed_results)
                scores = pd.concat(scores)
                scores.to_csv('%s_training.tsv' % pickle_pref, sep='\t')
            highcols = ['MCC']
            if "hmmufotu" in name:
                highcols = ['F1S']
            if 'blast' in name:
                highcols += ['evalue', 'max_target_seqs']
                lowcols = ['p_id']
            elif 'lca' in name:
                highcols += ['evalue']
                lowcols = ['p_id', 'm_hit', 'p_hit']
            else:
                lowcols = None
            best = scores.nlargest(1, columns=highcols, keep='all')
            if lowcols is not None:
                best = best.nsmallest(1, columns=lowcols)
            best = best.reset_index(drop=True)
            try:
                best = best.loc[0, pnames].to_dict()
            except TypeError:
                print(best)
                raise
        else:
            print('  Performing Grid Search CV')
            asfile = True
            with ProgressBar():
                tuned_parameters = [kwargs]
                p = dict(program=function,
                         db=db,
                         taxlevel=taxlevel,
                         asfile=False,
                         prefix=prefix)
                c = GridSearchCV(Estimator(**p),
                                 tuned_parameters,
                                 cv=3,
                                 n_jobs=n_jobs,
                                 scoring=mathews_scorer).fit(X, y)
            with open('%s_trainingres.pckl' % prefix, 'wb') as f:
                dill.dump(c.cv_results_, f)
            best = c.best_params_
        print('Processing best parameters in %s' % name)
        print(best)
        with ResourceProfiler(dt=0.01) as prof:
            start = time.time()
            results = function(prefix=prefix,
                               db=db,
                               query=query_test,
                               asfile=asfile,
                               taxlevel=taxlevel,
                               **best)
            elapsed = time.time() - start
            results = results.replace(r'^\s*$', np.nan, regex=True)
        with open('%s_cvresults.pckl' % pickle_pref, 'wb') as q:
            dill.dump((results, y), q)
        resources = pd.DataFrame(data=prof.results).mean()
        resources.rename(columns={'time': 'timestamp'})
        resources['time'] = elapsed
        sc = [
            score(results, y_test, resources, taxa)
            for taxa in taxlevels.values()
        ]
        d = pd.DataFrame(data=sc)
        d['Method'] = name
        d['Mock'] = mock
        print(d)
        with open('%s.pckl' % pickle_pref, 'wb') as p:
            dill.dump(d, p)
    return d
Esempio n. 23
0
n_sig = y_train[y_train == 1].shape[0]
n_bkg = y_train[y_train == 0].shape[0]
spw = n_bkg / n_sig

n_sig = y[y == 1].shape[0]
n_bkg = y[y == 0].shape[0]
spw = n_bkg / n_sig
print(spw)

search_parameters = {
    "learning_rate": [0.02, 0.05, 0.1],
    "num_leaves": [20, 50, 150, 200],
    "min_child_samples": [40, 60, 100, 160, 240],
    "max_depth": [3, 4, 5, 6, 7, 8],
}

clf = lgbm.LGBMClassifier(boosting_type="gbdt",
                          scale_pos_weight=spw,
                          n_estimators=1000)

fit_params = {
    "early_stopping_rounds": 15,
    "eval_metric": "auc",
    "eval_set": [(X_test, y_test)],
    "eval_sample_weight": [w_test],
}

search = GridSearchCV(clf, param_grid=search_parameters, cv=2)
search.fit(df, y, **fit_params)
Esempio n. 24
0
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.pipeline import make_pipeline
from dask_ml.preprocessing import StandardScaler
from dask_ml.linear_model import LogisticRegression

if __name__ == "__main__":
    client = Client()
    data = Path('./data')
    df = pd.read_csv(data / "01_heights_weights_genders.csv")
    y = 1 * (df.Gender == "Male").values
    X = df[['Height', 'Weight']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline = make_pipeline(StandardScaler(), LogisticRegression())
    grid = GridSearchCV(pipeline,
                        param_grid={'logisticregression__C': [.1, 1, 10, 100]},
                        cv=5)
    grid.fit(X_train, y_train)
    print("Score", grid.score(X_test, y_test))