コード例 #1
0
    def hpo_dask(self,
                 model,
                 params,
                 X,
                 y,
                 exp_name='exp_0',
                 joblib=True,
                 cv=2,
                 n_iter=10,
                 verbose=10,
                 n_jobs=-1,
                 random_state=0,
                 report=True):

        self.__engine_init()
        if joblib:
            from sklearn.model_selection import RandomizedSearchCV
            import joblib
            search = RandomizedSearchCV(model,
                                        params,
                                        cv=cv,
                                        n_iter=n_iter,
                                        verbose=verbose,
                                        n_jobs=n_jobs,
                                        random_state=random_state)
            with joblib.parallel_backend('dask'):
                print("Using dask backend")
                print("Started fitting")
        else:
            from dask_ml.model_selection import RandomizedSearchCV
            search = RandomizedSearchCV(model,
                                        params,
                                        cv=cv,
                                        n_iter=n_iter,
                                        n_jobs=-1,
                                        random_state=random_state)
            print("Started fitting")
        search.fit(X, y)
        best = search.best_estimator_
        print("Best score {}".format(search.best_score_))
        print("Best params {}".format(search.best_params_))
        if report:
            from joblib import dump, load
            import json
            self.__check_dirs()
            print("Saving report and best model")
            cv_report = pd.DataFrame(search.cv_results_)
            rep_name = "{}_cv_results.csv".format(exp_name)
            path_report = os.path.join(self.report_dir, rep_name)
            cv_report.to_csv(path_report)
            best_name = "{}_best.pkl"
            path_model = os.path.join(self.models_dir, best_name)
            dump(best, path_model)
            param_name = "{}_best_params.json".format(search.best_params_)
            path_best_params = os.path.join(self.report_dir, param_name)
            with open(path_best_params, 'w') as fp:
                json.dump(search.best_params_, fp)
        return best
コード例 #2
0
ファイル: logreg.py プロジェクト: xanthorpe/soccer_xg
def _logreg_gridsearch_model(
    task,
    numeric_features,
    categoric_features,
    learning_rate,
    use_dask,
    n_iter,
    scoring,
):
    if learning_rate is None:
        param_space = {
            'clf__C': np.logspace(-5, 5, 100),
            'clf__class_weight': ['balanced', None],
        }
        model = LogisticRegression(max_iter=10000, fit_intercept=False)
    else:
        param_space = {
            'clf__penalty': ['l1', 'l2'],
            'clf__alpha': np.logspace(-5, 5, 100),
            'clf__class_weight': ['balanced', None],
        }
        learning_rate_schedule = ('constant' if isinstance(
            learning_rate, float) else learning_rate)
        eta0 = learning_rate if isinstance(learning_rate, float) else 0
        model = SGDClassifier(
            learning_rate=learning_rate_schedule,
            eta0=eta0,
            loss='log',
            max_iter=10000,
            fit_intercept=False,
        )

    pipe = Pipeline([
        (
            'preprocessing',
            simple_proc_for_linear_algoritms(numeric_features,
                                             categoric_features),
        ),
        ('clf', model),
    ])

    if use_dask:
        from dask_ml.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
    else:
        from sklearn.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
コード例 #3
0
def _mlp_gridsearch_model(
    task,
    numeric_features,
    categoric_features,
    learning_rate,
    use_dask,
    n_iter,
    scoring,
):
    param_space = {
        'clf__hidden_layer_sizes': [
            (24, ),
            (12, 12),
            (6, 6, 6, 6),
            (4, 4, 4, 4, 4, 4),
            (12, 6, 3, 3),
        ],
        'clf__activation': ['relu', 'logistic', 'tanh'],
        'clf__batch_size': [16, 32, 64, 128, 256, 512],
        'clf__alpha':
        uniform(0.0001, 0.9),
        'clf__learning_rate': ['constant', 'adaptive'],
    }

    model = (MLPClassifier(learning_rate_init=learning_rate)
             if task == 'classification' else MLPRegressor(
                 learning_rate_init=learning_rate))

    pipe = Pipeline([
        (
            'preprocessing',
            simple_proc_for_linear_algoritms(numeric_features,
                                             categoric_features),
        ),
        ('clf', model),
    ])

    if use_dask:
        from dask_ml.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
    else:
        from sklearn.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
コード例 #4
0
ファイル: xgboost.py プロジェクト: xanthorpe/soccer_xg
def _xgboost_gridsearch_model(
    task,
    numeric_features,
    categoric_features,
    learning_rate,
    use_dask,
    n_iter,
    scoring,
):
    param_space = {
        'clf__max_depth': randint(2, 11),
        'clf__min_child_weight': randint(1, 11),
        'clf__subsample': uniform(0.5, 0.5),
        'clf__colsample_bytree': uniform(0.5, 0.5),
        'clf__colsample_bylevel': uniform(0.5, 0.5),
        'clf__gamma': uniform(0, 1),
        'clf__reg_alpha': uniform(0, 1),
        'clf__reg_lambda': uniform(0, 10),
        'clf__base_score': uniform(0.1, 0.9),
        'clf__scale_pos_weight': uniform(0.1, 9.9),
    }

    model = (xgbsk.XGBClassifier(learning_rate=learning_rate)
             if task == 'classification' else xgbsk.XGBRegressor(
                 learning_rate=learning_rate))

    pipe = Pipeline([
        (
            'preprocessing',
            simple_proc_for_tree_algoritms(numeric_features,
                                           categoric_features),
        ),
        ('clf', model),
    ])

    if use_dask:
        from dask_ml.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
    else:
        from sklearn.model_selection import RandomizedSearchCV

        return RandomizedSearchCV(pipe,
                                  param_space,
                                  n_iter=n_iter,
                                  scoring=scoring,
                                  cv=5)
コード例 #5
0
ファイル: rf_tuning_run.py プロジェクト: TejM/Cough_Project
def main():
    print("Loading data...", end='\r')
    x, y, iterator = load_data_nozeros_bypoint()
    print("Loaded                      ")
    n_estimators = [int(x) for x in np.linspace(start=50, stop=1000, num=10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    params = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }
    scoring = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'mcc': sklearn.metrics.make_scorer(sklearn.metrics.matthews_corrcoef)
    }

    out_dir = gouda.ensure_dir(os.path.join(RESULTS_DIR, 'log_results'))
    print(out_dir)
    rf = RandomForestClassifier()
    with ProgressBar():
        grid_search = RandomizedSearchCV(rf,
                                         params,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         cv=iterator,
                                         refit='mcc',
                                         iid=True,
                                         cache_cv=True)
        grid_search.fit(x, y)
    configs, means = get_configurations(grid_search.cv_results_)
    output_path = os.path.join(out_dir, 'new_RF.pickle')
    dump([
        'test', 'grid_search.cv_results_', grid_search.cv_results_,
        'grid_search.best_params_', grid_search.best_params_,
        'grid_search.best_score_', grid_search.best_score_,
        'grid_search.best_estimator_', grid_search.best_estimator_
    ], output_path)
コード例 #6
0
def model_selection(pipeline: Pipeline, X, y, n_iter: int, log) -> Pipeline:
    """Performs model selection using randomized search with cross-validation

    Parameters
    ----------
    pipline: Pipeline
        pipeline on which the search is to be performed
    X:
        dataframe containing the features on which model selection is performed
    y:
        dataframe containing the outcome on which model selection is performed
    n_iter: int
        number of search steps to be performed
    log:
        logger object

    Returns
    -------
    None
    """

    param_dists = {
        "feature_gen__cat_avg__category points_average__min_count":
        [15, 30, 50],
        "feature_gen__cat_not_winery__category_cutoff__min_count":
        [15, 30, 50],
        "feature_gen__cat_winery__category_cutoff__min_count": [5, 10],
        "feature_gen__designation__decomposition__n_components": [5, 20, 50],
        "feature_gen__designation__vectorizer__sublinear_tf": [True, False],
        "feature_gen__description__decomposition__n_components": [20, 50, 75],
        "feature_gen__description__vectorizer__sublinear_tf": [True, False],
        "feature_gen__title__decomposition__n_components": [5, 20, 50],
        "feature_gen__title__vectorizer__sublinear_tf": [True, False],
        "regressor__min_samples_leaf": [5, 10, 25, 50, 100],
        "regressor__max_features": ['sqrt', 'log2'],
        "regressor__n_estimators": [50, 100, 300]
    }

    log.info('Running model selection')
    log.info(f'n_iter = {n_iter}')
    searchcv = RandomizedSearchCV(estimator=pipeline,
                                  param_distributions=param_dists,
                                  n_iter=int(n_iter),
                                  cv=5,
                                  scoring='neg_mean_squared_error',
                                  return_train_score=False)
    searchcv.fit(X, y)
    log.info('Model selection done')

    return searchcv
コード例 #7
0
def search(model,
           X,
           y,
           params,
           method="randomized",
           n_iter=30,
           cv=5,
           **kwargs):
    """Run a cross-validated search for hyperparameters."""
    if method.lower() == "randomized":
        search = RandomizedSearchCV(model,
                                    param_distributions=params,
                                    n_iter=n_iter,
                                    cv=cv)
    elif method.lower() == "grid":
        search = GridSearchCV(model, param_grid=params, cv=cv)
    elif method.lower() == "bayes":
        search = BayesSearchCV(model,
                               search_spaces=params,
                               n_iter=n_iter,
                               cv=cv)
    else:
        message = ("'method' must be either 'randomized', 'grid' or 'bayes'."
                   " Got method='{}'".format(method))
        LOGGER.error(message)
        raise ValueError(message)

    method_name = method.capitalize() + "SearchCV"
    LOGGER.info("Beginning " + method_name)
    when_started = time()

    progress(search.fit(X, y))

    total_time = time() - when_started
    n_settings = len(search.cv_results_['params'])
    LOGGER.warn(
        "{} took {:.2f} seconds for {} candidates parameter settings.".format(
            method_name, total_time, n_settings))
    return search
コード例 #8
0
ファイル: continuous.py プロジェクト: razeayres/sleepy
 def train(self):
     seed = random.seed(42)
     bins = qcut(self.Y, 5, labels=False, duplicates='drop')
     X_train, self.X_test, Y_train, self.Y_test = train_test_split(
         self.X, self.Y, test_size=0.3, stratify=bins, random_state=42)
     estimator = GradientBoostingRegressor(random_state=42)
     selector = RFECV(estimator, cv=2, min_features_to_select=1)
     if self.load_save == True:
         self.logger.warning(
             'The predefined parameters are going to be used')
         self.model = joblib.load(self.name + '.pkl')
         self.hp = None
     else:
         self.logger.warning('The RandomizedSearchCV() is going to be used')
         grid = {
             'estimator__n_estimators':
             [int(x) for x in linspace(10, 1000, num=101)],
             'estimator__max_depth':
             [int(x) for x in linspace(1, 100, num=101)],
             'estimator__min_samples_split':
             [int(x) for x in linspace(2, 50, num=49)],
             'estimator__min_samples_leaf':
             [int(x) for x in linspace(2, 50, num=49)]
         }
         # client = Client('192.168.200.1:8786')
         self.rscv = RandomizedSearchCV(
             estimator=selector,
             param_distributions=grid,
             n_iter=50,
             scoring='r2',
             cv=
             2,  # <-- change the number os simulations here! 4000 is the original
             iid=False,
             random_state=42,
             n_jobs=3)  # , scheduler=client)
         self.rscv.fit(X_train, Y_train)
         self.model = self.rscv.best_estimator_
         joblib.dump(self.model, self.name + '.pkl', compress=1)
         self.hp = self.rscv.best_params_
コード例 #9
0
    'gamma':[0,0.03,0.1,0.3],
    'min_child_weight':[1.5,6,10],
    'learning_rate':[0.1,0.07],
    'max_depth':[3,5],
    'n_estimators':[10000],
    'reg_alpha':[1e-5, 1e-2,  0.75],
    'reg_lambda':[1e-5, 1e-2, 0.45],
    'subsample':[0.6,0.95]  
}

print("hyperparameters")
#gb_model = xgboost.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=5,
#     min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=6, scale_pos_weight=1, seed=27)
cv = KFold(10, shuffle=True)

rsearch1 = RandomizedSearchCV(estimator = bst, param_distributions = parameters_for_testing, n_jobs=-1,iid=False,n_iter = 100, cv = cv ,scoring='neg_mean_squared_error')
rsearch1.fit(X_train, y_train)

print ('######################################################')
print (rsearch1.grid_scores_)
print('best params')
print (rsearch1.best_params_)
print('best score')
print (rsearch1.best_score_)


#bst.save_model('0001.model')



コード例 #10
0
ファイル: train_by_subject.py プロジェクト: nshawen/beat_pd
def train_by_subject(labels_df, features_df, cohort, device, instrument,
                     subject_id, label):

    label_cols = ['on_off', 'dyskinesia', 'tremor', 'subject_id']
    id_cols = ['measurement_id', 'id']

    labels_df["subject_id"] = labels_df["subject_id"].astype(str)
    subj_means = labels_df.groupby('subject_id').mean()

    df = features_df.dropna().merge(labels_df,
                                    right_on='measurement_id',
                                    left_on='measurement_id')
    print('%d rows dropped due to nans in features' %
          (features_df.shape[0] - df.shape[0]))

    # Model

    ## Model spec
    scaler = preprocessing.RobustScaler(quantile_range=(1, 99))
    scaler_pg = {
        'scaler__quantile_range': [(.1, 99.9), (.5, 99.5), (1, 99), (5, 95),
                                   (10, 90)],
    }

    # Keep features w/ variance in top x%ile
    var = lambda X, y: np.var(X, axis=0)
    f_select = feature_selection.SelectPercentile(var, percentile=95)
    f_select_pg = {'f_select__percentile': stats.uniform(0, 100)}

    model = ensemble.RandomForestRegressor()
    model_pg = {
        'model__regressor__n_estimators': stats.randint(50, 100),
        'model__regressor__max_depth': stats.randint(10, 25),
        'model__regressor__max_features': [.25, 'auto']
    }

    clip_out = preprocessing.FunctionTransformer(np.clip,
                                                 kw_args={
                                                     'a_min': 0,
                                                     'a_max': 4
                                                 })
    clipped_model = compose.TransformedTargetRegressor(
        regressor=model, inverse_func=clip_out.transform)

    pipe = pipeline.Pipeline([
        ('scaler', scaler),
        ('f_select', f_select),
        ('model', clipped_model),
    ],
                             verbose=1)

    param_grid = {
        **scaler_pg,
        **f_select_pg,
        **model_pg,
    }

    metric = metrics.make_scorer(metrics.mean_squared_error,
                                 greater_is_better=False)

    cv = model_selection.StratifiedKFold(shuffle=True)

    ## Model eval

    subj_df = df
    print(f'working on {label}')

    labeled_samps = subj_df.dropna(subset=[label])
    if not labeled_samps.shape[0]:
        print(f'skipping {label}')
        return None

    print(labeled_samps.columns.values.tolist())

    y = subj_df.loc[labeled_samps.index, label].astype('int')
    X = labeled_samps.drop(columns=[*label_cols, *id_cols])

    search = RandomizedSearchCV(pipe,
                                param_grid,
                                n_iter=20,
                                scoring=metric,
                                cv=cv,
                                refit=False)
    cv_fit = search.fit(X, y)
    cv_results_df = pd.DataFrame(cv_fit.cv_results_)

    resultset_json = {
        'cohort': cohort,
        'subject_id': subject_id,
        'model_type': str(type(model).__name__),
        'label': label
    }
    win_params = cv_results_df.loc[cv_results_df.rank_test_score == 1,
                                   'params'].values[0]
    winner = pipe.set_params(**win_params)

    return winner, cv_results_df, resultset_json