Exemple #1
0
def test_searchcv_callback():
    # Test whether callback is used in BayesSearchCV and
    # whether is can be used to interrupt the search loop

    X, y = load_iris(True)
    opt = BayesSearchCV(
        DecisionTreeClassifier(),
        {
            'max_depth': [3],  # additional test for single dimension
            'min_samples_split': Real(0.1, 0.9),
        },
        n_iter=5)
    total_iterations = [0]

    def callback(opt_result):
        # this simply counts iterations
        total_iterations[0] += 1

        # break the optimization loop at some point
        if total_iterations[0] > 2:
            return True  # True == stop optimization

        return False

    opt.fit(X, y, callback=callback)

    assert total_iterations[0] == 3

    # test whether final model was fit
    opt.score(X, y)
def test_searchcv_callback():
    # Test whether callback is used in BayesSearchCV and
    # whether is can be used to interrupt the search loop

    X, y = load_iris(True)
    opt = BayesSearchCV(
        DecisionTreeClassifier(),
        {
            'max_depth': [3],  # additional test for single dimension
            'min_samples_split': Real(0.1, 0.9),
        },
        n_iter=5
    )
    total_iterations = [0]

    def callback(opt_result):
        # this simply counts iterations
        total_iterations[0] += 1

        # break the optimization loop at some point
        if total_iterations[0] > 2:
            return True  # True == stop optimization

        return False

    opt.fit(X, y, callback=callback)

    assert total_iterations[0] == 3

    # test whether final model was fit
    opt.score(X, y)
    def trainB(self, X_list, y_list, n_points=1, space=spaceB, cv=5):
        """
        BayesianSearchCV method
        :param X_list: List of training sets
        :param y_list: List of targets
        :param space: parameter space
        :return: models an metrics
        """
        n_calls = self.n_calls

        scores = []
        val_scores = []
        best_models = []

        for j in range(len(X_list)):
            classifier = RandomForestClassifier(n_jobs=-1)
            y = y_list.copy()
            X = X_list.copy()
            y_test = y.pop(j)
            X_test = X.pop(j)
            y_train = np.concatenate(y, axis=0)
            X_train = np.concatenate(X, axis=0)

            X_train = Features().div_cols(X_train).values
            X_test = Features().div_cols(X_test).values

            start = time()
            opt = BayesSearchCV(classifier,
                                search_spaces=space,
                                scoring=self.scorer,
                                cv=cv,
                                n_points=n_points,
                                n_iter=n_calls,
                                n_jobs=-1)

            opt.fit(X_train, y_train)
            model = opt.best_estimator_
            print('Season', 2019 - j)
            print("Bayes CV search took %.2f seconds for %d candidates"
                  " parameter settings." % ((time() - start), n_calls))
            print("val. score:", opt.best_score_)
            print("test score:", opt.score(X_test, y_test))
            # print(model)
            print("")
            best_models.append(model)
            val_scores.append(opt.best_score_)
            scores.append(opt.score(X_test, y_test))
        return scores, val_scores, best_models
Exemple #4
0
def run_optimization_test():

    N_iter = 100
    # log-uniform: understand as search over p = exp(x) by varying x
    opt = BayesSearchCV(
        TemplateClassifier(),
        {
            "deltaEta": Real(0.0, 4.0, prior="uniform"),
            "deltaPhi": Real(0.0, 4.0, prior="uniform"),
            "maxNRegions": Integer(2, 100),
            "maxNVertices": Integer(1, 5),
            "nSigmaZBeamSpot": Real(0.0, 30.0, prior="uniform"),
            "nSigmaZVertex": Real(-1.0, 1.0, prior="uniform"),
            "originRadius": Real(0.0, 1.0, prior="uniform"),
            "ptMin": Real(0.0, 2.0, prior="uniform"),
            "zErrorBeamSpot": Real(0.0, 1.0, prior="uniform"),
            "zErrorVetex": Real(0.0, 1.0, prior="uniform"),
        },
        n_iter=N_iter,
        cv=[(slice(None), slice(None))],
        verbose=1,
        # scoring="accuracy"
    )

    opt.fit(np.zeros((100, 1)), np.zeros((100)))

    print("After {} iterations:".format(N_iter))
    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(0.0, 0.0))
    print("Final params:")
    params = opt.best_estimator_.get_params()
    for i, (param, val) in enumerate(params.items()):
        print("{0}:\t{1:2.2f} vs {2:2.2f}".format(param, val, targets[i]))
Exemple #5
0
def _fit_svc(n_jobs=1, n_points=1, cv=None):
    """
    Utility function to fit a larger classification task with SVC
    """

    X, y = make_classification(n_samples=1000,
                               n_features=20,
                               n_redundant=0,
                               n_informative=18,
                               random_state=1,
                               n_clusters_per_class=1)

    opt = BayesSearchCV(
        SVC(),
        {
            'C': Real(1e-3, 1e+3, prior='log-uniform'),
            'gamma': Real(1e-3, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 3),
        },
        n_jobs=n_jobs,
        n_iter=11,
        n_points=n_points,
        cv=cv,
        random_state=42,
    )

    opt.fit(X, y)

    assert opt.score(X, y) > 0.9
def bayesian_optimization(model,
                          space,
                          scorer,
                          x_train,
                          y_train,
                          x_test,
                          y_test,
                          n_iter=256,
                          cv=4,
                          n_jobs=None):
    global counter
    global opt

    if n_jobs is None:
        n_jobs = cv

    opt = BayesSearchCV(model,
                        space,
                        scoring=scorer,
                        n_iter=n_iter,
                        cv=cv,
                        verbose=10,
                        n_jobs=n_jobs)

    counter = 0
    opt.fit(x_train, y_train, callback=on_step)

    print(opt.best_params_)
    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(x_test, y_test))
	def testSVMParams(self,pipe):
		print("{} - CALCULATING BEST PARAMETERS... \n".format(datetime.datetime.now()))

		X_train, X_test, y_train, y_test = train_test_split(self.df, self.classes.values, train_size=0.75, test_size=.25, random_state=0)

		listaC = [0.001, 0.01, 0.1, 1, 10,100]
		listaGamma = [0.001, 0.01, 0.1, 1, 10, 100]
		listaKernels = ['rbf','linear','poly','sigmoid']

		if self.problem == 'classification':
			paramsGrid = dict(clf__C=listaC, clf__gamma=listaGamma, clf__kernel=listaKernels)
			grid = BayesSearchCV(pipe,paramsGrid,scoring='accuracy',n_iter=9)
		elif self.problem == 'regression':
			if isinstance(self.classes,pd.DataFrame):
				paramsGrid = dict(reg__estimator__C=listaC, reg__estimator__gamma=listaGamma, reg__estimator__kernel=listaKernels)
			else:
				paramsGrid = dict(reg__C=listaC, reg__gamma=listaGamma, reg__kernel=listaKernels)

			grid = BayesSearchCV(pipe,paramsGrid,scoring='r2',n_iter=9)

		# print("DF: \n {}".format(self.df))
		# print("CLASSES: \n {}".format(self.classes))
		print("{} - FITTING DATA... \n".format(datetime.datetime.now()))
		grid.fit(X_train,y_train)
		print("{} - BEST RESULTS - {}".format(datetime.datetime.now(),grid.best_score_))
		print("{} - TEST RESULTS: {}".format(datetime.datetime.now(),grid.score(X_test, y_test)))
		return grid.best_params_
def get_bayes_scikit_score_cv(X_train,y_train,X_test,y_test, X_val=None, y_val= None, max_evals = 25, folds=5, original = None):

    space = get_baesian_space(dictem = True)
    opt_cat = BayesSearchCV(CatBoostClassifier(logging_level='Silent'), space['CAT'], n_iter = max_evals, random_state = 0)
    opt_xgb = BayesSearchCV(XGBClassifier(), space['XGB'], n_iter = max_evals, random_state = 0)
    opt_lgbm = BayesSearchCV(LGBMClassifier(), space['LGBM'], n_iter = max_evals, random_state = 0)
    _ = opt_cat.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    __ = opt_xgb.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    ___ = opt_lgbm.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])

    scores = [opt_cat.score(X_test, y_test), opt_xgb.score(X_test, y_test), opt_lgbm.score(X_test, y_test)]
    train_scores  = [opt_cat.best_score_, opt_xgb.best_score_, opt_lgbm.best_score_]
    score = max(scores)
    cross_score = max(train_scores)
    neptune.log_metric(f'skopt-{max_evals}-iterations-{folds}-folds', score)
    neptune.log_metric('skopt train holdout score', cross_score)
    return score
def test_searchcv_runs(surrogate, n_jobs):
    """
    Test whether the cross validation search wrapper around sklearn
    models runs properly with available surrogates and with single
    or multiple workers.

    Parameters
    ----------

    * `surrogate` [str or None]:
        A class of the scikit-optimize surrogate used. None means
        to use default surrogate.

    * `n_jobs` [int]:
        Number of parallel processes to use for computations.

    """

    X, y = load_iris(True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, random_state=0
    )

    # None search space is only supported when only `step` function is used
    assert_raises(ValueError, BayesSearchCV(SVC(), None).fit, (X, y))

    # check if invalid dimensions are raising errors
    with pytest.raises(ValueError):
        BayesSearchCV(SVC(), {'C': '1 ... 100.0'})

    with pytest.raises(TypeError):
        BayesSearchCV(SVC(), ['C', (1.0, 1)])

    # create an instance of a surrogate if it is not a string
    if surrogate is not None:
        optimizer_kwargs = {'base_estimator': surrogate}
    else:
        optimizer_kwargs = None

    opt = BayesSearchCV(
        SVC(),
        {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 8),
            'kernel': Categorical(['linear', 'poly', 'rbf']),
        },
        n_jobs=n_jobs, n_iter=11,
        optimizer_kwargs=optimizer_kwargs
    )

    opt.fit(X_train, y_train)

    # this normally does not hold only if something is wrong
    # with the optimizaiton procedure as such
    assert_greater(opt.score(X_test, y_test), 0.9)
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0]
    X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0]

    opt = BayesSearchCV(
        estimator=GradientBoostingClassifier(),
        # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gradient_boosting.py
        search_spaces={
            'learning_rate': Real(0.01, 1, 'log-uniform'),
            'n_estimators': Integer(50, 2000),
            'subsample': Real(0.01, 1.0, 'uniform'),
            'max_depth': Integer(1, 10),
            'max_features': Real(0.1, 1.0, 'uniform'),
            'min_samples_split': Integer(2, 20),
            'min_samples_leaf': Integer(1, 20),
            'criterion': ['friedman_mse', 'mse', 'mae']
        },
        cv=StratifiedKFold(n_splits=10, shuffle=True),
        n_jobs=3,
        n_iter=100,
        verbose=0,
        refit=True,
        random_state=42)

    def status_print(_):
        """Status callback durring bayesian hyperparameter search"""

        # Get all the models tested so far in DataFrame format
        all_models = pd.DataFrame(opt.cv_results_)

        best_parap_copy = copy.deepcopy(opt.best_params_)
        for k, v in opt.best_params_.items():
            best_parap_copy[k] = v if isinstance(v, str) or isinstance(
                v, float) else v.item()
        param_list = []
        for each in json.dumps(best_parap_copy)[1:-1].split(', '):
            param_list.append('='.join(each[1:].split('": ')))

        if hasattr(opt.estimator, 'verbose'):
            param_list.append('verbose=True')

        param = opt.estimator.__class__.__name__ + \
            '(' + ', '.join(param_list) + ')'

        # Get current parameters and the best parameters
        print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format(
            len(all_models), np.round(opt.best_score_, 4), param))

    opt.fit(X_train, y_train, callback=status_print)

    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(X_test, y_test))
Exemple #11
0
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values

    # log-uniform: understand as search over p = exp(x) by varying x
    opt = BayesSearchCV(
        estimator=SVC(),

        # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py
        search_spaces={
            'C': Real(1e-6, 1e+6, 'log-uniform'),
            'gamma': Real(3.0517578125e-05, 8, 'log-uniform'),
            'kernel': ['rbf', 'poly', 'sigmoid'],  # categorical parameter
            'decision_function_shape': ['ovo', 'ovr'],
            'degree': Integer(2, 5),
            'coef0': Real(-1, 1, 'uniform'),
            'tol': Real(1e-5, 1e-1, 'log-uniform')
        },
        cv=StratifiedKFold(n_splits=10, shuffle=True),
        n_jobs=3,
        n_iter=100,
        verbose=0,
        refit=True)

    def status_print(_):
        """Status callback durring bayesian hyperparameter search"""

        # Get all the models tested so far in DataFrame format
        all_models = pd.DataFrame(opt.cv_results_)

        best_parap_copy = copy.deepcopy(opt.best_params_)
        for k, v in opt.best_params_.items():
            best_parap_copy[k] = v if isinstance(v, str) or isinstance(
                v, float) else v.item()
        param_list = []
        for each in json.dumps(best_parap_copy)[1:-1].split(', '):
            param_list.append('='.join(each[1:].split('": ')))

        if hasattr(opt.estimator, 'verbose'):
            param_list.append('verbose=True')

        param = opt.estimator.__class__.__name__ + \
            '(' + ', '.join(param_list) + ')'

        # Get current parameters and the best parameters
        print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format(
            len(all_models), np.round(opt.best_score_, 4), param))

    opt.fit(X_train, y_train, callback=status_print)

    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(X_test, y_test))
Exemple #12
0
def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None):
    """
    Test whether the cross validation search wrapper around sklearn
    models runs properly with available surrogates and with single
    or multiple workers and different number of parameter settings
    to ask from the optimizer in parallel.

    Parameters
    ----------

    * `surrogate` [str or None]:
        A class of the scikit-optimize surrogate used. None means
        to use default surrogate.

    * `n_jobs` [int]:
        Number of parallel processes to use for computations.

    """

    X, y = load_iris(True)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.75,
                                                        random_state=0)

    # create an instance of a surrogate if it is not a string
    if surrogate is not None:
        optimizer_kwargs = {'base_estimator': surrogate}
    else:
        optimizer_kwargs = None

    opt = BayesSearchCV(SVC(), {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1, 8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
                        n_jobs=n_jobs,
                        n_iter=11,
                        n_points=n_points,
                        cv=cv,
                        optimizer_kwargs=optimizer_kwargs)

    opt.fit(X_train, y_train)

    # this normally does not hold only if something is wrong
    # with the optimizaiton procedure as such
    assert opt.score(X_test, y_test) > 0.9
def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None):
    """
    Test whether the cross validation search wrapper around sklearn
    models runs properly with available surrogates and with single
    or multiple workers and different number of parameter settings
    to ask from the optimizer in parallel.

    Parameters
    ----------

    * `surrogate` [str or None]:
        A class of the scikit-optimize surrogate used. None means
        to use default surrogate.

    * `n_jobs` [int]:
        Number of parallel processes to use for computations.

    """

    X, y = load_iris(True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, random_state=0
    )

    # create an instance of a surrogate if it is not a string
    if surrogate is not None:
        optimizer_kwargs = {'base_estimator': surrogate}
    else:
        optimizer_kwargs = None

    opt = BayesSearchCV(
        SVC(),
        {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 8),
            'kernel': Categorical(['linear', 'poly', 'rbf']),
        },
        n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv,
        optimizer_kwargs=optimizer_kwargs
    )

    opt.fit(X_train, y_train)

    # this normally does not hold only if something is wrong
    # with the optimizaiton procedure as such
    assert_greater(opt.score(X_test, y_test), 0.9)
Exemple #14
0
def hyperparam_search(model_config, train, test):
    """Perform hyperparameter search using Bayesian optimization on a given model and
    dataset.

    Args:
        model_config (dict): the model and the parameter ranges to search in. Format:
        {
            "name": str,
            "model": sklearn.base.BaseEstimator,
            "params": dict
        }
        train (pandas.DataFrame): training data
        test (pandas.DataFrame): test data
    """
    X_train = train.drop("label", axis=1)
    y_train = train.label
    X_test = test.drop("label", axis=1)
    y_test = test.label

    opt = BayesSearchCV(
        model_config["model"],
        model_config["params"],
        n_jobs=4,
        cv=5,
        random_state=RANDOM_SEED,
    )
    opt.fit(X_train, y_train)
    acc = opt.score(X_test, y_test)

    print(f"{model_config['name']} results:")
    print(f"Best validation accuracy: {opt.best_score_}")
    print(f"Test set accuracy: {acc}")
    print(f"Best parameters:")

    for param, value in opt.best_params_.items():
        print(f"- {param}: {value}")

    return {
        "name": model_config["name"],
        "class": model_config["class"],
        "model": opt.best_estimator_,
        "params": opt.best_params_,
        "score": acc,
    }
Exemple #15
0
def test_searchcv_refit():
    """
    Test whether results of BayesSearchCV can be reproduced with a fixed
    random state.
    """

    X, y = load_iris(True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, random_state=0
    )

    random_state = 42

    opt = BayesSearchCV(
        SVC(random_state=random_state),
        {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 8),
            'kernel': Categorical(['linear', 'poly', 'rbf']),
        },
        n_iter=11, random_state=random_state
    )

    opt2 = BayesSearchCV(
        SVC(random_state=random_state),
        {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 8),
            'kernel': Categorical(['linear', 'poly', 'rbf']),
        },
        n_iter=11, random_state=random_state, refit=True
    )

    opt.fit(X_train, y_train)
    opt2.best_estimator_ = opt.best_estimator_

    opt2.fit(X_train, y_train)
    # this normally does not hold only if something is wrong
    # with the optimizaiton procedure as such
    assert opt2.score(X_test, y_test) > 0.9
 def set_model(self, model_type, hyper_parameters=None):
     """Create a surrogate model"""
     base_model = self.models[model_type]['model']
     hyper_model = self.models[model_type]['model']
     if hyper_parameters:
         hyper_model = BayesSearchCV(base_model,
                                     hyper_parameters,
                                     refit=True,
                                     n_jobs=16,
                                     cv=self.cv,
                                     n_iter=self.number_iters,
                                     random_state=self.random)
         fit = hyper_model.fit(self.scaled_var_train, self.scaled_obj_train)
         r2_score = hyper_model.score(self.scaled_var_test,
                                      self.scaled_obj_test)
         mse_score = self._get_mse(hyper_model)
         self.models[model_type].update({
             'model':
             hyper_model,
             'fit':
             fit,
             'score':
             r2_score,
             'mse_score':
             mse_score,
             'hyper_parameters':
             hyper_model.best_params_,
             'cv_results':
             hyper_model.cv_results_
         })
     else:
         fit = base_model.fit(self.scaled_var_train, self.scaled_obj_train)
         r2_score = base_model.score(self.scaled_var_test,
                                     self.scaled_obj_test)
         mse_score = self._get_mse(base_model)
         self.models[model_type].update({
             'fit': fit,
             'score': r2_score,
             'mse_score': mse_score,
             'hyper_parameters': None,
             'cv_results': None
         })
def _fit_svc(n_jobs=1, n_points=1, cv=None):
    """
    Utility function to fit a larger classification task with SVC
    """

    X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0,
                               n_informative=18, random_state=1,
                               n_clusters_per_class=1)

    opt = BayesSearchCV(
        SVC(),
        {
            'C': Real(1e-3, 1e+3, prior='log-uniform'),
            'gamma': Real(1e-3, 1e+1, prior='log-uniform'),
            'degree': Integer(1, 3),
        },
        n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv,
        random_state=42,
    )

    opt.fit(X, y)

    assert_greater(opt.score(X, y), 0.9)
Exemple #18
0
def random_forest(attributes, prediction):
    x_train, x_test, y_train, y_test = train_test_split(attributes,
                                                        prediction,
                                                        random_state=0)

    rf = RandomForestRegressor(n_estimators=200, random_state=42)

    opt = BayesSearchCV(rf, {
        'n_estimators': Integer(200, 2000),
        'max_features': Categorical(['auto', 'sqrt']),
        'max_depth': Integer(10, 110),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'bootstrap': Categorical([True, False]),
    },
                        n_iter=32,
                        cv=5,
                        n_jobs=-1,
                        verbose=2)
    opt.fit(x_train, y_train)

    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(x_test, y_test))

    params_from_bayes = opt.best_params_

    bayes_rf = RandomForestRegressor(**params_from_bayes)

    scoring = ['accuracy', 'precision', 'recall', 'roc_auc', 'f1']

    bayes_scores = cross_validate(bayes_rf,
                                  attributes,
                                  prediction,
                                  scoring=scoring,
                                  cv=10)

    print(simplify_scores(bayes_scores))
Exemple #19
0

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(SVC(), search_space, n_iter=32, cv=3, n_jobs=-1)


@timing
def dummy(func, X_train, y_train):
    func(X_train, y_train)
    return None


dummy(opt.fit, X_train, y_train)

print("val. score: {}".format(opt.best_score_))
print("test score: {}".format(opt.score(X_test, y_test)))

param_grid = {
    'C': [1e-6, 1e-3, 1, 1e3, 1e+6],
    'gamma': [1e-6, 1e-4, 1e-2, 1.0, 1e+1],
    'degree': [1, 4, 8],  # integer valued parameter
    'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
}

grid_opt = GridSearchCV(SVC(),
                        param_grid,
                        cv=3,
                        n_jobs=-1,
                        return_train_score=True)
dummy(grid_opt.fit, X_train, y_train)
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values

    # log-uniform: understand as search over p = exp(x) by varying x
    opt = BayesSearchCV(
        estimator=xgb.XGBClassifier(),

        # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/xgradient_boosting.py
        search_spaces={
            'learning_rate': Real(0.001, 1.0, 'log-uniform'),
            'min_child_weight': Integer(0, 20),
            'max_depth': Integer(0, 50),
            'max_delta_step': Integer(0, 20),
            'subsample': Real(0.01, 1.0, 'uniform'),
            'colsample_bytree': Real(0.01, 1.0, 'uniform'),
            'colsample_bylevel': Real(0.01, 1.0, 'uniform'),
            'reg_lambda': Real(1e-10, 1e-1, 'log-uniform'),
            'reg_alpha': Real(1e-10, 1e-1, 'log-uniform'),
            'gamma': Real(1e-9, 0.5, 'log-uniform'),
            'n_estimators': Integer(50, 512),
            'scale_pos_weight': Real(1e-6, 500, 'log-uniform'),
            'booster': ["gbtree", "dart"],
            'sample_type': ['uniform', 'weighted'],
            'normalize_type': ['tree', 'forest'],
            'rate_drop': Real(1e-10, 1 - (1e-10), 'uniform')
        },
        cv=StratifiedKFold(n_splits=10, shuffle=True),
        n_jobs=3,
        n_iter=100,
        verbose=0,
        refit=True,
        random_state=42)

    def status_print(_):
        """Status callback durring bayesian hyperparameter search"""

        # Get all the models tested so far in DataFrame format
        all_models = pd.DataFrame(opt.cv_results_)

        best_parap_copy = copy.deepcopy(opt.best_params_)
        for k, v in opt.best_params_.items():
            best_parap_copy[k] = v if isinstance(v, str) or isinstance(
                v, float) else v.item()
        param_list = []
        for each in json.dumps(best_parap_copy)[1:-1].split(', '):
            param_list.append('='.join(each[1:].split('": ')))

        if hasattr(opt.estimator, 'verbose'):
            param_list.append('verbose=True')

        param = opt.estimator.__class__.__name__ + \
            '(' + ', '.join(param_list) + ')'

        # Get current parameters and the best parameters
        print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format(
            len(all_models), np.round(opt.best_score_, 4), param))

    opt.fit(X_train, y_train, callback=status_print)

    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(X_test, y_test))
Exemple #21
0
def NestedBayesCV(tups, y_train, cv_outer = StratifiedKFold(10, random_state=42),
                       cv_inner = StratifiedKFold(10, random_state=42), nested = True,
                       n_iter = 60, random_state = 42, scoring='accuracy'):
    """
    Given a tuple of algorithms/different datasets/names/parameters this will
    perform randomised cross validation using nested CV to choose the best parameters for
    each combination of model and datset.
    
    params:
    - tups : a list of tuples containing algorithm/datasets/names/parameters 
    - y_train : train labels
    - cv_outer : type of CV to be used for the outer validation
    - cv_inner : type of CV to be used for the inner validation, i.e., hyperparameter
                 optimization
    - nested : whether to performe nested CV
    - n_iter: number of iterations for the randomized search
    - random_state: seed for the randomized search
    - scoring: scoring metric to be used
    """
    results = []
    
    if nested == True:
        
        for i in range(0, len(tups)):
            for j in range(0, len(tups[i][1])):
                for train, test in cv_outer.split(tups[i][1][j], y_train):
                    tX_train = tups[i][1][j][train]
                    ty_train = y_train[train]
                    tX_test = tups[i][1][j][test]
                    ty_test = y_train[test]
                    
                
                    opt = BayesSearchCV(tups[i][0], tups[i][3], cv = cv_inner,
                                         random_state = random_state, scoring=scoring,
                                         n_jobs=-1, n_iter = n_iter)
                
                    opt.fit(tX_train, ty_train)
                
                    results.append({'Combo':tups[i][2][j], 'Best_Params': opt.best_params_,
                                   'Outer_CV': opt.score(tX_test, ty_test),
                                   'Inner_CV': opt.best_score_})
            
                print('Finished with' + ' ' + tups[i][2][j])
    
        results = pd.DataFrame(results)
        results['Best_Params'] = results['Best_Params'].apply(str)
    
        if scoring == 'neg_mean_squared_error':
            
            results['Inner_CV'] = np.sqrt(-results['Inner_CV'])
            results['Outer_CV'] = np.sqrt(-results['Outer_CV'])
        
        if scoring in ['neg_log_loss', 'neg_mean_absolute_error',
                       'neg_median_absolute_error', 'neg_mean_absolute_error']:
            
            results['Inner_CV'] = -results['Inner_CV']
            results['Outer_CV'] = -results['Outer_CV']
    
        grouped = results.groupby(['Combo', 'Best_Params'])
        to_aggregate = {'Inner_CV':['mean', 'sem'], 'Outer_CV':['mean', 'sem', 'count']}
        results = grouped.agg(to_aggregate)
    
    else:
        for i in range(0, len(tups)):
            for j in range(0, len(tups[i][1])):
                opt = BayesSearchCV(tups[i][0], tups[i][3], cv = cv_inner,
                                         random_state = random_state, scoring=scoring,
                                         n_jobs=-1, n_iter = n_iter)
                
                opt.fit(X_train, y_train)
                results.append({'Combo':tups[i][2][j], 'Best_Params': opt.best_params_,
                                   'Score_CV': opt.best_score_})
        
                print('Finished with' + ' ' + tups[i][2][j])
                
        results = pd.DataFrame(results)
        results['Best_Params'] = results['Best_Params'].apply(str)
    
        if scoring == 'neg_mean_squared_error':
            
            results['Score_CV'] = np.sqrt(-results['Score_CV'])
        
        if scoring in ['neg_log_loss', 'neg_mean_absolute_error',
                       'neg_median_absolute_error', 'neg_mean_absolute_error']:
            
            results['Score_CV'] = -results['Score_CV']
    
        grouped = results.groupby(['Combo', 'Best_Params'])
        to_aggregate = {'Score_CV':['mean', 'sem', 'count']}
        results = grouped.agg(to_aggregate)
    
    return results
#BayesSearch
lr_param_grid = {
    'C': Real(1e-6, 1e+6, prior='log-uniform'),
    'solver': Categorical(['newton-cg', 'lbfgs', 'liblinear'])
}

lr = LogisticRegression(max_iter=150)
lr_bs = BayesSearchCV(lr,
                      lr_param_grid,
                      scoring='accuracy',
                      cv=strat_k_fold,
                      n_iter=20)
lr_bs.fit(X, y)
print("best BS test score:", lr_bs.best_score_)
print("best BS total score:", lr_bs.score(X, y))
print("best BS params:", lr_bs.best_params_)

#%%

#RF

strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

#GridSearch
rf_param_grid = {
    'n_estimators': [5, 15, 50, 100],
    'max_depth': [5, 15, 25],
    'min_samples_leaf': [1, 3],
    'max_leaf_nodes': [10, 20, 50, 100]
}
Exemple #23
0
cv_folds = [train_test_split(range(len(X)), train_size=0.666)]

model = BayesSearchCV(estimator=pipe,
                      search_spaces={
                          'model__latent_dim': (2, 20),
                          'model__intermediate_dim': (8, 128),
                          'model__epochs': (8, 128),
                          'model__D': (1e-3, 1e+3, 'log-uniform'),
                          'model__lr': (1e-4, 1e-2, 'log-uniform'),
                      },
                      n_iter=32,
                      cv=cv_folds,
                      refit=False,
                      error_score=-1.0)

model.on_step = lambda x: print(
    (x, model.total_iterations(), model.best_score_))
model.fit(X, Y)
model.refit = True
model._fit_best_model(X, Y)
print(model.best_params_)
print(model.score(X, Y))
"""

model = pipe
model.set_params(**{'model__D': 5.1964624423233898, 'model__lr': 0.00010138257365940301,
                    'model__epochs': 26, 'model__intermediate_dim': 125, 'model__latent_dim': 2})
model.fit(X, Y)

print(model.predict(X, Y))
"""
Exemple #24
0
# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  # integer valued parameter
        'kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
    },
    n_iter=32,
    cv=3)

opt.fit(X_train, y_train)

print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))

#############################################################################
# Advanced example
# ================
#
# In practice, one wants to enumerate over multiple predictive model classes,
# with different search spaces and number of evaluations per class. An
# example of such search over parameters of Linear SVM, Kernel SVM, and
# decision trees is given below.

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram

from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import timeit


class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x):
        raise BaseException('Use newer skopt')


X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k')

X_train = X_train[:10000]
y_train = y_train[:10000]

start = timeit.default_timer()
opt = BayesSearchCV(SVC(), {
    'C': Real(0.001, 10, prior='log-uniform'),
    'gamma': Real(0.001, 1, prior='log-uniform'),
    'kernel': Categorical(['poly']),
},
                    n_iter=40)
opt.fit(X_train, y_train)
stop = timeit.default_timer()

print("Bayes on fashion data with kernel poly")
print(opt.score(X_test, y_test))
print(opt.best_params_)
print('Time: ', stop - start)
Exemple #26
0
        'max_depth': (3, 15),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'learning_rate': (0.01, 0.4, 'log-uniform'),
        'min_child_weight': (1, 10),
        'subsample': (0.5, 1.0, 'log-uniform'),
        'colsample_bytree': (0.5, 1.0, 'log-uniform'),
        'n_estimators': (100, 1000)
    },
    n_iter=32,
    random_state=42,
    cv=3
)

xgb_opt.fit(X_train, Y_train)

xgb_opt.score(X_train, Y_train)

#Accuracy of the model on the validation set
y_pred = xgb_opt.predict(X_test)
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

#Load the testSpike data
X_test = mat['testSpike']
X_test.shape

#Pre-processing of the testSpike data
l = 2
X = numpy.array([])
X = numpy.mean(X_test[:, 0: 1], axis = 1)[numpy.newaxis].T
for i in range(2, len(X_test[0])+1):
Exemple #27
0
        cb,
        cb_params,
        n_iter=32,
        fit_params=dict(cat_features=cat_feats),
        # n_jobs=2
    )
    X = app_train.drop(['SK_ID_CURR', "TARGET"], axis=1)
    y = app_train["TARGET"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=0,
                                                        train_size=0.75)
    opt.fit(X_train, y_train)

    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(X_test, y_test))
    print(opt.cv_results_)

    with open("wwwww.txt", "w+") as f:
        f.write(opt.best_score_)
        f.write("\n")
        f.write(opt.score(X_test, y_test))
        f.write("\n")
        f.write(opt.cv_results_)

    # # print("start xgboost")
    # xg_params = dict(
    #     n_estimators=(100, 400),
    #     max_depth=(5, 20),
    #     colsample_bytree=(0.6, 1),
    #     reg_alpha=(0.01, 10),