コード例 #1
0
def test_deadline_stopper():
    deadline = DeadlineStopper(0.0001)
    gp_minimize(bench3, [(-1.0, 1.0)], callback=deadline, n_calls=10, random_state=1)
    assert len(deadline.iter_time) == 1
    assert np.sum(deadline.iter_time) > deadline.total_time

    deadline = DeadlineStopper(60)
    gp_minimize(bench3, [(-1.0, 1.0)], callback=deadline, n_calls=10, random_state=1)
    assert len(deadline.iter_time) == 10
    assert np.sum(deadline.iter_time) < deadline.total_time
コード例 #2
0
def main():
    hyperparameters = {
        'kernelSize1': np.arange(2, 10),
        'stride1': np.arange(1, 5),
        'dropout1': np.linspace(0.0, 0.8),
        'kernelSize2': np.arange(2, 10),
        'stride2': np.arange(1, 5),
        'dropout2': np.linspace(0.0, 0.8),
        'learningRate': np.linspace(0.001, 0.1)
    }

    hyperspace = HyperSpace(hyperparameters)
    all_intervals = hyperspace.fold_space()
    hyperspaces = hyperspace.hyper_permute(all_intervals)
    subspace_keys, subspace_boundaries = hyperspace.format_hyperspace(
        hyperspaces)

    space = subspace_boundaries[0]

    deadline = DeadlineStopper(18000)
    # Gaussian process minimization (see scikit-optimize skopt module for other optimizers)
    res_gp = gp_minimize(objective,
                         space,
                         n_calls=50,
                         callback=deadline,
                         random_state=0,
                         verbose=True)
コード例 #3
0
ファイル: cifar_cnn_hyperdrive.py プロジェクト: yngtodd/mobil
def main():
    if rank == 0:
        hyperparameters = {
            'kernelSize1': np.arange(2, 10),
            'stride1': np.arange(1, 5),
            'dropout1': np.linspace(0.0, 0.8),
            'kernelSize2': np.arange(2, 10),
            'stride2': np.arange(1, 5),
            'dropout2': np.linspace(0.0, 0.8),
            'learningRate': np.linspace(0.001, 0.1)
        }

        hyperspace = HyperSpace(hyperparameters)
        all_intervals = hyperspace.fold_space()
        hyperspaces = hyperspace.hyper_permute(all_intervals)
        subspace_keys, subspace_boundaries = hyperspace.format_hyperspace(
            hyperspaces)
    else:
        subspace_keys, subspace_boundaries = None, None

    space = comm.scatter(subspace_boundaries, root=0)

    deadline = DeadlineStopper(18000)
    # Gaussian process minimization (see scikit-optimize skopt module for other optimizers)
    res_gp = gp_minimize(objective,
                         space,
                         n_calls=50,
                         callback=deadline,
                         random_state=0,
                         verbose=True)
    # Each worker will write their results to disk
    dump(res_gp, 'hyper_results/gp_subspace_' + str(rank))
コード例 #4
0
ファイル: hyperdrive.py プロジェクト: yngtodd/mobil
def main():
    if rank == 0:
        hyperparameters = {
            'kernelSize1': np.arange(2, 12),
            'stride1': np.arange(1, 10),
            'kernelSize2': np.arange(2, 12),
            'stride2': np.arange(1, 10),
            'kernelSize3': np.arange(2, 12),
            'kernelSize4': np.arange(1, 12),
            'kernelSize5': np.arange(2, 12)
        }

        hyperspace = HyperSpace(hyperparameters)
        all_intervals = hyperspace.fold_space()
        hyperspaces = hyperspace.hyper_permute(all_intervals)
        subspace_keys, subspace_boundaries = hyperspace.format_hyperspace(
            hyperspaces)
    else:
        subspace_keys, subspace_boundaries = None, None

    space = comm.scatter(subspace_boundaries, root=0)
    deadline = DeadlineStopper(18000)
    # Gaussian process (see scikit-optimize skopt module for other optimizers)
    res_gp = gp_minimize(objective,
                         space,
                         n_calls=20,
                         callback=deadline,
                         random_state=0,
                         verbose=True)
    dump(res_gp, 'hyper_results/gp_subspace_' + str(rank))
コード例 #5
0
 def bayes_search(estimator,
                  search_spaces,
                  X,
                  y,
                  fit_params=None,
                  scoring=None,
                  n_jobs=1,
                  cv=None,
                  n_points=1,
                  n_iter=50,
                  refit=False,
                  random_state=9527,
                  verbose=0,
                  deadline=60):
     optimizer = BayesSearchCV(estimator,
                               search_spaces,
                               scoring=scoring,
                               cv=cv,
                               n_points=n_points,
                               n_iter=n_iter,
                               n_jobs=n_jobs,
                               return_train_score=False,
                               refit=refit,
                               optimizer_kwargs={'base_estimator': 'GP'},
                               random_state=random_state)
     best_parmas = BatchTrainer.hyperopt_search(
         optimizer,
         X,
         y,
         fit_params=fit_params,
         title='BayesSearchCV',
         callbacks=[VerboseCallback(verbose),
                    DeadlineStopper(deadline)])
     return best_parmas
コード例 #6
0
def tune_with_bayes(X_train, y_train):
    roc_auc = make_scorer(roc_auc_score,
                          greater_is_better=True,
                          needs_threshold=True)
    time_split = TimeSeriesSplit(n_splits=10)

    cboost = CatBoostClassifier(thread_count=2, od_type='Iter', verbose=False)
    search_spaces = {
        'iterations': Integer(10, 1000),
        'depth': Integer(1, 8),
        'learning_rate': Real(0.01, 1.0, 'log-uniform'),
        'random_strength': Real(1e-9, 10, 'log-uniform'),
        'bagging_temperature': Real(0.0, 1.0),
        'border_count': Integer(1, 255),
        'l2_leaf_reg': Integer(2, 30),
        'scale_pos_weight': Real(0.01, 1.0, 'uniform')
    }

    opt = BayesSearchCV(cboost,
                        search_spaces,
                        scoring=roc_auc,
                        cv=time_split,
                        n_iter=100,
                        n_jobs=1,
                        return_train_score=False,
                        refit=True,
                        optimizer_kwargs={'base_estimator': 'GP'},
                        random_state=17)

    best_params = report_performance(
        opt,
        X_train,
        y_train,
        'CatBoost',
        callbacks=[VerboseCallback(100),
                   DeadlineStopper(60 * 10)])

    best_params['iterations'] = 1000
    tuned_model = CatBoostClassifier(**best_params,
                                     od_type='Iter',
                                     one_hot_max_size=10)
    # tuned_model = CatBoostClassifier(**best_params,task_type = "GPU",od_type='Iter',one_hot_max_size=10)
    tuned_model.fit(X_train, y_train)
    return tuned_model
コード例 #7
0
ファイル: xgboost_util.py プロジェクト: santi921/ML_CO2
def xgboost_grid(x, y):
    try:
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
    except:
        x = list(x)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

    params = {
        "objective": ['reg:squarederror'],
        "colsample_bytree": [0.25, 0.5, 0.75],
        "learning_rate": [0.01, 0.1, 0.2, 0.3],
        "max_depth": [10, 20, 50],
        "gamma": [i * 0.05 for i in range(0, 5)],
        "lambda": [i * 0.05 for i in range(0, 4)],
        "alpha": [i * 0.05 for i in range(0, 4)],
        "eta": [i * 0.05 for i in range(0, 4)],
        "n_estimators": [400, 4000],
        "tree_method": ["gpu_hist"]
    }

    xgb_temp = xgb.XGBRegressor()
    reg = GridSearchCV(xgb_temp, params, verbose=5, cv=3)

    time_to_stop = 60 * 60
    ckpt_loc = "../data/train/bayes/ckpt_bayes_xgboost.pkl"
    checkpoint_callback = CheckpointSaver(ckpt_loc)
    reg.fit(x_train,
            y_train,
            callback=[DeadlineStopper(time_to_stop), checkpoint_callback])
    print(reg.best_params_)
    print(reg.best_score_)
    return reg
コード例 #8
0
ファイル: my_ds_methods_lib.py プロジェクト: LyubAlex/Kaggle
def get_params_SKopt(model, X, Y, space, cv_search, alg = 'catboost', cat_features = None, eval_dataset = None, UBM = False, opt_method =
                     'gbrt_minimize', verbose = True,  multi = False, scoring = 'neg_mean_squared_error', n_best = 50, total_time = 7200):
    """The method performs parameters tuning of an algorithm using scikit-optimize library.
    Parameters:
    1.
    2.
    3. multi - boolean, is used when a multioutput algorithm is tuned
    UPDATES:
    1. In this current version, the support of the catboost algorithms is added
    """
    if alg == 'catboost':
        fitparam = { 'eval_set' : eval_dataset,
                     'use_best_model' : UBM,
                     'cat_features' : cat_features,
                     'early_stopping_rounds': 20 }
    else:
        fitparam = {}
        
    @use_named_args(space)
    def objective(**params):
        model.set_params(**params)
        return -np.mean(cross_val_score(model, 
                                        X, Y, 
                                        cv=cv_search, 
                                        scoring= scoring,
                                        fit_params=fitparam))
    
    if opt_method == 'gbrt_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = gbrt_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [DeltaYStopper(delta = 0.01, n_best = 5), RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        

    elif opt_method == 'forest_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'EI',}
        
        reg_gp = forest_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
        
    elif opt_method == 'gp_minimize':
        
        HPO_PARAMS = {'n_calls':1000,
                      'n_random_starts':20,
                      'acq_func':'gp_hedge',}        
        
        reg_gp = gp_minimize(objective, 
                               space, 
                               n_jobs = -1,
                               verbose = verbose,
                               callback = [RepeatedMinStopper(n_best = n_best), DeadlineStopper(total_time = total_time)],
                               **HPO_PARAMS,
                               random_state = RANDOM_STATE)
    
    TUNED_PARAMS = {} 
    for i, item in enumerate(space):
        if multi:
            TUNED_PARAMS[item.name.split('__')[1]] = reg_gp.x[i]
        else:
            TUNED_PARAMS[item.name] = reg_gp.x[i]
    
    return [TUNED_PARAMS,reg_gp]
コード例 #9
0
ファイル: basetrainer.py プロジェクト: hado2020/ATOM
    def _check_parameters(self):
        """Check the validity of the input parameters."""
        if self.mapping is None:
            self.mapping = {str(v): v for v in sorted(self.y.unique())}

        if self.scaled is None:
            self.scaled = check_scaling(self.X)

        # Create model subclasses ================================== >>

        models = []
        for m in self._models:
            if isinstance(m, str):
                acronym = get_acronym(m, must_be_equal=False)

                # Check if packages for non-sklearn models are available
                if acronym in OPTIONAL_PACKAGES:
                    try:
                        importlib.import_module(OPTIONAL_PACKAGES[acronym])
                    except ImportError:
                        raise ValueError(
                            f"Unable to import the {OPTIONAL_PACKAGES[acronym]} "
                            "package. Make sure it is installed.")

                # Check for regression/classification-only models
                if self.goal.startswith("class") and acronym in ONLY_REG:
                    raise ValueError(
                        f"The {acronym} model can't perform classification tasks!"
                    )
                elif self.goal.startswith("reg") and acronym in ONLY_CLASS:
                    raise ValueError(
                        f"The {acronym} model can't perform regression tasks!")

                models.append(MODEL_LIST[acronym](self,
                                                  acronym + m[len(acronym):]))

            elif not isinstance(m, BaseModel):  # Model is custom estimator
                models.append(CustomModel(self, estimator=m))

            else:  # Model is already a model subclass (can happen with reruns)
                models.append(m)

        self._models = CustomDict({m.name: m for m in models})

        # Check validity metric ==================================== >>

        if None in self._metric:
            self._metric = CustomDict(get_default_metric(self.task))

        # Ignore if it's the same metric as previous call
        elif not all([hasattr(m, "name") for m in self._metric]):
            self._metric = self._prepare_metric(
                metric=self._metric,
                greater_is_better=self.greater_is_better,
                needs_proba=self.needs_proba,
                needs_threshold=self.needs_threshold,
            )

        # Check validity sequential parameters ===================== >>

        for param in ["n_calls", "n_initial_points", "bagging"]:
            p = lst(getattr(self, param))
            if len(p) != 1 and len(p) != len(self._models):
                raise ValueError(
                    f"Invalid value for the {param} parameter. Length "
                    "should be equal to the number of models, got len"
                    f"(models)={len(self._models)} and len({param})={len(p)}.")

            for i, model in enumerate(self._models):
                if param in ("n_calls", "bagging") and p[i % len(p)] < 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >=0, got {p[i % len(p)]}.")
                elif param == "n_initial_points" and p[i % len(p)] <= 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >0, got {p[i % len(p)]}.")

                setattr(model, "_" + param, p[i % len(p)])

        # Prepare bo parameters ===================================== >>

        # Choose a base estimator (GP is chosen as default)
        self._base_estimator = self.bo_params.get("base_estimator", "GP")
        if isinstance(self._base_estimator, str):
            if self._base_estimator.lower() not in ("gp", "et", "rf", "gbrt"):
                raise ValueError(
                    f"Invalid value for the base_estimator parameter, got "
                    f"{self._base_estimator}. Value should be one of: 'GP', "
                    f"'ET', 'RF', 'GBRT'.")

        if self.bo_params.get("callbacks"):
            self._callbacks = lst(self.bo_params["callbacks"])

        if "max_time" in self.bo_params:
            if self.bo_params["max_time"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >0, got {self.bo_params['max_time']}.")
            self._callbacks.append(DeadlineStopper(self.bo_params["max_time"]))

        if "delta_x" in self.bo_params:
            if self.bo_params["delta_x"] < 0:
                raise ValueError(
                    "Invalid value for the delta_x parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_x']}.")
            self._callbacks.append(DeltaXStopper(self.bo_params["delta_x"]))

        if "delta_y" in self.bo_params:
            if self.bo_params["delta_y"] < 0:
                raise ValueError(
                    "Invalid value for the delta_y parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_y']}.")
            self._callbacks.append(
                DeltaYStopper(self.bo_params["delta_y"], n_best=5))

        if self.bo_params.get("plot"):
            self._callbacks.append(PlotCallback(self))

        if "cv" in self.bo_params:
            if self.bo_params["cv"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >=0, got {self.bo_params['cv']}.")
            self._cv = self.bo_params["cv"]

        if "early_stopping" in self.bo_params:
            if self.bo_params["early_stopping"] <= 0:
                raise ValueError(
                    "Invalid value for the early_stopping parameter. "
                    f"Value should be >=0, got {self.bo_params['early_stopping']}."
                )
            self._early_stopping = self.bo_params["early_stopping"]

        # Add custom dimensions to every model subclass
        if self.bo_params.get("dimensions"):
            for name, model in self._models.items():
                # If not dict, the dimensions are for all models
                if not isinstance(self.bo_params["dimensions"], dict):
                    model._dimensions = self.bo_params["dimensions"]
                else:
                    # Dimensions for every specific model
                    for key, value in self.bo_params["dimensions"].items():
                        # Parameters for this model only
                        if key.lower() == name:
                            model._dimensions = value
                            break

        kwargs = [
            "base_estimator",
            "max_time",
            "delta_x",
            "delta_y",
            "early_stopping",
            "cv",
            "callbacks",
            "dimensions",
            "plot",
        ]

        # The remaining bo_params are added as kwargs to the optimizer
        self._bo_kwargs = {
            k: v
            for k, v in self.bo_params.items() if k not in kwargs
        }

        # Prepare est_params ======================================= >>

        if self.est_params:
            for name, model in self._models.items():
                params = {}
                for key, value in self.est_params.items():
                    # Parameters for this model only
                    if key.lower() == name:
                        params.update(value)
                    # Parameters for all models
                    elif key.lower() not in self._models.keys():
                        params.update({key: value})

                for key, value in params.items():
                    if key.endswith("_fit"):
                        model._est_params_fit[key[:-4]] = value
                    else:
                        model._est_params[key] = value
コード例 #10
0
ファイル: xgboost_util.py プロジェクト: santi921/ML_CO2
def xgboost_bayes_basic(x, y, csv_loc="../data/train/bayes.csv"):

    global csv_loc_bayes
    csv_loc_bayes = csv_loc

    try:
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
    except:
        x = list(x)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)

    xgb_temp = xgb.XGBRegressor()
    reg = BayesSearchCV(xgb_temp, {
        "colsample_bytree": Real(0.5, 0.99),
        "max_depth": Integer(5, 25),
        "lambda": Real(0, 0.25),
        "learning_rate": Real(0.1, 0.25),
        "alpha": Real(0, 0.2),
        "eta": Real(0, 0.1),
        "gamma": Real(0, 0.1),
        "n_estimators": Integer(500, 5000),
        "objective": ["reg:squarederror"],
        "tree_method": ["gpu_hist"]
    },
                        n_iter=10000,
                        verbose=4,
                        cv=3)

    time_to_stop = 60 * 60 * 47

    now = datetime.now()
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    hour = now.strftime("%H")
    minute = now.strftime("%M")
    sec = now.strftime("%S")
    #ckpt_loc = "../data/train/bayes/ckpt_bayes_xgboost" + str(year) + "_"+ str(month) + "_" + str(day) + "_" + \
    #           str(hour) + "_" + str(minute) + "_" + str(sec) + ".pkl"
    #checkpoint_callback = CheckpointSaver(ckpt_loc)
    #reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop), checkpoint_callback])

    custom_scorer = custom_skopt_scorer
    reg.fit(x_train,
            y_train,
            callback=[DeadlineStopper(time_to_stop),
                      custom_scorer(x, y)])
    #reg.fit(x_train, y_train, callback=[DeadlineStopper(time_to_stop)])

    score = str(mean_squared_error(reg.predict(x_test), y_test))
    print("MSE score:   " + str(score))
    score = str(mean_absolute_error(reg.predict(x_test), y_test))
    print("MAE score:   " + str(score))
    score = str(r2_score(reg.predict(x_test), y_test))
    print("r2 score:   " + str(score))
    return reg
コード例 #11
0
ファイル: hyperdrive.py プロジェクト: maxzvyagin/hyperspace
def hyperdrive(objective, hyperparameters, results_path, model="GP", n_iterations=50, verbose=False,
               checkpoints_path=None, deadline=None, sampler=None, n_samples=None, random_state=0):
    """
    Distributed optimization - one optimization per node.

    Parameters
    ----------
    * `objective` [function]:
        User defined function which calls a learner
        and returns a metric of interest.

    * `hyperparameters` [list, shape=(n_hyperparameters,)]:

    * `results_path` [string]
        Path to save optimization results

    * `checkpoint_path` [string]
        Path to previously saved results. Used to resume optimization.

    * `model` [string, default="GP"]
        Probilistic learner used to model our objective function.
        Options:
        - "GP": Gaussian process
        - "RF": Random forest
        - "GBRT": Gradient boosted regression trees
        - "RAND": Random search

    * `n_iterations` [int, default=50]
        Number of optimization iterations

    * `verbose` [bool, default=False]
        Verbosity of optimization.

    * `checkpoints` [bool, default=False]
        Whether to checkpoint at each step of the optimization.

    * `deadline` [int, optional]
        Deadline (seconds) for the optimization to finish within.

    * `sampler` [str, default=None]
        Random sampling scheme for optimizer's initial runs.
        Options:
        - "lhs": latin hypercube sampling

    * `n_samples` [int, default=None]
        Number of random samples to be drawn from the `sampler`.
        - Required if you would like to use `sampler`.
        - Must be <= the number of elements in the smallest hyperparameter bound's set.

    * `random_state` [int, default=0]
        Random state for reproducibility.
    """
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    if checkpoints_path and sampler:
        raise ValueError('Cannot use both a restart from a previous run and ' \
                         'use latin hypercube sampling for initial search points!')

    # Setup savefile
    if rank < 10:
        # Ensure results are sorted by rank
        filename = 'hyperspace' + str(0) + str(rank)
    else:
        filename = 'hyperspace' + str(rank)

    savefile = os.path.join(results_path, filename)

    # Create hyperspaces, and either sampling bounds or checkpoints
    hyperspace = create_hyperspace(hyperparameters)
    space = hyperspace[rank]

    # Latin hypercube sampling
    if sampler and not n_samples:
        raise ValueError(f'Sampler requires n_samples > 0. Got {n_samples}')
    elif sampler and n_samples:
        hyperbounds = create_hyperbounds(hyperparameters)
        bounds = hyperbounds[rank]
        # Get initial points in domain via latin hypercube sampling
        init_points = lhs_start(bounds, n_samples)
        init_response = None
        n_rand = 10 - len(init_points)
    else:
        init_points = None
        init_response = None
        n_rand = 10

    # Resuming from checkpoint
    if checkpoints_path:
        checkpoint = _load_checkpoint(checkpoints_path, rank)
        try:
            init_points = checkpoint.x_iters
            init_response = checkpoint.func_vals
            n_rand = 10 - len(init_points)
        except AttributeError:
            # Missing saves won't have initial values.
            init_points = None
            init_response = None
            n_rand = 10

    callbacks = []
    if deadline:
        deadline = DeadlineStopper(deadline)
        callbacks.append(deadline)

    if checkpoints_path:
        checkpoint_callback = CheckpointSaver(checkpoints_path, filename)
        callbacks.append(checkpoint_callback)

    # Thanks Guido for refusing to believe in switch statements.
    # Case 0
    if model == "GP":
        # Verbose mode should only run on node 0.
        if verbose and rank == 0:
            result = gp_minimize(objective, space, n_calls=n_iterations, verbose=verbose,
                                 callback=callbacks, x0=init_points, y0=init_response,
                                 n_random_starts=n_rand, random_state=random_state)
        else:
            result = gp_minimize(objective, space, n_calls=n_iterations,
                                 callback=callbacks, x0=init_points, y0=init_response,
                                 n_random_starts=n_rand, random_state=random_state)

    # Case 1
    elif model == "RF":
        if verbose and rank == 0:
            result = forest_minimize(objective, space, n_calls=n_iterations, verbose=verbose,
                                     callback=callbacks, x0=init_points, y0=init_response,
                                     n_random_starts=n_rand, random_state=random_state)
        else:
            result = forest_minimize(objective, space, n_calls=n_iterations,
                                     callback=callbacks, x0=init_points, y0=init_response,
                                     n_random_starts=n_rand, random_state=random_state)
    # Case 2
    elif model == "GBRT":
        if verbose and rank == 0:
            result = gbrt_minimize(objective, space, n_calls=n_iterations, verbose=verbose,
                                   callback=callbacks, x0=init_points, y0=init_response,
                                   n_random_starts=n_rand, random_state=random_state)
        else:
            result = gbrt_minimize(objective, space, n_calls=n_iterations,
                                   callback=callbacks, x0=init_points, y0=init_response,
                                   n_random_starts=n_rand, random_state=random_state)
    # Case 3
    elif model == "RAND":
        if verbose and rank == 0:
            result = dummy_minimize(objective, space, n_calls=n_iterations, verbose=verbose,
                                    callback=callbacks, x0=init_points, y0=init_response,
                                    random_state=random_state)
        else:
            result = dummy_minimize(objective, space, n_calls=n_iterations,
                                    callback=callbacks, x0=init_points, y0=init_response,
                                    random_state=random_state)
    else:
        raise ValueError("Invalid model {}. Read the documentation for "
                         "supported models.".format(model))

    # Each worker will independently write their results to disk
    dump(result, savefile)
コード例 #12
0
    def optimise_step(self,
                      df_train,
                      df_target,
                      npoints=1,
                      nrandom=1,
                      n_iter=50,
                      set_callbacks=True):
        """Evaluates the data.
        Build the pipeline. If no parameters are set, default configuration for
        each step is used
        Parameters
        ----------
        space : dict, default = None.
        df_train : pandas dataframe of shape = (n_train, n_features)
            The train dataset with numerical features.
        y_train : pandas series of shape = (n_train,)
            The numerical encoded target for classification tasks.
        max_evals : int, default = 20, max evaluation times
        set_callbacks (opt): bool,default: True
             If callable then callback(res) is called after each call to func. If list of callables, then each callable in the list is called.
        ----------
        Returns
        ---------
        result : dict
            - result['best_score'] : Best Score after Tuning
            - result['best_score_std'] : Standar Divation of best score
            - result['best_parmas'] : Best parameters
            - result['params'] : all paramsters (# = checked candicated)
            - result['time_cost(s)'] : total time of finding out the best parameters
            - result['all_cv_results'] : all cv results
            - result['mean_score_time'] : time for each cv result
        """
        # checke parallel strategy

        ce = Categorical_encoder()
        X = ce.fit_transform(df_train, df_target)

        if len(df_train.dtypes[df_train.dtypes == 'float'].index) != 0:
            scal = Scaler()
            X = scal.fit_transform(X, df_target)
            self.perform_scaling is True
        else:
            pass

        mid_result = {}
        tuning_result = {}
        if len(pd.DataFrame(X).columns) > 20:
            search_space_LGB = Classifier(
                strategy="LightGBM").get_search_spaces(
                    need_feature_selection=True)
            search_space_SVC = Classifier(strategy="SVC").get_search_spaces(
                need_feature_selection=True)
            search_spaces = [search_space_SVC, search_space_LGB]
        else:
            search_space_LGB = Classifier(
                strategy="LightGBM").get_search_spaces(
                    need_feature_selection=False)
            search_space_SVC = Classifier(strategy="SVC").get_search_spaces(
                need_feature_selection=False)
            search_spaces = [search_space_SVC, search_space_LGB]

        # Initialize a pipeline
        fs = None
        for i in range(len(search_spaces)):
            if isinstance(search_spaces, tuple):
                for p in search_spaces[i][0].keys():
                    if (p.startswith("fs__")):
                        fs = feature_selector()
                    else:
                        print(
                            ">> Number of Features < 20, ignore feature selection"
                        )
                        pass
            else:
                for p in search_spaces[i].keys():
                    if (p.startswith("fs__")):
                        fs = feature_selector()
                    else:
                        pass

        # Do we need to cache transformers?
        cache = False

        if (fs is not None):
            if ("fs__strategy" in search_spaces):
                if (search_spaces["fs__strategy"] != "variance"):
                    cache = True
                else:
                    pass
        else:
            pass
        mprint(f'Start turning Hyperparameters .... ')
        print("")
        print(">>> Categorical Features have encoded with :" +
              str({'strategy': ce.strategy}))
        print("")
        if self.perform_scaling is True:
            print(">>> Numerical Features have encoded with :" +
                  scal.__class__.__name__)
            print("")

        for baseestimator in self.baseEstimator:
            # Pipeline creation

            lgb = Classifier(strategy="LightGBM").get_estimator()
            #  rf = Classifier(strategy="RandomForest").get_estimator()
            #  svc = Classifier(strategy="SVC").get_estimator()

            if (fs is not None):
                if cache:
                    pipe = Pipeline([('fs', fs), ('model', lgb)],
                                    memory=self.to_path)
                else:
                    pipe = Pipeline([('fs', fs), ('model', lgb)])
            else:
                if cache:
                    pipe = Pipeline([('model', lgb)], memory=self.to_path)
                else:
                    pipe = Pipeline([('model', lgb)])

            if (self.parallel_strategy is True):
                opt = BayesSearchCV(pipe,
                                    search_spaces=search_spaces,
                                    scoring=self.scoring,
                                    cv=self.cv,
                                    npoints=npoints,
                                    n_jobs=-1,
                                    n_iter=n_iter,
                                    nrandom=nrandom,
                                    return_train_score=False,
                                    optimizer_kwargs={
                                        'base_estimator': baseestimator,
                                        "acq_func": "EI"
                                    },
                                    random_state=self.random_state,
                                    verbose=self.verbose,
                                    refit=self.refit)
            else:
                opt = BayesSearchCV(pipe,
                                    search_spaces=search_spaces,
                                    scoring=self.scoring,
                                    cv=self.cv,
                                    npoints=npoints,
                                    n_jobs=1,
                                    n_iter=n_iter,
                                    nrandom=nrandom,
                                    return_train_score=False,
                                    optimizer_kwargs={
                                        'base_estimator': baseestimator,
                                        "acq_func": "EI"
                                    },
                                    random_state=self.random_state,
                                    verbose=self.verbose,
                                    refit=self.refit)

            if not isinstance(baseestimator, GaussianProcessRegressor):
                if set_callbacks is True:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model:' + baseestimator,
                        callbacks=[
                            self.on_step,
                            DeadlineStopper(60 *
                                            60)  # ,DeltaYStopper(0.000001)
                        ])
                else:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model: ' + baseestimator,
                    )
                tuning_result[baseestimator] = mid_result

            else:
                if set_callbacks is True:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model:' +
                        baseestimator.__class__.__name__,
                        callbacks=[
                            self.on_step,
                            DeadlineStopper(60 *
                                            60)  # ,DeltaYStopper(0.000001)
                        ])
                else:
                    mid_result = self.report_perf(
                        opt,
                        X,
                        df_target,
                        ' with Surrogate Model: ' +
                        baseestimator.__class__.__name__,
                    )
                tuning_result[baseestimator.__class__.__name__] = mid_result

        bests = pd.DataFrame()
        for key in tuning_result.keys():
            if tuning_result[key]['best_score'] == max(
                    d['best_score'] for d in tuning_result.values()):
                bests = bests.append(
                    {
                        'best_score': tuning_result[key]['best_score'],
                        'best_SM': key,
                        'time': tuning_result[key]['Time_cost']
                    },
                    ignore_index=True)
                bests = bests.sort_values(
                    by=['time'], ascending=True).reset_index(drop=True)
                best_base_estimator = bests['best_SM'][0]
                best_param = tuning_result[best_base_estimator]['best_parmas']

        print("")
        print('######## Congratulations! Here is the Best Parameters: #######')
        print('Best Score is:',
              tuning_result[best_base_estimator]['best_score'])
        try:
            print('with Surrogate Model ' + best_base_estimator)
        except:
            print('with Surrogate Model ' +
                  best_base_estimator.__class__.__name__)
        pprint.pprint(best_param)

        self.best_param_ = best_param

        return best_param, tuning_result
コード例 #13
0
def main():
    #N_u = 50
    #N_f = 10
    #N_u2 = 25
    #N_f2 = 500
    #typen = 'N_f'
    #trialn = 0
    #m = .1
    #lambdas = [0.00001,0.00005, 0.0001,0.0005, 0.001, 0.005, 0.01]
    #lam = lambdas[0]
    args_parser = argparser_raissi.Parser()
    args = args_parser.parse_args_verified()
    layers = [2, 100, 100, 100, 100, 2]
    burgers_layers = [2, 20, 20, 20, 20, 20, 20, 20, 20, 1]
    input_seed = 1234

    N_u = args.N_u
    N_f = args.N_f
    N_u2 = args.N_u2
    N_f2 = args.N_f2
    m = args.m
    nsec = args.time

    #preparing actual solution u
    data = scipy.io.loadmat('burgers_shock.mat')
    t = data['t'].flatten()[:, None]
    x = data['x'].flatten()[:, None]
    X, T = np.meshgrid(x, t)
    udata = np.real(data['usol'])
    u = udata.T.flatten()[:, None]
    X_star = np.hstack((X.flatten()[:, None], T.flatten()[:, None]))
    ub = X_star.max()
    lb = X_star.min()

    #preparing training/input data
    #with open('train_d.p', 'rb') as fp:
    #    inputs = pickle.load(fp)[trialn]
    #X_u_train = inputs.X_u_train[:N_u]
    #X_f_train = inputs.X_f_train[:2**N_f]
    #u_train = inputs.u_train[:N_u]
    inputs = interiorburgerslambda.prepare_nn_inputs_burgers(
        'burgers_shock.mat', N_u, N_f, N_u2, N_f2, m, typen, debugging=False)
    u_input = inputs.u_train
    #model = burgersraissi.PhysicsInformedNN(inputs.X_u_train, u_input, inputs.X_f_train, burgers_layers, inputs.lb, inputs.ub, inputs.nu, inputs.X_star, N_u, N_f)

    errors = []

    def function(lam):
        #declaring, training model
        model = burgersraissilambda.PhysicsInformedNN(
            lam, inputs.X_u_train, inputs.u_train, inputs.X_f_train,
            burgers_layers, lb, ub, inputs.nu, X_star, N_u, N_f, N_u2, N_f2, m,
            typen)
        start_time = time.time()
        #if N_f > 0:
        #model.load_weights_and_biases('wab/weights_and_biases_%s_%s_%s_%s.npz' % (N_u, N_f - 1, typen, args.epochs))
        losses = model.train(args.epochs, args.data_loc, N_u, N_f, N_u2, N_f2,
                             m, typen, args.base_plot_dir)
        #print('Training time: %.4f' % (time.time() - start_time))
        #print(losses)

        #plt.close()
        #fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        #pd.Series(losses).plot(logy=True, ax=ax)
        #lpl = os.path.expanduser(args.lossplot_loc)
        #plt.savefig(lpl)
        #print("saved loss plot to {}".format(lpl))

        u_pred, f_pred = model.predict(
            inputs.X_star)  # X_star = tf.convert_to_tensor(X_star) ?
        error = np.linalg.norm(u - u_pred, 2) / 25000
        return error

    #start = time()
    #t = (0,0.1, "prior")
    print(type(t))
    l = skopt.gp_minimize(function, [(.01, 1.1)],
                          callback=DeadlineStopper(nsec))
    print(m)
    print(N_u2)
    print(N_f2)
    print(type(l))
    print(l)
    with open('final_lambda.p', 'wb') as fp:
        pickle.dump({'lambda': l}, fp, protocol=2)
コード例 #14
0
def hyperdrive(objective,
               hyperparameters,
               results_path,
               model="GP",
               n_iterations=50,
               verbose=False,
               deadline=None,
               sampler=None,
               n_samples=None,
               random_state=0):
    """
    Distributed optimization - one optimization per node.

    Parameters
    ----------
    * `objective` [function]:
        User defined function which calls a learner
        and returns a metric of interest.

    * `hyperparameters` [list, shape=(n_hyperparameters,)]:

    * `results_path` [string]
        Path to save optimization results

    * `model` [string, default="GP"]
        Probilistic learner used to model our objective function.
        Options:
        - "GP": Gaussian process
        - "RF": Random forest
        - "GBRT": Gradient boosted regression trees
        - "RAND": Random search

    * `n_iterations` [int, default=50]
        Number of optimization iterations

    * `verbose` [bool, default=False]
        Verbosity of optimization.

    * `deadline` [int, optional]
        Deadline (seconds) for the optimization to finish within.

    * `sampler` [str, default=None]
        Random sampling scheme for optimizer's initial runs.
        Options:
        - "lhs": latin hypercube sampling

    * `n_samples` [int, default=None]
        Number of random samples to be drawn from the `sampler`.
        - Required if you would like to use `sampler`.
        - Must be <= the number of elements in the smallest hyperparameter bound's set.

    * `random_state` [int, default=0]
        Random state for reproducibility.
    """
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    if rank == 0:
        hyperspace = create_hyperspace(hyperparameters)
        if sampler and not n_samples:
            raise ValueError(
                'Sampler requires n_samples > 0. Got {}'.format(n_samples))
        elif sampler and n_samples:
            hyperbounds = create_hyperbounds(hyperparameters)
    else:
        hyperspace = None
        if sampler is not None:
            hyperbounds = None

    space = comm.scatter(hyperspace, root=0)
    if sampler:
        bounds = comm.scatter(hyperbounds, root=0)
        # Get initial points in the obj. function domain via latin hypercube sampling
        init_points = lhs_start(bounds, n_samples)
        n_rand = 10 - len(init_points)
    else:
        init_points = None
        n_rand = 10

    if deadline:
        deadline = DeadlineStopper(deadline)

    # Thanks Guido for refusing to believe in switch statements.
    # Case 0
    if model == "GP":
        # Verbose mode should only run on node 0.
        if verbose and rank == 0:
            result = gp_minimize(objective,
                                 space,
                                 n_calls=n_iterations,
                                 verbose=verbose,
                                 callback=deadline,
                                 x0=init_points,
                                 n_random_starts=n_rand,
                                 random_state=random_state)
        else:
            result = gp_minimize(objective,
                                 space,
                                 n_calls=n_iterations,
                                 callback=deadline,
                                 x0=init_points,
                                 n_random_starts=n_rand,
                                 random_state=random_state)
    # Case 1
    elif model == "RF":
        if verbose and rank == 0:
            result = forest_minimize(objective,
                                     space,
                                     n_calls=n_iterations,
                                     verbose=verbose,
                                     callback=deadline,
                                     x0=init_points,
                                     n_random_starts=n_rand,
                                     random_state=random_state)
        else:
            result = forest_minimize(objective,
                                     space,
                                     n_calls=n_iterations,
                                     callback=deadline,
                                     x0=init_points,
                                     n_random_starts=n_rand,
                                     random_state=random_state)
    # Case 2
    elif model == "GRBRT":
        if verbose and rank == 0:
            result = gbrt_minimize(objective,
                                   space,
                                   n_calls=n_iterations,
                                   verbose=verbose,
                                   callback=deadline,
                                   x0=init_points,
                                   n_random_starts=n_rand,
                                   random_state=random_state)
        else:
            result = gbrt_minimize(objective,
                                   space,
                                   n_calls=n_iterations,
                                   callback=deadline,
                                   x0=init_points,
                                   n_random_starts=n_rand,
                                   random_state=random_state)
    # Case 3
    elif model == "RAND":
        if verbose and rank == 0:
            result = dummy_minimize(objective,
                                    space,
                                    n_calls=n_iterations,
                                    verbose=verbose,
                                    callback=deadline,
                                    x0=init_points,
                                    n_random_starts=n_rand,
                                    random_state=random_state)
        else:
            result = dummy_minimize(objective,
                                    space,
                                    n_calls=n_iterations,
                                    callback=deadline,
                                    x0=init_points,
                                    n_random_starts=n_rand,
                                    random_state=random_state)
    else:
        raise ValueError("Invalid model {}. Read the documentation for "
                         "supported models.".format(model))

    # Each worker will independently write their results to disk
    dump(result, results_path + '/hyperspace' + str(rank))
コード例 #15
0
#                 'boosting_type':['Ordered'],
#                 'learning_rate': Real(0.05, 1.0, 'uniform'),
#                 'border_count': Integer(1, 25),
#                 'fold_len_multiplier': Real(1.1, 1.16, prior='uniform')}

# Setting up BayesSearchCV
opt = BayesSearchCV(
    clf,
    search_spaces,
    scoring=roc_auc,
    cv=skf,
    n_iter=5,
    n_points=100,
    n_jobs=
    1,  # use just 1 job with CatBoost in order to avoid segmentation fault
    return_train_score=False,
    refit=True,
    optimizer_kwargs={'base_estimator': 'ET'},  #'GP', 'RF', 'ET'
    random_state=57)

# Running the optimization
best_params = report_perf(
    opt,
    X,
    y,
    'CatBoost',
    callbacks=[VerboseCallback(20),
               DeadlineStopper(60 * 30)])

print("Notebook Runtime: %0.2f Minutes" % ((time.time() - notebookstart) / 60))
コード例 #16
0
                    cv=skf,
                    n_iter=10,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)

# COMMAND ----------

# Execute bayesian 
best_params = report_perf(optimizer = opt, 
                          X = X, 
                          y = y, 
                          title = 'CatBoost', 
                          callbacks=[VerboseCallback(100), DeadlineStopper(60*10)])


# COMMAND ----------

# Convert ordered dictionary to dictionary
import json
best_params_bayesian = json.loads(json.dumps(best_params))


# COMMAND ----------

# Manuelt
best_params={'bagging_temperature': 0.41010395885331385,
 'border_count': 186,
 'depth': 8,
コード例 #17
0
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1" + " %.3f") % (time() - start,
                                     len(optimizer.cv_results_['params']),
                                     best_score,
                                     best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params


# Converting average precision score into a scorer suitable for model selection
avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True)
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=avg_prec,
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22,
                    return_train_score=False,
                    )

best_params = report_perf(opt, X, y, 'LightGBM',
                          callbacks=[DeltaXStopper(0.001),
                                     DeadlineStopper(60 * 5)])
コード例 #18
0
bayes_search = BayesSearchCV(estimator=clf_lgb,
                             search_spaces=bayes_space,
                             n_iter=100,
                             cv=rskf,
                             scoring='roc_auc',
                             optimizer_kwargs={'base_estimator': 'GP'},
                             verbose=-1,
                             n_jobs=-1,
                             random_state=1337)

start_time = time.time()
bayes_search = bayes_search.fit(
    X_train,
    y_train,
    callback=[DeltaXStopper(0.0001),
              DeadlineStopper(60 * 60)])
print('Training time: {} minutes'.format(
    time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))))
bayes_search.best_params_, bayes_search.best_score_

# last step
clf_lgb_bayes = bayes_search.best_estimator_

y_pred = clf_lgb_bayes.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred = clf_lgb_bayes.predict_proba(X_test)[:, 1]
print('LGB_BAYES AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred))

# not the best the params======================================================
#[grid_search, random_search, bayes_search]
コード例 #19
0
    random_state=42)

# COMMAND ----------

opt

# COMMAND ----------

# Execute bayesian
best_params = report_perf(
    optimizer=opt,
    X=X,
    y=y,
    title='CatBoost',
    callbacks=[VerboseCallback(100),
               DeadlineStopper(60 * 10)])

# COMMAND ----------

best_params.values()

# COMMAND ----------

best_params = {
    'bagging_temperature': 0.41010395885331385,
    'border_count': 186,
    'depth': 8,
    'iterations': 323,
    'l2_leaf_reg': 21,
    'learning_rate': 0.0673344419215237,
    'random_strength': 3.230824361824754e-06,
    X_test = scaler.transform(X_test)[:100]
    y_train = y_train[:400]
    y_test = y_test[:100]
    
    joblib.dump(scaler, "dataset/%s.scalar"%name) 
    print("load data success!")

    ### test baseline model
    test_baseline()

    allmodels = ["MLP","GBT","RF","SVR"]
    for mname in allmodels:
        ### bayesian optimization selected model
        opt = get_model(mname,30)
        print("training start")
        callbacks = [DeadlineStopper(60*60*3),report_callback,VerboseCallback(50)]
        res = opt.fit(X_train, y_train,callback=callbacks)
        print("best params : %s" % opt.best_params_)
        print("best val. score: %s" % opt.best_score_)
        print("test r2: %s" % r2_score(y_test,opt.predict(X_test)))
        result = pd.DataFrame(opt.cv_results_)

        result.to_csv("result/{name}+{mtype}.csv".format(name=name,mtype=mname))
        best_model = opt.best_estimator_
        
        y_pred = best_model.predict(X_test)
        score = scoring(y_test,y_pred)
        print("test score: ",score)
        metric = "_".join([str(v) for v in score.values()])
        joblib.dump(best_model,"result/{}_{}_best_{}.model".format(name,mname,metric))