Example #1
0
    def hyperopt_cv(self, X, y, params, fn=None, algo=None, max_evals=10, timeout=None,
                    fmin_params=None, fn_params=None, p_last=True):
        """Hyperparameter optimization using hyperopt. Using cross-validation to evaluate hyperparameters by default.

        Args:
            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
            y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
            params (dict): Dictionary of hyperparameters passed to hyperopt.
            fn (:obj:`callable`, optional): Objective function to optimize with hyperopt.
            algo (:obj:`callable`, optional): Algorithm for hyperopt. Available choices are: hyperopt.tpe.suggest and
             hyperopt.random.suggest. Using hyperopt.tpe.suggest by default.
            max_evals (:obj:`int`, optional): Number of function evaluations before returning.
            timeout (:obj:`None`, :obj:`int`, optional): Limits search time by parametrized number of seconds.
            If None, then the search process has no time constraint. None by default.
            fmin_params (:obj:`dict`, optional): Dictionary of supplementary arguments for hyperopt.fmin function.
            fn_params (:obj:`dict`, optional):  Dictionary of supplementary arguments for custom fn objective function.
            p_last (:obj:`str`, optional): If model object is a sklearn.Pipeline then apply fit parameters to the last
             step. True by default.

        Returns:
            dict: Dictionary of best choice of hyperparameters. Also best model is fitted.
        """
        if self.backend == 'h2o':
            raise Exception('hyperopt_cv is not supported by `h2o` backend. Use `optimize_hyperparam`')

        trials = Trials()
        algo = tpe.suggest if algo is None else algo
        if isinstance(self.model, Pipeline) and ((fn_params is not None) and ('fit_params' in fn_params)) and p_last:
            fn_params['fit_params'] = {f'{self.model.steps[-1][0]}__{key}': fn_params['fit_params'].get(key)
                                       for key in fn_params['fit_params'].keys()}
        if fn is None:
            scoring = (None if not (isinstance(fn_params, dict) and ('scoring' in fn_params.keys()))
                       else fn_params.pop('scoring'))
            scoring = make_scorer(mean_squared_error) if scoring is None else scoring
            try:
                check_scoring(self, scoring)
            except ValueError:
                scoring = make_scorer(scoring)
            fn = functools.partial(self._hyperopt_obj_cv, X=X, y=y, scoring=scoring,
                                   **(fn_params if fn_params is not None else {}))
        best = fmin(fn=fn, space=params, trials=trials, algo=algo, max_evals=max_evals, timeout=timeout,
                    **(fmin_params if fmin_params is not None else {}))
        best_params = space_eval(params, best)
        best_params = {key: best_params[key] if not (isinstance(best_params[key], float) and
                                                     best_params[key].is_integer()) else int(best_params[key])
                       for key in best_params.keys()}
        self.best_params, self.trials = best_params, trials
        self.model = self.object(**self.best_params)
        self.model.fit(X, y, **({} if not ((fn_params is not None) and ('fit_params' in fn_params))
                                else fn_params['fit_params']))
        if not hasattr(self.model, 'feature_name_'):
            self.model.feature_name_ = X.columns.tolist() if isinstance(X, DataFrame) else [X.name]
        self._update_meta()
        return self.best_params
Example #2
0
def test_scoring_is_not_metric():
    with pytest.raises(ValueError, match="make_scorer"):
        check_scoring(LogisticRegression(), scoring=f1_score)
    with pytest.raises(ValueError, match="make_scorer"):
        check_scoring(LogisticRegression(), scoring=roc_auc_score)
    with pytest.raises(ValueError, match="make_scorer"):
        check_scoring(Ridge(), scoring=r2_score)
    with pytest.raises(ValueError, match="make_scorer"):
        check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score)
    with pytest.raises(ValueError, match="make_scorer"):
        check_scoring(KMeans(), scoring=cluster_module.rand_score)
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type):
    # Make sure the seeds for train/val split and small trainset subsampling
    # are correctly set in a warm start context.
    def _get_rng(rng_type):
        # Helper to avoid consuming rngs
        if rng_type == 'none':
            return None
        elif rng_type == 'int':
            return 42
        else:
            return np.random.RandomState(0)

    random_state = _get_rng(rng_type)
    gb_1 = GradientBoosting(early_stopping=True,
                            max_iter=2,
                            random_state=random_state)
    gb_1.set_params(scoring=check_scoring(gb_1))
    gb_1.fit(X, y)
    random_seed_1_1 = gb_1._random_seed

    gb_1.fit(X, y)
    random_seed_1_2 = gb_1._random_seed  # clear the old state, different seed

    random_state = _get_rng(rng_type)
    gb_2 = GradientBoosting(early_stopping=True,
                            max_iter=2,
                            random_state=random_state,
                            warm_start=True)
    gb_2.set_params(scoring=check_scoring(gb_2))
    gb_2.fit(X, y)  # inits state
    random_seed_2_1 = gb_2._random_seed
    gb_2.fit(X, y)  # clears old state and equals est
    random_seed_2_2 = gb_2._random_seed

    # Without warm starting, the seeds should be
    # * all different if random state is None
    # * all equal if random state is an integer
    # * different when refitting and equal with a new estimator (because
    #   the random state is mutated)
    if rng_type == 'none':
        assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1
    elif rng_type == 'int':
        assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1
    else:
        assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2

    # With warm starting, the seeds must be equal
    assert random_seed_2_1 == random_seed_2_2
Example #4
0
def test_parallel_fit():
    """The goal of this test is to check that results of _parallel_fit is the
    same for different controlled param_grid
    """

    X, y = make_regression(n_samples=100, n_features=20,
                           n_informative=5, noise=0.2, random_state=42)
    train = range(80)
    test = range(80, len(y_classification))
    outputs = []
    estimator = svr
    svr_params = [[1e-1, 1e0, 1e1], [1e-1, 1e0, 5e0, 1e1]]
    scorer = check_scoring(estimator, 'r2')  #  define a scorer
    # Define a screening selector
    selector = check_feature_screening(screening_percentile=None,
                                       mask_img=None, is_classification=False)
    for params in svr_params:
        param_grid = {}
        param_grid['C'] = np.array(params)
        outputs.append(list(_parallel_fit(estimator=estimator, X=X, y=y,
                                          train=train, test=test,
                                          param_grid=param_grid,
                                          is_classification=False,
                                          scorer=scorer, mask_img=None,
                                          class_index=1,
                                          selector=selector,
                                          clustering_percentile=100)))
    # check that every element of the output tuple is the same for both tries
    for a, b in zip(outputs[0], outputs[1]):
        if isinstance(a, np.ndarray):
            np.testing.assert_array_almost_equal(a, b)
        else:
            assert a == b
Example #5
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    groups=None,
                    scoring=None,
                    cv=None,
                    n_jobs=None,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs',
                    error_score=np.nan):

    # To ensure multimetric format is not supported
    scorer = check_scoring(estimator, scoring=scoring)

    cv_results = cross_validate(estimator=estimator,
                                X=X,
                                y=y,
                                groups=groups,
                                scoring={'score': scorer},
                                cv=cv,
                                n_jobs=n_jobs,
                                verbose=verbose,
                                fit_params=fit_params,
                                pre_dispatch=pre_dispatch,
                                error_score=error_score)

    return cv_results
Example #6
0
 def _estimate_performances(self, X, y):
     performances = np.zeros(self.n_classifiers_)
     for idx, clf in enumerate(self.pool_classifiers_):
         scorer = check_scoring(clf, self.scoring)
         performances[idx] = scorer(clf,
                                    X[:, self.estimator_features_[idx]], y)
     return performances
Example #7
0
        def hyperopt_train_test(params, X_train, y_train):
            warnings.filterwarnings("ignore")

            trainable = create_instance_from_hyperopt_search_space(self.estimator, params)
            try:
                cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer)
                logger.debug("Successful trial of hyperopt with hyperparameters:{}".format(params))
            except BaseException as e:
                #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20)
                    start = time.time()
                    trained = trainable.fit(X_train_part, y_train_part)
                    scorer = check_scoring(trainable, scoring=self.scoring)
                    cv_score  = scorer(trained, X_validation, y_validation, **self.args_to_scorer)
                    execution_time = time.time() - start
                    y_pred_proba = trained.predict_proba(X_validation)
                    try:
                        logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba)
                    except BaseException:
                        logloss = 0
                        logger.debug("Warning, log loss cannot be computed")
                else:
                    logger.debug(e)
                    logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json()))
                    raise e
            return cv_score, logloss, execution_time
Example #8
0
def permutations(estimator,
                 X,
                 y,
                 cv=None,
                 n_permuations=100,
                 random_state=0,
                 scoring=None):
    """
    This follows the sklearn API sklearn.inspection.permutation_test_score
    I have modified accordinlgy to accomodate filtering of features using correlation matrix
    before running cross-validation using the model
    """

    Xs, ys = indexable(X, y)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # corr = CorrMatrix()
    # corr.fit(X,y)
    # Xs, ys = corr.transform()
    score = _permutations(clone(estimator), Xs, ys, cv, scorer)
    permutation_scores = np.zeros((n_permuations))
    for i in range(n_permuations):
        # corr_p = CorrMatrix()
        # corr_p.fit(X, y)
        # Xp, yp = corr_p.transform()
        yp = _safe_indexing(y, random_state.permutation(len(y)))
        permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv,
                                              scorer)

    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1)

    return score, permutation_scores, pvalue
Example #9
0
    def score(self, X, y, scoring):
        """
        score

        Parameters
        ----------
        X: np.ndarray
            data.
        y: np.ndarray
            true y.
        scoring: str
            scoring method,default is "r2"

        """
        scoring = check_scoring(self, scoring=scoring)

        if not isinstance(scoring, (list, tuple)):
            scoring = [
                scoring,
            ]
        try:
            sc_all = []
            for si in scoring:
                sc = si(self, X, y)
                sc_all.append(sc)

        # except (ValueError, RuntimeWarning):
        except (RuntimeWarning):

            sc_all = None

        return sc_all
Example #10
0
File: smac.py Project: shinnar/lale
 def smac_train_test(trainable, X_train, y_train):
     try:
         cv_score, logloss, execution_time = cross_val_score_track_trials(
             trainable,
             X_train,
             y_train,
             cv=self.cv,
             scoring=self.scoring)
         logger.debug("Successful trial of SMAC")
     except BaseException as e:
         # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion
         if self.handle_cv_failure:
             (
                 X_train_part,
                 X_validation,
                 y_train_part,
                 y_validation,
             ) = train_test_split(X_train, y_train, test_size=0.20)
             start = time.time()
             trained = trainable.fit(X_train_part, y_train_part)
             scorer = check_scoring(trainable, scoring=self.scoring)
             cv_score = scorer(trained, X_validation, y_validation)
             execution_time = time.time() - start
             y_pred_proba = trained.predict_proba(X_validation)
             try:
                 logloss = log_loss(y_true=y_validation,
                                    y_pred=y_pred_proba)
             except BaseException:
                 logloss = 0
                 logger.debug("Warning, log loss cannot be computed")
         else:
             logger.debug("Error {} with pipeline:{}".format(
                 e, trainable.to_json()))
             raise e
     return cv_score, logloss, execution_time
Example #11
0
        def hyperopt_train_test(params, X_train, y_train):
            warnings.filterwarnings("ignore")

            reg = create_instance_from_hyperopt_search_space(
                self.estimator, params)
            try:
                cv_score, _, execution_time = cross_val_score_track_trials(
                    reg,
                    X_train,
                    y_train,
                    cv=KFold(self.cv),
                    scoring=self.scoring)
                logger.debug("Successful trial of hyperopt")
            except BaseException as e:
                #If there is any error in cross validation, use the accuracy based on a random train-test split as the evaluation criterion
                if self.handle_cv_failure:
                    X_train_part, X_validation, y_train_part, y_validation = train_test_split(
                        X_train, y_train, test_size=0.20)
                    start = time.time()
                    reg_trained = reg.fit(X_train_part, y_train_part)
                    scorer = check_scoring(reg, scoring=self.scoring)
                    cv_score = scorer(reg_trained, X_validation, y_validation)
                    execution_time = time.time() - start
                else:
                    logger.debug(e)
                    logger.debug("Error {} with pipeline:{}".format(
                        e, reg.to_json()))
                    raise e

            return cv_score, execution_time
Example #12
0
 def score(self, X, Y, *args, **kwargs):
     scores = []
     for estimator, target in zip(self.estimators_, self.targets_):
         scorer = check_scoring(estimator, self.scoring)
         score = scorer(estimator, X, Y[target], *args, **kwargs)
         scores.append(score)
     return np.average(scores, weights=self.weights)
    def __init__(self, trained_model, validation_df, features, target,
                 scoring, n_jobs=None):
        self.trained_model = trained_model
        self.df = validation_df.copy()
        self.features = features
        self.target = target
        self.n_jobs = n_jobs
        self.scorer = check_scoring(estimator=self.trained_model, scoring=scoring)

        # FLOFO defaults
        self.num_bins = 10
        self.shuffle_func = np.random.permutation
        self.feature_group_len = 2

        min_data_needed = 10*(self.num_bins**self.feature_group_len)
        if self.df.shape[0] < min_data_needed:
            raise Exception("Small validation set (<{})".format(min_data_needed))
        if len(self.features) <= self.feature_group_len:
            raise Exception("FLOFO needs more than {} features".format(self.feature_group_len))

        if self.n_jobs is not None and self.n_jobs > 1:
            warning_str = "Warning: If your model is multithreaded, please initialise the number \
                of jobs of LOFO to be equal to 1, otherwise you may experience issues."
            warnings.warn(warning_str)

        self._bin_features()
Example #14
0
    def __init__(self,
                 name,
                 parent=None,
                 evaluations=None,
                 n_trials=200,
                 scoring_strategy: str = 'fold',
                 scoring: Union[str, Callable, None] = None,
                 optuna_jobs: int = 1,
                 **kwargs):
        """
        Optuna Optimization Model Feature

        Args:
            n_trials:
                total number of trials.
            scoring_strategy:
                out-of-fold scoring strategy.
                If set as `"fold"`, the score are calculated each by fold and use mean of them for optimization.
                If set as `"whole"`, the score is calculated whole data.
            scoring:
                scoring method. String or Scoring Object.
                When optimizing parameters, the best parameters in the sense of this score are chosen.
                By default (pass None)
                - for regression model, use `RMSE` metric and
                - for classifier model, use `negative log likelihood`.
                 (scoring obj must be satisfied check_scoring validation)
            optuna_jobs:
                optuna parallel jobs.
                [NOTE]
                    when set > 1, pre-post process model fail to jit input / target scaling class.
                    so it is recommend to set = 1.
        """
        super(TunerBlock, self).__init__(name=name,
                                         parent=parent,
                                         evaluations=evaluations)

        self.study = None  # type: Union[Study, None]
        self.n_trails = n_trials
        self.optuna_jobs = optuna_jobs

        if scoring_strategy not in self.SCORING_STRATEGY_CHOICES:
            raise ValueError('`scoring_strategy` must be in {}'.format(
                ','.join(self.SCORING_STRATEGY_CHOICES)))
        self.scoring_strategy = scoring_strategy
        if scoring is None:
            if self.is_regression_model:
                scoring = 'neg_root_mean_squared_error'
            else:
                scoring = 'neg_log_loss'

        try:
            scoring = check_scoring(self.model_class,
                                    scoring=scoring,
                                    allow_none=False)
        except ValueError as e:
            s = f'Invalid scoring argument: {scoring}. You can select scoring method from pre-defined as follow\n'
            s += ', '.join(SCORERS.keys())
            raise ValueError(s)
        self.scoring_method = scoring  # type: _BaseScorer
Example #15
0
    def fit(self, X, y=None, groups=None, **fit_params):
        # type: (np.ndarray, np.ndarray, np.ndarray, Any) -> 'TPESearchCV'
        """Run fit with all sets of parameters.

        Args:
            X:
                Training data.

            y:
                Target variable.

            groups:
                Group labels for the samples used while splitting the dataset
                into train/test set.

            **fit_params:
                Parameters passed to ``fit`` on the estimator.

        Returns:
            self:
                Return self.
        """

        self._check_params()
        self._set_verbosity()

        classifier = is_classifier(self.estimator)
        cv = check_cv(self.cv, y, classifier)

        self.n_splits_ = cv.get_n_splits(X, y, groups=groups)
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        self.study_ = study.create_study(load_if_exists=self.load_if_exists,
                                         pruner=self.pruner,
                                         sampler=self._sampler,
                                         storage=self.storage,
                                         study_name=self.study_name)

        objective = Objective(self.estimator,
                              self.param_distributions,
                              X,
                              y,
                              cv=cv,
                              error_score=self.error_score,
                              fit_params=fit_params,
                              groups=groups,
                              max_iter=self.max_iter,
                              return_train_score=self.return_train_score,
                              scoring=self.scorer_)

        self.study_.optimize(objective,
                             n_jobs=self.n_jobs,
                             n_trials=self.n_trials,
                             timeout=self.timeout)

        if self.refit:
            self._refit(X, y, **fit_params)

        return self
Example #16
0
def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3)
    scorer = check_scoring(grid, scoring="f1")
    assert isinstance(scorer, _PredictScorer)

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, scoring="f1")
    assert isinstance(scorer, _PredictScorer)

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1],
                             scoring=DummyScorer(), cv=3)
    assert_array_equal(scores, 1)
Example #17
0
def _check_multimetric_scoring(estimator, scoring=None):
    # TODO: See if scikit-learn 0.24 solves the need for using
    # a private method
    from sklearn.metrics import check_scoring
    from sklearn.metrics._scorer import _check_multimetric_scoring

    if callable(scoring) or isinstance(scoring, (type(None), str)):
        scorers = {"score": check_scoring(estimator, scoring=scoring)}
        return scorers, False
    return _check_multimetric_scoring(estimator, scoring), True
Example #18
0
    def _validate_parameters(self, X, y):
        if (self.max_iter is not None) and self.max_iter < 1:
            raise ValueError(
                "Received max_iter={}. max_iter < 1 is not supported".format(
                    self.max_iter))

        X = self._check_array(X)
        y = self._check_array(y, ensure_2d=False)
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        return X, y, scorer
Example #19
0
def _compute_scores(estimator, X_train, y_train, X_test, y_test, scoring):
    """ Given a fitted estimator and a train-test split (X_train , y_train , X_test , y_test),
    compute the trainig scores and the test scores (defined by the scoring parameter) and save 
    them in two separate dictionaries.
    
    Parameters
    ----------
    estimator : estimator object.
        This is assumed to implement the scikit-learn estimator interface.
        
    X_train : array-like, shape (n_samples , n_features)
    
    y_train : array-like, shape (n_samples , n_output)

    X_test : array-like, shape (n_samples , n_features)

    y_test : array-like, shape (n_samples , n_output)

    scoring : str, callable, dict of strings and callables
        Strategy to evaluate the performances of the estimator in the outer loop. 
        A dictionnary can be used for multiple scores.

    Returns
    -------
    List of two dictionaries 
    
    """
    training_scores, test_scores = {}, {}
    if isinstance(scoring, dict):
        for k in scoring.keys():
            scorer = metrics.check_scoring(estimator, scoring[k])
            training_scores[k], test_scores[k] = scorer(
                estimator, X_train, y_train), scorer(estimator, X_test, y_test)
    elif isinstance(scoring, str):
        scorer = metrics.check_scoring(estimator, scoring)
        training_scores[scoring], test_scores[scoring] = scorer(
            estimator, X_train, y_train), scorer(estimator, X_test, y_test)
    else:
        scorer = metrics.check_scoring(estimator, scoring)
        training_scores['score'], test_scores['score'] = scorer(
            estimator, X_train, y_train), scorer(estimator, X_test, y_test)
    return [training_scores, test_scores]
Example #20
0
    def fit(self, X, y):
        """
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : ndarray of shape (n_samples, n_targets)
            Training data, where n_samples is the number of samples
            and n_targets is the number of target properties.
        """
        # check input parameters, can be moved at some point to a sklearn-like check function
        if self.regularization_method not in ["tikhonov", "cutoff"]:
            raise ValueError(
                f"regularization method {self.regularization_method} is not known."
            )
        if self.alpha_type not in ["absolute", "relative"]:
            raise ValueError(f"alpha type {self.alpha_type} is not known.")
        if self.alpha_type == "relative" and (np.any(self.alphas < 0)
                                              or np.any(self.alphas >= 1)):
            raise ValueError(
                "relative alphas type used, but the alphas are not within the range [0,1)"
            )

        # check_scoring uses estimators scoring function if the scorer is None, this is intercepted here
        if self.scoring is None:
            scorer = check_scoring(self,
                                   scoring="neg_root_mean_squared_error",
                                   allow_none=False)
        else:
            scorer = check_scoring(self,
                                   scoring=self.scoring,
                                   allow_none=False)
        fold1_idx, fold2_idx = next(
            KFold(n_splits=2,
                  shuffle=self.shuffle,
                  random_state=self.random_state).split(X))
        self.coef_ = self._2fold_cv(X, y, fold1_idx, fold2_idx, scorer)
        return self
Example #21
0
def score_estimator(scoring, estimator, coordinates, data, weights=None):
    """
    Score the given gridder against the given data using the given metric.

    If the data and predictions have more than 1 component, the scores of each
    component will be averaged.

    Parameters
    ----------
    scoring : str or callable
        A scoring specification known to scikit-learn. See
        :func:`sklearn.metrics.check_scoring`.
    estimator : a Verde gridder
        The gridder to score. Usually derived from
        :class:`verde.base.BaseGridder`.
    coordinates : tuple of arrays
        Arrays with the coordinates of each data point. Should be in the
        following order: (easting, northing, vertical, ...).
        For the specific definition of coordinate systems and what these
        names mean, see the class docstring.
    data : array or tuple of arrays
        The data values of each data point. If the data has more than one
        component, *data* must be a tuple of arrays (one for each
        component).
    weights : None or array or tuple of arrays
        If not None, then the weights assigned to each data point. If more
        than one data component is provided, you must provide a weights
        array for each data component (if not None).

    Returns
    -------
    score : float
        The score.

    """
    coordinates, data, weights = check_fit_input(
        coordinates, data, weights, unpack=False
    )
    predicted = check_data(estimator.predict(coordinates))
    scorer = check_scoring(DummyEstimator, scoring=scoring)
    result = np.mean(
        [
            scorer(
                DummyEstimator(pred.ravel()),
                coordinates,
                data[i].ravel(),
                sample_weight=weights[i],
            )
            for i, pred in enumerate(predicted)
        ]
    )
    return result
Example #22
0
    def _cross_val(self, X, y, scoring=None, cv=None, **kwargs):
        if self.backend != 'h2o':
            cv = KFold(n_splits=5) if cv is None else cv
            njobs = -1 if 'n_jobs' not in kwargs else kwargs.pop('n_jobs')
            if 'return_estimator' in kwargs:
                kwargs.pop('return_estimator')
            scoring = make_scorer(mean_squared_error) if scoring is None else scoring

            if callable(scoring) or isinstance(scoring, str):
                scorers = scoring
                try:
                    check_scoring(self.model, scorers)
                    scorers = {scorers.__name__.replace('_', ' '): (make_scorer(scorers) if
                               isinstance(scorers, (types.FunctionType, types.BuiltinFunctionType, functools.partial))
                               else scorers)}
                except ValueError:
                    scorers = {scorers.__name__.replace('_', ' '): make_scorer(scorers)}
            elif isinstance(scoring, (tuple, list)):
                scorers = []
                for scorer in scoring:
                    try:
                        check_scoring(self.model, scorer)
                        scorers.append([scorer.__name__.replace('_', ' '),
                                        (make_scorer(scorer) if
                                         isinstance(scorer, (types.FunctionType, types.BuiltinFunctionType,
                                                             functools.partial)) else scorer)])
                    except ValueError:
                        scorers.append([scorer.__name__.replace('_', ' '), make_scorer(scorer)])
                scorers = {scorer[0]: scorer[1] for scorer in scorers}
            else:
                raise NotImplementedError(f'Scoring of type {type(scoring)} is not supported.')

            cv_results = cross_validate(self.model, X, y=y, scoring=scorers, cv=cv, n_jobs=njobs,
                                        return_estimator=True, **kwargs)
            estimators = cv_results.pop('estimator')
            cv_results = {key.split('test_')[1]: cv_results[key] for key in cv_results if key.startswith('test_')}
            return estimators, cv_results
        else:
            raise NotImplementedError('_cross_val method is not implemented for backend=`h2o`')
Example #23
0
def validation_curve(estimator,
                     X,
                     y,
                     param_name,
                     param_range,
                     groups=None,
                     cv=None,
                     scoring=None,
                     n_jobs=None,
                     pre_dispatch="all",
                     verbose=0,
                     error_score=np.nan):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)
    out = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters={
                                    param_name: v
                                },
                                fit_params=None,
                                return_train_score=True,
                                error_score=error_score,
                                return_estimator=True,
                                return_times=True)
        # NOTE do not change order of iteration to allow one time cv splitters
        for train, test in cv.split(X, y, groups) for v in param_range)

    out = np.asarray(out)
    estimators = out[:, 4]
    out_scores = np.asarray(out[:, :2])
    fit_time = out[:, 2]
    score_time = out[:, 3]
    n_params = len(param_range)
    n_cv_folds = out_scores.shape[0] // n_params
    out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose(
        (2, 1, 0))

    return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \
           np.float64(score_time)
Example #24
0
    def _validate_parameters(self, X, y):
        if (self.max_iter is not None) and self.max_iter < 1:
            raise ValueError(
                "Received max_iter={}. max_iter < 1 is not supported".format(
                    self.max_iter))

        # Make sure dask arrays are passed so error on unknown chunk size is raised
        kwargs = dict(accept_unknown_chunks=True, accept_dask_dataframe=True)
        if not isinstance(X, dd.DataFrame):
            X = self._check_array(X, **kwargs)
        if not isinstance(y, dd.Series):
            y = self._check_array(y, ensure_2d=False, **kwargs)
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        return X, y, scorer
Example #25
0
def fit_grid_point(X,
                   y,
                   estimator,
                   parameters,
                   train,
                   test,
                   scorer,
                   verbose,
                   error_score=np.nan,
                   **fit_params):
    check_scoring(estimator, scorer)
    scores, n_samples_test = _fit_and_score(estimator,
                                            X,
                                            y,
                                            scorer,
                                            train,
                                            test,
                                            verbose,
                                            parameters,
                                            fit_params=fit_params,
                                            return_n_test_samples=True,
                                            error_score=error_score)
    return scores, parameters, n_samples_test
Example #26
0
def get_col_score(estimator,
                  X,
                  y,
                  col,
                  n_repeats=5,
                  scoring=None,
                  random_state=None):
    """Calculate score when `col` is permuted."""

    scorer = check_scoring(estimator, scoring=scoring)
    rstate = check_random_state(random_state)

    scores = _get_col_score(estimator, X, y, col, n_repeats, scorer, rstate)

    return scores
Example #27
0
def get_group_score(estimator,
                    X,
                    y,
                    g,
                    n_repeats=5,
                    scoring=None,
                    random_state=None):
    """Calculate score when columns group `g` is permuted."""

    scorer = check_scoring(estimator, scoring=scoring)
    rstate = check_random_state(random_state)

    scores = _get_group_score(estimator, X, y, g, n_repeats, scorer, rstate)

    return scores
Example #28
0
    def permutation_importance(estimator,
                               X,
                               y,
                               *,
                               scoring=None,
                               n_repeats=5,
                               n_jobs=None,
                               random_state=None):
        if not DaskToolBox.is_dask_dataframe(X):
            return sk_inspect.permutation_importance(estimator,
                                                     X,
                                                     y,
                                                     scoring=scoring,
                                                     n_repeats=n_repeats,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state)
        random_state = sk_utils.check_random_state(random_state)

        def shuffle_partition(df, col_idx):
            shuffling_idx = np.arange(df.shape[0])
            random_state.shuffle(shuffling_idx)
            col = df.iloc[shuffling_idx, col_idx]
            col.index = df.index
            df.iloc[:, col_idx] = col
            return df

        if DaskToolBox.is_dask_object(y):
            y = y.compute()

        scorer = sk_metrics.check_scoring(
            DaskToolBox.wrap_for_local_scorer(estimator, type_of_target(y)),
            scoring)
        baseline_score = scorer(estimator, X, y)
        scores = []

        for c in range(X.shape[1]):
            col_scores = []
            for i in range(n_repeats):
                X_permuted = X.copy().map_partitions(shuffle_partition, c)
                col_scores.append(scorer(estimator, X_permuted, y))
            if logger.is_debug_enabled():
                logger.debug(f'permuted scores [{X.columns[c]}]: {col_scores}')
            scores.append(col_scores)

        importances = baseline_score - np.array(scores)
        return sk_utils.Bunch(importances_mean=np.mean(importances, axis=1),
                              importances_std=np.std(importances, axis=1),
                              importances=importances)
Example #29
0
    def fit(self, X, y, groups=None, **fit_params):
        # type: (...) -> PermutationImportance
        """Compute ``feature_importances_`` attribute and optionally
        fit the base estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like, shape (n_samples,)
            The target values (integers that correspond to classes in
            classification, real numbers in regression).

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : Other estimator specific parameters

        Returns
        -------
        self : object
            Returns self.
        """
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if pandas_available and isinstance(X, pd.DataFrame):
            self.scorer_ = self._wrap_scorer(self.scorer_, X.columns)

        if self.cv != "prefit" and self.refit:
            self.estimator_ = clone(self.estimator)
            self.estimator_.fit(X, y, **fit_params)

        X = check_array(X, force_all_finite='allow-nan')

        if self.cv not in (None, "prefit"):
            si = self._cv_scores_importances(X, y, groups=groups, **fit_params)
        else:
            si = self._non_cv_scores_importances(X, y)
        scores, results = si
        self.scores_ = np.array(scores)
        self.results_ = results
        self.feature_importances_ = np.mean(results, axis=0)
        self.feature_importances_std_ = np.std(results, axis=0)
        return self
Example #30
0
    def fit(self, X, y):
        """Fit the static selection model by select an ensemble of classifier
        containing the base classifiers with highest accuracy in the given
        dataset.

        Parameters
        ----------
        X : array of shape (n_samples, n_features)
            Data used to fit the model.

        y : array of shape (n_samples)
            class labels of each example in X.

        Returns
        -------
        self : object
            Returns self.
        """
        self._validate_parameters()

        X, y = check_X_y(X, y)

        super(StaticSelection, self).fit(X, y)

        self.n_classifiers_ensemble_ = int(self.n_classifiers_ *
                                           self.pct_classifiers)

        performances = np.zeros(self.n_classifiers_)

        if not self.base_already_encoded_:
            y_encoded = y
        else:
            y_encoded = self.enc_.transform(y)

        for clf_idx, clf in enumerate(self.pool_classifiers_):
            scorer = check_scoring(clf, self.scoring)
            performances[clf_idx] = scorer(
                clf, X[:, self.estimator_features_[clf_idx]], y_encoded)

        self.clf_indices_ = np.argsort(
            performances)[::-1][0:self.n_classifiers_ensemble_]
        self.ensemble_ = [
            self.pool_classifiers_[clf_idx] for clf_idx in self.clf_indices_
        ]

        return self
    def fit(self, X, y):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)
            Target values.

        Returns
        -------
        self : object
        """

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        acc_compute_hist_time = 0.  # time spent computing histograms
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        X, y = check_X_y(X, y, dtype=[X_DTYPE])
        y = self._encode_y(y)
        rng = check_random_state(self.random_state)

        self._validate_parameters()
        self.n_features_ = X.shape[1]  # used for validation in predict()

        # we need this stateful variable to tell raw_predict() that it was
        # called from fit() (this current method), and that the data it has
        # received is pre-binned.
        # predicting is faster on pre-binned data, so we want early stopping
        # predictions to be made on pre-binned data. Unfortunately the scorer_
        # can only call predict() or predict_proba(), not raw_predict(), and
        # there's no way to tell the scorer that it needs to predict binned
        # data.
        self._in_fit = True


        self.loss_ = self._get_loss()

        self.do_early_stopping_ = (self.n_iter_no_change is not None and
                                   self.n_iter_no_change > 0)

        # create validation data if needed
        self._use_validation_data = self.validation_fraction is not None
        if self.do_early_stopping_ and self._use_validation_data:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None

            X_train, X_val, y_train, y_val = train_test_split(
                X, y, test_size=self.validation_fraction, stratify=stratify,
                random_state=rng)
        else:
            X_train, y_train = X, y
            X_val, y_val = None, None

        # Bin the data
        self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng)
        X_binned_train = self._bin_data(X_train, rng, is_training_data=True)
        if X_val is not None:
            X_binned_val = self._bin_data(X_val, rng, is_training_data=False)
        else:
            X_binned_val = None

        if self.verbose:
            print("Fitting gradient boosted rounds:")

        # initialize raw_predictions: those are the accumulated values
        # predicted by the trees for the training data. raw_predictions has
        # shape (n_trees_per_iteration, n_samples) where
        # n_trees_per_iterations is n_classes in multiclass classification,
        # else 1.
        n_samples = X_binned_train.shape[0]
        self._baseline_prediction = self.loss_.get_baseline_prediction(
            y_train, self.n_trees_per_iteration_
        )
        raw_predictions = np.zeros(
            shape=(self.n_trees_per_iteration_, n_samples),
            dtype=self._baseline_prediction.dtype
        )
        raw_predictions += self._baseline_prediction

        # initialize gradients and hessians (empty arrays).
        # shape = (n_trees_per_iteration, n_samples).
        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            prediction_dim=self.n_trees_per_iteration_
        )

        # predictors is a matrix (list of lists) of TreePredictor objects
        # with shape (n_iter_, n_trees_per_iteration)
        self._predictors = predictors = []

        # Initialize structures and attributes related to early stopping
        self.scorer_ = None  # set if scoring != loss
        raw_predictions_val = None  # set if scoring == loss and use val
        self.train_score_ = []
        self.validation_score_ = []
        if self.do_early_stopping_:
            # populate train_score and validation_score with the predictions
            # of the initial model (before the first tree)

            if self.scoring == 'loss':
                # we're going to compute scoring w.r.t the loss. As losses
                # take raw predictions as input (unlike the scorers), we can
                # optimize a bit and avoid repeating computing the predictions
                # of the previous trees. We'll re-use raw_predictions (as it's
                # needed for training anyway) for evaluating the training
                # loss, and create raw_predictions_val for storing the
                # raw predictions of the validation data.

                if self._use_validation_data:
                    raw_predictions_val = np.zeros(
                        shape=(self.n_trees_per_iteration_,
                               X_binned_val.shape[0]),
                        dtype=self._baseline_prediction.dtype
                    )

                    raw_predictions_val += self._baseline_prediction

                self._check_early_stopping_loss(raw_predictions, y_train,
                                                raw_predictions_val, y_val)
            else:
                self.scorer_ = check_scoring(self, self.scoring)
                # scorer_ is a callable with signature (est, X, y) and calls
                # est.predict() or est.predict_proba() depending on its nature.
                # Unfortunately, each call to scorer_() will compute
                # the predictions of all the trees. So we use a subset of the
                # training set to compute train scores.
                subsample_size = 10000  # should we expose this parameter?
                indices = np.arange(X_binned_train.shape[0])
                if X_binned_train.shape[0] > subsample_size:
                    # TODO: not critical but stratify using resample()
                    indices = rng.choice(indices, subsample_size,
                                         replace=False)
                X_binned_small_train = X_binned_train[indices]
                y_small_train = y_train[indices]
                # Predicting is faster on C-contiguous arrays.
                X_binned_small_train = np.ascontiguousarray(
                    X_binned_small_train)

                self._check_early_stopping_scorer(
                    X_binned_small_train, y_small_train,
                    X_binned_val, y_val,
                )

        for iteration in range(self.max_iter):

            if self.verbose:
                iteration_start_time = time()
                print("[{}/{}] ".format(iteration + 1, self.max_iter),
                      end='', flush=True)

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_train, raw_predictions)

            # Append a list since there may be more than 1 predictor per iter
            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k in range(self.n_trees_per_iteration_):

                grower = TreeGrower(
                    X_binned_train, gradients[k, :], hessians[k, :],
                    max_bins=self.max_bins,
                    actual_n_bins=self.bin_mapper_.actual_n_bins_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time
                acc_compute_hist_time += grower.total_compute_hist_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_
                )
                predictors[-1].append(predictor)

                # Update raw_predictions with the predictions of the newly
                # created tree.
                tic_pred = time()
                _update_raw_predictions(raw_predictions[k, :], grower)
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            should_early_stop = False
            if self.do_early_stopping_:
                if self.scoring == 'loss':
                    # Update raw_predictions_val with the newest tree(s)
                    if self._use_validation_data:
                        for k, pred in enumerate(self._predictors[-1]):
                            raw_predictions_val[k, :] += (
                                pred.predict_binned(X_binned_val))

                    should_early_stop = self._check_early_stopping_loss(
                        raw_predictions, y_train,
                        raw_predictions_val, y_val
                    )

                else:
                    should_early_stop = self._check_early_stopping_scorer(
                        X_binned_small_train, y_small_train,
                        X_binned_val, y_val,
                    )

            if self.verbose:
                self._print_iteration_stats(iteration_start_time)

            # maybe we could also early stop if all the trees are stumps?
            if should_early_stop:
                break

        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self._predictors
                for predictor in predictors_at_ith_iteration
            )
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self._predictors)
            print("Fit {} trees in {:.3f} s, ({} total leaves)".format(
                n_predictors, duration, n_total_leaves))
            print("{:<32} {:.3f}s".format('Time spent computing histograms:',
                                          acc_compute_hist_time))
            print("{:<32} {:.3f}s".format('Time spent finding best splits:',
                                          acc_find_split_time))
            print("{:<32} {:.3f}s".format('Time spent applying splits:',
                                          acc_apply_split_time))
            print("{:<32} {:.3f}s".format('Time spent predicting:',
                                          acc_prediction_time))

        self.train_score_ = np.asarray(self.train_score_)
        self.validation_score_ = np.asarray(self.validation_score_)
        del self._in_fit  # hard delete so we're sure it can't be used anymore
        return self