def hyperopt_cv(self, X, y, params, fn=None, algo=None, max_evals=10, timeout=None, fmin_params=None, fn_params=None, p_last=True): """Hyperparameter optimization using hyperopt. Using cross-validation to evaluate hyperparameters by default. Args: X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data. y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values. params (dict): Dictionary of hyperparameters passed to hyperopt. fn (:obj:`callable`, optional): Objective function to optimize with hyperopt. algo (:obj:`callable`, optional): Algorithm for hyperopt. Available choices are: hyperopt.tpe.suggest and hyperopt.random.suggest. Using hyperopt.tpe.suggest by default. max_evals (:obj:`int`, optional): Number of function evaluations before returning. timeout (:obj:`None`, :obj:`int`, optional): Limits search time by parametrized number of seconds. If None, then the search process has no time constraint. None by default. fmin_params (:obj:`dict`, optional): Dictionary of supplementary arguments for hyperopt.fmin function. fn_params (:obj:`dict`, optional): Dictionary of supplementary arguments for custom fn objective function. p_last (:obj:`str`, optional): If model object is a sklearn.Pipeline then apply fit parameters to the last step. True by default. Returns: dict: Dictionary of best choice of hyperparameters. Also best model is fitted. """ if self.backend == 'h2o': raise Exception('hyperopt_cv is not supported by `h2o` backend. Use `optimize_hyperparam`') trials = Trials() algo = tpe.suggest if algo is None else algo if isinstance(self.model, Pipeline) and ((fn_params is not None) and ('fit_params' in fn_params)) and p_last: fn_params['fit_params'] = {f'{self.model.steps[-1][0]}__{key}': fn_params['fit_params'].get(key) for key in fn_params['fit_params'].keys()} if fn is None: scoring = (None if not (isinstance(fn_params, dict) and ('scoring' in fn_params.keys())) else fn_params.pop('scoring')) scoring = make_scorer(mean_squared_error) if scoring is None else scoring try: check_scoring(self, scoring) except ValueError: scoring = make_scorer(scoring) fn = functools.partial(self._hyperopt_obj_cv, X=X, y=y, scoring=scoring, **(fn_params if fn_params is not None else {})) best = fmin(fn=fn, space=params, trials=trials, algo=algo, max_evals=max_evals, timeout=timeout, **(fmin_params if fmin_params is not None else {})) best_params = space_eval(params, best) best_params = {key: best_params[key] if not (isinstance(best_params[key], float) and best_params[key].is_integer()) else int(best_params[key]) for key in best_params.keys()} self.best_params, self.trials = best_params, trials self.model = self.object(**self.best_params) self.model.fit(X, y, **({} if not ((fn_params is not None) and ('fit_params' in fn_params)) else fn_params['fit_params'])) if not hasattr(self.model, 'feature_name_'): self.model.feature_name_ = X.columns.tolist() if isinstance(X, DataFrame) else [X.name] self._update_meta() return self.best_params
def test_scoring_is_not_metric(): with pytest.raises(ValueError, match="make_scorer"): check_scoring(LogisticRegression(), scoring=f1_score) with pytest.raises(ValueError, match="make_scorer"): check_scoring(LogisticRegression(), scoring=roc_auc_score) with pytest.raises(ValueError, match="make_scorer"): check_scoring(Ridge(), scoring=r2_score) with pytest.raises(ValueError, match="make_scorer"): check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score) with pytest.raises(ValueError, match="make_scorer"): check_scoring(KMeans(), scoring=cluster_module.rand_score)
def test_random_seeds_warm_start(GradientBoosting, X, y, rng_type): # Make sure the seeds for train/val split and small trainset subsampling # are correctly set in a warm start context. def _get_rng(rng_type): # Helper to avoid consuming rngs if rng_type == 'none': return None elif rng_type == 'int': return 42 else: return np.random.RandomState(0) random_state = _get_rng(rng_type) gb_1 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state) gb_1.set_params(scoring=check_scoring(gb_1)) gb_1.fit(X, y) random_seed_1_1 = gb_1._random_seed gb_1.fit(X, y) random_seed_1_2 = gb_1._random_seed # clear the old state, different seed random_state = _get_rng(rng_type) gb_2 = GradientBoosting(early_stopping=True, max_iter=2, random_state=random_state, warm_start=True) gb_2.set_params(scoring=check_scoring(gb_2)) gb_2.fit(X, y) # inits state random_seed_2_1 = gb_2._random_seed gb_2.fit(X, y) # clears old state and equals est random_seed_2_2 = gb_2._random_seed # Without warm starting, the seeds should be # * all different if random state is None # * all equal if random state is an integer # * different when refitting and equal with a new estimator (because # the random state is mutated) if rng_type == 'none': assert random_seed_1_1 != random_seed_1_2 != random_seed_2_1 elif rng_type == 'int': assert random_seed_1_1 == random_seed_1_2 == random_seed_2_1 else: assert random_seed_1_1 == random_seed_2_1 != random_seed_1_2 # With warm starting, the seeds must be equal assert random_seed_2_1 == random_seed_2_2
def test_parallel_fit(): """The goal of this test is to check that results of _parallel_fit is the same for different controlled param_grid """ X, y = make_regression(n_samples=100, n_features=20, n_informative=5, noise=0.2, random_state=42) train = range(80) test = range(80, len(y_classification)) outputs = [] estimator = svr svr_params = [[1e-1, 1e0, 1e1], [1e-1, 1e0, 5e0, 1e1]] scorer = check_scoring(estimator, 'r2') # Â define a scorer # Define a screening selector selector = check_feature_screening(screening_percentile=None, mask_img=None, is_classification=False) for params in svr_params: param_grid = {} param_grid['C'] = np.array(params) outputs.append(list(_parallel_fit(estimator=estimator, X=X, y=y, train=train, test=test, param_grid=param_grid, is_classification=False, scorer=scorer, mask_img=None, class_index=1, selector=selector, clustering_percentile=100))) # check that every element of the output tuple is the same for both tries for a, b in zip(outputs[0], outputs[1]): if isinstance(a, np.ndarray): np.testing.assert_array_almost_equal(a, b) else: assert a == b
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan): # To ensure multimetric format is not supported scorer = check_scoring(estimator, scoring=scoring) cv_results = cross_validate(estimator=estimator, X=X, y=y, groups=groups, scoring={'score': scorer}, cv=cv, n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, pre_dispatch=pre_dispatch, error_score=error_score) return cv_results
def _estimate_performances(self, X, y): performances = np.zeros(self.n_classifiers_) for idx, clf in enumerate(self.pool_classifiers_): scorer = check_scoring(clf, self.scoring) performances[idx] = scorer(clf, X[:, self.estimator_features_[idx]], y) return performances
def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") trainable = create_instance_from_hyperopt_search_space(self.estimator, params) try: cv_score, logloss, execution_time = cross_val_score_track_trials(trainable, X_train, y_train, cv=self.cv, scoring=self.scoring, args_to_scorer=self.args_to_scorer) logger.debug("Successful trial of hyperopt with hyperparameters:{}".format(params)) except BaseException as e: #If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: X_train_part, X_validation, y_train_part, y_validation = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation, **self.args_to_scorer) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug(e) logger.debug("Error {} with pipeline:{}".format(e, trainable.to_json())) raise e return cv_score, logloss, execution_time
def permutations(estimator, X, y, cv=None, n_permuations=100, random_state=0, scoring=None): """ This follows the sklearn API sklearn.inspection.permutation_test_score I have modified accordinlgy to accomodate filtering of features using correlation matrix before running cross-validation using the model """ Xs, ys = indexable(X, y) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # corr = CorrMatrix() # corr.fit(X,y) # Xs, ys = corr.transform() score = _permutations(clone(estimator), Xs, ys, cv, scorer) permutation_scores = np.zeros((n_permuations)) for i in range(n_permuations): # corr_p = CorrMatrix() # corr_p.fit(X, y) # Xp, yp = corr_p.transform() yp = _safe_indexing(y, random_state.permutation(len(y))) permutation_scores[i] = _permutations(clone(estimator), Xs, yp, cv, scorer) pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permuations + 1) return score, permutation_scores, pvalue
def score(self, X, y, scoring): """ score Parameters ---------- X: np.ndarray data. y: np.ndarray true y. scoring: str scoring method,default is "r2" """ scoring = check_scoring(self, scoring=scoring) if not isinstance(scoring, (list, tuple)): scoring = [ scoring, ] try: sc_all = [] for si in scoring: sc = si(self, X, y) sc_all.append(sc) # except (ValueError, RuntimeWarning): except (RuntimeWarning): sc_all = None return sc_all
def smac_train_test(trainable, X_train, y_train): try: cv_score, logloss, execution_time = cross_val_score_track_trials( trainable, X_train, y_train, cv=self.cv, scoring=self.scoring) logger.debug("Successful trial of SMAC") except BaseException as e: # If there is any error in cross validation, use the score based on a random train-test split as the evaluation criterion if self.handle_cv_failure: ( X_train_part, X_validation, y_train_part, y_validation, ) = train_test_split(X_train, y_train, test_size=0.20) start = time.time() trained = trainable.fit(X_train_part, y_train_part) scorer = check_scoring(trainable, scoring=self.scoring) cv_score = scorer(trained, X_validation, y_validation) execution_time = time.time() - start y_pred_proba = trained.predict_proba(X_validation) try: logloss = log_loss(y_true=y_validation, y_pred=y_pred_proba) except BaseException: logloss = 0 logger.debug("Warning, log loss cannot be computed") else: logger.debug("Error {} with pipeline:{}".format( e, trainable.to_json())) raise e return cv_score, logloss, execution_time
def hyperopt_train_test(params, X_train, y_train): warnings.filterwarnings("ignore") reg = create_instance_from_hyperopt_search_space( self.estimator, params) try: cv_score, _, execution_time = cross_val_score_track_trials( reg, X_train, y_train, cv=KFold(self.cv), scoring=self.scoring) logger.debug("Successful trial of hyperopt") except BaseException as e: #If there is any error in cross validation, use the accuracy based on a random train-test split as the evaluation criterion if self.handle_cv_failure: X_train_part, X_validation, y_train_part, y_validation = train_test_split( X_train, y_train, test_size=0.20) start = time.time() reg_trained = reg.fit(X_train_part, y_train_part) scorer = check_scoring(reg, scoring=self.scoring) cv_score = scorer(reg_trained, X_validation, y_validation) execution_time = time.time() - start else: logger.debug(e) logger.debug("Error {} with pipeline:{}".format( e, reg.to_json())) raise e return cv_score, execution_time
def score(self, X, Y, *args, **kwargs): scores = [] for estimator, target in zip(self.estimators_, self.targets_): scorer = check_scoring(estimator, self.scoring) score = scorer(estimator, X, Y[target], *args, **kwargs) scores.append(score) return np.average(scores, weights=self.weights)
def __init__(self, trained_model, validation_df, features, target, scoring, n_jobs=None): self.trained_model = trained_model self.df = validation_df.copy() self.features = features self.target = target self.n_jobs = n_jobs self.scorer = check_scoring(estimator=self.trained_model, scoring=scoring) # FLOFO defaults self.num_bins = 10 self.shuffle_func = np.random.permutation self.feature_group_len = 2 min_data_needed = 10*(self.num_bins**self.feature_group_len) if self.df.shape[0] < min_data_needed: raise Exception("Small validation set (<{})".format(min_data_needed)) if len(self.features) <= self.feature_group_len: raise Exception("FLOFO needs more than {} features".format(self.feature_group_len)) if self.n_jobs is not None and self.n_jobs > 1: warning_str = "Warning: If your model is multithreaded, please initialise the number \ of jobs of LOFO to be equal to 1, otherwise you may experience issues." warnings.warn(warning_str) self._bin_features()
def __init__(self, name, parent=None, evaluations=None, n_trials=200, scoring_strategy: str = 'fold', scoring: Union[str, Callable, None] = None, optuna_jobs: int = 1, **kwargs): """ Optuna Optimization Model Feature Args: n_trials: total number of trials. scoring_strategy: out-of-fold scoring strategy. If set as `"fold"`, the score are calculated each by fold and use mean of them for optimization. If set as `"whole"`, the score is calculated whole data. scoring: scoring method. String or Scoring Object. When optimizing parameters, the best parameters in the sense of this score are chosen. By default (pass None) - for regression model, use `RMSE` metric and - for classifier model, use `negative log likelihood`. (scoring obj must be satisfied check_scoring validation) optuna_jobs: optuna parallel jobs. [NOTE] when set > 1, pre-post process model fail to jit input / target scaling class. so it is recommend to set = 1. """ super(TunerBlock, self).__init__(name=name, parent=parent, evaluations=evaluations) self.study = None # type: Union[Study, None] self.n_trails = n_trials self.optuna_jobs = optuna_jobs if scoring_strategy not in self.SCORING_STRATEGY_CHOICES: raise ValueError('`scoring_strategy` must be in {}'.format( ','.join(self.SCORING_STRATEGY_CHOICES))) self.scoring_strategy = scoring_strategy if scoring is None: if self.is_regression_model: scoring = 'neg_root_mean_squared_error' else: scoring = 'neg_log_loss' try: scoring = check_scoring(self.model_class, scoring=scoring, allow_none=False) except ValueError as e: s = f'Invalid scoring argument: {scoring}. You can select scoring method from pre-defined as follow\n' s += ', '.join(SCORERS.keys()) raise ValueError(s) self.scoring_method = scoring # type: _BaseScorer
def fit(self, X, y=None, groups=None, **fit_params): # type: (np.ndarray, np.ndarray, np.ndarray, Any) -> 'TPESearchCV' """Run fit with all sets of parameters. Args: X: Training data. y: Target variable. groups: Group labels for the samples used while splitting the dataset into train/test set. **fit_params: Parameters passed to ``fit`` on the estimator. Returns: self: Return self. """ self._check_params() self._set_verbosity() classifier = is_classifier(self.estimator) cv = check_cv(self.cv, y, classifier) self.n_splits_ = cv.get_n_splits(X, y, groups=groups) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self.study_ = study.create_study(load_if_exists=self.load_if_exists, pruner=self.pruner, sampler=self._sampler, storage=self.storage, study_name=self.study_name) objective = Objective(self.estimator, self.param_distributions, X, y, cv=cv, error_score=self.error_score, fit_params=fit_params, groups=groups, max_iter=self.max_iter, return_train_score=self.return_train_score, scoring=self.scorer_) self.study_.optimize(objective, n_jobs=self.n_jobs, n_trials=self.n_trials, timeout=self.timeout) if self.refit: self._refit(X, y, **fit_params) return self
def test_check_scoring_gridsearchcv(): # test that check_scoring works on GridSearchCV and pipeline. # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3) scorer = check_scoring(grid, scoring="f1") assert isinstance(scorer, _PredictScorer) pipe = make_pipeline(LinearSVC()) scorer = check_scoring(pipe, scoring="f1") assert isinstance(scorer, _PredictScorer) # check that cross_val_score definitely calls the scorer # and doesn't make any assumptions about the estimator apart from having a # fit. scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer(), cv=3) assert_array_equal(scores, 1)
def _check_multimetric_scoring(estimator, scoring=None): # TODO: See if scikit-learn 0.24 solves the need for using # a private method from sklearn.metrics import check_scoring from sklearn.metrics._scorer import _check_multimetric_scoring if callable(scoring) or isinstance(scoring, (type(None), str)): scorers = {"score": check_scoring(estimator, scoring=scoring)} return scorers, False return _check_multimetric_scoring(estimator, scoring), True
def _validate_parameters(self, X, y): if (self.max_iter is not None) and self.max_iter < 1: raise ValueError( "Received max_iter={}. max_iter < 1 is not supported".format( self.max_iter)) X = self._check_array(X) y = self._check_array(y, ensure_2d=False) scorer = check_scoring(self.estimator, scoring=self.scoring) return X, y, scorer
def _compute_scores(estimator, X_train, y_train, X_test, y_test, scoring): """ Given a fitted estimator and a train-test split (X_train , y_train , X_test , y_test), compute the trainig scores and the test scores (defined by the scoring parameter) and save them in two separate dictionaries. Parameters ---------- estimator : estimator object. This is assumed to implement the scikit-learn estimator interface. X_train : array-like, shape (n_samples , n_features) y_train : array-like, shape (n_samples , n_output) X_test : array-like, shape (n_samples , n_features) y_test : array-like, shape (n_samples , n_output) scoring : str, callable, dict of strings and callables Strategy to evaluate the performances of the estimator in the outer loop. A dictionnary can be used for multiple scores. Returns ------- List of two dictionaries """ training_scores, test_scores = {}, {} if isinstance(scoring, dict): for k in scoring.keys(): scorer = metrics.check_scoring(estimator, scoring[k]) training_scores[k], test_scores[k] = scorer( estimator, X_train, y_train), scorer(estimator, X_test, y_test) elif isinstance(scoring, str): scorer = metrics.check_scoring(estimator, scoring) training_scores[scoring], test_scores[scoring] = scorer( estimator, X_train, y_train), scorer(estimator, X_test, y_test) else: scorer = metrics.check_scoring(estimator, scoring) training_scores['score'], test_scores['score'] = scorer( estimator, X_train, y_train), scorer(estimator, X_test, y_test) return [training_scores, test_scores]
def fit(self, X, y): """ Parameters ---------- X : ndarray of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : ndarray of shape (n_samples, n_targets) Training data, where n_samples is the number of samples and n_targets is the number of target properties. """ # check input parameters, can be moved at some point to a sklearn-like check function if self.regularization_method not in ["tikhonov", "cutoff"]: raise ValueError( f"regularization method {self.regularization_method} is not known." ) if self.alpha_type not in ["absolute", "relative"]: raise ValueError(f"alpha type {self.alpha_type} is not known.") if self.alpha_type == "relative" and (np.any(self.alphas < 0) or np.any(self.alphas >= 1)): raise ValueError( "relative alphas type used, but the alphas are not within the range [0,1)" ) # check_scoring uses estimators scoring function if the scorer is None, this is intercepted here if self.scoring is None: scorer = check_scoring(self, scoring="neg_root_mean_squared_error", allow_none=False) else: scorer = check_scoring(self, scoring=self.scoring, allow_none=False) fold1_idx, fold2_idx = next( KFold(n_splits=2, shuffle=self.shuffle, random_state=self.random_state).split(X)) self.coef_ = self._2fold_cv(X, y, fold1_idx, fold2_idx, scorer) return self
def score_estimator(scoring, estimator, coordinates, data, weights=None): """ Score the given gridder against the given data using the given metric. If the data and predictions have more than 1 component, the scores of each component will be averaged. Parameters ---------- scoring : str or callable A scoring specification known to scikit-learn. See :func:`sklearn.metrics.check_scoring`. estimator : a Verde gridder The gridder to score. Usually derived from :class:`verde.base.BaseGridder`. coordinates : tuple of arrays Arrays with the coordinates of each data point. Should be in the following order: (easting, northing, vertical, ...). For the specific definition of coordinate systems and what these names mean, see the class docstring. data : array or tuple of arrays The data values of each data point. If the data has more than one component, *data* must be a tuple of arrays (one for each component). weights : None or array or tuple of arrays If not None, then the weights assigned to each data point. If more than one data component is provided, you must provide a weights array for each data component (if not None). Returns ------- score : float The score. """ coordinates, data, weights = check_fit_input( coordinates, data, weights, unpack=False ) predicted = check_data(estimator.predict(coordinates)) scorer = check_scoring(DummyEstimator, scoring=scoring) result = np.mean( [ scorer( DummyEstimator(pred.ravel()), coordinates, data[i].ravel(), sample_weight=weights[i], ) for i, pred in enumerate(predicted) ] ) return result
def _cross_val(self, X, y, scoring=None, cv=None, **kwargs): if self.backend != 'h2o': cv = KFold(n_splits=5) if cv is None else cv njobs = -1 if 'n_jobs' not in kwargs else kwargs.pop('n_jobs') if 'return_estimator' in kwargs: kwargs.pop('return_estimator') scoring = make_scorer(mean_squared_error) if scoring is None else scoring if callable(scoring) or isinstance(scoring, str): scorers = scoring try: check_scoring(self.model, scorers) scorers = {scorers.__name__.replace('_', ' '): (make_scorer(scorers) if isinstance(scorers, (types.FunctionType, types.BuiltinFunctionType, functools.partial)) else scorers)} except ValueError: scorers = {scorers.__name__.replace('_', ' '): make_scorer(scorers)} elif isinstance(scoring, (tuple, list)): scorers = [] for scorer in scoring: try: check_scoring(self.model, scorer) scorers.append([scorer.__name__.replace('_', ' '), (make_scorer(scorer) if isinstance(scorer, (types.FunctionType, types.BuiltinFunctionType, functools.partial)) else scorer)]) except ValueError: scorers.append([scorer.__name__.replace('_', ' '), make_scorer(scorer)]) scorers = {scorer[0]: scorer[1] for scorer in scorers} else: raise NotImplementedError(f'Scoring of type {type(scoring)} is not supported.') cv_results = cross_validate(self.model, X, y=y, scoring=scorers, cv=cv, n_jobs=njobs, return_estimator=True, **kwargs) estimators = cv_results.pop('estimator') cv_results = {key.split('test_')[1]: cv_results[key] for key in cv_results if key.startswith('test_')} return estimators, cv_results else: raise NotImplementedError('_cross_val method is not implemented for backend=`h2o`')
def validation_curve(estimator, X, y, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch="all", verbose=0, error_score=np.nan): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) out = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters={ param_name: v }, fit_params=None, return_train_score=True, error_score=error_score, return_estimator=True, return_times=True) # NOTE do not change order of iteration to allow one time cv splitters for train, test in cv.split(X, y, groups) for v in param_range) out = np.asarray(out) estimators = out[:, 4] out_scores = np.asarray(out[:, :2]) fit_time = out[:, 2] score_time = out[:, 3] n_params = len(param_range) n_cv_folds = out_scores.shape[0] // n_params out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose( (2, 1, 0)) return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \ np.float64(score_time)
def _validate_parameters(self, X, y): if (self.max_iter is not None) and self.max_iter < 1: raise ValueError( "Received max_iter={}. max_iter < 1 is not supported".format( self.max_iter)) # Make sure dask arrays are passed so error on unknown chunk size is raised kwargs = dict(accept_unknown_chunks=True, accept_dask_dataframe=True) if not isinstance(X, dd.DataFrame): X = self._check_array(X, **kwargs) if not isinstance(y, dd.Series): y = self._check_array(y, ensure_2d=False, **kwargs) scorer = check_scoring(self.estimator, scoring=self.scoring) return X, y, scorer
def fit_grid_point(X, y, estimator, parameters, train, test, scorer, verbose, error_score=np.nan, **fit_params): check_scoring(estimator, scorer) scores, n_samples_test = _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=fit_params, return_n_test_samples=True, error_score=error_score) return scores, parameters, n_samples_test
def get_col_score(estimator, X, y, col, n_repeats=5, scoring=None, random_state=None): """Calculate score when `col` is permuted.""" scorer = check_scoring(estimator, scoring=scoring) rstate = check_random_state(random_state) scores = _get_col_score(estimator, X, y, col, n_repeats, scorer, rstate) return scores
def get_group_score(estimator, X, y, g, n_repeats=5, scoring=None, random_state=None): """Calculate score when columns group `g` is permuted.""" scorer = check_scoring(estimator, scoring=scoring) rstate = check_random_state(random_state) scores = _get_group_score(estimator, X, y, g, n_repeats, scorer, rstate) return scores
def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None): if not DaskToolBox.is_dask_dataframe(X): return sk_inspect.permutation_importance(estimator, X, y, scoring=scoring, n_repeats=n_repeats, n_jobs=n_jobs, random_state=random_state) random_state = sk_utils.check_random_state(random_state) def shuffle_partition(df, col_idx): shuffling_idx = np.arange(df.shape[0]) random_state.shuffle(shuffling_idx) col = df.iloc[shuffling_idx, col_idx] col.index = df.index df.iloc[:, col_idx] = col return df if DaskToolBox.is_dask_object(y): y = y.compute() scorer = sk_metrics.check_scoring( DaskToolBox.wrap_for_local_scorer(estimator, type_of_target(y)), scoring) baseline_score = scorer(estimator, X, y) scores = [] for c in range(X.shape[1]): col_scores = [] for i in range(n_repeats): X_permuted = X.copy().map_partitions(shuffle_partition, c) col_scores.append(scorer(estimator, X_permuted, y)) if logger.is_debug_enabled(): logger.debug(f'permuted scores [{X.columns[c]}]: {col_scores}') scores.append(col_scores) importances = baseline_score - np.array(scores) return sk_utils.Bunch(importances_mean=np.mean(importances, axis=1), importances_std=np.std(importances, axis=1), importances=importances)
def fit(self, X, y, groups=None, **fit_params): # type: (...) -> PermutationImportance """Compute ``feature_importances_`` attribute and optionally fit the base estimator. Parameters ---------- X : array-like of shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples,) The target values (integers that correspond to classes in classification, real numbers in regression). groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : Other estimator specific parameters Returns ------- self : object Returns self. """ self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) if pandas_available and isinstance(X, pd.DataFrame): self.scorer_ = self._wrap_scorer(self.scorer_, X.columns) if self.cv != "prefit" and self.refit: self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **fit_params) X = check_array(X, force_all_finite='allow-nan') if self.cv not in (None, "prefit"): si = self._cv_scores_importances(X, y, groups=groups, **fit_params) else: si = self._non_cv_scores_importances(X, y) scores, results = si self.scores_ = np.array(scores) self.results_ = results self.feature_importances_ = np.mean(results, axis=0) self.feature_importances_std_ = np.std(results, axis=0) return self
def fit(self, X, y): """Fit the static selection model by select an ensemble of classifier containing the base classifiers with highest accuracy in the given dataset. Parameters ---------- X : array of shape (n_samples, n_features) Data used to fit the model. y : array of shape (n_samples) class labels of each example in X. Returns ------- self : object Returns self. """ self._validate_parameters() X, y = check_X_y(X, y) super(StaticSelection, self).fit(X, y) self.n_classifiers_ensemble_ = int(self.n_classifiers_ * self.pct_classifiers) performances = np.zeros(self.n_classifiers_) if not self.base_already_encoded_: y_encoded = y else: y_encoded = self.enc_.transform(y) for clf_idx, clf in enumerate(self.pool_classifiers_): scorer = check_scoring(clf, self.scoring) performances[clf_idx] = scorer( clf, X[:, self.estimator_features_[clf_idx]], y_encoded) self.clf_indices_ = np.argsort( performances)[::-1][0:self.n_classifiers_ensemble_] self.ensemble_ = [ self.pool_classifiers_[clf_idx] for clf_idx in self.clf_indices_ ] return self
def fit(self, X, y): """Fit the gradient boosting model. Parameters ---------- X : array-like, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) Target values. Returns ------- self : object """ fit_start_time = time() acc_find_split_time = 0. # time spent finding the best splits acc_apply_split_time = 0. # time spent splitting nodes acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. X, y = check_X_y(X, y, dtype=[X_DTYPE]) y = self._encode_y(y) rng = check_random_state(self.random_state) self._validate_parameters() self.n_features_ = X.shape[1] # used for validation in predict() # we need this stateful variable to tell raw_predict() that it was # called from fit() (this current method), and that the data it has # received is pre-binned. # predicting is faster on pre-binned data, so we want early stopping # predictions to be made on pre-binned data. Unfortunately the scorer_ # can only call predict() or predict_proba(), not raw_predict(), and # there's no way to tell the scorer that it needs to predict binned # data. self._in_fit = True self.loss_ = self._get_loss() self.do_early_stopping_ = (self.n_iter_no_change is not None and self.n_iter_no_change > 0) # create validation data if needed self._use_validation_data = self.validation_fraction is not None if self.do_early_stopping_ and self._use_validation_data: # stratify for classification stratify = y if hasattr(self.loss_, 'predict_proba') else None X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=self.validation_fraction, stratify=stratify, random_state=rng) else: X_train, y_train = X, y X_val, y_val = None, None # Bin the data self.bin_mapper_ = _BinMapper(max_bins=self.max_bins, random_state=rng) X_binned_train = self._bin_data(X_train, rng, is_training_data=True) if X_val is not None: X_binned_val = self._bin_data(X_val, rng, is_training_data=False) else: X_binned_val = None if self.verbose: print("Fitting gradient boosted rounds:") # initialize raw_predictions: those are the accumulated values # predicted by the trees for the training data. raw_predictions has # shape (n_trees_per_iteration, n_samples) where # n_trees_per_iterations is n_classes in multiclass classification, # else 1. n_samples = X_binned_train.shape[0] self._baseline_prediction = self.loss_.get_baseline_prediction( y_train, self.n_trees_per_iteration_ ) raw_predictions = np.zeros( shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype ) raw_predictions += self._baseline_prediction # initialize gradients and hessians (empty arrays). # shape = (n_trees_per_iteration, n_samples). gradients, hessians = self.loss_.init_gradients_and_hessians( n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_ ) # predictors is a matrix (list of lists) of TreePredictor objects # with shape (n_iter_, n_trees_per_iteration) self._predictors = predictors = [] # Initialize structures and attributes related to early stopping self.scorer_ = None # set if scoring != loss raw_predictions_val = None # set if scoring == loss and use val self.train_score_ = [] self.validation_score_ = [] if self.do_early_stopping_: # populate train_score and validation_score with the predictions # of the initial model (before the first tree) if self.scoring == 'loss': # we're going to compute scoring w.r.t the loss. As losses # take raw predictions as input (unlike the scorers), we can # optimize a bit and avoid repeating computing the predictions # of the previous trees. We'll re-use raw_predictions (as it's # needed for training anyway) for evaluating the training # loss, and create raw_predictions_val for storing the # raw predictions of the validation data. if self._use_validation_data: raw_predictions_val = np.zeros( shape=(self.n_trees_per_iteration_, X_binned_val.shape[0]), dtype=self._baseline_prediction.dtype ) raw_predictions_val += self._baseline_prediction self._check_early_stopping_loss(raw_predictions, y_train, raw_predictions_val, y_val) else: self.scorer_ = check_scoring(self, self.scoring) # scorer_ is a callable with signature (est, X, y) and calls # est.predict() or est.predict_proba() depending on its nature. # Unfortunately, each call to scorer_() will compute # the predictions of all the trees. So we use a subset of the # training set to compute train scores. subsample_size = 10000 # should we expose this parameter? indices = np.arange(X_binned_train.shape[0]) if X_binned_train.shape[0] > subsample_size: # TODO: not critical but stratify using resample() indices = rng.choice(indices, subsample_size, replace=False) X_binned_small_train = X_binned_train[indices] y_small_train = y_train[indices] # Predicting is faster on C-contiguous arrays. X_binned_small_train = np.ascontiguousarray( X_binned_small_train) self._check_early_stopping_scorer( X_binned_small_train, y_small_train, X_binned_val, y_val, ) for iteration in range(self.max_iter): if self.verbose: iteration_start_time = time() print("[{}/{}] ".format(iteration + 1, self.max_iter), end='', flush=True) # Update gradients and hessians, inplace self.loss_.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions) # Append a list since there may be more than 1 predictor per iter predictors.append([]) # Build `n_trees_per_iteration` trees. for k in range(self.n_trees_per_iteration_): grower = TreeGrower( X_binned_train, gradients[k, :], hessians[k, :], max_bins=self.max_bins, actual_n_bins=self.bin_mapper_.actual_n_bins_, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate) grower.grow() acc_apply_split_time += grower.total_apply_split_time acc_find_split_time += grower.total_find_split_time acc_compute_hist_time += grower.total_compute_hist_time predictor = grower.make_predictor( bin_thresholds=self.bin_mapper_.bin_thresholds_ ) predictors[-1].append(predictor) # Update raw_predictions with the predictions of the newly # created tree. tic_pred = time() _update_raw_predictions(raw_predictions[k, :], grower) toc_pred = time() acc_prediction_time += toc_pred - tic_pred should_early_stop = False if self.do_early_stopping_: if self.scoring == 'loss': # Update raw_predictions_val with the newest tree(s) if self._use_validation_data: for k, pred in enumerate(self._predictors[-1]): raw_predictions_val[k, :] += ( pred.predict_binned(X_binned_val)) should_early_stop = self._check_early_stopping_loss( raw_predictions, y_train, raw_predictions_val, y_val ) else: should_early_stop = self._check_early_stopping_scorer( X_binned_small_train, y_small_train, X_binned_val, y_val, ) if self.verbose: self._print_iteration_stats(iteration_start_time) # maybe we could also early stop if all the trees are stumps? if should_early_stop: break if self.verbose: duration = time() - fit_start_time n_total_leaves = sum( predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self._predictors for predictor in predictors_at_ith_iteration ) n_predictors = sum( len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self._predictors) print("Fit {} trees in {:.3f} s, ({} total leaves)".format( n_predictors, duration, n_total_leaves)) print("{:<32} {:.3f}s".format('Time spent computing histograms:', acc_compute_hist_time)) print("{:<32} {:.3f}s".format('Time spent finding best splits:', acc_find_split_time)) print("{:<32} {:.3f}s".format('Time spent applying splits:', acc_apply_split_time)) print("{:<32} {:.3f}s".format('Time spent predicting:', acc_prediction_time)) self.train_score_ = np.asarray(self.train_score_) self.validation_score_ = np.asarray(self.validation_score_) del self._in_fit # hard delete so we're sure it can't be used anymore return self