def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. assert_array_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) assert_true(np.any(np.array(list(kf_iter_wrapped.split(X, y))) != np.array(list(kf_randomized_iter_wrapped.split(X, y)))))
def _set_cv(cv, X, y, classifier): """This method returns either a `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17 or sklearn-0.18 is being used. Parameters ---------- cv : int, `_PartitionIterator` or `BaseCrossValidator` The CV object or int to check. If an int, will be converted into the appropriate class of crossvalidator. X : pd.DataFrame or np.ndarray, shape(n_samples, n_features) The dataframe or np.ndarray being fit in the grid search. y : np.ndarray, shape(n_samples,) The target being fit in the grid search. classifier : bool Whether the estimator being fit is a classifier Returns ------- `_PartitionIterator` or `BaseCrossValidator` """ return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
def _set_cv(cv, estimator=None, X=None, y=None): """ Set the default cross-validation depending on whether clf is classifier or regressor. """ from sklearn.base import is_classifier # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV if check_version('sklearn', '0.18'): from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) else: from sklearn import cross_validation as models from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): if est_is_classifier: cv = StratifiedKFold(y=y, n_folds=cv) else: cv = KFold(n=len(y), n_folds=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) if cv.__name__ not in ['KFold', 'LeaveOneOut']: raise NotImplementedError('CV cannot be defined with str for' ' sklearn < .017.') cv = cv(len(y)) cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits
def _set_cv(cv, estimator=None, X=None, y=None): """Set the default CV depending on whether clf is classifier/regressor.""" # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits
def _check_cv(self, y): cv = check_cv(self.cv, y=y, classifier=is_classifier(self)) if hasattr(cv, 'random_state') and cv.random_state is None: cv.random_state = np.random.RandomState() if hasattr(cv, 'shuffle') and self.shuffle_cv: cv.shuffle = True return cv
def __init__( self, alpha: float = 10, max_bins: int = 30, split: Tuple[Union[int, BaseCrossValidator]] = (3, 3), ): """ Args: alpha (float): smoothing parameter for generalization. max_bins (int): maximum number of unique values in a feature. split (tuple[Union[int, BaseCrossValidator]): tuple of int or cross-validator classes. If split len is 0, then algorithm will encode features without cross-validation. This situation features will over-fit on target. If split len is 1, algorithm will encode features by using cross-validation on folds. In this situation you will not over-fit on tests, but when you will validate, your score may over-fit. If split len is 2, algorithm will separate data on first folds, afterwords will encode features by using cross-validation on second folds. This situation is the best way to avoid over-fit, but algorithm will use small data for encode. """ self.alpha = alpha self.max_bins = max_bins self.split = tuple(check_cv(x_split) for x_split in split) self._encodings = [] # type: List[BaseTargetEncoder]
def cross_val_predict_proba(self, X, y, cv=None, scoring=None, **kwargs): """Performing cross validation hold out predictions for stacking. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Input feature matrix used for training and cv. y : array-like of shape = [n_samples, ] or [n_samples, n_classes] for Keras. The numerical encoded target for classification tasks. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : callable, default: None A callable to evaluate the predictions on the cv set. None, accuracy score **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- array of shape = [n_samples, n_classes] The hold out probabilities for each class """ y = self.__process_target(y) y_pred_proba = np.zeros((X.shape[0], self.__num_classes)) cv = check_cv(cv, y, classifier=True) n_splits = cv.get_n_splits(X, y) if scoring is None: scoring = make_scorer(accuracy_score) i = 0 score_mean = 0.0 print("Starting hold out prediction with {} splits.".format(n_splits)) for train_index, cv_index in cv.split(X, y): X_train = X[train_index] y_train = y[train_index] X_cv = X[cv_index] y_cv = y[cv_index] est = self.get_estimator_copy() est.fit(X_train, y_train, **kwargs) y_pred_proba_cv = est.predict_proba(X_cv) # score = scoring(y_cv, y_pred_proba_cv) # print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score)) # score_mean += score / float(n_splits) y_pred_proba[cv_index] = y_pred_proba_cv i += 1 # print("Mean score: {:.4f}".format(score_mean)) return y_pred_proba
def get_fold_splitting(self, X, y) -> Iterable: # If cv is iterable obj, convert to list and return if isinstance(self.cv, Iterable): return list(self.cv) if self._checked_cv is None: self._checked_cv = check_cv(self.cv, y, classifier=self.model_class) return list(self._checked_cv.split(X, y, self.groups))
def _score_lambda_path(est, X, y, sample_weight, relative_penalties, cv, scoring, classifier, n_jobs, verbose): """Score each model found by glmnet using cross validation. Parameters ---------- est : estimator The previously fitted estimator. X : array, shape (n_samples, n_features) Input features y : array, shape (n_samples,) Target values. sample_weight : array, shape (n_samples,) Weight of each row in X. n_splits : int Number of folds for cross validation, must be at least 3. scoring : string, callable or None Scoring method to apply to each model. n_jobs: int Maximum number of threads to use for scoring models. verbose : bool Emit logging data and warnings when True. classifier : boolean, optional, default False Whether the task is a classification task, in which case stratified KFold will be used. Returns ------- scores : array, shape (n_lambda,) Scores for each value of lambda over all cv folds. """ scorer = check_scoring(est, scoring) cv = check_cv(cv, y, classifier) cv = cv.split(X, y) # We score the model for every value of lambda, for classification # models, this will be an intercept-only model, meaning it predicts # the same class regardless of the input. Obviously, this makes some of # the scikit-learn metrics unhappy, so we are silencing these warnings. # Also note, catch_warnings is not thread safe. with warnings.catch_warnings(): action = 'always' if verbose else 'ignore' warnings.simplefilter(action, UndefinedMetricWarning) scores = Parallel(n_jobs=n_jobs, verbose=verbose, backend='threading')( delayed(_fit_and_score)(est, scorer, X, y, sample_weight, relative_penalties, est.lambda_path_, train_idx, test_idx) for (train_idx, test_idx) in cv) return scores
def evaluate_candidates(self, x, y, groups, candidate_params, scorers, fit_params): fit_and_score_kwargs = dict(scorer=scorers, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, error_score=self.error_score, verbose=self.verbose) cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(x, y, groups) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, len(candidate_params), len(candidate_params) * n_splits)) param_grid = list(product(candidate_params, range(n_splits))) par_param_grid = self.sc.parallelize( list(zip(range(len(param_grid)), param_grid)), len(param_grid)) x_bc = self.sc.broadcast(x) y_bc = self.sc.broadcast(y) groups_bc = self.sc.broadcast(groups) base_estimator = self.estimator def test_one_parameter(task): (index, (parameters, split_idx)) = task local_estimator = clone(base_estimator) local_x = x_bc.value local_y = y_bc.value local_groups = groups_bc.value train, test = next( islice(cv.split(local_x, local_y, local_groups), split_idx, split_idx + 1)) res = _fit_and_score(local_estimator, local_x, local_y, train=train, test=test, parameters=parameters, **fit_and_score_kwargs) return index, res out = dict(par_param_grid.map(test_one_parameter).collect()) x_bc.unpersist() y_bc.unpersist() groups_bc.unpersist() out = [out[idx] for idx in range(len(param_grid))] # Warning: may not work for sklearn != 0.20.3 results = self._format_results(candidate_params, scorers, n_splits, out) return results
def fit(self, X, y=None, groups=None, **fit_params): # type: (np.ndarray, np.ndarray, np.ndarray, Any) -> 'TPESearchCV' """Run fit with all sets of parameters. Args: X: Training data. y: Target variable. groups: Group labels for the samples used while splitting the dataset into train/test set. **fit_params: Parameters passed to ``fit`` on the estimator. Returns: self: Return self. """ self._check_params() self._set_verbosity() classifier = is_classifier(self.estimator) cv = check_cv(self.cv, y, classifier) self.n_splits_ = cv.get_n_splits(X, y, groups=groups) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self.study_ = study.create_study(load_if_exists=self.load_if_exists, pruner=self.pruner, sampler=self._sampler, storage=self.storage, study_name=self.study_name) objective = Objective(self.estimator, self.param_distributions, X, y, cv=cv, error_score=self.error_score, fit_params=fit_params, groups=groups, max_iter=self.max_iter, return_train_score=self.return_train_score, scoring=self.scorer_) self.study_.optimize(objective, n_jobs=self.n_jobs, n_trials=self.n_trials, timeout=self.timeout) if self.refit: self._refit(X, y, **fit_params) return self
def fit(self, X, y): X, y = indexable(X, y) cv = check_cv(self.fold, y, classifier=True) for idx_train, idx_eval in cv.split(X, y): X_train = X.loc[idx_train] y_train = y.loc[idx_train] X_eval = X.loc[idx_eval] y_eval = y.loc[idx_eval] self.models.append(self.train_func(X_train, X_eval, y_train, y_eval))
def fit_transform(self, X=None, y=None, **kwargs): """Creates training meta-features for the stacking procedure and fits the base models. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features], default = None Input feature matrix used for training. y : array-like of shape = [n_samples, ], default = None The numerical encoded target for regression tasks. **kwargs : default = None Additional fitting arguments accepted by model. Not tested. Returns ------- self.__X_meta_train : array-like or sparse matrix of shape = n_samples, n_base_estimators * (n_classes - int(self.base_drop_first))] Training meta-features """ self.__X_meta_train = None if X is not None and y is not None: cv = check_cv(self.base_cv, y, classifier=False) scoring = self.base_scoring for c, est in enumerate(self.base_estimators): if type(est) == tuple: if(self.stacking_verbose): print("\n" + "Loading estimator n°" + str(c+1)) y_pred = np.load(est[0]) elif X is not None and y is not None: if(self.stacking_verbose): print("\n" + "Fitting estimator n°" + str(c+1)) y_pred = est.cross_val_predict(X, y, cv=cv, scoring=scoring, **kwargs) est.fit(X, y, **kwargs) if self.base_save: if self.base_save_files is not None: np.save(self.base_save_files[c][0], y_pred) else: np.save('est' + str(c) + '_train', y_pred) else: raise ValueError("X and y must be specified to fit_transform base estimators.") if self.base_copy_idx is not None: self.__X_meta_train = np.column_stack((self.__X_meta_train, X[:,self.base_copy_idx])) self.__y = y self.__fittransformOK = True return self.__X_meta_train
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, caching=self.caching) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.n_features_ = support_.sum() self.support_ = support_ return self
def split(x1, x2, n_folds=5): cv1 = check_cv(n_folds, x1) cv1_iter = list(cv1.split(x1, None, None)) cv2 = check_cv(n_folds, x2) cv2_iter = list(cv2.split(x2, None, None)) cv_iter = [] for i in range(len(cv1_iter)): train1, test1 = cv1_iter[i] train2, test2 = cv2_iter[i] x1_train = [x1[index] for index in train1] x1_test = [x1[index] for index in test1] x2_train = [x2[index] for index in train2] x2_test = [x2[index] for index in test2] cv_iter.append((x1_train + x2_train, x1_test + x2_test)) return cv_iter
def fit(self, X, y): """Fit the meta classifier to the base classifiers. Parameters ---------- X: {array-like, sparse matrix}, shape(n_samples, n_features) Training vectors, where n_samples is the number samples and n_features is the number of features. y: array-like, shape(n_samples,) Labels for classification. Returns ------- self: object Returns self """ meta_features_list = [] cv = check_cv(self.cv, y=y, classifier=True) for clf in self.base_estimators: # feels kind of clumsy, but we want the meta features in the # original ordering if self.probas: meta_features = np.zeros((y.shape[0], 2)) pred = [(test, clf.fit(X[train], y[train]).predict_proba(X[test])) for train, test in cv.split(X, y)] else: meta_features = np.zeros((y.shape[0], )) pred = [(test, clf.fit(X[train], y[train]).predict(X[test])) for train, test in cv.split(X, y)] for index, y_pred in pred: meta_features[index] = y_pred meta_features_list.append(meta_features) all_meta_features = np.column_stack(meta_features_list) if self.use_orig_features: all_meta_features = np.hstack((X, all_meta_features)) # train base estimators with whole training data set for clf in self.base_estimators: clf.fit(X, y) # train meta estimators self.meta_estimator.fit(all_meta_features, y) return self
def __init__(self, X, y, Model, cv=None, max_iter=1000, estimator=None): """ Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Data y : {ndarray, sparse matrix} of shape (n_samples,) Target Model: class The Model class definition (e.g. Lasso or SparseLogreg) cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - scikit-learn CV splitter - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, KFold is used. max_iter: int Maximal number of iteration for the state-of-the-art solver estimator: instance of ``sklearn.base.BaseEstimator`` An estimator that follows the scikit-learn API. """ self.X = X self.y = y self.dict_crits = {} self.val_test = None self.rmse = None self.estimator = estimator cv = check_cv(cv) for i, (train, val) in enumerate(cv.split(X)): X_train = X[train, :] y_train = y[train] X_val = X[val, :] y_val = y[val] if issparse(X_train): X_train = X_train.tocsc() if issparse(X_val): X_val = X_val.tocsc() model = Model( X_train, y_train, max_iter=max_iter, estimator=estimator) criterion = HeldOutMSE( X_val, y_val, model, X_test=X_val, y_test=y_val) self.dict_crits[i] = criterion self.n_splits = cv.n_splits self.model = self.dict_crits[0].model
def _get_scores(X, y, groups, cv, estimator, scorer): cv = check_cv(cv, y, classifier=is_classifier(estimator)) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=None, verbose=False, pre_dispatch='2*n_jobs') scores = parallel( delayed(_fit_and_predict)(clone(estimator), X, y, train, test, groups, scorer) for train, test in cv.split(X, y, groups)) return scores
def _check_cv(self, y): """Overrides base class _check_cv """ # Squeezed target should be 1-dimensional if len(y.shape) != 1: raise NotImplementedError("StackedClassifier does not currently " "support multi-column classification " "problems. If your target is a one-hot " "encoded multi-class problem, please " "recast it to a single column.") return check_cv(self.cv, y=y, classifier=True)
def __init__(self, cv=None, **kwargs): """ See BaseLassoNet for the parameters cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Default is 5-fold cross-validation. See <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.check_cv.html> """ super().__init__(**kwargs) self.cv = check_cv(cv)
def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None): # use a binary array to get stratified split in case of discrete treatment stratify = self._discrete_treatment or self._discrete_instrument if self._discrete_treatment: T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1))) if self._discrete_instrument: z_enc = LabelEncoder() Z = z_enc.fit_transform(Z.ravel()) if self._discrete_treatment: # need to stratify on combination of Z and T to_split = inverse_onehot(T) + Z * len(self._one_hot_encoder.categories_[0]) else: to_split = Z # just stratify on Z z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first') Z = z_ohe.fit_transform(reshape(Z, (-1, 1))) self.z_transformer = FunctionTransformer( func=_EncoderWrapper(z_ohe, z_enc).encode, validate=False) else: # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not to_split = inverse_onehot(T) if self._discrete_treatment else T self.z_transformer = None if self._n_splits == 1: # special case, no cross validation folds = None else: splitter = check_cv(self._n_splits, [0], classifier=stratify) # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)): splitter.shuffle = True splitter.random_state = self._random_state all_vars = [var if np.ndim(var) == 2 else var.reshape(-1, 1) for var in [Z, W, X] if var is not None] if all_vars: all_vars = np.hstack(all_vars) folds = splitter.split(all_vars, to_split) else: folds = splitter.split(np.ones((T.shape[0], 1)), to_split) if self._discrete_treatment: self._d_t = shape(T)[1:] self.transformer = FunctionTransformer( func=_EncoderWrapper(self._one_hot_encoder).encode, validate=False) nuisances, fitted_models, fitted_inds, scores = _crossfit(self._model_nuisance, folds, Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight) self._models_nuisance = fitted_models self.nuisance_scores_ = scores return nuisances, fitted_inds
def cross_val_score(estimator, X, y=None, fold_specific_X_extractor=None, groups=None, scorings=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ :param estimator: :param X: :param y: :param fold_specific_X_extractor: :param groups: :param scorings: list of scorings (strings, callables, etc...) :param cv: :param n_jobs: :param verbose: :param fit_params: :param pre_dispatch: :return: an array of scores, shape: <folds x scores> """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorers = [ check_scoring(estimator, scoring=scoring) for scoring in scorings ] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel( delayed(_fe_fit_and_score)( clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, fold_specific_X_extractor=fold_specific_X_extractor) for train, test in cv.split(X, y, groups)) # here scores is python list of shape <folds x 1 x scores> scores = np.array(scores) # eliminate middle axis return scores.reshape((scores.shape[0], scores.shape[2]))
def get_splitter(random_state=None, **params): '''Get cross-validation index generator Parameters: random_state: int or RandomState object seed for random number generator name: str name of the splitter params: keyword arguments extra parameters for the classifier Returns: estimator: object a BaseEstimator object ''' from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \ RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit splitter = params.get('splitter') if splitter is None: return check_cv(**params) if splitter == 'KFold': from sklearn.model_selection import KFold return KFold(random_state=random_state, **search_dict(params, ('n_splits', 'shuffle'))) elif splitter == 'StratifiedKFold': from sklearn.model_selection import StratifiedKFold return StratifiedKFold(random_state=random_state, **search_dict(params, ('n_splits', 'shuffle'))) elif splitter == 'RepeatedStratifiedKFold': from sklearn.model_selection import RepeatedStratifiedKFold return RepeatedStratifiedKFold(random_state=random_state, **search_dict( params, ('n_splits', 'n_repeats'))) elif splitter == 'ShuffleSplit': from sklearn.model_selection import ShuffleSplit return ShuffleSplit( random_state=random_state, **search_dict(params, ('n_splits', 'test_size', 'train_size'))) elif splitter == 'StratifiedShuffleSplit': from sklearn.model_selection import StratifiedShuffleSplit return StratifiedShuffleSplit( random_state=random_state, **search_dict(params, ('n_splits', 'test_size', 'train_size'))) elif splitter == 'LeaveOneOut': from sklearn.model_selection import LeaveOneOut return LeaveOneOut() elif splitter == 'FileSplitter': return UserFileSplitter(**search_dict(params, 'filename')) else: raise ValueError('unknown splitter: {}'.format(splitter))
def _cv_scores_importances(self, X, y, groups=None, **fit_params): assert self.cv is not None cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] for train, test in cv.split(X, y, groups): est = clone(self.estimator).fit(X[train], y[train], **fit_params) score_func = partial(self.scorer_, est) _base_score, _importances = self._get_score_importances( score_func, X[test], y[test]) base_scores.extend([_base_score] * len(_importances)) feature_importances.extend(_importances) return base_scores, feature_importances
def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame: self.cbe_ = [] cv = check_cv(self.cv) cbe = CatBoostEncoder(cols=X.columns.tolist(), return_df=False, **self.cbe_params) X_transformed = np.zeros_like(X, dtype=np.float64) for train_idx, valid_idx in cv.split(X, y): self.cbe_.append(clone(cbe).fit(X.loc[train_idx], y[train_idx])) X_transformed[valid_idx] = self.cbe_[-1].transform( X.loc[valid_idx]) return pd.DataFrame(X_transformed, columns=X.columns)
def validation_curve(estimator, X, y, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch="all", verbose=0, error_score=np.nan): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) out = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters={ param_name: v }, fit_params=None, return_train_score=True, error_score=error_score, return_estimator=True, return_times=True) # NOTE do not change order of iteration to allow one time cv splitters for train, test in cv.split(X, y, groups) for v in param_range) out = np.asarray(out) estimators = out[:, 4] out_scores = np.asarray(out[:, :2]) fit_time = out[:, 2] score_time = out[:, 3] n_params = len(param_range) n_cv_folds = out_scores.shape[0] // n_params out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose( (2, 1, 0)) return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \ np.float64(score_time)
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) try: np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) splits_are_equal = True except AssertionError: splits_are_equal = False assert_false( splits_are_equal, "If the splits are randomized, " "successive calls to split should yield different results")
def fit(self, X, y=None, groups=None, **fit_params): """ Run fit method with all sets of parameters Args ---- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning groups : array-like, shape = [n_samples], optional Training vector groups for cross-validation **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ # check estimator and cv methods are valid self.cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) # check for binary response if len(np.unique(y)) > 2: raise ValueError( 'Only a binary response vector is currently supported') # check that scoring metric has been specified if self.scoring is None: raise ValueError('No score function is defined') # perform cross validation prediction self.y_pred_ = cross_val_predict(estimator=self.estimator, X=X, y=y, groups=groups, cv=self.cv, method='predict_proba', n_jobs=self.n_jobs, **fit_params) self.y_true = y # add fold id to the predictions self.test_idx_ = [ indexes[1] for indexes in self.cv.split(X, y, groups) ]
def check_cv(cv: Union[int, Iterable, BaseCrossValidator] = 5, y: Optional[Union[pd.Series, np.ndarray]] = None, stratified: bool = False, random_state: int = 0): if cv is None: cv = 5 if isinstance(cv, numbers.Integral): if stratified and (y is not None) and (type_of_target(y) in ('binary', 'multiclass')): return StratifiedKFold(cv, shuffle=True, random_state=random_state) else: return KFold(cv, shuffle=True, random_state=random_state) return model_selection.check_cv(cv, y, stratified)
def cv_score(self, X, y, cv=0.2, scoring='accuracy'): """ Calculate validation score Parameters ---------- X: iterable, shape (n_samples, ) Sequence of tokenized documents y: iterable, shape (n_samples, ) Sequence of labels cv: float, int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - float, to use holdout set of this size - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a StratifiedKFold, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. scoring : string, callable or None, optional, optional A string (see sklearn model evaluation documentation) or a scorer callable object Returns ---------- float Average value of the validation metrics """ self._classes = sorted(np.unique(y)) np.random.seed(self.random_state) if isinstance(cv, float): train_ind, __ = train_test_split(np.arange(0, X.shape[0])) test_fold = np.zeros((X.shape[0], )) test_fold[train_ind] = -1 self._cv_split = PredefinedSplit(test_fold) else: self._cv_split = check_cv(cv, y=y, classifier=True) if scoring == 'neg_log_loss': scoring = make_scorer(log_loss, labels=self._classes, greater_is_better=False, needs_proba=True) return cross_val_score(self._model, X, y, cv=self._cv_split, scoring=scoring)
def _cv_scores_importances(self, X, y, groups=None, n_jobs=1, **fit_params): assert self.cv is not None cv = check_cv(self.cv, y, is_classifier(self.estimator)) feature_importances = [] # type: List base_scores = [] # type: List[float] pool = Pool(self.n_jobs) #, maxtasksperchild=1) result = pool.map(lambda train_test: self._parallel_cv_scores_sub(X, y, *train_test, **fit_params), cv.split(X, y, groups), chunksize=1) #close and join the pools pool.close() pool.join() #unpack tuples to lists and flatten flatten = lambda z: [x for y in z for x in y] base_scores = flatten(map(list, zip(*result))[0]) feature_importances = flatten(map(list, zip(*result))[1]) return base_scores, feature_importances
def fit(self, X, y, tree="rf", recursive=True, cv=5): """ Fits to the data (X) and target (y) to determine the selected_features. Args: X (pandas.DataFrame): input data, note that numpy matrix is NOT accepted since the X.columns is used for feature names y (pandas.Series or np.ndarray): list of outputs used for fitting the tree model tree (str or instantiated sklearn tree-based model): if a model is directly fed, it must have the .feature_importances_ attribute recursive (bool): whether to recursively reduce the features (True) or just do it once (False) cv (int or CrossValidation): sklearn's cross-validation with the same options (int or actual instantiated CrossValidation) Returns (None): sets the class attribute .selected_features """ m0 = len(X.columns) if isinstance(tree, str): if tree.lower() in ["rf", "random forest", "randomforest"]: if self.mode.lower() in ["classification", "classifier"]: tree = RandomForestClassifier(random_state=self.rs) else: tree = RandomForestRegressor(random_state=self.rs) elif tree.lower() in ["gb", "gbt", "gradiet boosting"]: if self.mode.lower() in ["classification", "classifier"]: tree = GradientBoostingClassifier(random_state=self.rs) else: tree = GradientBoostingRegressor(random_state=self.rs) else: raise AutomatminerError( "Unsupported tree_type {}!".format(tree)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree)) all_feats = [] for train, _ in cv.split(X, y, groups=None): Xtrn = X.iloc[train] ytrn = y.iloc[train] all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive) # take the union of selected features of each fold self.selected_features = list(set(all_feats)) logger.info( self._log_prefix + "Finished tree-based feature reduction of {} initial features to " "{}".format(m0, len(self.selected_features))) return self
def nested_cross_validation(dataset, model, X, y, df, feature_list, impute, feature_select): cv = StratifiedKFold(n_splits=5, shuffle=True) X, y, _ = indexable(X, y, None) cv = check_cv(cv, y, classifier=is_classifier(model)) mean_scores = {} scores = { "acc_scores": [], "f1_scores": [], "p_scores": [], "r_scores": [], "auc_scores": [], "geometric_mean_scores": [] } for train, test in cv.split(X, y): X_train = X[train] y_train = y[train] best_features = feature_assessment_and_selection(dataset=dataset, model=clone(model), X=X_train, y=y_train, df=df, feature_list=feature_list) X_train_reduced = df[best_features].to_numpy()[train] X_test = df[best_features].to_numpy()[test] y_test = y[test] estimator = clone(model) estimator.fit(X_train_reduced, y_train) y_pred =estimator.predict(X_test) acc_score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test) auc_score = metrics.roc_auc_score(y_score=y_pred, y_true=y_test) r_score = metrics.recall_score(y_pred=y_pred, y_true=y_test) p_score = metrics.precision_score(y_pred=y_pred, y_true=y_test) f1_score = metrics.f1_score(y_pred=y_pred, y_true=y_test) gmean_score = geometric_mean_score(y_test, y_pred) scores["acc_scores"].append(acc_score) scores["auc_scores"].append(auc_score) scores["r_scores"].append(r_score) scores["p_scores"].append(p_score) scores["f1_scores"].append(f1_score) scores["geometric_mean_scores"].append(gmean_score) mean_scores["acc_scores"] = mean(scores["acc_scores"]) mean_scores["auc_scores"] = mean(scores["auc_scores"]) mean_scores["r_scores"] = mean(scores["r_scores"]) mean_scores["p_scores"] = mean(scores["p_scores"]) mean_scores["f1_scores"] = mean(scores["f1_scores"]) mean_scores["geometric_mean_scores"] = mean(scores["geometric_mean_scores"]) print(mean_scores) return
def check_cv(cv=3, y=None, classifier=False): """Dask aware version of ``sklearn.model_selection.check_cv`` Same as the scikit-learn version, but works if ``y`` is a dask object. """ if cv is None: cv = 3 # If ``cv`` is not an integer, the scikit-learn implementation doesn't # touch the ``y`` object, so passing on a dask object is fine if not is_dask_collection(y) or not isinstance(cv, numbers.Integral): return model_selection.check_cv(cv, y, classifier) if classifier: # ``y`` is a dask object. We need to compute the target type target_type = delayed(type_of_target, pure=True)(y).compute() if target_type in ('binary', 'multiclass'): return StratifiedKFold(cv) return KFold(cv)
def test_check_cv(): X = np.ones(9) cv = check_cv(3, classifier=False) # Use numpy.testing.assert_equal which recursively compares # lists of lists np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) cv = check_cv(3, y_binary, classifier=True) np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_binary)), list(cv.split(X, y_binary))) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv = check_cv(3, y_multiclass, classifier=True) np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass)), list(cv.split(X, y_multiclass))) X = np.ones(5) y_multilabel = np.array([[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1], [1, 1, 0, 1], [0, 0, 1, 0]]) cv = check_cv(3, y_multilabel, classifier=True) np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) cv = check_cv(3, y_multioutput, classifier=True) np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X))) # Check if the old style classes are wrapped to have a split method X = np.ones(9) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv1 = check_cv(3, y_multiclass, classifier=True) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv2 = check_cv(OldSKF(y_multiclass, n_folds=3)) np.testing.assert_equal(list(cv1.split(X, y_multiclass)), list(cv2.split())) assert_raises(ValueError, check_cv, cv="lolo")
def fit(self, X, y, groups=None): """Fit the RFE model and automatically tune the number of selected features. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where `n_samples` is the number of samples and `n_features` is the total number of features. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). groups : array-like, shape = [n_samples], optional Group labels for the samples used while splitting the dataset into train/test set. """ if type(self.step) is not list: return super(DyRFECV, self).fit(X, y, groups) X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] step = [] for s in self.step: if 0.0 < s < 1.0: step.append(int(max(1, s * n_features))) else: step.append(int(s)) if s <= 0: raise ValueError("Step must be >0") # Build an RFE object, which will evaluate and score each possible # feature count, down to self.min_features_to_select rfe = DyRFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, step=self.step, verbose=self.verbose) # Determine the number of subsets of features by fitting across # the train folds and choosing the "features_to_select" parameter # that gives the least averaged error across all folds. # Note that joblib raises a non-picklable error for bound methods # even if n_jobs is set to 1 with the default multiprocessing # backend. # This branching is done so that to # make sure that user code that sets n_jobs to 1 # and provides bound methods as scorers is not broken with the # addition of n_jobs parameter in version 0.18. if effective_n_jobs(self.n_jobs) == 1: parallel, func = list, _rfe_single_fit else: parallel = Parallel(n_jobs=self.n_jobs) func = delayed(_rfe_single_fit) scores = parallel( func(rfe, self.estimator, X, y, train, test, scorer) for train, test in cv.split(X, y, groups)) scores = np.sum(scores, axis=0) diff = int(scores.shape[0]) - len(step) if diff > 0: step = np.r_[step, [step[-1]] * diff] scores_rev = scores[::-1] argmax_idx = len(scores) - np.argmax(scores_rev) - 1 n_features_to_select = max( n_features - sum(step[:argmax_idx]), self.min_features_to_select) # Re-execute an elimination with best_k over the whole set rfe = DyRFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, verbose=self.verbose) rfe.fit(X, y) # Set final attributes self.support_ = rfe.support_ self.n_features_ = rfe.n_features_ self.ranking_ = rfe.ranking_ self.estimator_ = clone(self.estimator) self.estimator_.fit(self.transform(X), y) # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1 # here, the scores are normalized by get_n_splits(X, y) self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups) return self
def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None, iid=True, n_jobs=1, verbose=1, pre_dispatch='2*n_jobs'): """Fit and score an estimator with cross-validation This function is basically a copy of sklearn's grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV fit() method. Unfortunately, that class does _not_ return the training set scores, which we want to save in the database, and because of the way it's written, you can't change it by subclassing or monkeypatching. This function uses some undocumented internal sklearn APIs (non-public). It was written against sklearn version 0.16.1. Prior Versions are likely to fail due to changes in the design of cross_validation module. Returns ------- out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores' The scores on the training and test sets, as well as the mean test set score. """ scorer = check_scoring(estimator, scoring=scoring) n_samples = num_samples(X) X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', allow_nans=True) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator)) out = Parallel( n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch )( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters, fit_params=None) for train, test in cv.split(X, y)) assert len(out) == cv.n_splits train_scores, test_scores = [], [] n_train_samples, n_test_samples = [], [] for test_score, n_test, train_score, n_train, _ in out: train_scores.append(train_score) test_scores.append(test_score) n_test_samples.append(n_test) n_train_samples.append(n_train) train_scores, test_scores = map(list, check_arrays(train_scores, test_scores, warn_nans=True, replace_nans=True)) if iid: if verbose > 0 and is_msmbuilder_estimator(estimator): print('[CV] Using MSMBuilder API n_samples averaging') print('[CV] n_train_samples: %s' % str(n_train_samples)) print('[CV] n_test_samples: %s' % str(n_test_samples)) mean_test_score = np.average(test_scores, weights=n_test_samples) mean_train_score = np.average(train_scores, weights=n_train_samples) else: mean_test_score = np.average(test_scores) mean_train_score = np.average(train_scores) grid_scores = { 'mean_test_score': mean_test_score, 'test_scores': test_scores, 'mean_train_score': mean_train_score, 'train_scores': train_scores, 'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples} return grid_scores
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] if self.max_features is not None: if not isinstance(self.max_features, numbers.Integral): raise TypeError("'max_features' should be an integer between 1 and {} features." " Got {!r} instead." .format(n_features, self.max_features)) elif self.max_features < 1 or self.max_features > n_features: raise ValueError("'max_features' should be between 1 and {} features." " Got {} instead." .format(n_features, self.max_features)) max_features = self.max_features else: max_features = n_features if not isinstance(self.n_gen_no_change, (numbers.Integral, np.integer, type(None))): raise ValueError("'n_gen_no_change' should either be None or an integer." " {} was passed." .format(self.n_gen_no_change)) estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, max_features=max_features, caching=self.caching) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs == 0: raise ValueError("n_jobs == 0 has no meaning.") elif self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") _, log = _eaFunction(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, ngen_no_change=self.n_gen_no_change, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.generation_scores_ = np.array([score for score, _ in log.select("max")]) self.n_features_ = support_.sum() self.support_ = support_ return self
def _fit(self, X, y, groups, parameter_iterable): estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) param_grid = [(parameters, train, test) for parameters in parameter_iterable for train, test in list(cv.split(X, y, groups))] # Because the original python code expects a certain order for the elements, we need to # respect it. indexed_param_grid = list(zip(range(len(param_grid)), param_grid)) par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid)) X_bc = self.sc.broadcast(X) y_bc = self.sc.broadcast(y) scorer = self.scorer_ verbose = self.verbose error_score = self.error_score fit_params = self.fit_params return_train_score = self.return_train_score fas = _fit_and_score def fun(tup): (index, (parameters, train, test)) = tup local_estimator = clone(base_estimator) local_X = X_bc.value local_y = y_bc.value res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose, parameters, fit_params, return_train_score=return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=error_score) return (index, res) indexed_out0 = dict(par_param_grid.map(fun).collect()) out = [indexed_out0[idx] for idx in range(len(param_grid))] if return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) X_bc.unpersist() y_bc.unpersist() candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def cross_val_multiscore(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """Evaluate a score by cross-validation. Parameters ---------- estimator : instance of sklearn.base.BaseEstimator The object to use to fit the data. Must implement the 'fit' method. X : array-like, shape (n_samples, n_dimensional_features,) The data to fit. Can be, for example a list, or an array at least 2d. y : array-like, shape (n_samples, n_targets,) The target variable to try to predict in the case of supervised learning. groups : array-like, with shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. scoring : string, callable | None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : int, cross-validation generator | iterable Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross validation, - integer, to specify the number of folds in a ``(Stratified)KFold``, - An object to be used as a cross-validation generator. - An iterable yielding train, test splits. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used. In all other cases, :class:`sklearn.model_selection.KFold` is used. n_jobs : int, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : int, optional The verbosity level. fit_params : dict, optional Parameters to pass to the fit method of the estimator. pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' Returns ------- scores : array of float, shape (n_splits,) | shape (n_splits, n_scores) Array of scores of the estimator for each run of the cross validation. """ # This code is copied from sklearn from sklearn.base import clone from sklearn.utils import indexable from sklearn.metrics.scorer import check_scoring from sklearn.model_selection._split import check_cv X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) cv_iter = list(cv.split(X, y, groups)) scorer = check_scoring(estimator, scoring=scoring) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. # Note: this parallelization is implemented using MNE Parallel parallel, p_func, n_jobs = parallel_func(_fit_and_score, n_jobs, pre_dispatch=pre_dispatch) scores = parallel(p_func(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in cv_iter) return np.array(scores)[:, 0, ...] # flatten over joblib output.
def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ if self.fit_params is not None: warnings.warn('"fit_params" as a constructor argument was ' 'deprecated in version 0.19 and will be removed ' 'in version 0.21. Pass fit parameters to the ' '"fit" method instead.', DeprecationWarning) if fit_params: warnings.warn('Ignoring fit_params passed as a constructor ' 'argument in favor of keyword arguments to ' 'the "fit" method.', RuntimeWarning) else: fit_params = self.fit_params estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) if self.multimetric_: if self.refit is not False and ( not isinstance(self.refit, six.string_types) or # This will work for both dict / list (tuple) self.refit not in scorers): raise ValueError("For multi-metric scoring, the parameter " "refit must be set to a scorer key " "to refit an estimator with the best " "parameter setting on the whole data and " "make the best_* attributes " "available for that metric. If this is not " "needed, refit should be set to False " "explicitly. %r was passed." % self.refit) else: refit_metric = self.refit else: refit_metric = 'score' # X, y, groups = indexable(X, y, groups) if groups is not None: raise NotImplementedError("groups are not supported") # n_splits = cv.get_n_splits(X, y, groups) n_splits = min(cv.get_n_splits(X_.transpose(1, 2, 0), y_, None) for X_, y_ in zip(X, y)) def generate_index(X_list, y_list): split = [cv.split(X.transpose(1, 2, 0), y) for X, y in zip(X_list, y_list)] for i in range(n_splits): yield zip(*[next(s) for s in split]) generate_index_iter = generate_index(X, y) # Regenerate parameter iterable for each fit candidate_params = list(self._get_param_iterator()) n_candidates = len(candidate_params) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_fit_and_score)(clone(base_estimator), X, y, scorers, train, test, self.verbose, parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, error_score=self.error_score, return_estimator=True, return_idx=True) for parameters, (train, test) in product( candidate_params, generate_index_iter)) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, score_time, estimators, train_idxs, test_idxs) = zip(*out) else: (test_score_dicts, test_sample_counts, fit_time, score_time, estimators, train_idxs, test_idxs) = zip(*out) # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists test_scores = _aggregate_score_dicts(test_score_dicts) if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) # TODO: replace by a dict in 0.21 results = (DeprecationDict() if self.return_train_score == 'warn' else {}) def _store(key_name, array, weights=None, splits=False, rank=False): """Store the scores/times to the cv_results_.""" # When iterated first by splits, then by parameters # We want `array` to have `n_candidates` rows and `n_splits` cols. array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): # Uses closure to alter the results results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) _store('fit_time', fit_time) _store('score_time', score_time) results['estimators'] = estimators results['train_index'] = train_idxs results['test_index'] = test_idxs # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: prev_keys = set(results.keys()) _store('train_%s' % scorer_name, train_scores[scorer_name], splits=True) if self.return_train_score == 'warn': for key in set(results.keys()) - prev_keys: message = ( 'You are accessing a training score ({!r}), ' 'which will not be available by default ' 'any more in 0.21. If you need training scores, ' 'please set return_train_score=True').format(key) # warn on key access results.add_warning(key, message, FutureWarning) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.multimetric_: self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_params_ = candidate_params[self.best_index_] self.best_score_ = results["mean_test_%s" % refit_metric][ self.best_index_] if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) if y is not None: self.best_estimator_.fit(X, y, **fit_params) else: self.best_estimator_.fit(X, **fit_params) # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] self.cv_results_ = results self.n_splits_ = n_splits return self
def fit(self, subjects, y=None): """Compute cross-validated group-sparse precisions. Parameters ---------- subjects : list of numpy.ndarray with shapes (n_samples, n_features) input subjects. Each subject is a 2D array, whose columns contain signals. Sample number can vary from subject to subject, but all subjects must have the same number of features (i.e. of columns.) Returns ------- self: GroupSparseCovarianceCV the object instance itself. """ # Empirical covariances emp_covs, n_samples = \ empirical_covariances(subjects, assume_centered=False) n_subjects = emp_covs.shape[2] # One cv generator per subject must be created, because each subject # can have a different number of samples from the others. cv = [] for k in range(n_subjects): cv.append(check_cv( self.cv, np.ones(subjects[k].shape[0]), classifier=False ).split(subjects[k]) ) path = list() # List of (alpha, scores, covs) n_alphas = self.alphas if isinstance(n_alphas, collections.Sequence): alphas = list(self.alphas) n_alphas = len(alphas) n_refinements = 1 else: n_refinements = self.n_refinements alpha_1, _ = compute_alpha_max(emp_covs, n_samples) alpha_0 = 1e-2 * alpha_1 alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1] covs_init = itertools.repeat(None) # Copying the cv generators to use them n_refinements times. cv_ = izip(*cv) for i, (this_cv) in enumerate(itertools.tee(cv_, n_refinements)): # Compute the cross-validated loss on the current grid train_test_subjs = [] for train_test in this_cv: assert(len(train_test) == n_subjects) train_test_subjs.append(list(zip(*[(subject[train, :], subject[test, :]) for subject, (train, test) in zip(subjects, train_test)]))) if self.early_stopping: probes = [EarlyStopProbe(test_subjs, verbose=max(0, self.verbose - 1)) for _, test_subjs in train_test_subjs] else: probes = itertools.repeat(None) this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(group_sparse_covariance_path)( train_subjs, alphas, test_subjs=test_subjs, max_iter=self.max_iter_cv, tol=self.tol_cv, verbose=max(0, self.verbose - 1), debug=self.debug, # Warm restart is useless with early stopping. precisions_init=None if self.early_stopping else prec_init, probe_function=probe) for (train_subjs, test_subjs), prec_init, probe in zip(train_test_subjs, covs_init, probes)) # this_path[i] is a tuple (precisions_list, scores) # - scores: scores obtained with the i-th folding, for each value # of alpha. # - precisions_list: corresponding precisions matrices, for each # value of alpha. precisions_list, scores = list(zip(*this_path)) # now scores[i][j] is the score for the i-th folding, j-th value of # alpha (analoguous for precisions_list) precisions_list = list(zip(*precisions_list)) scores = [np.mean(sc) for sc in zip(*scores)] # scores[i] is the mean score obtained for the i-th value of alpha. path.extend(list(zip(alphas, scores, precisions_list))) path = sorted(path, key=operator.itemgetter(0), reverse=True) # Find the maximum score (avoid using the built-in 'max' function # to have a fully-reproducible selection of the smallest alpha in # case of equality) best_score = -np.inf last_finite_idx = 0 for index, (alpha, this_score, _) in enumerate(path): if this_score >= .1 / np.finfo(np.float).eps: this_score = np.nan if np.isfinite(this_score): last_finite_idx = index if this_score >= best_score: best_score = this_score best_index = index # Refine the grid if best_index == 0: # We do not need to go back: we have chosen # the highest value of alpha for which there are # non-zero coefficients alpha_1 = path[0][0] alpha_0 = path[1][0] covs_init = path[0][2] elif (best_index == last_finite_idx and not best_index == len(path) - 1): # We have non-converged models on the upper bound of the # grid, we need to refine the grid there alpha_1 = path[best_index][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index][2] elif best_index == len(path) - 1: alpha_1 = path[best_index][0] alpha_0 = 0.01 * path[best_index][0] covs_init = path[best_index][2] else: alpha_1 = path[best_index - 1][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index - 1][2] alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), len(alphas) + 2) alphas = alphas[1:-1] if n_refinements > 1: logger.log("[GroupSparseCovarianceCV] Done refinement " "% 2i out of %i" % (i + 1, n_refinements), verbose=self.verbose) path = list(zip(*path)) cv_scores_ = list(path[1]) alphas = list(path[0]) self.cv_scores_ = np.array(cv_scores_) self.alpha_ = alphas[best_index] self.cv_alphas_ = alphas # Finally, fit the model with the selected alpha logger.log("Final optimization", verbose=self.verbose) self.covariances_ = emp_covs self.precisions_ = _group_sparse_covariance( emp_covs, n_samples, self.alpha_, tol=self.tol, max_iter=self.max_iter, verbose=max(0, self.verbose - 1), debug=self.debug) return self
def _check_cv_non_float(self, y): return check_cv( self.cv, y=y, classifier=self.stratified, )
def _fit(self, X, y, parameter_dict): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator)) creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax) toolbox = base.Toolbox() name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict) if self.gene_type is None: self.gene_type = gene_type if self.verbose: print("Types %s and maxint %s detected" % (self.gene_type, maxints)) toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, searchobj=self, name_values=name_values, X=X, y=y, scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose, error_score=self.error_score, fit_params=self.fit_params) toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type) toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = Pool(processes=self.n_jobs) toolbox.register("map", pool.map) pop = toolbox.population(n=self.population_size) hof = tools.HallOfFame(1) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean) stats.register("min", np.min) stats.register("max", np.max) if self.verbose: print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1))) pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=self.generations_number, stats=stats, halloffame=hof, verbose=self.verbose) current_best_score_ = hof[0].fitness.values[0] current_best_params_ = _individual_to_params(hof[0], name_values) if self.verbose: print("Best individual is: %s\nwith fitness: %s" % ( current_best_params_, current_best_score_) ) print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % ( self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits) if current_best_score_ > self.best_score_: self.best_score_ = current_best_score_ self.best_params_ = current_best_params_