def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    assert_array_equal(list(kf_iter_wrapped.split(X, y)),
                       list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    assert_array_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                       list(kf_randomized_iter_wrapped.split(X, y)))
    assert_true(np.any(np.array(list(kf_iter_wrapped.split(X, y))) !=
                       np.array(list(kf_randomized_iter_wrapped.split(X, y)))))
Beispiel #2
0
def _set_cv(cv, X, y, classifier):
    """This method returns either a `sklearn.cross_validation._PartitionIterator` or 
    `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
    or sklearn-0.18 is being used.

    Parameters
    ----------

    cv : int, `_PartitionIterator` or `BaseCrossValidator`
        The CV object or int to check. If an int, will be converted
        into the appropriate class of crossvalidator.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    classifier : bool
        Whether the estimator being fit is a classifier

    Returns
    -------

    `_PartitionIterator` or `BaseCrossValidator`
    """
    return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
Beispiel #3
0
def _set_cv(cv, estimator=None, X=None, y=None):
    """ Set the default cross-validation depending on whether clf is classifier
        or regressor. """

    from sklearn.base import is_classifier

    # Detect whether classification or regression
    if estimator in ['classifier', 'regressor']:
        est_is_classifier = estimator == 'classifier'
    else:
        est_is_classifier = is_classifier(estimator)
    # Setup CV
    if check_version('sklearn', '0.18'):
        from sklearn import model_selection as models
        from sklearn.model_selection import (check_cv, StratifiedKFold, KFold)
        if isinstance(cv, (int, np.int)):
            XFold = StratifiedKFold if est_is_classifier else KFold
            cv = XFold(n_splits=cv)
        elif isinstance(cv, str):
            if not hasattr(models, cv):
                raise ValueError('Unknown cross-validation')
            cv = getattr(models, cv)
            cv = cv()
        cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
    else:
        from sklearn import cross_validation as models
        from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold)
        if isinstance(cv, (int, np.int)):
            if est_is_classifier:
                cv = StratifiedKFold(y=y, n_folds=cv)
            else:
                cv = KFold(n=len(y), n_folds=cv)
        elif isinstance(cv, str):
            if not hasattr(models, cv):
                raise ValueError('Unknown cross-validation')
            cv = getattr(models, cv)
            if cv.__name__ not in ['KFold', 'LeaveOneOut']:
                raise NotImplementedError('CV cannot be defined with str for'
                                          ' sklearn < .017.')
            cv = cv(len(y))
        cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)

    # Extract train and test set to retrieve them at predict time
    if hasattr(cv, 'split'):
        cv_splits = [(train, test) for train, test in
                     cv.split(X=np.zeros_like(y), y=y)]
    else:
        # XXX support sklearn.cross_validation cv
        cv_splits = [(train, test) for train, test in cv]

    if not np.all([len(train) for train, _ in cv_splits]):
        raise ValueError('Some folds do not have any train epochs.')

    return cv, cv_splits
Beispiel #4
0
def _set_cv(cv, estimator=None, X=None, y=None):
    """Set the default CV depending on whether clf is classifier/regressor."""
    # Detect whether classification or regression
    if estimator in ['classifier', 'regressor']:
        est_is_classifier = estimator == 'classifier'
    else:
        est_is_classifier = is_classifier(estimator)
    # Setup CV
    from sklearn import model_selection as models
    from sklearn.model_selection import (check_cv, StratifiedKFold, KFold)
    if isinstance(cv, (int, np.int)):
        XFold = StratifiedKFold if est_is_classifier else KFold
        cv = XFold(n_splits=cv)
    elif isinstance(cv, str):
        if not hasattr(models, cv):
            raise ValueError('Unknown cross-validation')
        cv = getattr(models, cv)
        cv = cv()
    cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)

    # Extract train and test set to retrieve them at predict time
    if hasattr(cv, 'split'):
        cv_splits = [(train, test)
                     for train, test in cv.split(X=np.zeros_like(y), y=y)]
    else:
        # XXX support sklearn.cross_validation cv
        cv_splits = [(train, test) for train, test in cv]

    if not np.all([len(train) for train, _ in cv_splits]):
        raise ValueError('Some folds do not have any train epochs.')

    return cv, cv_splits
Beispiel #5
0
 def _check_cv(self, y):
     cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
     if hasattr(cv, 'random_state') and cv.random_state is None:
         cv.random_state = np.random.RandomState()
     if hasattr(cv, 'shuffle') and self.shuffle_cv:
         cv.shuffle = True
     return cv
Beispiel #6
0
    def __init__(
            self,
            alpha: float = 10,
            max_bins: int = 30,
            split: Tuple[Union[int, BaseCrossValidator]] = (3, 3),
    ):
        """

        Args:
            alpha (float): smoothing parameter for generalization.
            max_bins (int): maximum number of unique values in a feature.
            split (tuple[Union[int, BaseCrossValidator]): tuple of int or
                cross-validator classes.

                If split len is 0, then algorithm
                will encode features without cross-validation.
                This situation features will over-fit on target.

                If split len is 1, algorithm will encode features by using
                cross-validation on folds.
                In this situation you will not over-fit on tests,
                but when you will validate, your score may over-fit.

                If split len is 2, algorithm will separate data on first folds,
                afterwords will encode features by using cross-validation
                on second folds. This situation is the best way to
                avoid over-fit, but algorithm will use small data for encode.
        """

        self.alpha = alpha
        self.max_bins = max_bins
        self.split = tuple(check_cv(x_split) for x_split in split)
        self._encodings = []  # type: List[BaseTargetEncoder]
Beispiel #7
0
    def cross_val_predict_proba(self, X, y, cv=None, scoring=None, **kwargs):
        """Performing cross validation hold out predictions for stacking.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            Input feature matrix used for training and cv.
        y : array-like of shape = [n_samples, ] or [n_samples, n_classes] for Keras.
            The numerical encoded target for classification tasks.
        cv : int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.
        scoring : callable, default: None
                A callable to evaluate the predictions on the cv set.
                None, accuracy score
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.         
        Returns
        -------
        array of shape = [n_samples, n_classes]
            The hold out probabilities for each class
        """
        y = self.__process_target(y)

        y_pred_proba = np.zeros((X.shape[0], self.__num_classes))

        cv = check_cv(cv, y, classifier=True)
        n_splits = cv.get_n_splits(X, y)

        if scoring is None:
            scoring = make_scorer(accuracy_score)

        i = 0
        score_mean = 0.0
        print("Starting hold out prediction with {} splits.".format(n_splits))
        for train_index, cv_index in cv.split(X, y):
            X_train = X[train_index]
            y_train = y[train_index]
            X_cv = X[cv_index]
            y_cv = y[cv_index]

            est = self.get_estimator_copy()
            est.fit(X_train, y_train, **kwargs)
            y_pred_proba_cv = est.predict_proba(X_cv)

            #            score = scoring(y_cv, y_pred_proba_cv)

            #            print("Train size: {} ::: cv size: {} score (fold {}/{}): {:.4f}".format(len(train_index), len(cv_index), i + 1, n_splits, score))
            #            score_mean += score / float(n_splits)

            y_pred_proba[cv_index] = y_pred_proba_cv

            i += 1

#        print("Mean score: {:.4f}".format(score_mean))

        return y_pred_proba
Beispiel #8
0
 def get_fold_splitting(self, X, y) -> Iterable:
     # If cv is iterable obj, convert to list and return
     if isinstance(self.cv, Iterable):
         return list(self.cv)
     if self._checked_cv is None:
         self._checked_cv = check_cv(self.cv, y, classifier=self.model_class)
     return list(self._checked_cv.split(X, y, self.groups))
Beispiel #9
0
def _score_lambda_path(est, X, y, sample_weight, relative_penalties, cv,
                       scoring, classifier, n_jobs, verbose):
    """Score each model found by glmnet using cross validation.

    Parameters
    ----------
    est : estimator
        The previously fitted estimator.

    X : array, shape (n_samples, n_features)
        Input features

    y : array, shape (n_samples,)
        Target values.

    sample_weight : array, shape (n_samples,)
        Weight of each row in X.

    n_splits : int
        Number of folds for cross validation, must be at least 3.

    scoring : string, callable or None
        Scoring method to apply to each model.

    n_jobs: int
        Maximum number of threads to use for scoring models.

    verbose : bool
        Emit logging data and warnings when True.

    classifier : boolean, optional, default False
        Whether the task is a classification task, in which case
        stratified KFold will be used.

    Returns
    -------
    scores : array, shape (n_lambda,)
        Scores for each value of lambda over all cv folds.
    """
    scorer = check_scoring(est, scoring)
    cv = check_cv(cv, y, classifier)
    cv = cv.split(X, y)

    # We score the model for every value of lambda, for classification
    # models, this will be an intercept-only model, meaning it predicts
    # the same class regardless of the input. Obviously, this makes some of
    # the scikit-learn metrics unhappy, so we are silencing these warnings.
    # Also note, catch_warnings is not thread safe.
    with warnings.catch_warnings():
        action = 'always' if verbose else 'ignore'
        warnings.simplefilter(action, UndefinedMetricWarning)

        scores = Parallel(n_jobs=n_jobs, verbose=verbose, backend='threading')(
            delayed(_fit_and_score)(est, scorer, X, y, sample_weight,
                                    relative_penalties, est.lambda_path_,
                                    train_idx, test_idx)
            for (train_idx, test_idx) in cv)

    return scores
    def evaluate_candidates(self, x, y, groups, candidate_params, scorers,
                            fit_params):
        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    error_score=self.error_score,
                                    verbose=self.verbose)

        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        n_splits = cv.get_n_splits(x, y, groups)

        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, len(candidate_params),
                                     len(candidate_params) * n_splits))

        param_grid = list(product(candidate_params, range(n_splits)))
        par_param_grid = self.sc.parallelize(
            list(zip(range(len(param_grid)), param_grid)), len(param_grid))

        x_bc = self.sc.broadcast(x)
        y_bc = self.sc.broadcast(y)
        groups_bc = self.sc.broadcast(groups)
        base_estimator = self.estimator

        def test_one_parameter(task):
            (index, (parameters, split_idx)) = task
            local_estimator = clone(base_estimator)
            local_x = x_bc.value
            local_y = y_bc.value
            local_groups = groups_bc.value

            train, test = next(
                islice(cv.split(local_x, local_y, local_groups), split_idx,
                       split_idx + 1))
            res = _fit_and_score(local_estimator,
                                 local_x,
                                 local_y,
                                 train=train,
                                 test=test,
                                 parameters=parameters,
                                 **fit_and_score_kwargs)

            return index, res

        out = dict(par_param_grid.map(test_one_parameter).collect())
        x_bc.unpersist()
        y_bc.unpersist()
        groups_bc.unpersist()

        out = [out[idx] for idx in range(len(param_grid))]

        # Warning: may not work for sklearn != 0.20.3
        results = self._format_results(candidate_params, scorers, n_splits,
                                       out)
        return results
Beispiel #11
0
    def fit(self, X, y=None, groups=None, **fit_params):
        # type: (np.ndarray, np.ndarray, np.ndarray, Any) -> 'TPESearchCV'
        """Run fit with all sets of parameters.

        Args:
            X:
                Training data.

            y:
                Target variable.

            groups:
                Group labels for the samples used while splitting the dataset
                into train/test set.

            **fit_params:
                Parameters passed to ``fit`` on the estimator.

        Returns:
            self:
                Return self.
        """

        self._check_params()
        self._set_verbosity()

        classifier = is_classifier(self.estimator)
        cv = check_cv(self.cv, y, classifier)

        self.n_splits_ = cv.get_n_splits(X, y, groups=groups)
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        self.study_ = study.create_study(load_if_exists=self.load_if_exists,
                                         pruner=self.pruner,
                                         sampler=self._sampler,
                                         storage=self.storage,
                                         study_name=self.study_name)

        objective = Objective(self.estimator,
                              self.param_distributions,
                              X,
                              y,
                              cv=cv,
                              error_score=self.error_score,
                              fit_params=fit_params,
                              groups=groups,
                              max_iter=self.max_iter,
                              return_train_score=self.return_train_score,
                              scoring=self.scorer_)

        self.study_.optimize(objective,
                             n_jobs=self.n_jobs,
                             n_trials=self.n_trials,
                             timeout=self.timeout)

        if self.refit:
            self._refit(X, y, **fit_params)

        return self
 def fit(self, X, y):
     X, y = indexable(X, y)
     cv = check_cv(self.fold, y, classifier=True)
     for idx_train, idx_eval in cv.split(X, y):
         X_train = X.loc[idx_train]
         y_train = y.loc[idx_train]
         X_eval = X.loc[idx_eval]
         y_eval = y.loc[idx_eval]
         self.models.append(self.train_func(X_train, X_eval, y_train, y_eval))
    def fit_transform(self, X=None, y=None, **kwargs):

        """Creates training meta-features for the stacking procedure
        and fits the base models.
        
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features], default = None
            Input feature matrix used for training.
        y : array-like of shape = [n_samples, ], default = None
            The numerical encoded target for regression tasks.
        **kwargs : default = None
            Additional fitting arguments accepted by model. Not tested.  
            
        Returns
        -------
        self.__X_meta_train : array-like or sparse matrix of shape = n_samples, n_base_estimators * (n_classes - int(self.base_drop_first))]
            Training meta-features 
        """

        self.__X_meta_train = None
        
        if X is not None and y is not None:
            cv = check_cv(self.base_cv, y, classifier=False)
            scoring = self.base_scoring

        for c, est in enumerate(self.base_estimators):
            
            if type(est) == tuple:
                if(self.stacking_verbose):
                    print("\n" + "Loading estimator n°" + str(c+1))
                
                y_pred = np.load(est[0])  
  
            elif X is not None and y is not None:
                if(self.stacking_verbose):
                    print("\n" + "Fitting estimator n°" + str(c+1))
    
                y_pred = est.cross_val_predict(X, y, cv=cv, scoring=scoring, **kwargs)
                est.fit(X, y, **kwargs)
                
                if self.base_save:
                    if self.base_save_files is not None:
                        np.save(self.base_save_files[c][0], y_pred)
                    else:
                        np.save('est' + str(c) + '_train', y_pred)
                    
            else:
                raise ValueError("X and y must be specified to fit_transform base estimators.")

        if self.base_copy_idx is not None:
            self.__X_meta_train = np.column_stack((self.__X_meta_train, X[:,self.base_copy_idx]))
        
        self.__y = y
        self.__fittransformOK = True    
        
        return self.__X_meta_train
Beispiel #14
0
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y,
                         cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params,
                         caching=self.caching)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba,
                            ngen=self.n_generations, stats=stats, halloffame=hof,
                            verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
def split(x1, x2, n_folds=5):
    cv1 = check_cv(n_folds, x1)
    cv1_iter = list(cv1.split(x1, None, None))
    cv2 = check_cv(n_folds, x2)
    cv2_iter = list(cv2.split(x2, None, None))

    cv_iter = []
    for i in range(len(cv1_iter)):
        train1, test1 = cv1_iter[i]
        train2, test2 = cv2_iter[i]

        x1_train = [x1[index] for index in train1]
        x1_test = [x1[index] for index in test1]
        x2_train = [x2[index] for index in train2]
        x2_test = [x2[index] for index in test2]

        cv_iter.append((x1_train + x2_train, x1_test + x2_test))

    return cv_iter
Beispiel #16
0
    def fit(self, X, y):
        """Fit the meta classifier to the base classifiers.

        Parameters
        ----------
        X: {array-like, sparse matrix}, shape(n_samples, n_features)
            Training vectors, where n_samples is the number samples and
            n_features is the number of features.

        y: array-like, shape(n_samples,)
            Labels for classification.

        Returns
        -------
        self: object
            Returns self

        """
        meta_features_list = []
        cv = check_cv(self.cv, y=y, classifier=True)

        for clf in self.base_estimators:
            # feels kind of clumsy, but we want the meta features in the
            # original ordering
            if self.probas:
                meta_features = np.zeros((y.shape[0], 2))

                pred = [(test, clf.fit(X[train],
                                       y[train]).predict_proba(X[test]))
                        for train, test in cv.split(X, y)]

            else:
                meta_features = np.zeros((y.shape[0], ))

                pred = [(test, clf.fit(X[train], y[train]).predict(X[test]))
                        for train, test in cv.split(X, y)]

            for index, y_pred in pred:
                meta_features[index] = y_pred

            meta_features_list.append(meta_features)

        all_meta_features = np.column_stack(meta_features_list)

        if self.use_orig_features:
            all_meta_features = np.hstack((X, all_meta_features))

        # train base estimators with whole training data set
        for clf in self.base_estimators:
            clf.fit(X, y)

        # train meta estimators
        self.meta_estimator.fit(all_meta_features, y)

        return self
Beispiel #17
0
    def __init__(self, X, y, Model, cv=None, max_iter=1000, estimator=None):
        """
        Parameters
        ----------
        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
            Data
        y : {ndarray, sparse matrix} of shape (n_samples,)
            Target
        Model: class
            The Model class definition (e.g. Lasso or SparseLogreg)
        cv : int, cross-validation generator or iterable, default=None
            Determines the cross-validation splitting strategy.
            Possible inputs for cv are:

            - None, to use the default 5-fold cross-validation,
            - int, to specify the number of folds.
            - scikit-learn CV splitter
            - An iterable yielding (train, test) splits as arrays of indices.

            For int/None inputs, KFold is used.
        max_iter: int
            Maximal number of iteration for the state-of-the-art solver
        estimator: instance of ``sklearn.base.BaseEstimator``
            An estimator that follows the scikit-learn API.
        """
        self.X = X
        self.y = y
        self.dict_crits = {}
        self.val_test = None
        self.rmse = None
        self.estimator = estimator

        cv = check_cv(cv)

        for i, (train, val) in enumerate(cv.split(X)):
            X_train = X[train, :]
            y_train = y[train]
            X_val = X[val, :]
            y_val = y[val]

            if issparse(X_train):
                X_train = X_train.tocsc()
            if issparse(X_val):
                X_val = X_val.tocsc()

            model = Model(
                X_train, y_train, max_iter=max_iter, estimator=estimator)

            criterion = HeldOutMSE(
                X_val, y_val, model, X_test=X_val, y_test=y_val)

            self.dict_crits[i] = criterion
        self.n_splits = cv.n_splits
        self.model = self.dict_crits[0].model
Beispiel #18
0
def _get_scores(X, y, groups, cv, estimator, scorer):
    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=None, verbose=False, pre_dispatch='2*n_jobs')
    scores = parallel(
        delayed(_fit_and_predict)(clone(estimator), X, y, train, test, groups,
                                  scorer)
        for train, test in cv.split(X, y, groups))
    return scores
Beispiel #19
0
 def _check_cv(self, y):
     """Overrides base class _check_cv
     """
     # Squeezed target should be 1-dimensional
     if len(y.shape) != 1:
         raise NotImplementedError("StackedClassifier does not currently "
                                   "support multi-column classification "
                                   "problems. If your target is a one-hot "
                                   "encoded multi-class problem, please "
                                   "recast it to a single column.")
     return check_cv(self.cv, y=y, classifier=True)
Beispiel #20
0
    def __init__(self, cv=None, **kwargs):
        """
        See BaseLassoNet for the parameters

        cv : int, cross-validation generator or iterable, default=None
            Determines the cross-validation splitting strategy.
            Default is 5-fold cross-validation.
            See <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.check_cv.html>
        """
        super().__init__(**kwargs)
        self.cv = check_cv(cv)
Beispiel #21
0
    def _fit_nuisances(self, Y, T, X=None, W=None, Z=None, sample_weight=None):
        # use a binary array to get stratified split in case of discrete treatment
        stratify = self._discrete_treatment or self._discrete_instrument

        if self._discrete_treatment:
            T = self._one_hot_encoder.fit_transform(reshape(T, (-1, 1)))

        if self._discrete_instrument:
            z_enc = LabelEncoder()
            Z = z_enc.fit_transform(Z.ravel())

            if self._discrete_treatment:  # need to stratify on combination of Z and T
                to_split = inverse_onehot(T) + Z * len(self._one_hot_encoder.categories_[0])
            else:
                to_split = Z  # just stratify on Z

            z_ohe = OneHotEncoder(categories='auto', sparse=False, drop='first')
            Z = z_ohe.fit_transform(reshape(Z, (-1, 1)))
            self.z_transformer = FunctionTransformer(
                func=_EncoderWrapper(z_ohe, z_enc).encode,
                validate=False)
        else:
            # stratify on T if discrete, and fine to pass T as second arg to KFold.split even when not
            to_split = inverse_onehot(T) if self._discrete_treatment else T
            self.z_transformer = None

        if self._n_splits == 1:  # special case, no cross validation
            folds = None
        else:
            splitter = check_cv(self._n_splits, [0], classifier=stratify)
            # if check_cv produced a new KFold or StratifiedKFold object, we need to set shuffle and random_state
            if splitter != self._n_splits and isinstance(splitter, (KFold, StratifiedKFold)):
                splitter.shuffle = True
                splitter.random_state = self._random_state

            all_vars = [var if np.ndim(var) == 2 else var.reshape(-1, 1) for var in [Z, W, X] if var is not None]
            if all_vars:
                all_vars = np.hstack(all_vars)
                folds = splitter.split(all_vars, to_split)
            else:
                folds = splitter.split(np.ones((T.shape[0], 1)), to_split)

        if self._discrete_treatment:
            self._d_t = shape(T)[1:]
            self.transformer = FunctionTransformer(
                func=_EncoderWrapper(self._one_hot_encoder).encode,
                validate=False)

        nuisances, fitted_models, fitted_inds, scores = _crossfit(self._model_nuisance, folds,
                                                                  Y, T, X=X, W=W, Z=Z, sample_weight=sample_weight)
        self._models_nuisance = fitted_models
        self.nuisance_scores_ = scores
        return nuisances, fitted_inds
Beispiel #22
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    fold_specific_X_extractor=None,
                    groups=None,
                    scorings=None,
                    cv=None,
                    n_jobs=1,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    :param estimator: 
    :param X: 
    :param y: 
    :param fold_specific_X_extractor: 
    :param groups: 
    :param scorings: list of scorings (strings, callables, etc...)
    :param cv: 
    :param n_jobs: 
    :param verbose: 
    :param fit_params: 
    :param pre_dispatch: 
    :return: an array of scores, shape: <folds x scores>
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers = [
        check_scoring(estimator, scoring=scoring) for scoring in scorings
    ]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fe_fit_and_score)(
            clone(estimator),
            X,
            y,
            scorers,
            train,
            test,
            verbose,
            None,
            fit_params,
            fold_specific_X_extractor=fold_specific_X_extractor)
        for train, test in cv.split(X, y, groups))
    # here scores is python list of shape <folds x 1 x scores>
    scores = np.array(scores)
    # eliminate middle axis
    return scores.reshape((scores.shape[0], scores.shape[2]))
Beispiel #23
0
def get_splitter(random_state=None, **params):
    '''Get cross-validation index generator

    Parameters:
        random_state: int or RandomState object
            seed for random number generator 
        
        name: str
            name of the splitter

        params: keyword arguments
            extra parameters for the classifier
    
    Returns:
        estimator: object
            a BaseEstimator object
    '''
    from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \
        RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit

    splitter = params.get('splitter')
    if splitter is None:
        return check_cv(**params)
    if splitter == 'KFold':
        from sklearn.model_selection import KFold
        return KFold(random_state=random_state,
                     **search_dict(params, ('n_splits', 'shuffle')))
    elif splitter == 'StratifiedKFold':
        from sklearn.model_selection import StratifiedKFold
        return StratifiedKFold(random_state=random_state,
                               **search_dict(params, ('n_splits', 'shuffle')))
    elif splitter == 'RepeatedStratifiedKFold':
        from sklearn.model_selection import RepeatedStratifiedKFold
        return RepeatedStratifiedKFold(random_state=random_state,
                                       **search_dict(
                                           params, ('n_splits', 'n_repeats')))
    elif splitter == 'ShuffleSplit':
        from sklearn.model_selection import ShuffleSplit
        return ShuffleSplit(
            random_state=random_state,
            **search_dict(params, ('n_splits', 'test_size', 'train_size')))
    elif splitter == 'StratifiedShuffleSplit':
        from sklearn.model_selection import StratifiedShuffleSplit
        return StratifiedShuffleSplit(
            random_state=random_state,
            **search_dict(params, ('n_splits', 'test_size', 'train_size')))
    elif splitter == 'LeaveOneOut':
        from sklearn.model_selection import LeaveOneOut
        return LeaveOneOut()
    elif splitter == 'FileSplitter':
        return UserFileSplitter(**search_dict(params, 'filename'))
    else:
        raise ValueError('unknown splitter: {}'.format(splitter))
Beispiel #24
0
 def _cv_scores_importances(self, X, y, groups=None, **fit_params):
     assert self.cv is not None
     cv = check_cv(self.cv, y, is_classifier(self.estimator))
     feature_importances = []  # type: List
     base_scores = []  # type: List[float]
     for train, test in cv.split(X, y, groups):
         est = clone(self.estimator).fit(X[train], y[train], **fit_params)
         score_func = partial(self.scorer_, est)
         _base_score, _importances = self._get_score_importances(
             score_func, X[test], y[test])
         base_scores.extend([_base_score] * len(_importances))
         feature_importances.extend(_importances)
     return base_scores, feature_importances
Beispiel #25
0
    def fit_transform(self, X: pd.DataFrame, y) -> pd.DataFrame:
        self.cbe_ = []
        cv = check_cv(self.cv)
        cbe = CatBoostEncoder(cols=X.columns.tolist(),
                              return_df=False,
                              **self.cbe_params)

        X_transformed = np.zeros_like(X, dtype=np.float64)
        for train_idx, valid_idx in cv.split(X, y):
            self.cbe_.append(clone(cbe).fit(X.loc[train_idx], y[train_idx]))
            X_transformed[valid_idx] = self.cbe_[-1].transform(
                X.loc[valid_idx])
        return pd.DataFrame(X_transformed, columns=X.columns)
Beispiel #26
0
def validation_curve(estimator,
                     X,
                     y,
                     param_name,
                     param_range,
                     groups=None,
                     cv=None,
                     scoring=None,
                     n_jobs=None,
                     pre_dispatch="all",
                     verbose=0,
                     error_score=np.nan):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)
    out = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters={
                                    param_name: v
                                },
                                fit_params=None,
                                return_train_score=True,
                                error_score=error_score,
                                return_estimator=True,
                                return_times=True)
        # NOTE do not change order of iteration to allow one time cv splitters
        for train, test in cv.split(X, y, groups) for v in param_range)

    out = np.asarray(out)
    estimators = out[:, 4]
    out_scores = np.asarray(out[:, :2])
    fit_time = out[:, 2]
    score_time = out[:, 3]
    n_params = len(param_range)
    n_cv_folds = out_scores.shape[0] // n_params
    out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose(
        (2, 1, 0))

    return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \
           np.float64(score_time)
Beispiel #27
0
def test_cv_iterable_wrapper():
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv = OldSKF(y_multiclass, n_folds=3)
    wrapped_old_skf = _CVIterableWrapper(cv)

    # Check if split works correctly
    np.testing.assert_equal(list(cv), list(wrapped_old_skf.split()))

    # Check if get_n_splits works correctly
    assert_equal(len(cv), wrapped_old_skf.get_n_splits())

    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert_false(
        splits_are_equal, "If the splits are randomized, "
        "successive calls to split should yield different results")
Beispiel #28
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit method with all sets of parameters

        Args
        ----
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning

        groups : array-like, shape = [n_samples], optional
            Training vector groups for cross-validation

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        # check estimator and cv methods are valid
        self.cv = check_cv(self.cv,
                           y,
                           classifier=is_classifier(self.estimator))

        # check for binary response
        if len(np.unique(y)) > 2:
            raise ValueError(
                'Only a binary response vector is currently supported')

        # check that scoring metric has been specified
        if self.scoring is None:
            raise ValueError('No score function is defined')

        # perform cross validation prediction
        self.y_pred_ = cross_val_predict(estimator=self.estimator,
                                         X=X,
                                         y=y,
                                         groups=groups,
                                         cv=self.cv,
                                         method='predict_proba',
                                         n_jobs=self.n_jobs,
                                         **fit_params)
        self.y_true = y

        # add fold id to the predictions
        self.test_idx_ = [
            indexes[1] for indexes in self.cv.split(X, y, groups)
        ]
Beispiel #29
0
def check_cv(cv: Union[int, Iterable, BaseCrossValidator] = 5,
             y: Optional[Union[pd.Series, np.ndarray]] = None,
             stratified: bool = False,
             random_state: int = 0):
    if cv is None:
        cv = 5
    if isinstance(cv, numbers.Integral):
        if stratified and (y is not None) and (type_of_target(y)
                                               in ('binary', 'multiclass')):
            return StratifiedKFold(cv, shuffle=True, random_state=random_state)
        else:
            return KFold(cv, shuffle=True, random_state=random_state)

    return model_selection.check_cv(cv, y, stratified)
Beispiel #30
0
    def cv_score(self, X, y, cv=0.2, scoring='accuracy'):
        """
        Calculate validation score

        Parameters
        ----------
        X: iterable, shape (n_samples, )
            Sequence of tokenized documents

        y: iterable, shape (n_samples, )
            Sequence of labels

        cv: float, int, cross-validation generator or an iterable, optional
            Determines the cross-validation splitting strategy. Possible inputs for cv are:

            - float, to use holdout set of this size
            - None, to use the default 3-fold cross validation,
            - integer, to specify the number of folds in a StratifiedKFold,
            - An object to be used as a cross-validation generator.
            - An iterable yielding train, test splits.

        scoring : string, callable or None, optional, optional
            A string (see sklearn model evaluation documentation) or a scorer callable object

        Returns
        ----------
        float
            Average value of the validation metrics
        """
        self._classes = sorted(np.unique(y))
        np.random.seed(self.random_state)
        if isinstance(cv, float):
            train_ind, __ = train_test_split(np.arange(0, X.shape[0]))
            test_fold = np.zeros((X.shape[0], ))
            test_fold[train_ind] = -1
            self._cv_split = PredefinedSplit(test_fold)
        else:
            self._cv_split = check_cv(cv, y=y, classifier=True)

        if scoring == 'neg_log_loss':
            scoring = make_scorer(log_loss,
                                  labels=self._classes,
                                  greater_is_better=False,
                                  needs_proba=True)
        return cross_val_score(self._model,
                               X,
                               y,
                               cv=self._cv_split,
                               scoring=scoring)
Beispiel #31
0
 def _cv_scores_importances(self, X, y, groups=None, n_jobs=1, **fit_params):
     assert self.cv is not None
     cv = check_cv(self.cv, y, is_classifier(self.estimator))
     feature_importances = []  # type: List
     base_scores = []  # type: List[float]
     pool = Pool(self.n_jobs) #, maxtasksperchild=1)
     result = pool.map(lambda train_test: self._parallel_cv_scores_sub(X, y, *train_test, **fit_params), cv.split(X, y, groups), chunksize=1)
     #close and join the pools
     pool.close()
     pool.join()
     #unpack tuples to lists and flatten
     flatten = lambda z: [x for y in z for x in y]
     base_scores = flatten(map(list, zip(*result))[0])
     feature_importances = flatten(map(list, zip(*result))[1])        
     return base_scores, feature_importances
    def fit(self, X, y, tree="rf", recursive=True, cv=5):
        """
        Fits to the data (X) and target (y) to determine the selected_features.

        Args:
            X (pandas.DataFrame): input data, note that numpy matrix is NOT
                accepted since the X.columns is used for feature names
            y (pandas.Series or np.ndarray): list of outputs used for fitting
                the tree model
            tree (str or instantiated sklearn tree-based model): if a model is
                directly fed, it must have the .feature_importances_ attribute
            recursive (bool): whether to recursively reduce the features (True)
                or just do it once (False)
            cv (int or CrossValidation): sklearn's cross-validation with the
                same options (int or actual instantiated CrossValidation)

        Returns (None):
            sets the class attribute .selected_features
        """
        m0 = len(X.columns)
        if isinstance(tree, str):
            if tree.lower() in ["rf", "random forest", "randomforest"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = RandomForestClassifier(random_state=self.rs)
                else:
                    tree = RandomForestRegressor(random_state=self.rs)
            elif tree.lower() in ["gb", "gbt", "gradiet boosting"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = GradientBoostingClassifier(random_state=self.rs)
                else:
                    tree = GradientBoostingRegressor(random_state=self.rs)
            else:
                raise AutomatminerError(
                    "Unsupported tree_type {}!".format(tree))

        cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree))
        all_feats = []
        for train, _ in cv.split(X, y, groups=None):
            Xtrn = X.iloc[train]
            ytrn = y.iloc[train]
            all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive)
        # take the union of selected features of each fold
        self.selected_features = list(set(all_feats))
        logger.info(
            self._log_prefix +
            "Finished tree-based feature reduction of {} initial features to "
            "{}".format(m0, len(self.selected_features)))
        return self
Beispiel #33
0
def nested_cross_validation(dataset, model, X, y, df, feature_list, impute, feature_select):
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    X, y, _ = indexable(X, y, None)
    cv = check_cv(cv, y, classifier=is_classifier(model))
    mean_scores = {}
    scores = {
        "acc_scores": [],
        "f1_scores": [],
        "p_scores": [],
        "r_scores": [],
        "auc_scores": [],
        "geometric_mean_scores": []
    }
    for train, test in cv.split(X, y):
        X_train = X[train]
        y_train = y[train]
        best_features = feature_assessment_and_selection(dataset=dataset, model=clone(model), X=X_train, y=y_train,
                                                         df=df, feature_list=feature_list)
        X_train_reduced = df[best_features].to_numpy()[train]
        X_test = df[best_features].to_numpy()[test]
        y_test = y[test]
        estimator = clone(model)
        estimator.fit(X_train_reduced, y_train)
        y_pred =estimator.predict(X_test)
        acc_score = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
        auc_score = metrics.roc_auc_score(y_score=y_pred, y_true=y_test)
        r_score = metrics.recall_score(y_pred=y_pred, y_true=y_test)
        p_score = metrics.precision_score(y_pred=y_pred, y_true=y_test)
        f1_score = metrics.f1_score(y_pred=y_pred, y_true=y_test)
        gmean_score = geometric_mean_score(y_test, y_pred)

        scores["acc_scores"].append(acc_score)
        scores["auc_scores"].append(auc_score)
        scores["r_scores"].append(r_score)
        scores["p_scores"].append(p_score)
        scores["f1_scores"].append(f1_score)
        scores["geometric_mean_scores"].append(gmean_score)

    mean_scores["acc_scores"] = mean(scores["acc_scores"])
    mean_scores["auc_scores"] = mean(scores["auc_scores"])
    mean_scores["r_scores"] = mean(scores["r_scores"])
    mean_scores["p_scores"] = mean(scores["p_scores"])
    mean_scores["f1_scores"] = mean(scores["f1_scores"])
    mean_scores["geometric_mean_scores"] = mean(scores["geometric_mean_scores"])

    print(mean_scores)

    return
Beispiel #34
0
def check_cv(cv=3, y=None, classifier=False):
    """Dask aware version of ``sklearn.model_selection.check_cv``

    Same as the scikit-learn version, but works if ``y`` is a dask object.
    """
    if cv is None:
        cv = 3

    # If ``cv`` is not an integer, the scikit-learn implementation doesn't
    # touch the ``y`` object, so passing on a dask object is fine
    if not is_dask_collection(y) or not isinstance(cv, numbers.Integral):
        return model_selection.check_cv(cv, y, classifier)

    if classifier:
        # ``y`` is a dask object. We need to compute the target type
        target_type = delayed(type_of_target, pure=True)(y).compute()
        if target_type in ('binary', 'multiclass'):
            return StratifiedKFold(cv)
    return KFold(cv)
def test_check_cv():
    X = np.ones(9)
    cv = check_cv(3, classifier=False)
    # Use numpy.testing.assert_equal which recursively compares
    # lists of lists
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = check_cv(3, y_binary, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_binary)),
                            list(cv.split(X, y_binary)))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = check_cv(3, y_multiclass, classifier=True)
    np.testing.assert_equal(list(StratifiedKFold(3).split(X, y_multiclass)),
                            list(cv.split(X, y_multiclass)))

    X = np.ones(5)
    y_multilabel = np.array([[0, 0, 0, 0], [0, 1, 1, 0], [0, 0, 0, 1],
                            [1, 1, 0, 1], [0, 0, 1, 0]])
    cv = check_cv(3, y_multilabel, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = check_cv(3, y_multioutput, classifier=True)
    np.testing.assert_equal(list(KFold(3).split(X)), list(cv.split(X)))

    # Check if the old style classes are wrapped to have a split method
    X = np.ones(9)
    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv1 = check_cv(3, y_multiclass, classifier=True)

    with warnings.catch_warnings(record=True):
        from sklearn.cross_validation import StratifiedKFold as OldSKF

    cv2 = check_cv(OldSKF(y_multiclass, n_folds=3))
    np.testing.assert_equal(list(cv1.split(X, y_multiclass)),
                            list(cv2.split()))

    assert_raises(ValueError, check_cv, cv="lolo")
    def fit(self, X, y, groups=None):
        """Fit the RFE model and automatically tune the number of selected
           features.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.
        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        if type(self.step) is not list:
            return super(DyRFECV, self).fit(X, y, groups)

        X, y = check_X_y(X, y, "csr")

        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        step = []
        for s in self.step:
            if 0.0 < s < 1.0:
                step.append(int(max(1, s * n_features)))
            else:
                step.append(int(s))
            if s <= 0:
                raise ValueError("Step must be >0")

        # Build an RFE object, which will evaluate and score each possible
        # feature count, down to self.min_features_to_select
        rfe = DyRFE(estimator=self.estimator,
                    n_features_to_select=self.min_features_to_select,
                    step=self.step, verbose=self.verbose)

        # Determine the number of subsets of features by fitting across
        # the train folds and choosing the "features_to_select" parameter
        # that gives the least averaged error across all folds.

        # Note that joblib raises a non-picklable error for bound methods
        # even if n_jobs is set to 1 with the default multiprocessing
        # backend.
        # This branching is done so that to
        # make sure that user code that sets n_jobs to 1
        # and provides bound methods as scorers is not broken with the
        # addition of n_jobs parameter in version 0.18.

        if effective_n_jobs(self.n_jobs) == 1:
            parallel, func = list, _rfe_single_fit
        else:
            parallel = Parallel(n_jobs=self.n_jobs)
            func = delayed(_rfe_single_fit)

        scores = parallel(
            func(rfe, self.estimator, X, y, train, test, scorer)
            for train, test in cv.split(X, y, groups))

        scores = np.sum(scores, axis=0)
        diff = int(scores.shape[0]) - len(step)
        if diff > 0:
            step = np.r_[step, [step[-1]] * diff]
        scores_rev = scores[::-1]
        argmax_idx = len(scores) - np.argmax(scores_rev) - 1
        n_features_to_select = max(
            n_features - sum(step[:argmax_idx]),
            self.min_features_to_select)

        # Re-execute an elimination with best_k over the whole set
        rfe = DyRFE(estimator=self.estimator,
                    n_features_to_select=n_features_to_select, step=self.step,
                    verbose=self.verbose)

        rfe.fit(X, y)

        # Set final attributes
        self.support_ = rfe.support_
        self.n_features_ = rfe.n_features_
        self.ranking_ = rfe.ranking_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1
        # here, the scores are normalized by get_n_splits(X, y)
        self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)
        return self
Beispiel #37
0
def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None,
                            iid=True, n_jobs=1, verbose=1,
                            pre_dispatch='2*n_jobs'):
    """Fit and score an estimator with cross-validation

    This function is basically a copy of sklearn's
    grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV
    fit() method. Unfortunately, that class does _not_ return the training
    set scores, which we want to save in the database, and because of the
    way it's written, you can't change it by subclassing or monkeypatching.

    This function uses some undocumented internal sklearn APIs (non-public).
    It was written against sklearn version 0.16.1. Prior Versions are likely
    to fail due to changes in the design of cross_validation module.

    Returns
    -------
    out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores'
        The scores on the training and test sets, as well as the mean test set
        score.
    """

    scorer = check_scoring(estimator, scoring=scoring)
    n_samples = num_samples(X)
    X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr',
                        allow_nans=True)
    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)'
                             % (len(y), n_samples))
    cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator))

    out = Parallel(
        n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch
    )(
        delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                train, test, verbose, parameters,
                                fit_params=None)
        for train, test in cv.split(X, y))

    assert len(out) == cv.n_splits

    train_scores, test_scores = [], []
    n_train_samples, n_test_samples = [], []
    for test_score, n_test, train_score, n_train, _ in out:
        train_scores.append(train_score)
        test_scores.append(test_score)
        n_test_samples.append(n_test)
        n_train_samples.append(n_train)

    train_scores, test_scores = map(list, check_arrays(train_scores,
                                                       test_scores,
                                                       warn_nans=True,
                                                       replace_nans=True))

    if iid:
        if verbose > 0 and is_msmbuilder_estimator(estimator):
            print('[CV] Using MSMBuilder API n_samples averaging')
            print('[CV]   n_train_samples: %s' % str(n_train_samples))
            print('[CV]   n_test_samples: %s' % str(n_test_samples))
        mean_test_score = np.average(test_scores, weights=n_test_samples)
        mean_train_score = np.average(train_scores, weights=n_train_samples)
    else:
        mean_test_score = np.average(test_scores)
        mean_train_score = np.average(train_scores)

    grid_scores = {
        'mean_test_score': mean_test_score, 'test_scores': test_scores,
        'mean_train_score': mean_train_score, 'train_scores': train_scores,
        'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples}
    return grid_scores
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        if self.max_features is not None:
            if not isinstance(self.max_features, numbers.Integral):
                raise TypeError("'max_features' should be an integer between 1 and {} features."
                                " Got {!r} instead."
                                .format(n_features, self.max_features))
            elif self.max_features < 1 or self.max_features > n_features:
                raise ValueError("'max_features' should be between 1 and {} features."
                                 " Got {} instead."
                                 .format(n_features, self.max_features))
            max_features = self.max_features
        else:
            max_features = n_features

        if not isinstance(self.n_gen_no_change, (numbers.Integral, np.integer, type(None))):
            raise ValueError("'n_gen_no_change' should either be None or an integer."
                             " {} was passed."
                             .format(self.n_gen_no_change))

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y,
                         cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params,
                         max_features=max_features, caching=self.caching)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs == 0:
            raise ValueError("n_jobs == 0 has no meaning.")
        elif self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        _, log = _eaFunction(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba,
                             ngen=self.n_generations, ngen_no_change=self.n_gen_no_change,
                             stats=stats, halloffame=hof, verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.generation_scores_ = np.array([score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
Beispiel #39
0
    def _fit(self, X, y, groups, parameter_iterable):

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test) for parameters in parameter_iterable
                                                for train, test in list(cv.split(X, y, groups))]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        error_score = self.error_score
        fit_params = self.fit_params
        return_train_score = self.return_train_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                      parameters, fit_params,
                      return_train_score=return_train_score,
                      return_n_test_samples=True, return_times=True,
                      return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]
        if return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out)
        X_bc.unpersist()
        y_bc.unpersist()

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
Beispiel #40
0
def cross_val_multiscore(estimator, X, y=None, groups=None, scoring=None,
                         cv=None, n_jobs=1, verbose=0, fit_params=None,
                         pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation.

    Parameters
    ----------
    estimator : instance of sklearn.base.BaseEstimator
        The object to use to fit the data.
        Must implement the 'fit' method.
    X : array-like, shape (n_samples, n_dimensional_features,)
        The data to fit. Can be, for example a list, or an array at least 2d.
    y : array-like, shape (n_samples, n_targets,)
        The target variable to try to predict in the case of
        supervised learning.
    groups : array-like, with shape (n_samples,)
        Group labels for the samples used while splitting the dataset into
        train/test set.
    scoring : string, callable | None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    cv : int, cross-validation generator | iterable
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a ``(Stratified)KFold``,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass,
        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
        other cases, :class:`sklearn.model_selection.KFold` is used.
    n_jobs : int, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    verbose : int, optional
        The verbosity level.
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

        - None, in which case all the jobs are immediately
          created and spawned. Use this for lightweight and
          fast-running jobs, to avoid delays due to on-demand
          spawning of the jobs
        - An int, giving the exact number of total jobs that are
          spawned
        - A string, giving an expression as a function of n_jobs,
          as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape (n_splits,) | shape (n_splits, n_scores)
        Array of scores of the estimator for each run of the cross validation.
    """
    # This code is copied from sklearn

    from sklearn.base import clone
    from sklearn.utils import indexable
    from sklearn.metrics.scorer import check_scoring
    from sklearn.model_selection._split import check_cv

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = list(cv.split(X, y, groups))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    # Note: this parallelization is implemented using MNE Parallel
    parallel, p_func, n_jobs = parallel_func(_fit_and_score, n_jobs,
                                             pre_dispatch=pre_dispatch)
    scores = parallel(p_func(clone(estimator), X, y, scorer, train, test,
                             verbose, None, fit_params)
                      for train, test in cv_iter)
    return np.array(scores)[:, 0, ...]  # flatten over joblib output.
Beispiel #41
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator

        """
        if self.fit_params is not None:
            warnings.warn('"fit_params" as a constructor argument was '
                          'deprecated in version 0.19 and will be removed '
                          'in version 0.21. Pass fit parameters to the '
                          '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn('Ignoring fit_params passed as a constructor '
                              'argument in favor of keyword arguments to '
                              'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        # X, y, groups = indexable(X, y, groups)
        if groups is not None:
            raise NotImplementedError("groups are not supported")

        # n_splits = cv.get_n_splits(X, y, groups)
        n_splits = min(cv.get_n_splits(X_.transpose(1, 2, 0), y_, None)
                       for X_, y_ in zip(X, y))

        def generate_index(X_list, y_list):
            split = [cv.split(X.transpose(1, 2, 0), y)
                     for X, y in zip(X_list, y_list)]
            for i in range(n_splits):
                yield zip(*[next(s) for s in split])

        generate_index_iter = generate_index(X, y)

        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_fit_and_score)(clone(base_estimator), X, y, scorers, train,
                                  test, self.verbose, parameters,
                                  fit_params=fit_params,
                                  return_train_score=self.return_train_score,
                                  return_n_test_samples=True,
                                  return_times=True, return_parameters=False,
                                  error_score=self.error_score,
                                  return_estimator=True, return_idx=True)
          for parameters, (train, test) in product(
            candidate_params, generate_index_iter))

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time, estimators, train_idxs, test_idxs) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time, estimators, train_idxs, test_idxs) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # TODO: replace by a dict in 0.21
        results = (DeprecationDict() if self.return_train_score == 'warn'
                   else {})

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """Store the scores/times to the cv_results_."""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        results['estimators'] = estimators
        results['train_index'] = train_idxs
        results['test_index'] = test_idxs

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name, test_scores[scorer_name],
                   splits=True, rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name, train_scores[scorer_name],
                       splits=True)

                if self.return_train_score == 'warn':
                    for key in set(results.keys()) - prev_keys:
                        message = (
                            'You are accessing a training score ({!r}), '
                            'which will not be available by default '
                            'any more in 0.21. If you need training scores, '
                            'please set return_train_score=True').format(key)
                        # warn on key access
                        results.add_warning(key, message, FutureWarning)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
    def fit(self, subjects, y=None):
        """Compute cross-validated group-sparse precisions.

        Parameters
        ----------
        subjects : list of numpy.ndarray with shapes (n_samples, n_features)
            input subjects. Each subject is a 2D array, whose columns contain
            signals. Sample number can vary from subject to subject, but all
            subjects must have the same number of features (i.e. of columns.)

        Returns
        -------
        self: GroupSparseCovarianceCV
            the object instance itself.
        """
        # Empirical covariances
        emp_covs, n_samples = \
                  empirical_covariances(subjects, assume_centered=False)
        n_subjects = emp_covs.shape[2]

        # One cv generator per subject must be created, because each subject
        # can have a different number of samples from the others.
        cv = []
        for k in range(n_subjects):
            cv.append(check_cv(
                    self.cv, np.ones(subjects[k].shape[0]),
                    classifier=False
                    ).split(subjects[k])
                      )
        path = list()  # List of (alpha, scores, covs)
        n_alphas = self.alphas

        if isinstance(n_alphas, collections.Sequence):
            alphas = list(self.alphas)
            n_alphas = len(alphas)
            n_refinements = 1
        else:
            n_refinements = self.n_refinements
            alpha_1, _ = compute_alpha_max(emp_covs, n_samples)
            alpha_0 = 1e-2 * alpha_1
            alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
                                 n_alphas)[::-1]

        covs_init = itertools.repeat(None)

        # Copying the cv generators to use them n_refinements times.
        cv_ = izip(*cv)

        for i, (this_cv) in enumerate(itertools.tee(cv_, n_refinements)):
            # Compute the cross-validated loss on the current grid
            train_test_subjs = []
            for train_test in this_cv:
                assert(len(train_test) == n_subjects)
                train_test_subjs.append(list(zip(*[(subject[train, :],
                                                    subject[test, :])
                                             for subject, (train, test)
                                             in zip(subjects, train_test)])))
            if self.early_stopping:
                probes = [EarlyStopProbe(test_subjs,
                                         verbose=max(0, self.verbose - 1))
                          for _, test_subjs in train_test_subjs]
            else:
                probes = itertools.repeat(None)

            this_path = Parallel(n_jobs=self.n_jobs,
                                 verbose=self.verbose)(
                delayed(group_sparse_covariance_path)(
                    train_subjs, alphas, test_subjs=test_subjs,
                    max_iter=self.max_iter_cv, tol=self.tol_cv,
                    verbose=max(0, self.verbose - 1), debug=self.debug,
                    # Warm restart is useless with early stopping.
                    precisions_init=None if self.early_stopping else prec_init,
                    probe_function=probe)
                for (train_subjs, test_subjs), prec_init, probe
                in zip(train_test_subjs, covs_init, probes))

            # this_path[i] is a tuple (precisions_list, scores)
            # - scores: scores obtained with the i-th folding, for each value
            #   of alpha.
            # - precisions_list: corresponding precisions matrices, for each
            #   value of alpha.
            precisions_list, scores = list(zip(*this_path))
            # now scores[i][j] is the score for the i-th folding, j-th value of
            # alpha (analoguous for precisions_list)
            precisions_list = list(zip(*precisions_list))
            scores = [np.mean(sc) for sc in zip(*scores)]
            # scores[i] is the mean score obtained for the i-th value of alpha.

            path.extend(list(zip(alphas, scores, precisions_list)))
            path = sorted(path, key=operator.itemgetter(0), reverse=True)

            # Find the maximum score (avoid using the built-in 'max' function
            # to have a fully-reproducible selection of the smallest alpha in
            # case of equality)
            best_score = -np.inf
            last_finite_idx = 0
            for index, (alpha, this_score, _) in enumerate(path):
                if this_score >= .1 / np.finfo(np.float).eps:
                    this_score = np.nan
                if np.isfinite(this_score):
                    last_finite_idx = index
                if this_score >= best_score:
                    best_score = this_score
                    best_index = index

            # Refine the grid
            if best_index == 0:
                # We do not need to go back: we have chosen
                # the highest value of alpha for which there are
                # non-zero coefficients
                alpha_1 = path[0][0]
                alpha_0 = path[1][0]
                covs_init = path[0][2]
            elif (best_index == last_finite_idx
                    and not best_index == len(path) - 1):
                # We have non-converged models on the upper bound of the
                # grid, we need to refine the grid there
                alpha_1 = path[best_index][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index][2]
            elif best_index == len(path) - 1:
                alpha_1 = path[best_index][0]
                alpha_0 = 0.01 * path[best_index][0]
                covs_init = path[best_index][2]
            else:
                alpha_1 = path[best_index - 1][0]
                alpha_0 = path[best_index + 1][0]
                covs_init = path[best_index - 1][2]
            alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
                                 len(alphas) + 2)
            alphas = alphas[1:-1]
            if n_refinements > 1:
                logger.log("[GroupSparseCovarianceCV] Done refinement "
                           "% 2i out of %i" % (i + 1, n_refinements),
                           verbose=self.verbose)

        path = list(zip(*path))
        cv_scores_ = list(path[1])
        alphas = list(path[0])

        self.cv_scores_ = np.array(cv_scores_)
        self.alpha_ = alphas[best_index]
        self.cv_alphas_ = alphas

        # Finally, fit the model with the selected alpha
        logger.log("Final optimization", verbose=self.verbose)
        self.covariances_ = emp_covs
        self.precisions_ = _group_sparse_covariance(
            emp_covs, n_samples, self.alpha_, tol=self.tol,
            max_iter=self.max_iter,
            verbose=max(0, self.verbose - 1), debug=self.debug)
        return self
Beispiel #43
0
 def _check_cv_non_float(self, y):
     return check_cv(
         self.cv,
         y=y,
         classifier=self.stratified,
     )
Beispiel #44
0
    def _fit(self, X, y, parameter_dict):

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax)

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" % (self.gene_type, maxints))

        toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)

        toolbox.register("evaluate", _evalFunction, searchobj=self,
                         name_values=name_values, X=X, y=y,
                         scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose,
                         error_score=self.error_score, fit_params=self.fit_params)

        toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type)

        toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("min", np.min)
        stats.register("max", np.max)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1)))

        pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                           ngen=self.generations_number, stats=stats,
                                           halloffame=hof, verbose=self.verbose)

        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)

        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" % (
                current_best_params_, current_best_score_)
                  )
            print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % (
                self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits)

        if current_best_score_ > self.best_score_:
            self.best_score_ = current_best_score_
            self.best_params_ = current_best_params_