Ejemplo n.º 1
0
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order
Ejemplo n.º 2
0
    def split(self, X, y=None, groups=None):
        """
        Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """

        # check arguments
        X, y, groups = indexable(X, y, groups)

        for train, test in super(
                ShuffleSplitWithinGroups, self).split(X, y, groups):
            yield train, test
Ejemplo n.º 3
0
def permutation_test_score(estimator, X, y, groups=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state),
            groups, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    return permutation_scores
Ejemplo n.º 4
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                    X=features,
                                    y=target,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    parameters=None,
                                    fit_params=sample_weight_dict)
                                for train, test in cv_iter]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
Ejemplo n.º 5
0
    def _generate_sample(self, zscored=False, full=False):
        from sklearn.utils import indexable
        X = self._Xtr_zs.copy() if zscored else self._Xtrain.copy()
        sample_x = [tuple(x) for x in X[self._ftnames].values]
        labels_y = X[[self._rate_column]].values.ravel().tolist()

        if full:
            X = self._Xtest.copy()
            LOG.warning('Requested fitting in both train and test '
                        'datasets, appending %d examples', len(X))
            sample_x += [tuple(x) for x in X[self._ftnames].values]
            labels_y += X[[self._rate_column]].values.ravel().tolist()

        groups = None
        if not full:
            groups = self.get_groups()

        return indexable(np.array(sample_x), labels_y, groups)
Ejemplo n.º 6
0
Archivo: ifs.py Proyecto: teopir/ifqi
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
                         verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'
                             .format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_my_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
                                 for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate([indices_i
                                   for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    return predictions[inv_test_indices], scores
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1,
                    verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):

    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(model))
    scorer = check_scoring(model, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    #
    scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in cv)

    return np.array(scores)[:, 0]
Ejemplo n.º 8
0
    def split(self, X, y=None, groups=None):
        """
        Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """

        # check arguments
        X, y, groups = indexable(X, y, groups)

        # get the number of samples
        n_samples = X.shape[0]
        if self.n_splits > n_samples:
            raise ValueError(
                ("Cannot have number of splits n_splits={0} greater"
                 " than the number of samples: n_samples={1}."
                 ).format(self.n_splits, n_samples))

        for train, test in super(KFoldWithinGroups, self).split(X, y, groups):
            yield train, test
Ejemplo n.º 9
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = X.shape[0]
        if self.n_splits > n_samples:
            raise ValueError(
                ("Cannot have number of splits n_splits={0} greater"
                 " than the number of samples: n_samples={1}."
                 ).format(self.n_splits, n_samples))

        # generate test fold
        test_fold = np.arange(n_samples, dtype=int) % self.n_splits
        cv = PredefinedSplit(test_fold)

        return(cv.split())
Ejemplo n.º 10
0
def cross_val_multiscore(estimator,
                         X,
                         y=None,
                         groups=None,
                         scoring=None,
                         cv=None,
                         n_jobs=1,
                         verbose=0,
                         fit_params=None,
                         pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation.

    Parameters
    ----------
    estimator : instance of sklearn.base.BaseEstimator
        The object to use to fit the data.
        Must implement the 'fit' method.
    X : array-like, shape (n_samples, n_dimensional_features,)
        The data to fit. Can be, for example a list, or an array at least 2d.
    y : array-like, shape (n_samples, n_targets,)
        The target variable to try to predict in the case of
        supervised learning.
    groups : array-like, with shape (n_samples,)
        Group labels for the samples used while splitting the dataset into
        train/test set.
    scoring : str, callable | None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
        Note that when using an estimator which inherently returns
        multidimensional output - in particular, SlidingEstimator
        or GeneralizingEstimator - you should set the scorer
        there, not here.
    cv : int, cross-validation generator | iterable
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a ``(Stratified)KFold``,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass,
        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
        other cases, :class:`sklearn.model_selection.KFold` is used.
    %(n_jobs)s
    verbose : int, optional
        The verbosity level.
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int, or str, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

        - None, in which case all the jobs are immediately
          created and spawned. Use this for lightweight and
          fast-running jobs, to avoid delays due to on-demand
          spawning of the jobs
        - An int, giving the exact number of total jobs that are
          spawned
        - A string, giving an expression as a function of n_jobs,
          as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape (n_splits,) | shape (n_splits, n_scores)
        Array of scores of the estimator for each run of the cross validation.
    """
    # This code is copied from sklearn

    from sklearn.base import clone
    from sklearn.utils import indexable
    from sklearn.model_selection._split import check_cv
    check_scoring = _get_check_scoring()

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = list(cv.split(X, y, groups))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    # Note: this parallelization is implemented using MNE Parallel
    parallel, p_func, n_jobs = parallel_func(_fit_and_score,
                                             n_jobs,
                                             pre_dispatch=pre_dispatch)
    scores = parallel(
        p_func(clone(estimator), X, y, scorer, train, test, verbose, None,
               fit_params) for train, test in cv_iter)
    return np.array(scores)[:, 0, ...]  # flatten over joblib output.
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        if self.fit_params is not None:
            warnings.warn(
                '"fit_params" as a constructor argument was '
                'deprecated in version 0.19 and will be removed '
                'in version 0.21. Pass fit parameters to the '
                '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn(
                    'Ignoring fit_params passed as a constructor '
                    'argument in favor of keyword arguments to '
                    'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        # Regenerate parameter iterable for each fit
        candidate_params = list(self._get_param_iterator())
        n_candidates = len(candidate_params)
        if self.verbose > 0:
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        if self.spark is None:
            out = self._run_sklearn_fit(base_estimator, X, y, scorers,
                                        fit_params, candidate_params, cv,
                                        groups)
        else:
            out = self._run_skspark_fit(base_estimator, X, y, scorers,
                                        fit_params, candidate_params, cv,
                                        groups)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time) = zip(*out)

        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # TODO: replace by a dict in 0.21
        from sklearn.utils.deprecation import DeprecationDict
        results = (DeprecationDict()
                   if self.return_train_score == 'warn' else {})

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)
        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name,
                   test_scores[scorer_name],
                   splits=True,
                   rank=True,
                   weights=test_sample_counts if self.iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name,
                       train_scores[scorer_name],
                       splits=True)

                if self.return_train_score == 'warn':
                    for key in set(results.keys()) - prev_keys:
                        message = (
                            'You are accessing a training score ({!r}), '
                            'which will not be available by default '
                            'any more in 0.21. If you need training scores, '
                            'please set return_train_score=True').format(key)
                        # warn on key access
                        results.add_warning(key, message, FutureWarning)

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = candidate_params[self.best_index_]
            self.best_score_ = results["mean_test_%s" %
                                       refit_metric][self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
Ejemplo n.º 12
0
def one_class_learning_curve(estimator,
                             X,
                             y,
                             groups=None,
                             train_sizes=np.linspace(0.1, 1.0, 5),
                             cv=None,
                             scoring=None,
                             n_jobs=1,
                             pre_dispatch="all",
                             verbose=0,
                             shuffle=False,
                             random_state=None):
    """One-class learning curve.
    Determines cross-validated training and test scores for different one-class
    training set sizes.  This should help choosing the best downsampling ratio.
    A cross-validation generator splits the whole dataset k times in training
    and test data. Subsets of the training set with varying sizes will be used
    to train the estimator and a score for each training subset size and the
    test set will be computed. Afterwards, the scores will be averaged over
    all k runs for each training subset size.
    Read more in the :ref:`User Guide <learning_curve>`.
    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.
    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a `(Stratified)KFold`,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.
        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.
    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.
    verbose : integer, optional
        Controls the verbosity: the higher, the more messages.
    shuffle : boolean, optional
        Whether to shuffle training data before taking prefixes of it
        based on``train_sizes``.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used when ``shuffle`` == 'True'.
    -------
    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
        Numbers of training examples that has been used to generate the
        learning curve. Note that the number of ticks might be less
        than n_ticks because duplicate entries will be removed.
    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.
    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.
    Notes
    -----
    See :ref:`examples/model_selection/plot_learning_curve.py
    <sphx_glr_auto_examples_model_selection_plot_learning_curve.py>`
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    # Store it as list as we will be iterating over the list multiple times
    cv_iter = list(cv.split(X, y, groups))

    scorer = check_scoring(estimator, scoring=scoring)

    n_max_training_samples = len(cv_iter[0][0])
    # Because the lengths of folds can be significantly different, it is
    # not guaranteed that we use all of the available training data when we
    # use the first 'n_max_training_samples' samples.
    train_sizes_abs = _translate_train_sizes(train_sizes,
                                             n_max_training_samples)
    n_unique_ticks = train_sizes_abs.shape[0]
    if verbose > 0:
        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)

    if shuffle:
        rng = check_random_state(random_state)
        cv_iter = ((rng.permutation(train), test) for train, test in cv_iter)

    one_class_sizes = list()

    train_test_proportions = []
    for train, test in cv_iter:
        pos = train[y.iloc[train] == 1]
        for n_train_samples in train_sizes_abs:
            train_split = train[:n_train_samples]
            neg = train_split[y.iloc[train_split] == 0]
            selected = np.concatenate((pos, neg), axis=0)
            train_test_proportions.append((selected, test))
            if len(one_class_sizes) < train_sizes_abs.shape[0]:
                one_class_sizes.append(neg.shape[0])

    out = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters=None,
                                fit_params=None,
                                return_train_score=True)
        for train, test in train_test_proportions)
    out = np.array(out)
    n_cv_folds = out.shape[0] // n_unique_ticks
    out = out.reshape(n_cv_folds, n_unique_ticks, 2)

    out = np.asarray(out).transpose((2, 1, 0))

    return np.array(one_class_sizes), out[0], out[1]
def cross_val_decision_function(estimator,
                                X,
                                y=None,
                                cv=None,
                                n_jobs=1,
                                verbose=0,
                                fit_params=None,
                                pre_dispatch='2*n_jobs'):
    """Generate cross-validated estimates for each input data point

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - An object to be used as a cross-validation generator.
        - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    preds : ndarray
        This is the result of calling 'predict'
    """
    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    preds_blocks = parallel(
        delayed(_fit_and_predict)(clone(estimator), X, y, train, test, verbose,
                                  fit_params) for train, test in cv)

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not _check_is_partition(locs, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
Ejemplo n.º 14
0
    def fit(self, X, y, **kwargs):
        """
        Fit is the entry point for the visualizer. Given instances described
        by X and binary classes described in the target y, fit performs n
        trials by shuffling and splitting the dataset then computing the
        precision, recall, f1, and queue rate scores for each trial. The
        scores are aggregated by the quantiles expressed then drawn.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values. The target y must
            be a binary classification target.

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.

        Returns
        -------
        self : instance
            Returns the instance of the visualizer

        raises: YellowbrickValueError
            If the target y is not a binary classification target.
        """
        # Check target before metrics raise crazy exceptions
        if type_of_target(y) != 'binary':
            raise YellowbrickValueError("multiclass format is not supported")

        # Make arrays indexable for cross validation
        X, y = indexable(X, y)

        # TODO: parallelize trials with joblib (using sklearn utility)
        # NOTE: parallelization with matplotlib is tricy at best!
        trials = [
            metric for idx in range(self.n_trials)
            for metric in self._split_fit_score_trial(X, y, idx)
        ]

        # Compute maximum number of uniform thresholds across all trials
        n_thresholds = np.array([len(t['thresholds']) for t in trials]).min()
        self.thresholds_ = np.linspace(0.0, 1.0, num=n_thresholds)

        # Filter metrics and collect values for uniform thresholds
        metrics = frozenset(METRICS) - self._check_exclude(self.exclude)
        uniform_metrics = defaultdict(list)

        for trial in trials:
            rows = defaultdict(list)
            for t in self.thresholds_:
                idx = bisect.bisect_left(trial['thresholds'], t)
                for metric in metrics:
                    rows[metric].append(trial[metric][idx])

            for metric, row in rows.items():
                uniform_metrics[metric].append(row)

        # Convert metrics to metric arrays
        uniform_metrics = {
            metric: np.array(values)
            for metric, values in uniform_metrics.items()
        }

        # Perform aggregation and store cv_scores_
        quantiles = self._check_quantiles(self.quantiles)
        self.cv_scores_ = {}

        for metric, values in uniform_metrics.items():
            # Compute the lower, median, and upper plots
            lower, median, upper = mstats.mquantiles(values,
                                                     prob=quantiles,
                                                     axis=0)

            # Store the aggregates in cv scores
            self.cv_scores_[metric] = median
            self.cv_scores_["{}_lower".format(metric)] = lower
            self.cv_scores_["{}_upper".format(metric)] = upper

        # Draw and always return self
        self.draw()
        return self
Ejemplo n.º 15
0
    def fit(self, X, y, sample_weight=None):
        """Fit the calibrated model

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        y : array-like, shape (n_samples,)
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.

        Returns
        -------
        self : object
            Returns an instance of self.
        """

        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csc', 'csr', 'coo'],
                         force_all_finite=False)
        X, y = indexable(X, y)

        df = self._preproc(X)

        weights = None
        if self.platts_trick:
            # Bayesian priors (see Platt end of section 2.2)
            prior0 = float(np.sum(y <= 0))
            prior1 = y.shape[0] - prior0

            weights = np.zeros_like(y).astype(float)
            weights[y > 0] = (prior1 + 1.) / (prior1 + 2.)
            weights[y <= 0] = 1. / (prior0 + 2.)
            y = np.append(np.ones_like(y), np.zeros_like(y))
            weights = np.append(weights, 1.0 - weights)
            df = np.append(df, df)

        if self.method is None:
            self.calibrator = _DummyCalibration()
        elif self.method == 'isotonic':
            self.calibrator = IsotonicRegression(out_of_bounds='clip')
        elif self.method == 'sksigmoid':
            self.calibrator = sk_sigmoid()
        elif self.method == 'sksigmoid_notrick':
            self.calibrator = sk_sigmoid_notrick()
        elif self.method == 'sigmoid':
            self.calibrator = _SigmoidCalibration()
        elif self.method == 'beta':
            self.calibrator = BetaCalibration(parameters="abm")
        elif self.method == 'beta_am':
            self.calibrator = BetaCalibration(parameters="am")
        elif self.method == 'beta_ab':
            self.calibrator = BetaCalibration(parameters="ab")
        elif self.method == 'beta_test_strict':
            self.calibrator = BetaCalibration(parameters="abm")
        elif self.method == 'beta_test_relaxed':
            self.calibrator = BetaCalibration(parameters="abm")
        elif self.method == 'beta_test':
            self.calibrator = _BetaTestedCalibration()
        else:
            raise ValueError('method should be None, "sigmoid", '
                             '"isotonic", "beta", "beta_am" or "beta_ab". '
                             'Got %s.' % self.method)
        self.calibrator.fit(df, y, weights)
        if self.method == 'beta':
            df_pos = df[y == 1]
            df_neg = df[y == 0]

            # alpha_pos_nll, beta_pos_nll = fit_beta_nll(df_pos)
            # alpha_neg_nll, beta_neg_nll = fit_beta_nll(df_neg)
            #
            # a_nll = alpha_pos_nll - alpha_neg_nll
            # b_nll = beta_neg_nll - beta_pos_nll
            # m_nll = fit_beta_midpoint(alpha_pos_nll, beta_pos_nll,
            #                           alpha_neg_nll, beta_neg_nll)

            alpha_pos_mmt, beta_pos_mmt = fit_beta_moments(df_pos)
            alpha_neg_mmt, beta_neg_mmt = fit_beta_moments(df_neg)

            a_mmt = alpha_pos_mmt - alpha_neg_mmt
            if a_mmt < 0 or np.isnan(a_mmt):
                a_mmt = 0
            b_mmt = beta_neg_mmt - beta_pos_mmt
            if b_mmt < 0 or np.isnan(b_mmt):
                b_mmt = 0
            prior_pos = len(df_pos) / len(df)
            prior_neg = len(df_neg) / len(df)
            m_mmt = fit_beta_midpoint(prior_pos, alpha_pos_mmt, beta_pos_mmt,
                                      prior_neg, alpha_neg_mmt, beta_neg_mmt)
            map = self.calibrator.calibrator_.map_
            #     if a_mmt > 4 and map[0] < 2:
            #         print [a_mmt, map[0]]
            #         print [b_mmt, map[1]]
            #         print [m_mmt, map[2]]
            #         exit()
            #     if b_mmt > 4 and map[1] < 2:
            #         print [a_mmt, map[0]]
            #         print [b_mmt, map[1]]
            #         print [m_mmt, map[2]]
            #         exit()
            self.a = [a_mmt, map[0]]
            self.b = [b_mmt, map[1]]
            self.m = [m_mmt, map[2]]
            self.df_pos = df_pos
            self.df_neg = df_neg
        return self
Ejemplo n.º 16
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap_size = self.gap_size
        rollback_size = self.rollback_size
        if self.test_size is not None:
            test_size = self.test_size
        else:
            test_size = n_samples // n_folds

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError((f"Cannot have number of folds={n_folds} greater"
                              f" than the number of samples={n_samples}."))

        if rollback_size >= test_size:
            raise ValueError((f"test_size={test_size} should be strictly "
                              f"larger than rollback_size={rollback_size}"))

        first_test = n_samples - (test_size - rollback_size) * n_splits
        first_test -= rollback_size
        if first_test < 0:
            raise ValueError(
                (f"Too many splits={n_splits} for number of samples"
                 f"={n_samples} with test_size={test_size} and "
                 f"rollback_size ={rollback_size}."))

        indices = np.arange(n_samples)
        test_starts = range(first_test, n_samples, test_size - rollback_size)
        test_starts = test_starts[0:n_splits]

        for test_start in test_starts:
            train_end = test_start - gap_size
            if self.max_train_size and self.max_train_size < train_end:
                yield (indices[train_end - self.max_train_size:train_end],
                       indices[test_start:test_start + test_size])
            else:
                yield (indices[:max(train_end, 0)],
                       indices[test_start:test_start + test_size])
Ejemplo n.º 17
0
def cross_val_predict_proba(estimator, X, y=None, cv=None, n_jobs=1,
                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """Generate cross-validated estimates for each input data point
    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.
    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv : cross-validation generator or int, optional, default: None
        A cross-validation generator to use. If int, determines
        the number of folds in StratifiedKFold if y is binary
        or multiclass and estimator is a classifier, or the number
        of folds in KFold otherwise. If None, it is equivalent to cv=3.
        This generator must include all elements in the test set exactly once.
        Otherwise, a ValueError is raised.
    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    verbose : integer, optional
        The verbosity level.
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:
            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs
            - An int, giving the exact number of total jobs that are
              spawned
            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'
    Returns
    -------
    probs : ndarray
        This is the result of calling 'predict_proba'
    """
    X, y = indexable(X, y)

    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    probs_blocks = parallel(delayed(_fit_and_predict_proba)(clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params)
                            for train, test in cv)
    p = np.concatenate([p for p, _ in probs_blocks])
    locs = np.concatenate([loc for _, loc in probs_blocks])
    if not _check_is_partition(locs, X.shape[0]):
        raise ValueError('cross_val_predict_proba only works for partitions')
    probs = p.copy()
    probs[locs] = p
    return probs
Ejemplo n.º 18
0
def train_test_split(*arrays, **options):
    """Extend sklearn.model_selection.train_test_slit to have group split.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.

    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : None or str (default='simple')
        How to shuffle the data before splitting.
        None, no shuffle.
        For str, one of 'simple', 'stratified' and 'group', corresponding to
        `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`,
        respectively.

    labels : array-like or None (default=None)
        Ignored if shuffle is None or 'simple'.
        When shuffle='stratified', this array is used as class labels.
        When shuffle='group', this array is used as groups.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    shuffle = options.pop('shuffle', 'simple')
    labels = options.pop('labels', None)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    if shuffle == 'group':
        if labels is None:
            raise ValueError("When shuffle='group', "
                             "labels should not be None!")
        labels = check_array(labels, ensure_2d=False, dtype=None)
        uniques = np.unique(labels)
        n_samples = uniques.size

    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
                                              default_test_size=0.25)

    shuffle_options = dict(test_size=n_test,
                           train_size=n_train,
                           random_state=random_state)

    if shuffle is None:
        if labels is not None:
            warnings.warn("The `labels` is ignored for "
                          "shuffle being None!")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)

    elif shuffle == 'simple':
        if labels is not None:
            warnings.warn("The `labels` is not needed and therefore "
                          "ignored for ShuffleSplit, as shuffle='simple'!")

        cv = ShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None))

    elif shuffle == 'stratified':
        cv = StratifiedShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=labels))

    elif shuffle == 'group':
        cv = GroupShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None, groups=labels))

    else:
        raise ValueError("The argument `shuffle` only supports None, "
                         "'simple', 'stratified' and 'group', but got `%s`!"
                         % shuffle)

    return list(chain.from_iterable((safe_indexing(a, train),
                                    safe_indexing(a, test)) for a in arrays))
Ejemplo n.º 19
0
    def fit(self, X, y, sample_weight=None):
        """Fit the calibrated model
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.
        y : array-like, shape (n_samples,)
            Target values.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.
        Returns
        -------
        self : object
            Returns an instance of self.
        """
        X, y = indexable(X, y)
        le = LabelBinarizer().fit(y)
        self.classes_ = le.classes_

        # Check that each cross-validation fold can have at least one
        # example per class
        n_folds = self.cv if isinstance(self.cv, int) \
            else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
        if n_folds and \
                np.any([np.sum(y == class_) < n_folds for class_ in
                        self.classes_]):
            raise ValueError("Requesting %d-fold cross-validation but provided"
                             " less than %d examples for at least one class."
                             % (n_folds, n_folds))

        self.calibrated_classifiers_ = []
        if self.base_estimator is None:
            # we want all classifiers that don't expose a random_state
            # to be deterministic (and we don't want to expose this one).
            base_estimator = LinearSVC(random_state=0)
        else:
            base_estimator = self.base_estimator

        if self.cv == "prefit":
            calibrated_classifier = _CalibratedClassifier(
                base_estimator, method=self.method)
            calibrated_classifier.fit(X, y, sample_weight)
            self.calibrated_classifiers_.append(calibrated_classifier)
        else:
            cv = check_cv(self.cv, y, classifier=True)
            fit_parameters = signature(base_estimator.fit).parameters
            base_estimator_supports_sw = "sample_weight" in fit_parameters

            if sample_weight is not None:
                sample_weight = _check_sample_weight(sample_weight, X)

                if not base_estimator_supports_sw:
                    estimator_name = type(base_estimator).__name__
                    warnings.warn("Since %s does not support sample_weights, "
                                  "sample weights will only be used for the "
                                  "calibration itself." % estimator_name)
            if self.ensemble:
                for train, test in cv.split(X, y):
                    this_estimator = clone(base_estimator)

                    if sample_weight is not None and \
                            base_estimator_supports_sw:
                        this_estimator.fit(X[train], y[train],
                                           sample_weight=sample_weight[train])
                    else:
                        this_estimator.fit(X[train], y[train])

                    calibrated_classifier = _CalibratedClassifier(
                        this_estimator, method=self.method,
                        classes=self.classes_)
                    sw = None if sample_weight is None else sample_weight[test]
                    calibrated_classifier.fit(X[test], y[test],
                                              sample_weight=sw)
                    self.calibrated_classifiers_.append(calibrated_classifier)
            else:
                if hasattr(base_estimator, "decision_function"):
                    base_estimator_method = "decision_function"
                elif hasattr(base_estimator, "predict_proba"):
                    base_estimator_method = "predict_proba"
                else:
                    raise RuntimeError('classifier has no decision_function '
                                       'or predict_proba method.')
                predictions = cross_val_predict(base_estimator, X, y, cv=cv,
                                                method=base_estimator_method)
                this_estimator = clone(base_estimator)
                if sample_weight is not None and base_estimator_supports_sw:
                    this_estimator.\
                        fit(X, y, sample_weight=sample_weight)
                else:
                    this_estimator.fit(X, y)
                calibrated_classifier = \
                    _CalibratedClassifier(this_estimator, method=self.method,
                                          classes=self.classes_,
                                          predictions_in_X=True)
                if hasattr(this_estimator, "decision_function"):
                    if predictions.ndim == 1:
                        predictions = predictions[:, np.newaxis]
                elif hasattr(this_estimator, "predict_proba"):
                    if len(self.classes_) == 2:
                        predictions = predictions[:, 1:]
                calibrated_classifier.fit(predictions, y, sample_weight)
                self.calibrated_classifiers_.append(calibrated_classifier)
        return self
Ejemplo n.º 20
0
def cross_validate_checkpoint(
    estimator,
    X,
    y=None,
    *,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=None,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
    return_train_score=False,
    return_estimator=False,
    error_score=np.nan,
    workdir=None,
    checkpoint=True,
    force_refresh=False,
    serialize_cv=False,
):
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    This is a copy of :func:`sklearn:sklearn.model_selection.cross_validate`
    that uses :func:`_fit_and_score_ckpt` to checkpoint scores and estimators
    for each CV split.
    Read more in the :ref:`sklearn user guide <sklearn:multimetric_cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be for example a list, or an array.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`sklearn:GroupKFold`).

    scoring : str, callable, list/tuple, or dict, default=None
        A single str (see :ref:`sklearn:scoring_parameter`) or a callable
        (see :ref:`sklearn:scoring`) to evaluate the predictions on the test set.

        For evaluating multiple metrics, either give a list of (unique) strings
        or a dict with names as keys and callables as values.

        NOTE that when using custom scorers, each scorer should return a single
        value. Metric functions returning a list/array of values can be wrapped
        into multiple scorers that return one value each.

        See :ref:`sklearn:multimetric_grid_search` for an example.

        If None, the estimator's score method is used.

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - an sklearn `CV splitter <https://scikit-learn.org/stable/glossary.html#term-cv-splitter>`_,
        - An iterable yielding (train, test) splits as arrays of indices.

        For int/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass,
        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
        other cases, :class:`sklearn.model_selection.KFold` is used.
        Refer :ref:`sklearn user guide <sklearn:cross_validation>` for the
        various cross-validation strategies that can be used here.

    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        ``None`` means 1 unless in a :obj:`joblib:joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`sklearn Glossary <sklearn:n_jobs>`
        for more details.

    verbose : int, default=0
        The verbosity level.

    fit_params : dict, default=None
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A str, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    return_train_score : bool, default=False
        Whether to include train scores.
        Computing training scores is used to get insights on how different
        parameter settings impact the overfitting/underfitting trade-off.
        However computing the scores on the training set can be computationally
        expensive and is not strictly required to select the parameters that
        yield the best generalization performance.

    return_estimator : bool, default=False
        Whether to return the estimators fitted on each split.

    error_score : 'raise' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.

    workdir : path-like object, default=None
        A string or :term:`python:path-like-object` indicating the directory
        in which to store checkpoint files

    checkpoint : bool, default=True
        If True, checkpoint the parameters, estimators, and scores.

    force_refresh : bool, default=False
        If True, recompute scores even if the checkpoint file already exists.
        Otherwise, load scores from checkpoint files and return.

    serialize_cv : bool, default=False
        If True, do not use joblib.Parallel to evaluate each CV split.

    Returns
    -------
    scores : dict of float arrays of shape (n_splits,)
        Array of scores of the estimator for each run of the cross validation.

        A dict of arrays containing the score/time arrays for each scorer is
        returned. The possible keys for this ``dict`` are:

            ``test_score``
                The score array for test scores on each cv split.
                Suffix ``_score`` in ``test_score`` changes to a specific
                metric like ``test_r2`` or ``test_auc`` if there are
                multiple scoring metrics in the scoring parameter.
            ``train_score``
                The score array for train scores on each cv split.
                Suffix ``_score`` in ``train_score`` changes to a specific
                metric like ``train_r2`` or ``train_auc`` if there are
                multiple scoring metrics in the scoring parameter.
                This is available only if ``return_train_score`` parameter
                is ``True``.
            ``fit_time``
                The time for fitting the estimator on the train
                set for each cv split.
            ``score_time``
                The time for scoring the estimator on the test set for each
                cv split. (Note time for scoring on the train set is not
                included even if ``return_train_score`` is set to ``True``
            ``estimator``
                The estimator objects for each cv split.
                This is available only if ``return_estimator`` parameter
                is set to ``True``.

    Examples
    --------
    >>> import shutil
    >>> import tempfile
    >>> from sklearn import datasets, linear_model
    >>> from afqinsight import cross_validate_checkpoint
    >>> from sklearn.pipeline import make_pipeline
    >>> from sklearn.preprocessing import StandardScaler
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()

    Single metric evaluation using ``cross_validate``

    >>> cv_results = cross_validate_checkpoint(lasso, X, y, cv=3, checkpoint=False)
    >>> sorted(cv_results.keys())
    ['fit_time', 'score_time', 'test_score']
    >>> cv_results['test_score']
    array([0.33150734, 0.08022311, 0.03531764])

    Multiple metric evaluation using ``cross_validate``, an estimator
    pipeline, and checkpointing (please refer the ``scoring`` parameter doc
    for more information)

    >>> tempdir = tempfile.mkdtemp()
    >>> scaler = StandardScaler()
    >>> pipeline = make_pipeline(scaler, lasso)
    >>> scores = cross_validate_checkpoint(pipeline, X, y, cv=3,
    ...                         scoring=('r2', 'neg_mean_squared_error'),
    ...                         return_train_score=True, checkpoint=True,
    ...                         workdir=tempdir, return_estimator=True)
    >>> shutil.rmtree(tempdir)
    >>> print(scores['test_neg_mean_squared_error'])
    [-2479.2... -3281.2... -3466.7...]
    >>> print(scores['train_r2'])
    [0.507... 0.602... 0.478...]

    See Also
    --------
    sklearn.model_selection.cross_val_score:
        Run cross-validation for single metric evaluation.
    sklearn.model_selection.cross_val_predict:
        Get predictions from each split of cross-validation for diagnostic
        purposes.
    sklearn.metrics.make_scorer:
        Make a scorer from a performance metric or loss function.
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    if serialize_cv:
        scores = [
            _fit_and_score_ckpt(
                workdir=workdir,
                checkpoint=checkpoint,
                force_refresh=force_refresh,
                estimator=clone(estimator),
                X=X,
                y=y,
                scorer=scorers,
                train=train,
                test=test,
                verbose=verbose,
                parameters=None,
                fit_params=fit_params,
                return_train_score=return_train_score,
                return_times=True,
                return_estimator=return_estimator,
                error_score=error_score,
            ) for train, test in cv.split(X, y, groups)
        ]
    else:
        parallel = Parallel(n_jobs=n_jobs,
                            verbose=verbose,
                            pre_dispatch=pre_dispatch)
        scores = parallel(
            delayed(_fit_and_score_ckpt)(
                workdir=workdir,
                checkpoint=checkpoint,
                force_refresh=force_refresh,
                estimator=clone(estimator),
                X=X,
                y=y,
                scorer=scorers,
                train=train,
                test=test,
                verbose=verbose,
                parameters=None,
                fit_params=fit_params,
                return_train_score=return_train_score,
                return_times=True,
                return_estimator=return_estimator,
                error_score=error_score,
            ) for train, test in cv.split(X, y, groups))

    zipped_scores = list(zip(*scores))
    if return_train_score:
        train_scores = zipped_scores.pop(0)
        train_scores = _aggregate_score_dicts(train_scores)
    if return_estimator:
        fitted_estimators = zipped_scores.pop()
    test_scores, fit_times, score_times = zipped_scores
    test_scores = _aggregate_score_dicts(test_scores)

    ret = {}
    ret["fit_time"] = np.array(fit_times)
    ret["score_time"] = np.array(score_times)

    if return_estimator:
        ret["estimator"] = fitted_estimators

    for name in scorers:
        ret["test_%s" % name] = np.array(test_scores[name])
        if return_train_score:
            key = "train_%s" % name
            ret[key] = np.array(train_scores[name])

    return ret
Ejemplo n.º 21
0
    def fit(self, X, y, **kwargs):
        """
        Fit is the entry point for the visualizer. Given instances described
        by X and binary classes described in the target y, fit performs n
        trials by shuffling and splitting the dataset then computing the
        precision, recall, f1, and queue rate scores for each trial. The
        scores are aggregated by the quantiles expressed then drawn.

        Parameters
        ----------
        X : ndarray or DataFrame of shape n x m
            A matrix of n instances with m features

        y : ndarray or Series of length n
            An array or series of target or class values. The target y must
            be a binary classification target.

        kwargs: dict
            keyword arguments passed to Scikit-Learn API.

        Returns
        -------
        self : instance
            Returns the instance of the visualizer

        raises: YellowbrickValueError
            If the target y is not a binary classification target.
        """
        # Check target before metrics raise crazy exceptions
        if type_of_target(y) != 'binary':
            raise YellowbrickValueError("multiclass format is not supported")

        # Make arrays indexable for cross validation
        X, y = indexable(X, y)

        # TODO: parallelize trials with joblib (using sklearn utility)
        # NOTE: parallelization with matplotlib is tricy at best!
        trials = [
            metric
            for idx in range(self.n_trials)
            for metric in self._split_fit_score_trial(X, y, idx)
        ]

        # Compute maximum number of uniform thresholds across all trials
        n_thresholds = np.array([len(t['thresholds']) for t in trials]).min()
        self.thresholds_ = np.linspace(0.0, 1.0, num=n_thresholds)

        # Filter metrics and collect values for uniform thresholds
        metrics = frozenset(METRICS) - self._check_exclude(self.exclude)
        uniform_metrics = defaultdict(list)

        for trial in trials:
            rows = defaultdict(list)
            for t in self.thresholds_:
                idx = bisect.bisect_left(trial['thresholds'], t)
                for metric in metrics:
                    rows[metric].append(trial[metric][idx])

            for metric, row in rows.items():
                uniform_metrics[metric].append(row)

        # Convert metrics to metric arrays
        uniform_metrics = {
            metric: np.array(values)
            for metric, values in uniform_metrics.items()
        }

        # Perform aggregation and store cv_scores_
        quantiles = self._check_quantiles(self.quantiles)
        self.cv_scores_ = {}

        for metric, values in uniform_metrics.items():
            # Compute the lower, median, and upper plots
            lower, median, upper = mstats.mquantiles(
                values, prob=quantiles, axis=0
            )

            # Store the aggregates in cv scores
            self.cv_scores_[metric] = median
            self.cv_scores_["{}_lower".format(metric)] = lower
            self.cv_scores_["{}_upper".format(metric)] = upper

        # Draw and always return self
        self.draw()
        return self
Ejemplo n.º 22
0
def _cross_val_predict(estimator,
                       X,
                       y=None,
                       *,
                       groups=None,
                       cv=None,
                       n_jobs=None,
                       verbose=0,
                       fit_params=None,
                       pre_dispatch='2*n_jobs',
                       method='predict',
                       safe=True):
    """This is a fork from :meth:`~sklearn.model_selection.cross_val_predict` to allow for
    non-safe cloning of the models for each fold.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
            default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`GroupKFold`).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - CV splitter,
        - An iterable yielding (train, test) splits as arrays of indices.

        For int/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

        .. versionchanged:: 0.22
            ``cv`` default value if None changed from 3-fold to 5-fold.

    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : int, default=0
        The verbosity level.

    fit_params : dict, defualt=None
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A str, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    method : str, default='predict'
        Invokes the passed method name of the passed estimator. For
        method='predict_proba', the columns correspond to the classes
        in sorted order.

    safe : bool, default=True
        Whether to clone with safe option.

    Returns
    -------
    predictions : ndarray
        This is the result of calling ``method``
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))

    test_indices = np.concatenate([test for _, test in splits])
    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    # If classification methods produce multiple columns of output,
    # we need to manually encode classes to ensure consistent column ordering.
    encode = method in [
        'decision_function', 'predict_proba', 'predict_log_proba'
    ] and y is not None
    if encode:
        y = np.asarray(y)
        if y.ndim == 1:
            le = LabelEncoder()
            y = le.fit_transform(y)
        elif y.ndim == 2:
            y_enc = np.zeros_like(y, dtype=int)
            for i_label in range(y.shape[1]):
                y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
            y = y_enc

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    predictions = parallel(
        delayed(_fit_and_predict)(clone(estimator, safe=safe), X, y, train,
                                  test, verbose, fit_params, method)
        for train, test in splits)
    from pkg_resources import parse_version
    if parse_version(sklearn.__version__) < parse_version("0.24.0"):
        # Prior to 0.24.0, this private scikit-learn method returned a tuple of two values
        predictions = [p[0] for p in predictions]

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    elif encode and isinstance(predictions[0], list):
        # `predictions` is a list of method outputs from each fold.
        # If each of those is also a list, then treat this as a
        # multioutput-multiclass task. We need to separately concatenate
        # the method outputs for each label into an `n_labels` long list.
        n_labels = y.shape[1]
        concat_pred = []
        for i_label in range(n_labels):
            label_preds = np.concatenate([p[i_label] for p in predictions])
            concat_pred.append(label_preds)
        predictions = concat_pred
    else:
        predictions = np.concatenate(predictions)

    if isinstance(predictions, list):
        return [p[inv_test_indices] for p in predictions]
    else:
        return predictions[inv_test_indices]
Ejemplo n.º 23
0
    def fit(self, X, y, sample_weight=None):
        """Fit the calibrated model

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        y : array-like, shape (n_samples,)
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        # X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
        # force_all_finite=False)
        X, y = indexable(X, y)
        lb = LabelBinarizer().fit(y)
        self.classes_ = lb.classes_

        # Check that each cross-validation fold can have at least one
        # example per class
        n_folds = (self.cv if isinstance(self.cv, int) else
                   self.cv.n_folds if hasattr(self.cv, 'n_folds') else None)
        if n_folds and np.any(
            [np.sum(y == class_) < n_folds for class_ in self.classes_]):
            raise ValueError('Requesting %d-fold cross-validation but provided'
                             ' less than %d examples for at least one class.' %
                             (n_folds, n_folds))

        self.calibrated_classifiers_ = []
        if self.base_estimator is None:
            # we want all classifiers that don't expose a random_state
            # to be deterministic (and we don't want to expose this one).
            base_estimator = LinearSVC(random_state=0)
        else:
            base_estimator = self.base_estimator

        if self.cv == 'prefit':
            calibrated_classifier = _CalibratedClassifier(
                base_estimator, method=self.method, score_type=self.score_type)
            if sample_weight is not None:
                calibrated_classifier.fit(X, y, sample_weight)
            else:
                calibrated_classifier.fit(X, y)
            self.calibrated_classifiers_.append(calibrated_classifier)
        else:
            cv = check_cv(self.cv, X, y, classifier=True)
            fit_parameters = signature(base_estimator.fit).parameters
            estimator_name = type(base_estimator).__name__
            if (sample_weight is not None
                    and 'sample_weight' not in fit_parameters):
                warnings.warn('%s does not support sample_weight. Samples'
                              ' weights are only used for the calibration'
                              ' itself.' % estimator_name)
                base_estimator_sample_weight = None
            else:
                base_estimator_sample_weight = sample_weight
            for train, test in cv:
                this_estimator = clone(base_estimator)
                if base_estimator_sample_weight is not None:
                    this_estimator.fit(
                        X[train],
                        y[train],
                        sample_weight=base_estimator_sample_weight[train],
                    )
                else:
                    this_estimator.fit(X[train], y[train])

                calibrated_classifier = _CalibratedClassifier(
                    this_estimator,
                    method=self.method,
                    score_type=self.score_type,
                )
                if sample_weight is not None:
                    calibrated_classifier.fit(X[test], y[test],
                                              sample_weight[test])
                else:
                    calibrated_classifier.fit(X[test], y[test])
                self.calibrated_classifiers_.append(calibrated_classifier)

        return self
Ejemplo n.º 24
0
def cross_val_predict(estimator,
                      y,
                      X=None,
                      cv=None,
                      verbose=0,
                      averaging="mean",
                      **kwargs):  # TODO: remove kwargs
    """Generate cross-validated estimates for each input data point

    Parameters
    ----------
    estimator : estimator
        An estimator object that implements the ``fit`` method

    y : array-like or iterable, shape=(n_samples,)
            The time-series array.

    X : array-like, shape=[n_obs, n_vars], optional (default=None)
        An optional 2-d array of exogenous variables.

    cv : BaseTSCrossValidator or None, optional (default=None)
        An instance of cross-validation. If None, will use a RollingForecastCV.
        Note that for cross-validation predictions, the CV step cannot exceed
        the CV horizon, or there will be a gap between fold predictions.

    verbose : integer, optional
        The verbosity level.

    averaging : str or callable, one of ["median", "mean"] (default="mean")
        Unlike normal CV, time series CV might have different folds (windows)
        forecasting the same time step. After all forecast windows are made,
        we build a matrix of y x n_folds, populating each fold's forecasts like
        so::

            nan nan nan  # training samples
            nan nan nan
            nan nan nan
            nan nan nan
              1 nan nan  # test samples
              4   3 nan
              3 2.5 3.5
            nan   6   5
            nan nan   4

        We then average each time step's forecasts to end up with our final
        prediction results.

    Examples
    --------
    >>> import pmdarima as pm
    >>> from pmdarima.model_selection import cross_val_predict,\
    ...     RollingForecastCV
    >>> y = pm.datasets.load_wineind()
    >>> cv = RollingForecastCV(h=14, step=12)
    >>> preds = cross_val_predict(
    ...     pm.ARIMA((1, 1, 2), seasonal_order=(0, 1, 1, 12)), y, cv=cv)
    >>> preds[:5]
    array([30710.45743168, 34902.94929722, 17994.16587163, 22127.71167249,
           25473.60876435])
    """
    # Temporary shim until we remove `exogenous` support completely
    X, _ = pm_compat.get_X(X, **kwargs)

    y, X = indexable(y, X)
    y = check_endog(y, copy=False)
    cv = check_cv(cv)
    avgfunc = _check_averaging(averaging)

    # need to be careful here:
    # >>> cv = RollingForecastCV(step=6, h=4)
    # >>> cv_generator = cv.split(wineind)
    # >>> next(cv_generator)
    # (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    #         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    #         30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
    #         45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]),
    #  array([58, 59, 60, 61]))
    # >>> next(cv_generator)
    # (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    #         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
    #         30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
    #         45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
    #         60, 61, 62, 63]),
    #  array([64, 65, 66, 67]))  <~~ 64 vs. 61
    if cv.step > cv.horizon:
        raise ValueError("CV step cannot be > CV horizon, or there will be a "
                         "gap in predictions between folds")

    # clone estimator to make sure all folds are independent
    prediction_blocks = [
        _fit_and_predict(fold,
                         base.clone(estimator),
                         y,
                         X,
                         train=train,
                         test=test,
                         verbose=verbose,)  # TODO: fit params?
        for fold, (train, test) in enumerate(cv.split(y, X))]

    # Unlike normal CV, time series CV might have different folds (windows)
    # forecasting the same time step. In this stage, we build a matrix of
    # y x n_folds, populating each fold's forecasts like so:

    pred_matrix = np.ones((y.shape[0], len(prediction_blocks))) * np.nan
    for i, (pred_block, test_indices) in enumerate(prediction_blocks):
        pred_matrix[test_indices, i] = pred_block

    # from there, we need to apply nanmean (or some other metric) along rows
    # to agree on a forecast for a sample.
    test_mask = ~(np.isnan(pred_matrix).all(axis=1))
    predictions = pred_matrix[test_mask]
    return avgfunc(predictions, axis=1)
Ejemplo n.º 25
0
def repeated_cross_validate(estimator,
                            X,
                            y=None,
                            groups=None,
                            scoring=None,
                            cv=None,
                            n_jobs=1,
                            n_reps=1,
                            verbose=0,
                            fit_params=None,
                            pre_dispatch='2*n_jobs',
                            return_train_score="warn"):
    if len(cv) != n_reps:
        raise ValueError(
            "Set n_reps = {}. Got only {} cross validators.".format(
                n_reps, len(cv)))

    n_folds = np.unique(
        [cross_validator.get_n_splits() for cross_validator in cv])
    if len(n_folds) != 1:
        raise ValueError(
            "Cross validators are not unified in fold number: {}".format(
                n_folds))
    n_folds = n_folds[0]
    """Evaluate metric(s) by cross-validation and also record fit/score times.

        Read more in the :ref:`User Guide <multimetric_cross_validation>`.

        Parameters
        ----------
        estimator : estimator object implementing 'fit'
            The object to use to fit the data.

        X : array-like
            The data to fit. Can be for example a list, or an array.

        y : array-like, optional, default: None
            The target variable to try to predict in the case of
            supervised learning.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        scoring : string, callable, list/tuple, dict or None, default: None
            A single string (see :ref:`scoring_parameter`) or a callable
            (see :ref:`scoring`) to evaluate the predictions on the test set.

            For evaluating multiple metrics, either give a list of (unique) strings
            or a dict with names as keys and callables as values.

            NOTE that when using custom scorers, each scorer should return a single
            value. Metric functions returning a list/array of values can be wrapped
            into multiple scorers that return one value each.

            See :ref:`multimetric_grid_search` for an example.

            If None, the estimator's default scorer (if available) is used.

        cv : array-like, a collection of cross-validation generators, with length n_reps

            Refer :ref:`User Guide <cross_validation>` for the various
            cross-validation strategies that can be used here.

        n_jobs : integer, optional
            The number of CPUs to use to do the computation. -1 means
            'all CPUs'.

        verbose : integer, optional
            The verbosity level.

        fit_params : dict, optional
            Parameters to pass to the fit method of the estimator.

        pre_dispatch : int, or string, optional
            Controls the number of jobs that get dispatched during parallel
            execution. Reducing this number can be useful to avoid an
            explosion of memory consumption when more jobs get dispatched
            than CPUs can process. This parameter can be:

                - None, in which case all the jobs are immediately
                  created and spawned. Use this for lightweight and
                  fast-running jobs, to avoid delays due to on-demand
                  spawning of the jobs

                - An int, giving the exact number of total jobs that are
                  spawned

                - A string, giving an expression as a function of n_jobs,
                  as in '2*n_jobs'

        return_train_score : boolean, optional
            Whether to include train decision_scores.

            Current default is ``'warn'``, which behaves as ``True`` in addition
            to raising a warning when a training score is looked up.
            That default will be changed to ``False`` in 0.21.
            Computing training decision_scores is used to get insights on how different
            parameter settings impact the overfitting/underfitting trade-off.
            However computing the decision_scores on the training set can be computationally
            expensive and is not strictly required to select the parameters that
            yield the best generalization performance.

        Returns
        -------
        repeated_decision_scores : dict of `decision_scores` dicts, of shape=(n_reps,)
    """
    X, y, groups = indexable(X, y, groups)

    # cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    # ---------------------- My Hack ----------------------- #
    # 1) Set parameter `error_score=-1` to `_fit_and_score`  #
    # 2) Created an argument `return_estimator` to           #
    #    `_fit_and_score`                                    #
    # ------------------------------------------------------ #
    tasks = [[
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                None,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=True,
                                error_score=-1)
        for train, test in cross_validator.split(X, y, groups)
    ] for cross_validator in cv]
    # Flatten this list of lists into a simple list
    tasks = itertools.chain.from_iterable(tasks)
    scores = parallel(tasks)

    if return_train_score:
        train_scores, test_scores, fit_times, score_times, estimators = zip(
            *scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times, estimators = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)
    ret['estimator'] = list(estimators)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training decision_scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)
    """
    Now `ret` is a dictionary whose values are all sequences of length `n_folds * n_reps`.
    Split it into `n_reps` sub-dictionaries whose values are of length `n_folds`
    """
    rep_rets = list(_split_dict(ret, chunk_size=n_folds))

    assert len(rep_rets) == n_reps

    for i in range(0, n_reps):
        rep_rets[i]["cross_validator"] = cv[i]

    result = dict(zip(range(0, n_reps), rep_rets))

    return result
Ejemplo n.º 26
0
def cross_val_predict(estimator,
                      X,
                      y=None,
                      *,
                      groups=None,
                      cv=None,
                      n_jobs=None,
                      verbose=0,
                      fit_params=None,
                      pre_dispatch='2*n_jobs',
                      method='predict'):
    """Generate cross-validated estimates for each input data point

    The data is split according to the cv parameter. Each sample belongs
    to exactly one test set, and its prediction is computed with an
    estimator fitted on the corresponding training set.

    Passing these predictions into an evaluation metric may not be a valid
    way to measure generalization performance. Results can differ from
    :func:`cross_validate` and :func:`cross_val_score` unless all tests sets
    have equal size and the metric decomposes over samples.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
            default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`GroupKFold`).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - :term:`CV splitter`,
        - An iterable yielding (train, test) splits as arrays of indices.

        For int/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

        .. versionchanged:: 0.22
            ``cv`` default value if None changed from 3-fold to 5-fold.

    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : int, default=0
        The verbosity level.

    fit_params : dict, defualt=None
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A str, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    method : str, default='predict'
        Invokes the passed method name of the passed estimator. For
        method='predict_proba', the columns correspond to the classes
        in sorted order.

    Returns
    -------
    predictions : ndarray
        This is the result of calling ``method``

    See also
    --------
    cross_val_score : calculate score for each CV split

    cross_validate : calculate one or more scores and timings for each CV split

    Notes
    -----
    In the case that one or more classes are absent in a training portion, a
    default score needs to be assigned to all instances for that class if
    ``method`` produces columns per class, as in {'decision_function',
    'predict_proba', 'predict_log_proba'}.  For ``predict_proba`` this value is
    0.  In order to ensure finite output, we approximate negative infinity by
    the minimum finite float value for the dtype in other cases.

    Examples
    --------
    >>> from sklearn import datasets, linear_model
    >>> from sklearn.model_selection import cross_val_predict
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()
    >>> y_pred = cross_val_predict(lasso, X, y, cv=3)
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # If classification methods produce multiple columns of output,
    # we need to manually encode classes to ensure consistent column ordering.
    encode = method in [
        'decision_function', 'predict_proba', 'predict_log_proba'
    ] and y is not None
    if encode:
        y = np.asarray(y)
        if y.ndim == 1:
            le = LabelEncoder()
            y = le.fit_transform(y)
        elif y.ndim == 2:
            y_enc = np.zeros_like(y, dtype=np.int)
            for i_label in range(y.shape[1]):
                y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
            y = y_enc

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(
        delayed(_fit_and_predict)(clone(estimator), X, y, train, test, verbose,
                                  fit_params, method)
        for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate(
        [indices_i for _, indices_i, _ in prediction_blocks])
    estimators = [e for _, _, e in prediction_blocks]

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    elif encode and isinstance(predictions[0], list):
        # `predictions` is a list of method outputs from each fold.
        # If each of those is also a list, then treat this as a
        # multioutput-multiclass task. We need to separately concatenate
        # the method outputs for each label into an `n_labels` long list.
        n_labels = y.shape[1]
        concat_pred = []
        for i_label in range(n_labels):
            label_preds = np.concatenate([p[i_label] for p in predictions])
            concat_pred.append(label_preds)
        predictions = concat_pred
    else:
        predictions = np.concatenate(predictions)

    if isinstance(predictions, list):
        return [p[inv_test_indices] for p in predictions], estimators
    else:
        return predictions[inv_test_indices], estimators
Ejemplo n.º 27
0
def cross_val_multiscore(estimator, X, y=None, groups=None, scoring=None,
                         cv=None, n_jobs=1, verbose=0, fit_params=None,
                         pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation.

    Parameters
    ----------
    estimator : instance of sklearn.base.BaseEstimator
        The object to use to fit the data.
        Must implement the 'fit' method.
    X : array-like, shape (n_samples, n_dimensional_features,)
        The data to fit. Can be, for example a list, or an array at least 2d.
    y : array-like, shape (n_samples, n_targets,)
        The target variable to try to predict in the case of
        supervised learning.
    groups : array-like, with shape (n_samples,)
        Group labels for the samples used while splitting the dataset into
        train/test set.
    scoring : string, callable | None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    cv : int, cross-validation generator | iterable
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a ``(Stratified)KFold``,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass,
        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
        other cases, :class:`sklearn.model_selection.KFold` is used.
    n_jobs : int, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    verbose : int, optional
        The verbosity level.
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

        - None, in which case all the jobs are immediately
          created and spawned. Use this for lightweight and
          fast-running jobs, to avoid delays due to on-demand
          spawning of the jobs
        - An int, giving the exact number of total jobs that are
          spawned
        - A string, giving an expression as a function of n_jobs,
          as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape (n_splits,) | shape (n_splits, n_scores)
        Array of scores of the estimator for each run of the cross validation.
    """
    # This code is copied from sklearn

    from sklearn.base import clone
    from sklearn.utils import indexable
    from sklearn.metrics.scorer import check_scoring
    from sklearn.model_selection._split import check_cv

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = list(cv.split(X, y, groups))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    # Note: this parallelization is implemented using MNE Parallel
    parallel, p_func, n_jobs = parallel_func(_fit_and_score, n_jobs,
                                             pre_dispatch=pre_dispatch)
    scores = parallel(p_func(clone(estimator), X, y, scorer, train, test,
                             verbose, None, fit_params)
                      for train, test in cv_iter)
    return np.array(scores)[:, 0, ...]  # flatten over joblib output.
 def split(self, X, y=None, groups=None):
     X, y, groups = indexable(X, y, groups)
     for train, test, date_range in self._iter_indices(X, y, groups):
         yield train, test, date_range
Ejemplo n.º 29
0
    def fit(self, X, y, sample_weight=None):
        """Fit the calibrated model

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        y : array-like, shape (n_samples,)
            Target values.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csc', 'csr', 'coo'],
                         force_all_finite=False,
                         allow_nd=True)
        X, y = indexable(X, y)
        le = LabelBinarizer().fit(y)
        self.classes_ = le.classes_

        # Check that each cross-validation fold can have at least one
        # example per class
        n_folds = self.cv if isinstance(self.cv, int) \
            else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
        if n_folds and \
                np.any([np.sum(y == class_) < n_folds for class_ in
                        self.classes_]):
            raise ValueError("Requesting %d-fold cross-validation but provided"
                             " less than %d examples for at least one class." %
                             (n_folds, n_folds))

        self.calibrated_classifiers_ = []

        cv = check_cv(self.cv, y, classifier=True)
        fit_parameters = signature(self.base_estimator.fit).parameters
        estimator_name = type(self.base_estimator).__name__

        # Restructured to match the method for Platt (1999). Train an
        # estimator per fold. Collect the predictions into a single list
        # Train the calibration model.

        parallel = Parallel(n_jobs=self.n_jobs)

        self.fit_estimators_ = parallel(
            delayed(_fit)(clone(self.base_estimator), X[train], y[train])
            for train, _ in cv.split(X, y))

        results = parallel(
            delayed(_predict)(estimator, X[test], y[test])
            for estimator, (_,
                            test) in zip(self.fit_estimators_, cv.split(X, y)))

        cv_predictions = [item[0] for item in results]
        cv_targets = [item[1] for item in results]

        cv_predictions = list(itertools.chain.from_iterable(cv_predictions))
        cv_targets = list(itertools.chain.from_iterable(cv_targets))

        this_estimator = clone(self.base_estimator)

        # Re-fit base_estimator on the whole dataset
        refit_estimator = this_estimator.fit(X, y)

        calibrated_classifier = _CalibratedClassifier(refit_estimator,
                                                      method=self.method,
                                                      classes=self.classes_)

        # Fit the isotonic regression model.
        calibrated_classifier.fit(cv_predictions, cv_targets)
        self.calibrated_classifiers_.append(calibrated_classifier)

        return self
            def cross_validate(estimator,
                               X,
                               y=None,
                               groups=None,
                               scoring=None,
                               cv='warn',
                               n_jobs=None,
                               verbose=0,
                               fit_params=None,
                               pre_dispatch='2*n_jobs',
                               return_train_score=False,
                               return_estimator=False,
                               error_score='raise-deprecating'):

                X, y, groups = indexable(X, y, groups)

                cv = check_cv(cv, y, classifier=is_classifier(estimator))
                scorers, _ = _check_multimetric_scoring(estimator,
                                                        scoring=scoring)

                def _score(estimator,
                           X_test,
                           y_test,
                           scorer,
                           is_multimetric=False):

                    if is_multimetric:
                        return _multimetric_score(estimator, X_test, y_test,
                                                  scorer)
                    else:
                        if y_test is None:
                            score = scorer(estimator, X_test)
                        else:
                            score = scorer(estimator, X_test, y_test)

                        if hasattr(score, 'item'):
                            try:
                                # e.g. unwrap memmapped scalars
                                score = score.item()
                            except ValueError:
                                # non-scalar?
                                pass

                        if not isinstance(score, numbers.Number):
                            raise ValueError(
                                "scoring must return a number, got %s (%s) "
                                "instead. (scorer=%r)" %
                                (str(score), type(score), scorer))

                    return score

                def _multimetric_score(estimator, X_test, y_test, scorers):
                    """Return a dict of score for multimetric scoring."""
                    scores = {}

                    for name, scorer in scorers.items():
                        if y_test is None:
                            score = scorer(estimator, X_test)
                        else:
                            score = scorer(estimator, X_test, y_test)

                        if hasattr(score, 'item'):
                            try:
                                # e.g. unwrap memmapped scalars
                                score = score.item()
                            except ValueError:
                                # non-scalar?
                                pass
                        scores[name] = score

                        if not isinstance(score, numbers.Number):
                            raise ValueError(
                                "scoring must return a number, got %s (%s) "
                                "instead. (scorer=%s)" %
                                (str(score), type(score), name))
                    return scores

                def _aggregate_score_dicts(scores):

                    out = {}
                    for key in scores[0]:
                        out[key] = np.asarray([score[key] for score in scores])
                    return out

                def _fit_and_score(estimator,
                                   X,
                                   y,
                                   scorer,
                                   train,
                                   test,
                                   verbose,
                                   parameters,
                                   fit_params,
                                   return_train_score=False,
                                   return_parameters=False,
                                   return_n_test_samples=False,
                                   return_times=False,
                                   return_estimator=False,
                                   error_score='raise-deprecating'):

                    start_time = time.time()

                    if verbose > 1:
                        if parameters is None:
                            msg = ''
                        else:
                            msg = '%s' % (', '.join(
                                '%s=%s' % (k, v)
                                for k, v in parameters.items()))
                        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

                    # Adjust length of sample weights
                    fit_params = fit_params if fit_params is not None else {}
                    fit_params = dict([(k, _index_param_value(X, v, train))
                                       for k, v in fit_params.items()])

                    train_scores = {}
                    if parameters is not None:
                        estimator.set_params(**parameters)

                    X_train, y_train = _safe_split(estimator, X, y, train)
                    X_test, y_test = _safe_split(estimator, X, y, test, train)

                    is_multimetric = not callable(scorer)
                    n_scorers = len(scorer.keys()) if is_multimetric else 1

                    try:
                        #########################################
                        ############ FIT CALLED HERE ############
                        #########################################
                        if y_train is None:
                            estimator.fit(X_train, **fit_params)
                        else:
                            estimator.fit(X_train, y_train, **fit_params)
                        #########################################
                    except Exception as e:
                        # Note fit time as time until error
                        fit_time = time.time() - start_time
                        score_time = 0.0
                        if error_score == 'raise':
                            raise
                        elif error_score == 'raise-deprecating':
                            warnings.warn(
                                "From version 0.22, errors during fit will result "
                                "in a cross validation score of NaN by default. Use "
                                "error_score='raise' if you want an exception "
                                "raised or error_score=np.nan to adopt the "
                                "behavior from version 0.22.", FutureWarning)
                            raise
                        elif isinstance(error_score, numbers.Number):
                            if is_multimetric:
                                test_scores = dict(
                                    zip(scorer.keys(), [
                                        error_score,
                                    ] * n_scorers))
                                if return_train_score:
                                    train_scores = dict(
                                        zip(scorer.keys(), [
                                            error_score,
                                        ] * n_scorers))
                            else:
                                test_scores = error_score
                                if return_train_score:
                                    train_scores = error_score
                            warnings.warn(
                                "Estimator fit failed. The score on this train-test"
                                " partition for these parameters will be set to %f. "
                                "Details: \n%s" %
                                (error_score, format_exception_only(
                                    type(e), e)[0]), FitFailedWarning)
                        else:
                            raise ValueError(
                                "error_score must be the string 'raise' or a"
                                " numeric value. (Hint: if using 'raise', please"
                                " make sure that it has been spelled correctly.)"
                            )

                    else:
                        fit_time = time.time() - start_time
                        # _score will return dict if is_multimetric is True
                        test_scores = _score(estimator, X_test, y_test, scorer,
                                             is_multimetric)
                        score_time = time.time() - start_time - fit_time
                        if return_train_score:
                            train_scores = _score(estimator, X_train, y_train,
                                                  scorer, is_multimetric)

                    if verbose > 2:
                        if is_multimetric:
                            for scorer_name, score in test_scores.items():
                                msg += ", %s=%s" % (scorer_name, score)
                        else:
                            msg += ", score=%s" % test_scores
                    if verbose > 1:
                        total_time = score_time + fit_time
                        end_msg = "%s, total=%s" % (
                            msg, logger.short_format_time(total_time))
                        print("[CV] %s %s" %
                              ((64 - len(end_msg)) * '.', end_msg))

                    ret = [train_scores, test_scores
                           ] if return_train_score else [test_scores]

                    if return_n_test_samples:
                        ret.append(_num_samples(X_test))
                    if return_times:
                        ret.extend([fit_time, score_time])
                    if return_parameters:
                        ret.append(parameters)
                    if return_estimator:
                        ret.append(estimator)

                    return ret

                if not context:
                    parallel = Parallel(n_jobs=n_jobs,
                                        verbose=verbose,
                                        pre_dispatch=pre_dispatch)
                else:
                    parallel = cls.Parallel()

                # We clone the estimator to make sure that all the folds are
                # independent, and that it is pickle-able.
                scores = parallel(
                    delayed(_fit_and_score)(
                        clone(estimator),
                        X,
                        y,
                        scorers,
                        train,
                        test,
                        verbose,
                        None,
                        fit_params,
                        return_train_score=return_train_score,
                        return_times=True,
                        return_estimator=return_estimator,
                        error_score=error_score)
                    for train, test in cv.split(X, y, groups))

                zipped_scores = list(zip(*scores))
                if return_train_score:
                    train_scores = zipped_scores.pop(0)
                    train_scores = _aggregate_score_dicts(train_scores)
                if return_estimator:
                    fitted_estimators = zipped_scores.pop()
                test_scores, fit_times, score_times = zipped_scores
                test_scores = _aggregate_score_dicts(test_scores)

                ret = {}
                ret['fit_time'] = np.array(fit_times)
                ret['score_time'] = np.array(score_times)

                if return_estimator:
                    ret['estimator'] = fitted_estimators

                for name in scorers:
                    ret['test_%s' % name] = np.array(test_scores[name])
                    if return_train_score:
                        key = 'train_%s' % name
                        ret[key] = np.array(train_scores[name])

                return ret
Ejemplo n.º 31
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None,
                             use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: cross-validation generator
        Object to be used as a cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except Exception as e:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies.\n{}".format(
                e)
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [
            cv_results['split{}_test_score'.format(i)] for i in range(n_splits)
        ]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [
                    _fit_and_score(estimator=clone(sklearn_pipeline),
                                   X=features,
                                   y=target,
                                   scorer=scorer,
                                   train=train,
                                   test=test,
                                   verbose=0,
                                   parameters=None,
                                   error_score='raise',
                                   fit_params=sample_weight_dict)
                    for train, test in cv_iter
                ]
                CV_score = np.array(scores)[:, 0]
                CV_score_mean = np.nanmean(CV_score)
            return CV_score_mean
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
def validation_curve(estimator, X, y, param_name, param_range, labels=None,
                     cv=None, scoring=None, n_jobs=1, pre_dispatch="all",
                     verbose=0):
    """Validation curve.

    Determine training and test scores for varying parameter values.

    Compute scores for an estimator with different values of a specified
    parameter. This is similar to grid search with one parameter. However, this
    will also compute training scores and is merely a utility for plotting the
    results.

    Read more in the :ref:`User Guide <learning_curve>`.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    param_name : string
        Name of the parameter that will be varied.

    param_range : array-like, shape (n_values,)
        The values of the parameter that will be evaluated.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    verbose : integer, optional
        Controls the verbosity: the higher, the more messages.

    Returns
    -------
    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.

    Notes
    -----
    See
    :ref:`examples/model_selection/plot_validation_curve.py
    <example_model_selection_plot_validation_curve.py>`
    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    scorer = check_scoring(estimator, scoring=scoring)

    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                        verbose=verbose)
    out = parallel(delayed(_fit_and_score)(
        estimator, X, y, scorer, train, test, verbose,
        parameters={param_name: v}, fit_params=None, return_train_score=True)
        for train, test in cv.split(X, y, labels) for v in param_range)

    out = np.asarray(out)[:, :2]
    n_params = len(param_range)
    n_cv_folds = out.shape[0] // n_params
    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))

    return out[0], out[1]
Ejemplo n.º 33
0
 def y(self):
     if self._y is None:
         self._x, self._y = indexable(*self._load_training_data())
     return self._y
Ejemplo n.º 34
0
def permutation_test_score(estimator,
                           X,
                           y,
                           labels=None,
                           cv=None,
                           n_permutations=100,
                           n_jobs=1,
                           random_state=0,
                           verbose=0,
                           scoring=None):
    """Evaluate the significance of a cross-validated score with permutations

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like
        The target variable to try to predict in the case of
        supervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_permutations : integer, optional
        Number of times to permute ``y``.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    random_state : RandomState or an int seed (0 by default)
        A random number generator instance to define the state of the
        random permutations generator.

    verbose : integer, optional
        The verbosity level.

    Returns
    -------
    score : float
        The true score without permuting targets.

    permutation_scores : array, shape (n_permutations,)
        The scores obtained for each permutations.

    pvalue : float
        The returned value equals p-value if `scoring` returns bigger
        numbers for better scores (e.g., accuracy_score). If `scoring` is
        rather a loss function (i.e. when lower is better such as with
        `mean_squared_error`) then this is actually the complement of the
        p-value:  1 - p-value.

    Notes
    -----
    This function implements Test 1 in:

        Ojala and Garriga. Permutation Tests for Studying Classifier
        Performance.  The Journal of Machine Learning Research (2010)
        vol. 11

    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    score = _permutation_test_score(clone(estimator), X, y, labels, cv, scorer)
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(clone(estimator), X,
                                         _shuffle(y, labels, random_state),
                                         labels, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue
Ejemplo n.º 35
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None, index=None, print_individual_scores=False):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
#    print(index, sklearn_pipeline.steps)
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = []
            fold = 0
            for train, test in cv_iter:
                estimator = clone(sklearn_pipeline)
                score = _fit_and_score(estimator=estimator,
                                       X=features,
                                       y=target,
                                       scorer=scorer,
                                       train=train,
                                       test=test,
                                       verbose=0,
                                       parameters=None,
                                       fit_params=sample_weight_dict)
                fold += 1
#                if print_individual_scores:
#                    print("%d (%d): %s" % (index, fold, score))
                scores.append(score)
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
#        _logger.info("Evaluation timeout on %s" % sklearn_pipeline.steps)
        return "Timeout"
    except Exception as e:
        _logger.info(traceback.format_exc())
        return -float('inf')
Ejemplo n.º 36
0
def learning_curve(estimator,
                   X,
                   y,
                   labels=None,
                   train_sizes=np.linspace(0.1, 1.0, 5),
                   cv=None,
                   scoring=None,
                   exploit_incremental_learning=False,
                   n_jobs=1,
                   pre_dispatch="all",
                   verbose=0):
    """Learning curve.

    Determines cross-validated training and test scores for different training
    set sizes.

    A cross-validation generator splits the whole dataset k times in training
    and test data. Subsets of the training set with varying sizes will be used
    to train the estimator and a score for each training subset size and the
    test set will be computed. Afterwards, the scores will be averaged over
    all k runs for each training subset size.

    Read more in the :ref:`User Guide <learning_curve>`.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    exploit_incremental_learning : boolean, optional, default: False
        If the estimator supports incremental learning, this will be
        used to speed up fitting for different training set sizes.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    verbose : integer, optional
        Controls the verbosity: the higher, the more messages.

    Returns
    -------
    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
        Numbers of training examples that has been used to generate the
        learning curve. Note that the number of ticks might be less
        than n_ticks because duplicate entries will be removed.

    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.

    Notes
    -----
    See :ref:`examples/model_selection/plot_learning_curve.py
    <example_model_selection_plot_learning_curve.py>`
    """
    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
        raise ValueError("An estimator must support the partial_fit interface "
                         "to exploit incremental learning")
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = cv.split(X, y, labels)
    # Make a list since we will be iterating multiple times over the folds
    cv_iter = list(cv_iter)
    scorer = check_scoring(estimator, scoring=scoring)

    n_max_training_samples = len(cv_iter[0][0])
    # Because the lengths of folds can be significantly different, it is
    # not guaranteed that we use all of the available training data when we
    # use the first 'n_max_training_samples' samples.
    train_sizes_abs = _translate_train_sizes(train_sizes,
                                             n_max_training_samples)
    n_unique_ticks = train_sizes_abs.shape[0]
    if verbose > 0:
        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)
    if exploit_incremental_learning:
        classes = np.unique(y) if is_classifier(estimator) else None
        out = parallel(
            delayed(_incremental_fit_estimator)
            (clone(estimator), X, y, classes, train, test, train_sizes_abs,
             scorer, verbose) for train, test in cv.split(X, y, labels))
    else:
        out = parallel(
            delayed(_fit_and_score)(clone(estimator),
                                    X,
                                    y,
                                    scorer,
                                    train[:n_train_samples],
                                    test,
                                    verbose,
                                    parameters=None,
                                    fit_params=None,
                                    return_train_score=True)
            for train, test in cv_iter for n_train_samples in train_sizes_abs)
        out = np.array(out)[:, :2]
        n_cv_folds = out.shape[0] // n_unique_ticks
        out = out.reshape(n_cv_folds, n_unique_ticks, 2)

    out = np.asarray(out).transpose((2, 1, 0))

    return train_sizes_abs, out[0], out[1]
def gbdt_feature_selector(data_dict,\
                          gbdt_estimator,\
                          feature_rank, \
                          category_features, \
                          cv_list=[StratifiedKFold(n_splits=5, shuffle=True, random_state=0)],\
                          groups_list=[None],\
                          weights_list=[1],\
                          rounds=100,\
                          step=1,\
                          auc_diff_threshold=0,\
                          auc_initial=0.55):
    '''
    逐步加入变量的stepwise特征筛选函数 使用lightgbm模型
    data_dict: 多个数据集组成的dict 包含train test_xxx等等key
    gbdt_estimator: gbdt类的estimator
    feature_rank: 搜索特征的按照各种方式排序后的list
    category_features: 需要当做类别型变量处理的特征
    cv_list: 数据集切分方法组成的list 默认为5折StratifiedKFold
    groups_list: 数据集切分方法的参数组成的list 默认不给
    weight_list: 不同数据切分方法最终评估结果的权重
    rounds: 总共筛选多少轮
    step: 每轮加入多少个变量
    auc_diff_threshold: auc有多少提升才会被选入到其中
    auc_initial: 第一个加入的变量至少需要达到多高的auc才能进入模型
    
    return:
    feature_selected 最终入模特征
    step_detail 每一轮cv的细节内容构成的map key为roundx x为论数
    step_outer_valid_statistic 被选入论的outer_valid评估结果
    '''
    ##进行一些预置检查
    ##检查3个list的长度是否相等
    cv_list, groups_list, weights_list = indexable(cv_list, groups_list,
                                                   weights_list)
    ##进行一些准备工作
    print("开始特征筛选".center(50, '='))
    each_round_start = range(0, rounds * step, step)
    ##每一轮评估之前已经入模的特征
    feature_selected = []
    ##每一轮的明细数据
    step_detail = {}
    ##存储最终入模的特征数据
    step_outer_valid_statistic = {}
    for i in each_round_start:
        print('*************rounds: %d****************' % (i / step + 1))
        ##选出特征
        feature_added = feature_rank[i:i + step]
        feature_used = feature_selected + feature_added
        category_feature_uesd = [
            i for i in feature_used if i in category_features
        ]
        print('入模型特征数:' + str(len(feature_used)))
        print('当前轮数考察特征:', feature_added)
        ##存储每一个CV下的结果
        cv_detail_result = {}
        cv_statistic_result = {}
        cv_best_iteration_result = {}
        cv_statistic = {}
        cv_outer_valid_statistic = {}
        for cv_index in range(len(cv_list)):
            fold_detail_result, fold_statistic_result, fold_best_iteration_result = gbdt_cv_evaluate_earlystop(
                data_dict=data_dict,
                gbdt_estimator=gbdt_estimator,
                total_features=feature_used,
                category_features=category_feature_uesd,
                cv=cv_list[cv_index],
                groups=groups_list[cv_index])
            cv_detail_result[cv_index] = fold_detail_result
            cv_statistic_result[cv_index] = fold_statistic_result
            cv_best_iteration_result[cv_index] = fold_best_iteration_result
            ##取出其中的评估数
            fold_statistic = pd.DataFrame()
            for fold_key in fold_statistic_result.keys():
                temp_statistic = pd.DataFrame(
                    fold_statistic_result[fold_key]).T
                temp_statistic.columns = [fold_key + '_'
                                          ] + temp_statistic.columns
                fold_statistic = pd.concat([fold_statistic, temp_statistic],
                                           axis=1)
            ##包含各种评估指标的列
            evaluation_map = {}
            ##包含auc数据的列
            evaluation_map['auc'] = [
                i for i in fold_statistic.columns if '_auc' in i
            ]
            ##包含ks数据的列
            evaluation_map['ks'] = [
                i for i in fold_statistic.columns if '_ks' in i
            ]
            ##计算ks数据的均值和方差
            for evaluation_key in evaluation_map.keys():
                fold_statistic[evaluation_key + '_mean'] = fold_statistic[
                    evaluation_map[evaluation_key]].apply(lambda x: np.mean(x),
                                                          axis=1)
                fold_statistic[evaluation_key + '_std'] = fold_statistic[
                    evaluation_map[evaluation_key]].apply(lambda x: np.std(x),
                                                          axis=1)
            cv_statistic[cv_index] = fold_statistic
            cv_outer_valid_statistic[cv_index] = fold_statistic.loc[
                'outer_valid', [i + '_mean' for i in evaluation_map.keys()] +
                [i + '_std' for i in evaluation_map.keys()]]
        ##进行cv结果的评估 使用的是outer_valid的数据 对不同cv下面的结果进行加权
        outer_valid_statistic = pd.DataFrame(cv_outer_valid_statistic).apply(
            lambda x: np.dot(x, np.array(weights_list)), axis=1)
        ##存储每一轮结果的明细数据
        current_step_detail = {}
        current_step_detail['auc_threshold'] = auc_initial
        current_step_detail['feature_initial'] = feature_selected
        current_step_detail['feature_added'] = feature_added
        current_step_detail['feature_used'] = feature_used
        current_step_detail['category_feature_used'] = category_feature_uesd
        current_step_detail['cv_detail_result'] = cv_detail_result
        current_step_detail[
            'cv_best_iteration_result'] = cv_best_iteration_result
        current_step_detail['cv_statistic'] = cv_statistic
        current_step_detail[
            'cv_outer_valid_statistic'] = cv_outer_valid_statistic
        current_step_detail['outer_valid_statistic'] = outer_valid_statistic
        current_step_detail['is_delete'] = (outer_valid_statistic['auc_mean'] -
                                            auc_initial) < auc_diff_threshold
        step_detail['round' + str(int(i / step + 1))] = current_step_detail
        print('当前step下加权评估结果为{0},auc阈值为{1}(包含每部最低提升要求{2})'.format(
            outer_valid_statistic, auc_initial + auc_diff_threshold,
            auc_diff_threshold))
        ##不满足条件 就不把当前变量加入到候选集当中
        if (outer_valid_statistic['auc_mean'] -
                auc_initial) < auc_diff_threshold:
            print('删除当前批次的入模特征')
            continue
        ##满足条件 加入到候选集当中
        feature_selected = feature_used
        ##更新阈值
        auc_initial = outer_valid_statistic['auc_mean']
        ##存储每一步加入到模型单重
        step_outer_valid_statistic['round' +
                                   str(int(i / step +
                                           1))] = outer_valid_statistic
    return feature_selected, step_detail, step_outer_valid_statistic
Ejemplo n.º 38
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    labels=None,
                    scoring=None,
                    cv=None,
                    n_jobs=1,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.

    See Also
    ---------
    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test,
                                verbose, None, fit_params)
        for train, test in cv.split(X, y, labels))
    return np.array(scores)[:, 0]
Ejemplo n.º 39
0
def cross_validate(estimator,
                   y,
                   X=None,
                   scoring=None,
                   cv=None,
                   verbose=0,
                   error_score=np.nan,
                   **kwargs):  # TODO: remove kwargs
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    Parameters
    ----------
    estimator : estimator
        An estimator object that implements the ``fit`` method

    y : array-like or iterable, shape=(n_samples,)
            The time-series array.

    X : array-like, shape=[n_obs, n_vars], optional (default=None)
        An optional 2-d array of exogenous variables.

    scoring : str or callable, optional (default=None)
        The scoring metric to use. If a callable, must adhere to the signature
        ``metric(true, predicted)``. Valid string scoring metrics include:

        - 'smape'
        - 'mean_absolute_error'
        - 'mean_squared_error'

    cv : BaseTSCrossValidator or None, optional (default=None)
        An instance of cross-validation. If None, will use a RollingForecastCV

    verbose : integer, optional
        The verbosity level.

    error_score : 'raise' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, ModelFitWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
    """
    # Temporary shim until we remove `exogenous` support completely
    X, _ = pm_compat.get_X(X, **kwargs)

    y, X = indexable(y, X)
    y = check_endog(y, copy=False)

    cv = check_cv(cv)
    scoring = _check_scoring(scoring)

    # validate the error score
    if not (error_score == "raise" or isinstance(error_score, numbers.Number)):
        raise ValueError('error_score should be the string "raise" or a '
                         'numeric value')

    # TODO: in the future we might consider joblib for parallelizing, but it
    #   . could cause cross threads in parallelism..

    results = [
        _fit_and_score(fold,
                       base.clone(estimator),
                       y,
                       X,
                       scorer=scoring,
                       train=train,
                       test=test,
                       verbose=verbose,
                       error_score=error_score)
        for fold, (train, test) in enumerate(cv.split(y, X))]
    scores, fit_times, score_times = list(zip(*results))

    ret = {
        'test_score': np.array(scores),
        'fit_time': np.array(fit_times),
        'score_time': np.array(score_times),
    }
    return ret
def cross_val_score(estimator, X, y=None, labels=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.

    See Also
    ---------
    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in cv.split(X, y, labels))
    return np.array(scores)[:, 0]
Ejemplo n.º 41
0
def cross_validate(estimator,
                   X,
                   y=None,
                   groups=None,
                   scoring=None,
                   cv=None,
                   n_jobs=1,
                   verbose=0,
                   fit_params=None,
                   pre_dispatch='2*n_jobs',
                   return_train_score="warn"):
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    Read more in the :ref:`User Guide <multimetric_cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be for example a list, or an array.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable, list/tuple, dict or None, default: None
        A single string (see :ref:`scoring_parameter`) or a callable
        (see :ref:`scoring`) to evaluate the predictions on the test set.

        For evaluating multiple metrics, either give a list of (unique) strings
        or a dict with names as keys and callables as values.

        NOTE that when using custom scorers, each scorer should return a single
        value. Metric functions returning a list/array of values can be wrapped
        into multiple scorers that return one value each.

        See :ref:`multimetric_grid_search` for an example.

        If None, the estimator's default scorer (if available) is used.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cross_validators are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    return_train_score : boolean, optional
        Whether to include train decision_scores.

        Current default is ``'warn'``, which behaves as ``True`` in addition
        to raising a warning when a training score is looked up.
        That default will be changed to ``False`` in 0.21.
        Computing training decision_scores is used to get insights on how different
        parameter settings impact the overfitting/underfitting trade-off.
        However computing the decision_scores on the training set can be computationally
        expensive and is not strictly required to select the parameters that
        yield the best generalization performance.

    Returns
    -------
    decision_scores : dict of float arrays of shape=(n_splits,)
        Array of results of the estimator for each run of the cross validation.

        A dict of arrays containing the score/time arrays for each scorer is
        returned. The possible keys for this ``dict`` are:

            ``test_score``
                The score array for test decision_scores on each cross_validators split.
            ``train_score``
                The score array for train decision_scores on each cross_validators split.
                This is available only if ``return_train_score`` parameter
                is ``True``.
            ``fit_time``
                The time for fitting the estimator on the train
                set for each cross_validators split.
            ``score_time``
                The time for scoring the estimator on the test set for each
                cross_validators split. (Note time for scoring on the train set is not
                included even if ``return_train_score`` is set to ``True``
            ``estimator``
                A list of estimator objects, one for each training dataset.

    Examples
    --------
    >>> from sklearn import datasets, linear_model
    >>> from sklearn.model_selection import cross_validate
    >>> from sklearn.metrics.scorer import make_scorer
    >>> from sklearn.metrics import confusion_matrix
    >>> from sklearn.svm import LinearSVC
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()

    Single metric evaluation using ``cross_validate``

    >>> cv_results = cross_validate(lasso, X, y, return_train_score=False)
    >>> sorted(cv_results.keys())                         # doctest: +ELLIPSIS
    ['fit_time', 'score_time', 'test_score']
    >>> cv_results['test_score']    # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    array([ 0.33...,  0.08...,  0.03...])

    Multiple metric evaluation using ``cross_validate``
    (please refer the ``scoring`` parameter doc for more information)

    >>> decision_scores = cross_validate(lasso, X, y,
    ...                         scoring=('r2', 'neg_mean_squared_error'))
    >>> print(decision_scores['test_neg_mean_squared_error'])      # doctest: +ELLIPSIS
    [-3635.5... -3573.3... -6114.7...]
    >>> print(decision_scores['train_r2'])                         # doctest: +ELLIPSIS
    [ 0.28...  0.39...  0.22...]

    See Also
    ---------
    :func:`sklearn.model_selection.cross_val_score`:
        Run cross-validation for single metric evaluation.

    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    # ---------------------- My Hack ----------------------- #
    # 1) Set parameter `error_score=-1` to `_fit_and_score`  #
    # 2) Created an argument `return_estimator` to           #
    #    `_fit_and_score`                                    #
    # ------------------------------------------------------ #
    scores = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorers,
                                train,
                                test,
                                verbose,
                                None,
                                fit_params,
                                return_train_score=return_train_score,
                                return_times=True,
                                return_estimator=True,
                                error_score=-1)
        for train, test in cv.split(X, y, groups))

    if return_train_score:
        train_scores, test_scores, fit_times, score_times, estimators = zip(
            *scores)
        train_scores = _aggregate_score_dicts(train_scores)
    else:
        test_scores, fit_times, score_times, estimators = zip(*scores)
    test_scores = _aggregate_score_dicts(test_scores)

    # TODO: replace by a dict in 0.21
    ret = DeprecationDict() if return_train_score == 'warn' else {}
    ret['fit_time'] = np.array(fit_times)
    ret['score_time'] = np.array(score_times)
    ret['estimator'] = list(estimators)

    for name in scorers:
        ret['test_%s' % name] = np.array(test_scores[name])
        if return_train_score:
            key = 'train_%s' % name
            ret[key] = np.array(train_scores[name])
            if return_train_score == 'warn':
                message = (
                    'You are accessing a training score ({!r}), '
                    'which will not be available by default '
                    'any more in 0.21. If you need training decision_scores, '
                    'please set return_train_score=True').format(key)
                # warn on key access
                ret.add_warning(key, message, FutureWarning)

    ret['cross_validator'] = cv

    return ret
Ejemplo n.º 42
0
def permutation_test_score(estimator, X, y, data_train=None, cv=None,
                           n_permutations=100, n_jobs=1, labels=None,
                           random_state=0, verbose=0, scoring=None):
    """Evaluate the significance of a cross-validated score with permutations

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like
        The target variable to try to predict in the case of
        supervised learning.
        
    data_train : np.array, optional
        Data to train on, if data for training is different from X.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : integer or cross-validation generator, optional
        If an integer is passed, it is the number of fold (default 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects.

    n_permutations : integer, optional
        Number of times to permute ``y``.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    labels : array-like of shape [n_samples] (optional)
        Labels constrain the permutation among groups of samples with
        a same label.

    random_state : RandomState or an int seed (0 by default)
        A random number generator instance to define the state of the
        random permutations generator.

    verbose : integer, optional
        The verbosity level.

    Returns
    -------
    score : float
        The true score without permuting targets.

    permutation_scores : array, shape = [n_permutations]
        The scores obtained for each permutations.

    pvalue : float
        The returned value equals p-value if `score_func` returns bigger
        numbers for better scores (e.g., accuracy_score). If `score_func` is
        rather a loss function (i.e. when lower is better such as with
        `mean_squared_error`) then this is actually the complement of the
        p-value:  1 - p-value.

    Notes
    -----
    This function implements Test 1 in:

        Ojala and Garriga. Permutation Tests for Studying Classifier
        Performance.  The Journal of Machine Learning Research (2010)
        vol. 11

    """
    X, y = indexable(X, y)
    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)
    
    if data_train is None:
        # We clone the estimator to make sure that all the folds are
        # independent, and that it is pickle-able.
        # Default behavior of sklearn permutation score
        score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
        permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_permutation_test_score)(
                clone(estimator), X, _shuffle(y, labels, random_state), cv,
                scorer)
            for _ in range(n_permutations))
    else:
        # Modification for 2pn
        # First get the real score, train on nii_optional (actor), test on nii_func (observer)
        score = []
        for train, test in cv:
            estimator.fit(data_train[train], y[train])
            score.append(scorer(estimator, X[test], y[test]))
        score = np.mean(score)
        # Then, get the prmutation scores
        permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_permutation_test_score)(
                clone(estimator), X, _shuffle(y, labels, random_state), cv,
                scorer, data_train)
            for _ in range(n_permutations))
                
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue
Ejemplo n.º 43
0
def gap_train_test_split(*arrays, **options):
    """Split arrays or matrices into random train and test subsets (with a gap)

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    gap_size : float or int, default=0
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset between the training and the test set. If int,
        represents the absolute number of the dropped samples.

    test_size : float, int, or None, default=None
        If float, should be between 0.0 and 1.0 and equal to
        test / (train + test). If int, represents the absolute number of
        test samples. If None, the value is set to the complement of the
        train size and the gap. If `train_size` is also None,
        it will be set to 0.25.

    train_size : float, int, or None, default=None
        If float, should be between 0.0 and 1.0 and equal to
        train / (train + test). If int, represents the absolute number of
        train samples. If None, the value is automatically set to
        the complement of the test size and the gap size.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    Examples
    --------
    >>> import numpy as np
    >>> from tscv import gap_train_test_split
    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
    >>> X
    array([[0, 1],
           [2, 3],
           [4, 5],
           [6, 7],
           [8, 9]])
    >>> list(y)
    [0, 1, 2, 3, 4]
    >>> X_train, X_test, y_train, y_test = gap_train_test_split(
    ...     X, y, test_size=0.33, gap_size=1)
    ...
    >>> X_train
    array([[0, 1],
           [2, 3],
           [4, 5]])
    >>> y_train
    [0, 1, 2]
    >>> X_test
    array([[8, 9]])
    >>> y_test
    [4]
    >>> gap_train_test_split(list(range(10)), gap_size=0.1)
    [[0, 1, 2, 3, 4, 5, 6], [8, 9]]
    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    check_consistent_length(*arrays)
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    gap_size = options.pop('gap_size', 0)
    if not isinstance(gap_size, numbers.Real):
        raise TypeError("The gap size should be a real number.")

    if options:
        raise TypeError("Invalid parameters passed: %s. \n"
                        "Check the spelling of keyword parameters." %
                        str(options))

    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])

    def size_to_number(size, n):
        b, a = modf(size)
        return int(max(a, round(b * n)))

    n_gap = size_to_number(gap_size, n_samples)
    n_remain = n_samples - n_gap
    if test_size is None and train_size is None:
        test_size = 0.25
    if train_size is None:
        n_test = size_to_number(test_size, n_remain)
        n_train = n_remain - n_test
    elif test_size is None:
        n_train = size_to_number(train_size, n_remain)
        n_test = n_remain - n_train
    else:
        warnings.warn(
            "The train_size argument is overridden by test_size; "
            "in case of nonzero gap_size, "
            "an explicit value should be provided "
            "and cannot be implied by 1 - train_size - test_size.", Warning)
        n_test = size_to_number(test_size, n_remain)
        n_train = n_remain - n_test

    train = np.arange(n_train)
    test = np.arange(n_train + n_gap, n_samples)

    return list(
        chain.from_iterable((_safe_indexing(a, train), _safe_indexing(a, test))
                            for a in arrays))
def learning_curve(estimator, X, y, labels=None,
                   train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None,
                   exploit_incremental_learning=False, n_jobs=1,
                   pre_dispatch="all", verbose=0):
    """Learning curve.

    Determines cross-validated training and test scores for different training
    set sizes.

    A cross-validation generator splits the whole dataset k times in training
    and test data. Subsets of the training set with varying sizes will be used
    to train the estimator and a score for each training subset size and the
    test set will be computed. Afterwards, the scores will be averaged over
    all k runs for each training subset size.

    Read more in the :ref:`User Guide <learning_curve>`.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    exploit_incremental_learning : boolean, optional, default: False
        If the estimator supports incremental learning, this will be
        used to speed up fitting for different training set sizes.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    verbose : integer, optional
        Controls the verbosity: the higher, the more messages.

    Returns
    -------
    train_sizes_abs : array, shape = (n_unique_ticks,), dtype int
        Numbers of training examples that has been used to generate the
        learning curve. Note that the number of ticks might be less
        than n_ticks because duplicate entries will be removed.

    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.

    Notes
    -----
    See :ref:`examples/model_selection/plot_learning_curve.py
    <example_model_selection_plot_learning_curve.py>`
    """
    if exploit_incremental_learning and not hasattr(estimator, "partial_fit"):
        raise ValueError("An estimator must support the partial_fit interface "
                         "to exploit incremental learning")
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = cv.split(X, y, labels)
    # Make a list since we will be iterating multiple times over the folds
    cv_iter = list(cv_iter)
    scorer = check_scoring(estimator, scoring=scoring)

    n_max_training_samples = len(cv_iter[0][0])
    # Because the lengths of folds can be significantly different, it is
    # not guaranteed that we use all of the available training data when we
    # use the first 'n_max_training_samples' samples.
    train_sizes_abs = _translate_train_sizes(train_sizes,
                                             n_max_training_samples)
    n_unique_ticks = train_sizes_abs.shape[0]
    if verbose > 0:
        print("[learning_curve] Training set sizes: " + str(train_sizes_abs))

    parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                        verbose=verbose)
    if exploit_incremental_learning:
        classes = np.unique(y) if is_classifier(estimator) else None
        out = parallel(delayed(_incremental_fit_estimator)(
            clone(estimator), X, y, classes, train, test, train_sizes_abs,
            scorer, verbose) for train, test in cv.split(X, y, labels))
    else:
        out = parallel(delayed(_fit_and_score)(
            clone(estimator), X, y, scorer, train[:n_train_samples], test,
            verbose, parameters=None, fit_params=None, return_train_score=True)
            for train, test in cv_iter
            for n_train_samples in train_sizes_abs)
        out = np.array(out)[:, :2]
        n_cv_folds = out.shape[0] // n_unique_ticks
        out = out.reshape(n_cv_folds, n_unique_ticks, 2)

    out = np.asarray(out).transpose((2, 1, 0))

    return train_sizes_abs, out[0], out[1]
def permutation_test_score(estimator, X, y, labels=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """Evaluate the significance of a cross-validated score with permutations

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like
        The target variable to try to predict in the case of
        supervised learning.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_permutations : integer, optional
        Number of times to permute ``y``.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    random_state : RandomState or an int seed (0 by default)
        A random number generator instance to define the state of the
        random permutations generator.

    verbose : integer, optional
        The verbosity level.

    Returns
    -------
    score : float
        The true score without permuting targets.

    permutation_scores : array, shape (n_permutations,)
        The scores obtained for each permutations.

    pvalue : float
        The returned value equals p-value if `scoring` returns bigger
        numbers for better scores (e.g., accuracy_score). If `scoring` is
        rather a loss function (i.e. when lower is better such as with
        `mean_squared_error`) then this is actually the complement of the
        p-value:  1 - p-value.

    Notes
    -----
    This function implements Test 1 in:

        Ojala and Garriga. Permutation Tests for Studying Classifier
        Performance.  The Journal of Machine Learning Research (2010)
        vol. 11

    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    score = _permutation_test_score(clone(estimator), X, y, labels, cv, scorer)
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, labels, random_state),
            labels, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None,
                             groups=None, use_dask=False, predictions=None,
                             pipelines=None, features_test=None,
                             random_state=None):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    # Re-set random seeds inside the threads
    if random_state is not None:
        random.seed(random_state) # deap uses random
        np.random.seed(random_state)

    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    # save the sklearn predictions. The model is trained with the training set (features) and validated with the test dataset
    # (features_test)
    # Note: because of the way TPOT is built, the fit function is called to see if the model is valid.
    try:
        tmp = sklearn_pipeline.fit(features, target)
        predictions.append(tmp.predict(features_test))
        pipelines.append(sklearn_pipeline)
    except:
        pass

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [cv_results['split{}_test_score'.format(i)]
                  for i in range(n_splits)]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                         X=features,
                                         y=target,
                                         scorer=scorer,
                                         train=train,
                                         test=test,
                                         verbose=0,
                                         parameters=None,
                                         fit_params=sample_weight_dict)
                          for train, test in cv_iter]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
Ejemplo n.º 47
0
def validation_curve(estimator,
                     X,
                     y,
                     param_name,
                     param_range,
                     labels=None,
                     cv=None,
                     scoring=None,
                     n_jobs=1,
                     pre_dispatch="all",
                     verbose=0):
    """Validation curve.

    Determine training and test scores for varying parameter values.

    Compute scores for an estimator with different values of a specified
    parameter. This is similar to grid search with one parameter. However, this
    will also compute training scores and is merely a utility for plotting the
    results.

    Read more in the :ref:`User Guide <learning_curve>`.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    param_name : string
        Name of the parameter that will be varied.

    param_range : array-like, shape (n_values,)
        The values of the parameter that will be evaluated.

    labels : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).

    pre_dispatch : integer or string, optional
        Number of predispatched jobs for parallel execution (default is
        all). The option can reduce the allocated memory. The string can
        be an expression like '2*n_jobs'.

    verbose : integer, optional
        Controls the verbosity: the higher, the more messages.

    Returns
    -------
    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.

    Notes
    -----
    See
    :ref:`examples/model_selection/plot_validation_curve.py
    <example_model_selection_plot_validation_curve.py>`
    """
    X, y, labels = indexable(X, y, labels)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    scorer = check_scoring(estimator, scoring=scoring)

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)
    out = parallel(
        delayed(_fit_and_score)(estimator,
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters={
                                    param_name: v
                                },
                                fit_params=None,
                                return_train_score=True)
        for train, test in cv.split(X, y, labels) for v in param_range)

    out = np.asarray(out)[:, :2]
    n_params = len(param_range)
    n_cv_folds = out.shape[0] // n_params
    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))

    return out[0], out[1]
def cross_val_decision_function(estimator, X, y=None, cv=None, n_jobs=1,
                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """Generate cross-validated estimates for each input data point

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross-validation,
        - integer, to specify the number of folds.
        - An object to be used as a cross-validation generator.
        - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    preds : ndarray
        This is the result of calling 'predict'
    """
    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    preds_blocks = parallel(delayed(_fit_and_predict)(clone(estimator), X, y,
                                                      train, test, verbose,
                                                      fit_params)
                            for train, test in cv)

    preds = [p for p, _ in preds_blocks]
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not _check_is_partition(locs, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')
    inv_locs = np.empty(len(locs), dtype=int)
    inv_locs[locs] = np.arange(len(locs))

    # Check for sparse predictions
    if sp.issparse(preds[0]):
        preds = sp.vstack(preds, format=preds[0].format)
    else:
        preds = np.concatenate(preds)
    return preds[inv_locs]
Ejemplo n.º 49
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None,
                             groups=None, use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [cv_results['split{}_test_score'.format(i)]
                  for i in range(n_splits)]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                         X=features,
                                         y=target,
                                         scorer=scorer,
                                         train=train,
                                         test=test,
                                         verbose=0,
                                         parameters=None,
                                         fit_params=sample_weight_dict)
                                    for train, test in cv_iter]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
Ejemplo n.º 50
0
def train_test_split(*arrays, **options):
    """Extend sklearn.model_selection.train_test_slit to have group split.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.

    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : None or str (default='simple')
        How to shuffle the data before splitting.
        None, no shuffle.
        For str, one of 'simple', 'stratified' and 'group', corresponding to
        `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`,
        respectively.

    labels : array-like or None (default=None)
        Ignored if shuffle is None or 'simple'.
        When shuffle='stratified', this array is used as class labels.
        When shuffle='group', this array is used as groups.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

    """
    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")
    test_size = options.pop('test_size', None)
    train_size = options.pop('train_size', None)
    random_state = options.pop('random_state', None)
    shuffle = options.pop('shuffle', 'simple')
    labels = options.pop('labels', None)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    arrays = indexable(*arrays)

    n_samples = _num_samples(arrays[0])
    if shuffle == 'group':
        if labels is None:
            raise ValueError("When shuffle='group', "
                             "labels should not be None!")
        labels = check_array(labels, ensure_2d=False, dtype=None)
        uniques = np.unique(labels)
        n_samples = uniques.size

    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
                                              default_test_size=0.25)

    shuffle_options = dict(test_size=n_test,
                           train_size=n_train,
                           random_state=random_state)

    if shuffle is None:
        if labels is not None:
            warnings.warn("The `labels` is ignored for "
                          "shuffle being None!")

        train = np.arange(n_train)
        test = np.arange(n_train, n_train + n_test)

    elif shuffle == 'simple':
        if labels is not None:
            warnings.warn("The `labels` is not needed and therefore "
                          "ignored for ShuffleSplit, as shuffle='simple'!")

        cv = ShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None))

    elif shuffle == 'stratified':
        cv = StratifiedShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=labels))

    elif shuffle == 'group':
        cv = GroupShuffleSplit(**shuffle_options)
        train, test = next(cv.split(X=arrays[0], y=None, groups=labels))

    else:
        raise ValueError("The argument `shuffle` only supports None, "
                         "'simple', 'stratified' and 'group', but got `%s`!"
                         % shuffle)

    return list(chain.from_iterable((safe_indexing(a, train),
                                    safe_indexing(a, test)) for a in arrays))