Ejemplo n.º 1
0
    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        X, y, groups = indexable(X, y, groups)

        cv = check_cv(self.cv, y, classifier=True)
        n_splits = cv.get_n_splits(X, y, groups)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            LOG.info("Fitting %d folds for each of %d candidates, totalling"
                     " %d fits", n_splits, n_candidates, n_candidates * n_splits)

        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))
        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_model_fit_and_score)(
            estimator, X, y, self.scoring, train, test, self.verbose, parameters,
            fit_params=self.fit_params,
            return_train_score=self.return_train_score,
            return_n_test_samples=True,
            return_times=True, return_parameters=True,
            error_score=self.error_score)
          for estimator, parameters in parameter_iterable
          for train, test in cv_iter)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index][1]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            _, param_values = params
            for name, value in param_values.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits
        self.best_model_ = candidate_params[best_index]

        if self.refit:
            # build best estimator and fit
            best_estimator = _clf_build(self.best_model_[0])
            best_estimator.set_params(**best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 2
0
    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        X, y, groups = indexable(X, y, groups)

        cv = check_cv(self.cv, y, classifier=True)
        n_splits = cv.get_n_splits(X, y, groups)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            LOG.info(
                "Fitting %d folds for each of %d candidates, totalling"
                " %d fits", n_splits, n_candidates, n_candidates * n_splits)

        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))
        out = Parallel(n_jobs=self.n_jobs,
                       verbose=self.verbose,
                       pre_dispatch=pre_dispatch)(
                           delayed(_model_fit_and_score)(
                               estimator,
                               X,
                               y,
                               self.scoring,
                               train,
                               test,
                               self.verbose,
                               parameters,
                               fit_params=self.fit_params,
                               return_train_score=self.return_train_score,
                               return_n_test_samples=True,
                               return_times=True,
                               return_parameters=True,
                               error_score=self.error_score)
                           for estimator, parameters in parameter_iterable
                           for train, test in cv_iter)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time,
             parameters) = zip(*out)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score',
               test_scores,
               splits=True,
               rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index][1]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            _, param_values = params
            for name, value in param_values.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits
        self.best_model_ = candidate_params[best_index]

        if self.refit:
            # build best estimator and fit
            best_estimator = _clf_build(self.best_model_[0])
            best_estimator.set_params(**best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 3
0
def fit(self, X, y=None, groups=None, **fit_params):
    """Run fit with all sets of parameters.

    Parameters
    ----------

    X : array-like, shape = [n_samples, n_features]
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape = [n_samples] or [n_samples, n_output], optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.

    **fit_params : dict of string -> object
        Parameters passed to the ``fit`` method of the estimator
    """
    if self.fit_params is not None:
        warnings.warn(
            '"fit_params" as a constructor argument was '
            'deprecated in version 0.19 and will be removed '
            'in version 0.21. Pass fit parameters to the '
            '"fit" method instead.', DeprecationWarning)
        if fit_params:
            warnings.warn(
                'Ignoring fit_params passed as a constructor '
                'argument in favor of keyword arguments to '
                'the "fit" method.', RuntimeWarning)
        else:
            fit_params = self.fit_params
    estimator = self.estimator
    cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

    scorers, self.multimetric_ = _check_multimetric_scoring(
        self.estimator, scoring=self.scoring)

    if self.multimetric_:
        if self.refit is not False and (
                not isinstance(self.refit, six.string_types) or
                # This will work for both dict / list (tuple)
                self.refit not in scorers):
            raise ValueError("For multi-metric scoring, the parameter "
                             "refit must be set to a scorer key "
                             "to refit an estimator with the best "
                             "parameter setting on the whole data and "
                             "make the best_* attributes "
                             "available for that metric. If this is not "
                             "needed, refit should be set to False "
                             "explicitly. %r was passed." % self.refit)
        else:
            refit_metric = self.refit
    else:
        refit_metric = 'score'

    X, y, groups = indexable(X, y, groups)
    n_splits = cv.get_n_splits(X, y, groups)
    # Regenerate parameter iterable for each fit
    candidate_params = list(self._get_param_iterator())
    n_candidates = len(candidate_params)
    if self.verbose > 0:
        print("Fitting {0} folds for each of {1} candidates, totalling"
              " {2} fits".format(n_splits, n_candidates,
                                 n_candidates * n_splits))

    base_estimator = clone(self.estimator)
    pre_dispatch = self.pre_dispatch

    out = Parallel(
        n_jobs=self.n_jobs,
        verbose=self.verbose,
        pre_dispatch=pre_dispatch,
        n_tasks=n_candidates * n_splits,
    )(delayed(_fit_and_score)(clone(base_estimator),
                              X,
                              y,
                              scorers,
                              train,
                              test,
                              self.verbose,
                              parameters,
                              fit_params=fit_params,
                              return_train_score=self.return_train_score,
                              return_n_test_samples=True,
                              return_times=True,
                              return_parameters=False,
                              error_score=self.error_score)
      for parameters, (
          train, test) in product(candidate_params, cv.split(X, y, groups)))

    # if one choose to see train score, "out" will contain train score info
    if self.return_train_score:
        (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)
    else:
        (test_score_dicts, test_sample_counts, fit_time,
         score_time) = zip(*out)

    # test_score_dicts and train_score dicts are lists of dictionaries and
    # we make them into dict of lists
    test_scores = _aggregate_score_dicts(test_score_dicts)
    if self.return_train_score:
        train_scores = _aggregate_score_dicts(train_score_dicts)

    results = dict()

    def _store(key_name, array, weights=None, splits=False, rank=False):
        """A small helper to store the scores/times to the cv_results_"""
        # When iterated first by splits, then by parameters
        # We want `array` to have `n_candidates` rows and `n_splits` cols.
        array = np.array(array,
                         dtype=np.float64).reshape(n_candidates, n_splits)
        if splits:
            for split_i in range(n_splits):
                # Uses closure to alter the results
                results["split%d_%s" % (split_i, key_name)] = array[:, split_i]

        array_means = np.average(array, axis=1, weights=weights)
        results['mean_%s' % key_name] = array_means
        # Weighted std is not directly available in numpy
        array_stds = np.sqrt(
            np.average((array - array_means[:, np.newaxis])**2,
                       axis=1,
                       weights=weights))
        results['std_%s' % key_name] = array_stds

        if rank:
            results["rank_%s" % key_name] = np.asarray(rankdata(-array_means,
                                                                method='min'),
                                                       dtype=np.int32)

    _store('fit_time', fit_time)
    _store('score_time', score_time)
    # Use one MaskedArray and mask all the places where the param is not
    # applicable for that candidate. Use defaultdict as each candidate may
    # not contain all the params
    param_results = defaultdict(
        partial(MaskedArray, np.empty(n_candidates, ), mask=True,
                dtype=object))
    for cand_i, params in enumerate(candidate_params):
        for name, value in params.items():
            # An all masked empty array gets created for the key
            # `"param_%s" % name` at the first occurence of `name`.
            # Setting the value at an index also unmasks that index
            param_results["param_%s" % name][cand_i] = value

    results.update(param_results)
    # Store a list of param dicts at the key 'params'
    results['params'] = candidate_params

    # NOTE test_sample counts (weights) remain the same for all candidates
    test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int)
    for scorer_name in scorers.keys():
        # Computed the (weighted) mean and std for test scores alone
        _store('test_%s' % scorer_name,
               test_scores[scorer_name],
               splits=True,
               rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_%s' % scorer_name,
                   train_scores[scorer_name],
                   splits=True)

    # For multi-metric evaluation, store the best_index_, best_params_ and
    # best_score_ iff refit is one of the scorer names
    # In single metric evaluation, refit_metric is "score"
    if self.refit or not self.multimetric_:
        self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
        self.best_params_ = candidate_params[self.best_index_]
        self.best_score_ = results["mean_test_%s" %
                                   refit_metric][self.best_index_]

    if self.refit:
        self.best_estimator_ = clone(base_estimator).set_params(
            **self.best_params_)
        if y is not None:
            self.best_estimator_.fit(X, y, **fit_params)
        else:
            self.best_estimator_.fit(X, **fit_params)

    # Store the only scorer not as a dict for single metric evaluation
    self.scorer_ = scorers if self.multimetric_ else scorers['score']

    self.cv_results_ = results
    self.n_splits_ = n_splits

    return self