Exemple #1
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    if num_samples(train) == 0 or num_samples(test) == 0:
        raise RuntimeError(
            'Cross validation error in fit_estimator. The total data set '
            'contains %d elements, which were split into a training set '
            'of %d elements and a test set of %d elements. Unfortunately, '
            'you can\'t have a %s set with 0 elements.' % (
                num_samples(X), num_samples(train), num_samples(test),
                'training' if num_samples(train) == 0 else 'test'))

    # adjust length of sample weights
    n_samples = num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    msmbuilder_api = is_msmbuilder_estimator(estimator)
    n_samples_test = num_samples(X_test, is_nested=msmbuilder_api)
    n_samples_train = num_samples(X_train, is_nested=msmbuilder_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
Exemple #2
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # adjust length of sample weights
    n_samples = _num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    mixtape_api = _is_mixtape_estimator(estimator)
    n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api)
    n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
Exemple #3
0
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise', extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    
    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)
    
    return ret
Exemple #4
0
    def fit(self, X, y):
        """Fit the RFA model and automatically tune the number of selected
           features.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.
        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).
        """
        X, y = check_X_y(X, y, "csr")
        if self.estimator_params is not None:
            warnings.warn("The parameter 'estimator_params' is deprecated as of version 0.16 "
                          "and will be removed in 0.18. The parameter is no longer "
                          "necessary because the value is set via the estimator initialisation "
                          "or set_params function."
                          , DeprecationWarning)
        # Initialization
        rfa = RFA(estimator=self.estimator, n_features_to_select=1,
                  step=self.step, estimator_params=self.estimator_params,
                  verbose=self.verbose - 1)

        cv = check_cv(self.cv, X, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        scores = np.zeros(X.shape[1])
        n_features_to_select_by_rank = np.zeros(X.shape[1])

        # Cross-validation
        for n, (train, test) in enumerate(cv):
            X_train, y_train = _safe_split(self.estimator, X, y, train)
            X_test, y_test = _safe_split(self.estimator, X, y, test, train)

            # Compute a full ranking of the features
            # ranking_ contains the same set of values for all CV folds,
            # but perhaps reordered
            ranking_ = rfa.fit(X_train, y_train).ranking_
            # Score each subset of features
            for k in range(0, np.max(ranking_)):
                indices = np.where(ranking_ <= k + 1)[0]
                estimator = clone(self.estimator)
                estimator.fit(X_train[:, indices], y_train)
                score = _score(estimator, X_test[:, indices], y_test, scorer)

                if self.verbose > 0:
                    print("Finished fold with %d / %d feature ranks, score=%f"
                          % (k + 1, np.max(ranking_), score))
                scores[k] += score
                # n_features_to_select_by_rank[k] is being overwritten
                # multiple times, but by the same value
                n_features_to_select_by_rank[k] = indices.size

        # Select the best upper bound for feature rank. It's OK to use the
        # last ranking_, as np.max(ranking_) is the same over all CV folds.
        scores = scores[:np.max(ranking_)]
        k = np.argmax(scores)

        # Re-execute an elimination with best_k over the whole set
        rfa = RFA(estimator=self.estimator,
                  n_features_to_select=n_features_to_select_by_rank[k],
                  step=self.step, estimator_params=self.estimator_params)

        rfa.fit(X, y)

        # Set final attributes
        self.support_ = rfa.support_
        self.n_features_ = rfa.n_features_
        self.ranking_ = rfa.ranking_
        self.estimator_ = clone(self.estimator)
        if self.estimator_params:
            self.estimator_.set_params(**self.estimator_params)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to len(cv) - 1
        # here, the scores are normalized by len(cv)
        self.grid_scores_ = scores / len(cv)
        return self
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   scoring,
                   return_train_score=False,
                   return_parameters=False,
                   error_score='raise'):
    """
    Fit estimator and compute scores for a given dataset split.
    This overrides the behavior of _fit_and_score method in cross_validation.py. 
    Note that a new argument, scoring, has been added to the function.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape at least 2D
        The data to fit.
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    train : array-like, shape (n_train_samples,)
        Indices of training samples.
    test : array-like, shape (n_test_samples,)
        Indices of test samples.
    verbose : integer
        The verbosity level.
    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    scoring: string
        The name of the scoring function used in cross_val_score. Default is
        accuracy.
    return_train_score : boolean, optional, default: False
        Compute and return score on training set.
    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.
    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.
    test_score : float
        Score on test set.
    n_test_samples : int
        Number of test samples.
    scoring_time : float
        Time spent for fitting and scoring in seconds.
    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    x_train, y_train = _safe_split(estimator, X, y, train)
    x_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            b = estimator.fit(x_train, **fit_params)
        else:
            b = estimator.fit(x_train, y_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, x_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, x_train, y_train, scorer)

    # Addition to original scikit code:
    # Create FitEvents for each estimator fit.
    fit_event = FitEvent(b, estimator, x_train)
    ModelDbSyncer.Syncer.instance.add_to_buffer(fit_event)

    scoring_time = time.time() - start_time
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(x_test), scoring_time])

    # Addition to original scikit code:
    # Create MetricEvents for each estimator.
    metric_event = MetricEvent(x_test, estimator, "", "", scoring, test_score)
    ModelDbSyncer.Syncer.instance.add_to_buffer(metric_event)

    if return_parameters:
        ret.append(parameters)
    return ret
Exemple #6
0
def _extended_fit_and_score(estimator,
                            X,
                            y,
                            scorer,
                            train,
                            test,
                            verbose,
                            parameters,
                            fit_params,
                            return_train_score=False,
                            return_parameters=False,
                            error_score='raise',
                            extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)

    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(
                estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)

    return ret
def _fit_and_score(estimator, X, y, scorer, train, test, cv, verbose, parameters,
                   fit_params, return_train_score=False,
                   return_parameters=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scoring : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape = (n_train_samples,)
        Indices of training samples.

    test : array-like, shape = (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust lenght of sample weights
    n_samples = _num_samples(X)

    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])
    if cv is not None:
        fit_params["cv"] = cv

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    if return_train_score:
        train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Exemple #8
0
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose,
                            parameters, fit_params, return_train_score=False,
                            return_parameters=False, return_n_support=True,
                            error_score='raise'):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    if return_n_support:
        ret.append(estimator.n_support_)
    return ret