コード例 #1
0
ファイル: grid_search.py プロジェクト: mayoub/hhana
def fit_grid_point(base_estimator, parameters, X, y, sample_weight, train,
                   test, verbose, **fit_params):
    """Run fit on one set of parameters"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    # update parameters of the classifier after a copy of its base structure
    estimator = clone(base_estimator)
    estimator.set_params(**parameters)

    X_train, y_train, sample_weight_train = _safe_split(
        estimator, X, y, sample_weight, train)
    X_test, y_test, sample_weight_test = _safe_split(estimator, X, y,
                                                     sample_weight, test,
                                                     train)

    if sample_weight is not None:
        fit_params = fit_params.copy()
        fit_params['sample_weight'] = sample_weight_train

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (
            msg, logger.short_format_time(time.time() - start_time))
        print "[BoostGridSearchCV] %s %s" % (
            (64 - len(end_msg)) * '.', end_msg)
    return estimator, parameters, train, test
コード例 #2
0
ファイル: grid_search.py プロジェクト: Reikyo/hhana
def fit_grid_point(base_estimator, parameters,
                   X, y, sample_weight,
                   train, test, verbose,
                   **fit_params):
    """Run fit on one set of parameters"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    # update parameters of the classifier after a copy of its base structure
    estimator = clone(base_estimator)
    estimator.set_params(**parameters)

    X_train, y_train, sample_weight_train = _safe_split(
        estimator, X, y, sample_weight, train)
    X_test, y_test, sample_weight_test = _safe_split(
        estimator, X, y, sample_weight, test, train)

    if sample_weight is not None:
        fit_params = fit_params.copy()
        fit_params['sample_weight'] = sample_weight_train

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return estimator, parameters, train, test
コード例 #3
0
ファイル: fit_estimator.py プロジェクト: brookehus/osprey
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    if num_samples(train) == 0 or num_samples(test) == 0:
        raise RuntimeError(
            'Cross validation error in fit_estimator. The total data set '
            'contains %d elements, which were split into a training set '
            'of %d elements and a test set of %d elements. Unfortunately, '
            'you can\'t have a %s set with 0 elements.' % (
                num_samples(X), num_samples(train), num_samples(test),
                'training' if num_samples(train) == 0 else 'test'))

    # adjust length of sample weights
    n_samples = num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    msmbuilder_api = is_msmbuilder_estimator(estimator)
    n_samples_test = num_samples(X_test, is_nested=msmbuilder_api)
    n_samples_train = num_samples(X_train, is_nested=msmbuilder_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
コード例 #4
0
def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
    """Fit estimator and predict values for a given dataset split.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    Returns
    -------
    preds : sequence
        Result of calling 'estimator.predict'

    test : array-like
        This is the value of the test parameter
    """
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, _ = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    #preds = estimator.predict(X_test)
    preds = estimator.decision_function(X_test)
    return preds, test
コード例 #5
0
def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params):
    """Fit estimator and predict values for a given dataset split.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    Returns
    -------
    preds : sequence
        Result of calling 'estimator.predict'

    test : array-like
        This is the value of the test parameter
    """
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, _ = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    #preds = estimator.predict(X_test)
    preds = estimator.decision_function(X_test)
    return preds, test
コード例 #6
0
ファイル: fit_estimator.py プロジェクト: Eigenstate/osprey
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
                   fit_params=None):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # adjust length of sample weights
    n_samples = _num_samples(X)
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    # fit and score
    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    mixtape_api = _is_mixtape_estimator(estimator)
    n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api)
    n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api)
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    return (test_score, n_samples_test, train_score, n_samples_train,
            scoring_time)
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
    tr, te = list(cv)[0]

    X_tr, y_tr = cval._safe_split(clf, X, y, tr)
    K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = cval._safe_split(clf, X, y, te, tr)
    K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
コード例 #8
0
def test_safe_split_with_precomputed_kernel():
    clf = SVC()
    clfp = SVC(kernel="precomputed")

    iris = load_iris()
    X, y = iris.data, iris.target
    K = np.dot(X, X.T)

    cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0)
    tr, te = list(cv)[0]

    X_tr, y_tr = cval._safe_split(clf, X, y, tr)
    K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr)
    assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T))

    X_te, y_te = cval._safe_split(clf, X, y, te, tr)
    K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
    assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
コード例 #9
0
ファイル: grid_search.py プロジェクト: Reikyo/hhana
def score_each_boost(estimator, parameters,
                     min_n_estimators,
                     X, y, sample_weight,
                     score_func, train, test,
                     verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(
        estimator, X, y, sample_weight, test, train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
コード例 #10
0
    def test_kernel_precomputed(self):
        from sklearn.metrics.pairwise import pairwise_kernels
        from sklearn.cross_validation import _safe_split

        m = MinlipSurvivalAnalysis(kernel="precomputed", solver="cvxpy")
        K = pairwise_kernels(self.x, metric="rbf")

        train_idx = numpy.arange(50, self.x.shape[0])
        test_idx = numpy.arange(50)
        X_fit, y_fit = _safe_split(m, K, self.y, train_idx)
        X_test, y_test = _safe_split(m, K, self.y, test_idx, train_idx)

        m.fit(X_fit, y_fit)

        p = m.predict(X_test)
        v = concordance_index_censored(y_test['cens'], y_test['time'], p)

        expected = numpy.array([0.508748, 378, 365, 0, 0])

        assert_array_almost_equal(expected, v)
コード例 #11
0
ファイル: grid_search.py プロジェクト: mayoub/hhana
def score_each_boost(estimator, parameters, min_n_estimators, X, y,
                     sample_weight, score_func, train, test, verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(estimator, X, y,
                                                     sample_weight, test,
                                                     train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (
            msg, logger.short_format_time(time.time() - start_time))
        print "[BoostGridSearchCV] %s %s" % (
            (64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
コード例 #12
0
ファイル: grid_search.py プロジェクト: afcarl/dask-learn
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise'):

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = estimator.score(X_test, y_test)

    scoring_time = time.time() - start_time

    ret = [test_score, _num_samples(X_test), scoring_time]
    if return_parameters:
        ret.append(parameters)
    return ret
コード例 #13
0
ファイル: gridSearch.py プロジェクト: jbjorne/CAMDA2015
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise', extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    
    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)
    
    return ret
コード例 #14
0
ファイル: rfa.py プロジェクト: scottclowe/eat-it
    def fit(self, X, y):
        """Fit the RFA model and automatically tune the number of selected
           features.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.
        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).
        """
        X, y = check_X_y(X, y, "csr")
        if self.estimator_params is not None:
            warnings.warn("The parameter 'estimator_params' is deprecated as of version 0.16 "
                          "and will be removed in 0.18. The parameter is no longer "
                          "necessary because the value is set via the estimator initialisation "
                          "or set_params function."
                          , DeprecationWarning)
        # Initialization
        rfa = RFA(estimator=self.estimator, n_features_to_select=1,
                  step=self.step, estimator_params=self.estimator_params,
                  verbose=self.verbose - 1)

        cv = check_cv(self.cv, X, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        scores = np.zeros(X.shape[1])
        n_features_to_select_by_rank = np.zeros(X.shape[1])

        # Cross-validation
        for n, (train, test) in enumerate(cv):
            X_train, y_train = _safe_split(self.estimator, X, y, train)
            X_test, y_test = _safe_split(self.estimator, X, y, test, train)

            # Compute a full ranking of the features
            # ranking_ contains the same set of values for all CV folds,
            # but perhaps reordered
            ranking_ = rfa.fit(X_train, y_train).ranking_
            # Score each subset of features
            for k in range(0, np.max(ranking_)):
                indices = np.where(ranking_ <= k + 1)[0]
                estimator = clone(self.estimator)
                estimator.fit(X_train[:, indices], y_train)
                score = _score(estimator, X_test[:, indices], y_test, scorer)

                if self.verbose > 0:
                    print("Finished fold with %d / %d feature ranks, score=%f"
                          % (k + 1, np.max(ranking_), score))
                scores[k] += score
                # n_features_to_select_by_rank[k] is being overwritten
                # multiple times, but by the same value
                n_features_to_select_by_rank[k] = indices.size

        # Select the best upper bound for feature rank. It's OK to use the
        # last ranking_, as np.max(ranking_) is the same over all CV folds.
        scores = scores[:np.max(ranking_)]
        k = np.argmax(scores)

        # Re-execute an elimination with best_k over the whole set
        rfa = RFA(estimator=self.estimator,
                  n_features_to_select=n_features_to_select_by_rank[k],
                  step=self.step, estimator_params=self.estimator_params)

        rfa.fit(X, y)

        # Set final attributes
        self.support_ = rfa.support_
        self.n_features_ = rfa.n_features_
        self.ranking_ = rfa.ranking_
        self.estimator_ = clone(self.estimator)
        if self.estimator_params:
            self.estimator_.set_params(**self.estimator_params)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to len(cv) - 1
        # here, the scores are normalized by len(cv)
        self.grid_scores_ = scores / len(cv)
        return self
コード例 #15
0
ファイル: train_test.py プロジェクト: blauigris/htcluster
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose,
                            parameters, fit_params, return_train_score=False,
                            return_parameters=False, return_n_support=True,
                            error_score='raise'):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    if return_n_support:
        ret.append(estimator.n_support_)
    return ret
コード例 #16
0
def _fit_and_score(estimator, X, y, scorer, train, test, cv, verbose, parameters,
                   fit_params, return_train_score=False,
                   return_parameters=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scoring : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape = (n_train_samples,)
        Indices of training samples.

    test : array-like, shape = (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.

    test_score : float
        Score on test set.

    n_test_samples : int
        Number of test samples.

    scoring_time : float
        Time spent for fitting and scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust lenght of sample weights
    n_samples = _num_samples(X)

    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, np.asarray(v)[train]
                       if hasattr(v, '__len__') and len(v) == n_samples else v)
                       for k, v in fit_params.items()])
    if cv is not None:
        fit_params["cv"] = cv

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)
    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = _score(estimator, X_test, y_test, scorer)
    if return_train_score:
        train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
コード例 #17
0
ファイル: build.py プロジェクト: tgadf/pymva
def grid_search_early_stopping(estimator,
                               param_grid,
                               verbose,
                               scoring,
                               cv,
                               X,
                               y,
                               early_stopping_rounds,
                               eval_set_size,
                               n_jobs=1,
                               iid=True,
                               refit=True,
                               pre_dispatch='2*n_jobs',
                               error_score='raise'):
    ''' This is from scikit-learn package.
    '''

    parameter_iterable = ParameterGrid(param_grid)
    scorer_ = check_scoring(estimator, scoring=scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    if verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(len(cv), n_candidates,
                                     n_candidates * len(cv)))

    base_estimator = clone(estimator)

    pre_dispatch = pre_dispatch

    out = Parallel(
        n_jobs=n_jobs,
        verbose=2 if verbose > 0 else 0,
        pre_dispatch=pre_dispatch)(delayed(_fit_and_score)(
            clone(base_estimator),
            X,
            y,
            scorer_,
            train,
            test,
            2 if verbose > 0 else 0,
            parameters, {
                "early_stopping_rounds": early_stopping_rounds,
                "eval_metric": get_xgboost_eval_metric(scoring),
                "eval_set": [_safe_split(estimator, X, y, test, train)],
                "verbose": True if verbose > 1 else False
            },
            return_parameters=True,
            error_score=error_score) for parameters in parameter_iterable
                                   for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in \
                out[grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            _CVScoreTuple(parameters, score, np.array(all_scores)))

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    best_score_ = best.mean_validation_score

    if refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)

        if y is not None:
            best_estimator, _, _ = fit_estimator_early_stopping(
                best_estimator, X, y, scoring, early_stopping_rounds,
                eval_set_size, verbose)
        else:
            raise ValueError('y is required.')

    return best_estimator, best.parameters, grid_scores
コード例 #18
0
def _fit_and_score(estimator,
                   X,
                   y,
                   scorer,
                   train,
                   test,
                   verbose,
                   parameters,
                   fit_params,
                   scoring,
                   return_train_score=False,
                   return_parameters=False,
                   error_score='raise'):
    """
    Fit estimator and compute scores for a given dataset split.
    This overrides the behavior of _fit_and_score method in cross_validation.py. 
    Note that a new argument, scoring, has been added to the function.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape at least 2D
        The data to fit.
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    train : array-like, shape (n_train_samples,)
        Indices of training samples.
    test : array-like, shape (n_test_samples,)
        Indices of test samples.
    verbose : integer
        The verbosity level.
    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    scoring: string
        The name of the scoring function used in cross_val_score. Default is
        accuracy.
    return_train_score : boolean, optional, default: False
        Compute and return score on training set.
    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.
    Returns
    -------
    train_score : float, optional
        Score on training set, returned only if `return_train_score` is `True`.
    test_score : float
        Score on test set.
    n_test_samples : int
        Number of test samples.
    scoring_time : float
        Time spent for fitting and scoring in seconds.
    parameters : dict or None, optional
        The parameters that have been evaluated.
    """
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    x_train, y_train = _safe_split(estimator, X, y, train)
    x_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            b = estimator.fit(x_train, **fit_params)
        else:
            b = estimator.fit(x_train, y_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, x_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, x_train, y_train, scorer)

    # Addition to original scikit code:
    # Create FitEvents for each estimator fit.
    fit_event = FitEvent(b, estimator, x_train)
    ModelDbSyncer.Syncer.instance.add_to_buffer(fit_event)

    scoring_time = time.time() - start_time
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(x_test), scoring_time])

    # Addition to original scikit code:
    # Create MetricEvents for each estimator.
    metric_event = MetricEvent(x_test, estimator, "", "", scoring, test_score)
    ModelDbSyncer.Syncer.instance.add_to_buffer(metric_event)

    if return_parameters:
        ret.append(parameters)
    return ret
コード例 #19
0
ファイル: gridSearch.py プロジェクト: jbjorne/DiMSUM2016
def _extended_fit_and_score(estimator,
                            X,
                            y,
                            scorer,
                            train,
                            test,
                            verbose,
                            parameters,
                            fit_params,
                            return_train_score=False,
                            return_parameters=False,
                            error_score='raise',
                            extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)

    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(
                estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)

    return ret