def _grid_search_params_iter(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator))

        param_iter = ParameterGrid(self.param_grid)
        LOG.info("Performing grid search over %d configurations" % len(param_iter))

        for fold_id, (train_index, test_index) in enumerate(inner_cv):
            for parameters in param_iter:
                yield fold_id + 1, train_index, test_index, parameters
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
def test_check_cv_return_types():
    X = np.ones((9, 2))
    cv = cval._check_cv(3, X, classifier=False)
    assert_true(isinstance(cv, cval.KFold))

    y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1])
    cv = cval._check_cv(3, X, y_binary, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2])
    cv = cval._check_cv(3, X, y_multiclass, classifier=True)
    assert_true(isinstance(cv, cval.StratifiedKFold))

    X = np.ones((5, 2))
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs)
    cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True)
    assert_true(isinstance(cv, cval.KFold))

    y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]])
    cv = cval._check_cv(3, X, y_multioutput, classifier=True)
    assert_true(isinstance(cv, cval.KFold))
Esempio n. 4
0
    def _grid_search(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = _check_cv(self.inner_cv,
                                 train_X,
                                 train_y,
                                 classifier=is_classifier(self.estimator))

        master = MPIGridSearchCVMaster(self.param_grid, inner_cv,
                                       self.estimator, self.scorer_,
                                       self.fit_params)
        return master.run(train_X, train_y)
Esempio n. 5
0
    def fit(self, X, y):
        X, y = check_X_y(X,
                         y,
                         force_all_finite=False,
                         multi_output=self.multi_output)
        _check_param_grid(self.param_grid)

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if comm_rank == 0:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()

        return self
    def fit(self, X, y=None, **fit_params):
        """Fit ensemble of models

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data.

        y : array-like, optional
            Target data if base estimators are supervised.

        Returns
        -------
        self
        """
        self._check_params()

        cv = _check_cv(self.cv, X, y)
        self._fit(X, y, cv, **fit_params)

        return self
Esempio n. 7
0
def _cross_val_predict(estimator, X, y=None, cv=None, n_jobs=1,
                      verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """Generate cross-validated estimates for each input data point

    Parameters
    ----------
    estimator : estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.

    X : array-like
        The data to fit. Can be, for example a list, or an array at least 2d.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    cv : cross-validation generator or int, optional, default: None
        A cross-validation generator to use. If int, determines
        the number of folds in StratifiedKFold if y is binary
        or multiclass and estimator is a classifier, or the number
        of folds in KFold otherwise. If None, it is equivalent to cv=3.
        This generator must include all elements in the test set exactly once.
        Otherwise, a ValueError is raised.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    verbose : integer, optional
        The verbosity level.

    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    Returns
    -------
    preds : ndarray
        This is the result of calling 'predict'
    """
    X, y = indexable(X, y)

    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    preds_blocks = parallel(delayed(_fit_and_predict)(pylibfm.FM(), X, y,
                                                      train, test, verbose,
                                                      fit_params)
                            for train, test in cv)
    p = np.concatenate([p for p, _ in preds_blocks])
    locs = np.concatenate([loc for _, loc in preds_blocks])
    if not _check_is_partition(locs, X.shape[0]):
        raise ValueError('cross_val_predict only works for partitions')
    preds = p.copy()
    preds[locs] = p
    return preds
    def fit(self, X, y, fit_params=None, predict_params=None, X_test=None, y_test=None):
        """Do nested cross-validation.

        If ``X_test`` and ``y_test`` are not provided, nested cross-validation using
        ``X`` and ``y`' is performed, i.e., data is first split into *K* folds, where
        *K-1* folds are used for training and hyper-parameter selection and the
        remaining fold for testing. The training portion is again split into *T* folds
        to perform a grid-search over hyper-parameters. The parameters that achieved the
        best average performance across the *T* inner cross-validation folds are selected.
        Using these parameters, a model is trained on the entire training data and applied
        to the *K*-th testing fold.

        If ``X_test`` and ``y_test`` are provided, a regular cross-validation is performed on
        ``X`` and ``y`` to determine hyper-parameters as for the inner cross-validation above.
        Using the best performing parameters, a model is trained on all of ``X`` and ``y`` and
        applied to ``X_test`` and ``y_test`` for testing.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Feature matrix.

        y : structured array, shape = [n_samples]
            A structured array containing the binary event indicator
            as first field, and time of event or time of censoring as
            second field.

        fit_params : dict
            Additional arguments passed to the fit method.

        predict_params : dict
            Additional arguments passed to the predict method.

        X_test : array-like, shape = [n_test_samples, n_features]
            Hold-out data to perform testing on.

        y_test : array-like or sequence, shape = [n_test_samples]
            Target values of hold-out test data.

        Returns
        -------
        self
        """
        if y.dtype.names is None:
            X, y = check_X_y(X, y)
        else:
            X, event, time = check_arrays_survival(X, y, force_all_finite=False)
            y = numpy.fromiter(zip(event, time), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        if X_test is not None:
            X_test, event_test, time_test = check_arrays_survival(X_test, y_test, force_all_finite=False)
            y_test = numpy.fromiter(zip(event_test, time_test), dtype=[('event', numpy.bool), ('time', numpy.float64)])

        cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        self._dview, self._lview = self._init_cluster()
        if X_test is None:
            self._fit(X, y, cv, fit_params, predict_params)
        else:
            self._fit_holdout(X, y, fit_params, predict_params, X_test, y_test)

        del self._dview
        del self._lview

        return self
Esempio n. 9
0
def permutation_test_score(estimator, X, y, data_train=None, cv=None,
                           n_permutations=100, n_jobs=1, labels=None,
                           random_state=0, verbose=0, scoring=None):
    """Evaluate the significance of a cross-validated score with permutations

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like
        The target variable to try to predict in the case of
        supervised learning.
        
    data_train : np.array, optional
        Data to train on, if data for training is different from X.

    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.

    cv : integer or cross-validation generator, optional
        If an integer is passed, it is the number of fold (default 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects.

    n_permutations : integer, optional
        Number of times to permute ``y``.

    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.

    labels : array-like of shape [n_samples] (optional)
        Labels constrain the permutation among groups of samples with
        a same label.

    random_state : RandomState or an int seed (0 by default)
        A random number generator instance to define the state of the
        random permutations generator.

    verbose : integer, optional
        The verbosity level.

    Returns
    -------
    score : float
        The true score without permuting targets.

    permutation_scores : array, shape = [n_permutations]
        The scores obtained for each permutations.

    pvalue : float
        The returned value equals p-value if `score_func` returns bigger
        numbers for better scores (e.g., accuracy_score). If `score_func` is
        rather a loss function (i.e. when lower is better such as with
        `mean_squared_error`) then this is actually the complement of the
        p-value:  1 - p-value.

    Notes
    -----
    This function implements Test 1 in:

        Ojala and Garriga. Permutation Tests for Studying Classifier
        Performance.  The Journal of Machine Learning Research (2010)
        vol. 11

    """
    X, y = indexable(X, y)
    cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)
    
    if data_train is None:
        # We clone the estimator to make sure that all the folds are
        # independent, and that it is pickle-able.
        # Default behavior of sklearn permutation score
        score = _permutation_test_score(clone(estimator), X, y, cv, scorer)
        permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_permutation_test_score)(
                clone(estimator), X, _shuffle(y, labels, random_state), cv,
                scorer)
            for _ in range(n_permutations))
    else:
        # Modification for 2pn
        # First get the real score, train on nii_optional (actor), test on nii_func (observer)
        score = []
        for train, test in cv:
            estimator.fit(data_train[train], y[train])
            score.append(scorer(estimator, X[test], y[test]))
        score = np.mean(score)
        # Then, get the prmutation scores
        permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_permutation_test_score)(
                clone(estimator), X, _shuffle(y, labels, random_state), cv,
                scorer, data_train)
            for _ in range(n_permutations))
                
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue