Ejemplo n.º 1
0
def test_check_sample_weight():
    from sklearn.cluster.k_means_ import _check_sample_weight
    sample_weight = None
    checked_sample_weight = _check_sample_weight(X, sample_weight)
    assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
    assert_equal(X.dtype, checked_sample_weight.dtype)
def _index_param_value(X, v, indices):
    """Private helper function for parameter value indexing."""
    if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
        # pass through: skip indexing
        return v
    if sp.issparse(v):
        v = v.tocsr()
    return safe_indexing(v, indices)
Ejemplo n.º 3
0
def _fit_and_score(estimator, depthmaps, offset_points_projected, direction_vectors, true_joints, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'):
    
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
        
    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(depthmaps, v, train))
                        for k, v in fit_params.items()])
                            
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    depth_train, offsets_train, directions_train, truths_train = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, train)
    depth_test, offsets_test, directions_test, truths_test = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, test)
    
    try:
        estimator.fit(depth_train, offsets_train, directions_train, **fit_params)
        
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )
    
    else:
        test_score = _score(estimator, depth_test, truths_test, scorer)
        if return_train_score:
            train_score = _score(estimator, depth_train, truths_train, scorer)
        
    scoring_time = time.time() - start_time
    
    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(depth_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Ejemplo n.º 4
0
def test_retrieve_samples_from_non_standard_shape():
    class TestNonNumericShape:
        def __init__(self):
            self.shape = ("not numeric",)

        def __len__(self):
            return len([1, 2, 3])

    X = TestNonNumericShape()
    assert _num_samples(X) == len(X)

    # check that it gives a good error if there's no __len__
    class TestNoLenWeirdShape:
        def __init__(self):
            self.shape = ("not numeric",)

    with pytest.raises(TypeError, match="Expected sequence or array-like"):
        _num_samples(TestNoLenWeirdShape())
Ejemplo n.º 5
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        :param X: array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        :param y: array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        :param groups: array-like of shape (n_samples,)
            Always ignored, exists for compatibility.

        :returns:
            train : ndarray
                The training set indices for that split.
            test : ndarray
                The testing set indices for that split.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None.")
        X, y, groups = indexable(X, y, groups)
        groups = check_array(groups, ensure_2d=False, dtype=None)

        unique_groups, groups = np.unique(groups, return_inverse=True)
        n_samples_per_group = np.bincount(groups)
        n_groups = len(unique_groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1

        if self.n_splits > n_groups:
            raise ValueError("Cannot have number of splits n_splits=%d greater"
                             " than the number of groups: %d." %
                             (self.n_splits, n_groups))

        indices = np.arange(n_samples)
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds, n_groups,
                            test_size)
        for test_start in test_starts:
            # here we already have groups after inverse operation
            # and don't need to use unique_group
            if self.max_train_size:
                # find number of group for start not to overflow train size
                sizes = n_samples_per_group[:test_start][::-1].cumsum()
                appropriate_indices = np.where(sizes <= self.max_train_size)[0]
                if appropriate_indices.size == 0:
                    train_start = max(test_start - 1, 0)
                else:
                    train_start = test_start - appropriate_indices.max() - 1
                yield (indices[(groups < test_start)
                               & (groups >= train_start)],
                       indices[(groups >= test_start)
                               & (groups < test_start + test_size)])
            else:
                yield (indices[groups < test_start],
                       indices[(groups >= test_start)
                               & (groups < test_start + test_size)])
def _fit_and_score(estimator, Z, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise'):

    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in list(parameters.items())))
        print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))

    fit_params = fit_params if fit_params is not None else {}

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    Z_train = Z[train]
    Z_test = Z[test]

    try:
        estimator.fit(Z_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )
    else:
        test_score = _score(estimator, Z_test, scorer)
        if return_train_score:
            train_score = _score(estimator, Z_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(Z_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Ejemplo n.º 7
0
    def split(self, X, y=None, groups=None):
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)

        # all events which are true are considered to be rare
        rare_event_indices = indices[np.sum(y, axis=tuple(range(1, y.ndim))) >= 0.999]

        for f, (train_idx, test_idx) in enumerate(super().split(X, y, groups)):
            yield np.hstack([train_idx, rare_event_indices]), np.hstack([test_idx, rare_event_indices])
def _check_chunk_size(reduced, chunk_size):
    """Checks chunk is a sequence of expected size or a tuple of same
    """
    is_tuple = isinstance(reduced, tuple)
    if not is_tuple:
        reduced = (reduced,)
    if any(isinstance(r, tuple) or not hasattr(r, '__iter__')
           for r in reduced):
        raise TypeError('reduce_func returned %r. '
                        'Expected sequence(s) of length %d.' %
                        (reduced if is_tuple else reduced[0], chunk_size))
    if any(_num_samples(r) != chunk_size for r in reduced):
        # XXX: we use int(_num_samples...) because sometimes _num_samples
        #      returns a long in Python 2, even for small numbers.
        actual_size = tuple(int(_num_samples(r)) for r in reduced)
        raise ValueError('reduce_func returned object of length %s. '
                         'Expected same length as input: %d.' %
                         (actual_size if is_tuple else actual_size[0],chunk_size))
Ejemplo n.º 9
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)  # pylint: disable=unbalanced-tuple-unpacking
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        delay = self.delay

        if n_folds > n_samples:
            raise ValueError(
                f'Cannot have number of folds={n_folds} greater than the number of samples: {n_samples}.'
            )

        indices = np.arange(n_samples)
        split_size = n_samples // n_folds

        train_size = self.train_size or split_size * self.n_splits
        test_size = self.test_size or n_samples // n_folds
        full_test = test_size + delay

        if full_test + n_splits > n_samples:
            raise ValueError(
                f'test_size\\({test_size}\\) + delay\\({delay}\\) = {test_size + delay} + '
                f'n_splits={n_splits} \n'
                f' greater than the number of samples: {n_samples}. Cannot create fold logic.'
            )

        # Generate logic for splits.
        # Overwrite fold test_starts ranges if force_step_size is specified.
        if self.force_step_size:
            step_size = self.force_step_size
            final_fold_start = n_samples - (train_size + full_test)
            range_start = (final_fold_start % step_size) + train_size

            test_starts = range(range_start, n_samples, step_size)

        else:
            if not self.train_size:
                step_size = split_size
                range_start = (split_size -
                               full_test) + split_size + (n_samples % n_folds)
            else:
                step_size = (n_samples - (train_size + full_test)) // n_folds
                final_fold_start = n_samples - (train_size + full_test)
                range_start = (final_fold_start -
                               (step_size * (n_splits - 1))) + train_size

            test_starts = range(range_start, n_samples, step_size)

        # Generate data splits.
        for test_start in test_starts:
            idx_start = test_start - train_size if self.train_size is not None else 0
            # Ensure we always return a test set of the same size
            if indices[test_start:test_start + full_test].size < full_test:
                continue
            yield (indices[idx_start:test_start],
                   indices[test_start + delay:test_start + full_test])
Ejemplo n.º 10
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Returns
        -------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """

        if groups is not None:
            # find all indices that are at the beginning of a group
            groups_unique = np.unique(groups)
            possible_test_start = [
                np.where(i == groups)[0][0] for i in np.nditer(groups_unique)
            ]
            possible_test_start = np.asarray(possible_test_start)

        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        if n_folds > n_samples:
            raise ValueError(("Cannot have number of folds ={0} greater"
                              " than the number of samples: {1}.").format(
                                  n_folds, n_samples))
        indices = np.arange(n_samples)
        test_size = (n_samples // n_folds)
        test_starts = range(test_size + n_samples % n_folds, n_samples,
                            test_size)

        if groups is not None:
            # find all possible starts that are closest to predefined test_starts
            test_starts = [
                possible_test_start[np.abs(possible_test_start - i).argmin()]
                for i in test_starts
            ]

        for test_start in test_starts:
            yield (indices[:test_start],
                   indices[test_start:test_start + test_size])
Ejemplo n.º 11
0
def score_each_boost(estimator, parameters,
                     min_n_estimators,
                     X, y, sample_weight,
                     score_func, train, test,
                     verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(
        estimator, X, y, sample_weight, test, train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Ejemplo n.º 12
0
    def predict_proba(self, X):
        """
        Predict class probabilities for X.

        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest.
        The class probability of a single tree is the fraction of samples of
        the same class in a leaf.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        p : ndarray of shape (n_samples, n_classes), or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute :term:`classes_`.
        """
        if sklearn_check_version("1.0"):
            self._check_feature_names(X, reset=False)
        if hasattr(self, 'n_features_in_'):
            try:
                num_features = _daal_num_features(X)
            except TypeError:
                num_features = _num_samples(X)
            if num_features != self.n_features_in_:
                raise ValueError(
                    (f'X has {num_features} features, '
                     f'but RandomForestClassifier is expecting '
                     f'{self.n_features_in_} features as input'))

        _patching_status = PatchingConditionsChain(
            "sklearn.ensemble.RandomForestClassifier.predict_proba")
        _dal_ready = _patching_status.and_conditions([
            (hasattr(self, 'daal_model_'), "oneDAL model was not trained."),
            (not sp.issparse(X), "X is sparse. Sparse input is not supported."),
            (daal_check_version((2021, 'P', 400)),
                "oneDAL version is lower than 2021.4.")])
        if hasattr(self, 'n_outputs_'):
            _dal_ready = _patching_status.and_conditions([
                (self.n_outputs_ == 1,
                    f"Number of outputs ({self.n_outputs_}) is not 1.")])
        _patching_status.write_log()

        if not _dal_ready:
            return super(RandomForestClassifier, self).predict_proba(X)
        X = check_array(X, dtype=[np.float64, np.float32])
        check_is_fitted(self)
        if sklearn_check_version('0.23'):
            self._check_n_features(X, reset=False)
        return _daal_predict_proba(self, X)
Ejemplo n.º 13
0
def test_retrieve_samples_from_non_standard_shape():
    class TestNonNumericShape:
        def __init__(self):
            self.shape = ("not numeric",)

        def __len__(self):
            return len([1, 2, 3])

    X = TestNonNumericShape()
    assert _num_samples(X) == len(X)
Ejemplo n.º 14
0
    def _iter_test_indices(self, X, y=None, groups=None):

        if self.random_state is not None:
            random.seed(self.random_state)

        n_samples = _num_samples(X)

        for i in random.sample(range(n_samples), self.iters):

            yield np.array(i)
Ejemplo n.º 15
0
 def predict(self, X):
     n_samples = _num_samples(X)
     maxima = np.empty(n_samples, dtype=float)
     maxima.fill(-np.inf)
     argmaxima = np.zeros(n_samples, dtype=int)
     for i, e in enumerate(self.estimators):
         pred = np.ravel(e.decision_function(X))
         np.maximum(maxima, pred, out=maxima)
         argmaxima[maxima == pred] = i
     return self.classes[np.array(argmaxima.T)]
Ejemplo n.º 16
0
 def __check_validity(self, X, y=None, groups=None):
     if X is None:
         raise ValueError("The 'X' parameter should not be None.")
     n_samples = _num_samples(X)
     gap_before, gap_after = self.gap_before, self.gap_after
     if (0 >= n_samples - gap_after - self.p
             and gap_before >= n_samples - self.p):
         raise ValueError("Not enough training samples available.")
     if n_samples - gap_after - self.p <= gap_before + 1:
         warnings.warn(SINGLETON_WARNING, Warning)
Ejemplo n.º 17
0
def test_retrieve_samples_from_non_standard_shape():
    class TestNonNumericShape:
        def __init__(self):
            self.shape = ("not numeric", )

        def __len__(self):
            return len([1, 2, 3])

    X = TestNonNumericShape()
    assert _num_samples(X) == len(X)
Ejemplo n.º 18
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)

        n_samples = _num_samples(X)
        train_size = round((1 - self.test_size) * n_samples)

        train_index = np.arange(train_size - self.n_reduce)
        test_index = np.arange(train_size, n_samples)

        yield train_index, test_index
 def split(self, X, y=None, labels=None):
     """Generate indices to split data into training and test set.
     Parameters
     ----------
     X : array-like, shape (n_samples, n_features)
         Training data, where n_samples is the number of samples
         and n_features is the number of features.
     y : array-like, of length n_samples
         The target variable for supervised learning problems.
         ignored
     labels : array-like, with shape (n_samples,), optional
         Group labels for the samples used while splitting the dataset into
         train/test set.
         ignored
     Returns
     -------
     train : ndarray
         The training set indices for that split.
     test : ndarray
         The testing set indices for that split.
     """
     n = _num_samples(X)
     n_slices = self.n_folds + 1
     # loop from the first 2 folds to the total number of folds
     for i in range(2, n_slices + 1):
         # the split is the percentage at which to split the folds into train
         # and test. For example when i = 2 we are taking the first 2 folds out
         # of the total available. In this specific case we have to split the
         # two of them in half (train on the first, test on the second),
         # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds
         # out of the total available, meaning that we have to split the three of them
         # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the
         # following)
         split = float(i - 1) / i
         # as we loop over the folds X and y are updated and increase in size.
         # This is the data that is going to be split and it increases in size
         # in the loop as we account for more folds. If k = 300, with i starting from 2
         # the result is the following in the loop
         # i = 2
         # X = X_train[:(600)]
         # y = y_train[:(600)]
         #
         # i = 3
         # X = X_train[:(900)]
         # y = y_train[:(900)]
         # ....
         n_sub = int(np.floor(float(n * i) / n_slices))
         subset = range(0, n_sub)
         # X and y contain both the folds to train and the fold to test.
         # index is the integer telling us where to split, according to the
         # split percentage we have set above
         n_train = int(np.floor(n_sub * split))
         train_index = np.arange(0, n_train)
         test_index = np.arange(n_train, n_sub)
         yield train_index, test_index
Ejemplo n.º 20
0
def score_each_boost(estimator, parameters, min_n_estimators, X, y,
                     sample_weight, score_func, train, test, verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                for k, v in parameters.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X_test, y_test, sample_weight_test = _safe_split(estimator, X, y,
                                                     sample_weight, test,
                                                     train)

    test_score_params = {}
    if sample_weight is not None:
        test_score_params['sample_weight'] = sample_weight_test

    this_n_test_samples = _num_samples(X_test)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    for i, y_pred in enumerate(estimator.staged_predict(X_test)):
        if i + 1 < min_n_estimators:
            continue
        score = score_func(y_test, y_pred, **test_score_params)
        all_scores.append(score)
        clf_para = copy(parameters)
        clf_para['n_estimators'] = i + 1
        all_clf_params.append(clf_para)
        n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                       estimator.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (
            msg, logger.short_format_time(time.time() - start_time))
        print "[BoostGridSearchCV] %s %s" % (
            (64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Ejemplo n.º 21
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = validation._num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1

        indices = np.arange(n_samples)
        test_size = (n_samples // n_folds)
        train_starts = range(0, n_samples - 2 * test_size, test_size)
        for train_start in train_starts:
            yield (indices[train_start:-test_size], indices[-test_size:])
Ejemplo n.º 22
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Ejemplo n.º 23
0
 def predict(self, X):
     check_is_fitted(self, 'estimators_')
     n_samples = _num_samples(X)
     maxima = np.empty(n_samples, dtype=float)
     maxima.fill(-np.inf)
     argmaxima = np.zeros(n_samples, dtype=int)
     for i, e in enumerate(self.estimators_):
         pred = _predict_binary(e, X)
         np.maximum(maxima, pred, out=maxima)
         argmaxima[maxima == pred] = i
     return self.classes_[np.array(argmaxima.T)]
Ejemplo n.º 24
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Ejemplo n.º 25
0
def mse_variance(y, y_pred, **kwargs):
    """ MSE variance estimator. **kwargs for compatibility """

    m = _num_samples(y)

    mse_hat = mean_squared_error(y, y_pred)

    loss = (y - y_pred)**2

    mse_var = sum((mse_hat - loss)**2) / (m * (m - 1))

    return mse_var
Ejemplo n.º 26
0
 def _iter_test_indices(self, X, y=None, groups=None):
     self.__check_validity(X, y, groups)
     n_samples = _num_samples(X)
     gap_before, gap_after = self.gap_before, self.gap_after
     if n_samples - gap_after - self.p >= gap_before + 1:
         for i in range(n_samples - self.p + 1):
             yield np.arange(i, i + self.p)
     else:
         for i in range(n_samples - gap_after - self.p):
             yield np.arange(i, i + self.p)
         for i in range(gap_before + 1, n_samples - self.p + 1):
             yield np.arange(i, i + self.p)
Ejemplo n.º 27
0
def _compute_chunked_score_samples(iforest, tree_idx, X):
    n_samples = _num_samples(X)
    if iforest._max_features == X.shape[1]:
        subsample_features = False
    else:
        subsample_features = True
    chunk_n_rows = get_chunk_n_rows(row_bytes=16 * iforest._max_features,
                                    max_n_rows=n_samples)
    slices = gen_batches(n_samples, chunk_n_rows)
    scores = np.zeros(n_samples, order="f")
    for sl in slices:
        scores[sl] = _compute_score_samples_single_tree(iforest, tree_idx, X[sl], subsample_features)
    return scores
Ejemplo n.º 28
0
def my_cross_val_predict(estimator,
                         X,
                         y=None,
                         groups=None,
                         cv=None,
                         n_jobs=1,
                         verbose=0,
                         fit_params=None,
                         pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'.format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(
        delayed(_my_fit_and_predict)(clone(estimator), X, y, train, test,
                                     verbose, fit_params, method)
        for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate(
        [indices_i for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    out_predictions = predictions[inv_test_indices]
    return out_predictions.reshape(y.shape), scores
Ejemplo n.º 29
0
 def _iter_indices(self, X, y=None, groups=None):
     n_samples = _num_samples(self.mapping)
     n_train, n_test = _validate_shuffle_split(n_samples,self.test_size,self.train_size)
     rng = check_random_state(self.random_state)
     for _ in range(self.n_splits):
         # random partition
         permutation = rng.permutation(n_samples)
         ind_test = []
         for it in permutation[:n_test]:
             ind_test.extend(self.mapping[it])
         ind_train = []
         for it in permutation[n_test:(n_test + n_train)]:
             ind_train.extend(self.mapping[it])
         yield ind_train, ind_test
Ejemplo n.º 30
0
def data_sample(n_folds=5,
                frac=0.2,
                X=None,
                y=None,
                groups=None,
                oob=True,
                random_state=0):
    """
    把数据集划分成多份用于模型训练
    n_folds:如果是int类型 那么就做bootstrap抽样 抽取n_folds份
            如果是是包含split函数的类 那么就调用其split函数 取出valid部分
    frac:抽取的样本比例 只有到n_folds是int的时候有效 值在0到1之间
    X: X数据 
    y: Y数据
    groups: 如果根据自定义的分组情况进行CV 那么就需要这个参数 比如LeaveOneGroupOut这个数据切分方法
    oob: 是否需要同时返回out of bag的index
    random_state:随机种子
    
    return:
    index_list n个index array组成的list
    """
    train_index_list = []
    oob_index_list = []
    num_samples = _num_samples(X)
    np.random.seed(random_state)
    if isinstance(n_folds, int):
        if frac is None:
            batch_size = round(num_samples / n_folds)
        elif frac >= 0 and frac <= 1:
            batch_size = round(num_samples * frac)
        else:
            raise ValueError(
                "expect frac is a int object between 0 and 1 but got {0}".
                format(frac))
        for i in range(n_folds):
            train_index = np.random.choice(num_samples,
                                           batch_size,
                                           replace=True)
            oob_index = [i for i in range(num_samples) if i not in train_index]
            train_index_list.append(train_index)
            oob_index_list.append(oob_index)
    elif hasattr(n_folds, 'split'):
        for fold_n, (train_index,
                     valid_index) in enumerate(n_folds.split(X, y, groups)):
            train_index_list.append(valid_index)
            oob_index_list.append(train_index)
    if oob:
        return train_index_list, oob_index_list
    else:
        return train_index_list
Ejemplo n.º 31
0
    def _iter_test_indices(self, X, y=None, groups=None):
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)
        if self.shuffle:
            check_random_state(self.random_state).shuffle(indices)

        n_splits = self.n_splits
        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
        fold_sizes[:n_samples % n_splits] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            yield indices[start:stop]
            current = stop
Ejemplo n.º 32
0
    def _iter_test_indices(self, X=None, y=None, groups=None):
        n_samples = _num_samples(X)

        _ks = _KennardStone()
        indices = _ks._get_indexes(X)

        n_splits = self.n_splits
        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
        fold_sizes[:n_samples % n_splits] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            yield indices[start:stop]
            current = stop
Ejemplo n.º 33
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap_size = self.gap_size
        test_size = self.test_size if self.test_size else n_samples // n_folds

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_samples))
        if n_samples - gap_size - (test_size * n_splits) <= 0:
            raise ValueError(
                ("Too many splits ={0} for number of samples"
                 " ={1} with test_size ={2} and gap_size ={3}."
                 "").format(n_splits, n_samples, test_size, gap_size))

        indices = np.arange(n_samples)
        test_starts = range(n_samples - n_splits * test_size,
                            n_samples, test_size)

        for test_start in test_starts:
            train_end = test_start - gap_size
            if self.max_train_size and self.max_train_size < train_end:
                yield (indices[train_end - self.max_train_size:train_end],
                       indices[test_start:test_start + test_size])
            else:
                yield (indices[:train_end],
                       indices[test_start:test_start + test_size])
Ejemplo n.º 34
0
    def transform(self, y):
        check_is_fitted(self, 'classes_')
        # y = column_or_1d(y, warn=True)

        if _num_samples(y) == 0:
            return np.array([])

        indices = np.isin(y, self.classes_)
        if not self.ignore_unknown and not np.all(indices):
            raise ValueError("y contains new labels: %s" %
                             str(np.setdiff1d(y, self.classes_)))

        y_transformed = np.searchsorted(self.classes_, y)
        y_transformed[~indices] = self.unknown_encoded_value
        return y_transformed
Ejemplo n.º 35
0
 def predict(self, X):
     """
     applying multiple estimators for prediction
     Args:
         X (numpy.ndarray): input points
     """
     n_samples = _num_samples(X)
     maxima = np.empty(n_samples, dtype=float)
     maxima.fill(-np.inf)
     argmaxima = np.zeros(n_samples, dtype=int)
     for i, e in enumerate(self.estimators):
         pred = np.ravel(e.decision_function(X))
         np.maximum(maxima, pred, out=maxima)
         argmaxima[maxima == pred] = i
     return self.classes[np.array(argmaxima.T)]
Ejemplo n.º 36
0
    def __init__(self, *arrays, batch_size, shuffle=False, to_dense=True):
        self.arrays = arrays
        if self.n_arrays == 0:
            raise ValueError('At least one data array is required')

        for array in arrays[1:]:
            if _num_samples(array) != self.n_samples:
                raise ValueError(
                    'Input arrays must have the same number of elements'
                )

        self.batch_size = batch_size
        self.shuffle = shuffle
        self.to_dense = to_dense
        self.reset()
Ejemplo n.º 37
0
    def _iter_indices(self, X, y=None, groups=None):
        _ks = _KennardStone()
        inds = _ks._get_indexes(X)

        n_samples = _num_samples(X)
        n_train, n_test = _validate_shuffle_split(
            n_samples,
            self.test_size,
            self.train_size,
            default_test_size=self._default_test_size)

        for _ in range(self.n_splits):
            ind_test = inds[:n_test]
            ind_train = inds[n_test:(n_test + n_train)]
            yield ind_train, ind_test
Ejemplo n.º 38
0
    def _iter_test_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        y = np.asarray(y)
        sorted_index = np.argsort(y)
        if self.shuffle:
            current = 0
            rng = check_random_state(self.random_state)
            for i in range(n_samples // int(n_splits)):
                start, stop = current, current + n_splits
                rng.shuffle(sorted_index[start:stop])
                current = stop
            rng.shuffle(sorted_index[current:])

        for i in range(n_splits):
            yield sorted_index[i:n_samples:n_splits]
Ejemplo n.º 39
0
def check_holdout(holdout, X, y, classifier=True):
    is_sparse = sp.issparse(X)
    if holdout is None:
        holdout = 0.8
    if isinstance(holdout, numbers.Integral):
        if classifier:
            if type_of_target(y) in ['binary', 'multiclass']:
                holdout = StratifiedShuffleSplit(y, train_size=holdout)
            else:
                holdout = ShuffleSplit(_num_samples(y), train_size=holdout)
        else:
            if not is_sparse:
                n_samples = len(X)
            else:
                n_samples = X.shape[0]
            holdout = ShuffleSplit(n_samples, train_size=holdout)
    return holdout
Ejemplo n.º 40
0
def check_cv_coverage(cv, X, y, labels, expected_n_iter=None):
    n_samples = _num_samples(X)
    # Check that a all the samples appear at least once in a test fold
    if expected_n_iter is not None:
        assert_equal(cv.get_n_splits(X, y, labels), expected_n_iter)
    else:
        expected_n_iter = cv.get_n_splits(X, y, labels)

    collected_test_samples = set()
    iterations = 0
    for train, test in cv.split(X, y, labels):
        check_valid_split(train, test, n_samples=n_samples)
        iterations += 1
        collected_test_samples.update(test)

    # Check that the accumulated test samples cover the whole dataset
    assert_equal(iterations, expected_n_iter)
    if n_samples is not None:
        assert_equal(collected_test_samples, set(range(n_samples)))
Ejemplo n.º 41
0
Archivo: ifs.py Proyecto: teopir/ifqi
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
                         verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
                         method='predict'):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # Ensure the estimator has implemented the passed decision function
    if not callable(getattr(estimator, method)):
        raise AttributeError('{} not implemented in estimator'
                             .format(method))

    if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
        le = LabelEncoder()
        y = le.fit_transform(y)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_my_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
                                 for train, test in cv.split(X, y, groups))

    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
    test_indices = np.concatenate([indices_i
                                   for _, indices_i, _ in prediction_blocks])
    scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])

    if not _check_is_permutation(test_indices, _num_samples(X)):
        raise ValueError('cross_val_predict only works for partitions')

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))

    # Check for sparse predictions
    if sp.issparse(predictions[0]):
        predictions = sp.vstack(predictions, format=predictions[0].format)
    else:
        predictions = np.concatenate(predictions)
    return predictions[inv_test_indices], scores
Ejemplo n.º 42
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)

        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
        if not self.dataset_filenames:
            self.save_dataset_filename(X, y, cv)

        dataset_filenames = self.dataset_filenames

        client = Client()
        lb_view = client.load_balanced_view()

        if self.verbose > 0:
            print("Number of CPU core %d" % len(client.ids()))

        self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
                        for dataset_filename in dataset_filenames], params)
                            for params in parameter_iterable]
        if self.sync:
            self.wait()
            self.set_grid_scores()
            self.set_best_score_params()

            if self.refit:
                self.set_best_estimator(estimator)
        return self
Ejemplo n.º 43
0
    def predict(self, X):
        neighbors = self.nbrs.kneighbors(X, self.n_neighbors, return_distance=False)

        neighbors_set = get_neighbors_above_threshold(self._fit_y, neighbors[0], self.threshold)

        check_is_fitted(self, 'estimators_')
        if (hasattr(self.estimators_[0], "decision_function") and
                is_classifier(self.estimators_[0])):
            thresh = 0
        else:
            thresh = .5

        n_samples = _num_samples(X)
        if self.label_binarizer_.y_type_ == "multiclass":
            maxima = np.empty(n_samples, dtype=float)
            maxima.fill(-np.inf)
            argmaxima = np.zeros(n_samples, dtype=int)
            for i, e in enumerate(self.estimators_):
                if not i in neighbors_set:
                    continue
                pred = _predict_binary(e, X)
                np.maximum(maxima, pred, out=maxima)
                argmaxima[maxima == pred] = i
            return self.label_binarizer_.classes_[np.array(argmaxima.T)]
        else:
            indices = array.array('i')
            indptr = array.array('i', [0])
            for i, e in enumerate(self.estimators_):
                if not i in neighbors_set:
                    continue
                indices.extend(np.where(_predict_binary(e, X) > thresh)[0])
                indptr.append(len(indices))
            data = np.ones(len(indices), dtype=int)
            indicator = sp.csc_matrix((data, indices, indptr),
                                      shape=(n_samples, len(self.estimators_)))
            return self.label_binarizer_.inverse_transform(indicator)
Ejemplo n.º 44
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise'):

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    if y_train is None:
        estimator.fit(X_train, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    test_score = estimator.score(X_test, y_test)

    scoring_time = time.time() - start_time

    ret = [test_score, _num_samples(X_test), scoring_time]
    if return_parameters:
        ret.append(parameters)
    return ret
Ejemplo n.º 45
0
 def _fit(self, X, y, parameter_dict):
     self._cv_results = None  # Indicador de necesidad de actualización
     self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
     n_samples = _num_samples(X)
     # verificar longitudes x,y
     if _num_samples(y) != n_samples:
         raise ValueError('Target [y], data [X] no coinciden')
     self.cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))
     toolbox = base.Toolbox()
     # name_values = lista de parametros, gene_type = [1:categorico; 2:numérico], maxints = size(parametros)
     name_values, self.gene_type, maxints = _get_param_types_maxint(parameter_dict)
     if self.verbose:
         print("Tipos: %s, rangos: %s" % (self.gene_type, maxints))
     # registro de función Individuo
     toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints)
     # registro de función Población
     toolbox.register("population", tools.initRepeat, list, toolbox.individual)
     # Paralelísmo, create pool
     if not isinstance(self.n_jobs, int):
         self.n_jobs=1
     pool = Pool(self.n_jobs)
     toolbox.register("map", pool.map)
     # registro de función Evaluación
     toolbox.register("evaluate", _evalFunction,
                     name_values=name_values, X=X, y=y,
                     scorer=self.scorer_, cv=self.cv, uniform=self.uniform, verbose=self.verbose,
                     error_score=self.error_score, fit_params=self.fit_params,
                     score_cache=self.score_cache)
     # registro de función Cruce
     toolbox.register("mate", _cxIndividual, prob_cruce=self.gene_crossover_prob, gene_type=self.gene_type)
     # registro de función Mutación
     toolbox.register("mutate", _mutIndividual, prob_mutacion=self.gene_mutation_prob, maxints=maxints)
     # registro de función Selección
     toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)
     # Creación de Población
     pop = toolbox.population(n=self.population_size)
     # Mejor Individuo que ha existido
     hof = tools.HallOfFame(1)
     # Stats
     stats = tools.Statistics(lambda ind: ind.fitness.values)
     stats.register("avg", np.nanmean)
     stats.register("min", np.nanmin)
     stats.register("max", np.nanmax)
     stats.register("std", np.nanstd)
     # Genealogía
     hist = tools.History()
     # Decoración de operadores de variaznza
     toolbox.decorate("mate", hist.decorator)
     toolbox.decorate("mutate", hist.decorator)
     hist.update(pop)
     # Posibles combinaciones
     if self.verbose:
         print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints) + 1)))
     pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                     ngen=self.generations_number, stats=stats,
                                     halloffame=hof, verbose=self.verbose)
     #pop, logbook = algorithms.eaGenerateUpdate(toolbox,
     #								ngen=self.generations_number, stats=stats,
     #								halloffame=hof, verbose=self.verbose)
     print(logbook)
     # Save History
     self.all_history_ = hist
     self.all_logbooks_ = logbook
     # Mejor score y parametros
     current_best_score_ = hof[0].fitness.values[0]
     current_best_params_ = _individual_to_params(hof[0], name_values)
     if self.verbose:
         print("Best individual is: %s\nwith fitness: %s" % (
             current_best_params_, current_best_score_))
     if current_best_score_ > self.best_mem_score_:
         self.best_mem_score_ = current_best_score_
         self.best_mem_params_ = current_best_params_
     # fin paralelización, close pool
     pool.close()
     #pool.join()
     self.best_score_ = current_best_score_
     self.best_params_ = current_best_params_
Ejemplo n.º 46
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(
            self.loss_func, self.score_func, self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(fit_grid_point_extended)(
                    X, y, base_estimator, parameters, train, test,
                    self.scorer_, self.verbose, **self.fit_params)
                for parameters in parameter_iterable
                for train, test in cv)
        
#         out = []
#         for parameters in parameter_iterable:
#             fold = 1
#             for train, test in cv:
#                 print "Processing fold", fold, self.fit_params
#                 out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params))
#                 fold += 1

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_extras = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = list()
            for this_score, parameters, this_n_test_samples, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 47
0
def fit_grid_point_extended(X, y, base_estimator, parameters, train, test, scorer,
                   verbose, loss_func=None, extraOut="auto", **fit_params):
    """Run fit on one set of parameters.

Parameters
----------
X : array-like, sparse matrix or list
Input data.

y : array-like or None
Targets for input data.

base_estimator : estimator object
This estimator will be cloned and then fitted.

parameters : dict
Parameters to be set on base_estimator clone for this grid point.

train : ndarray, dtype int or bool
Boolean mask or indices for training set.

test : ndarray, dtype int or bool
Boolean mask or indices for test set.

scorer : callable or None.
If provided must be a scorer callable object / function with signature
``scorer(estimator, X, y)``.

verbose : int
Verbosity level.

**fit_params : kwargs
Additional parameter passed to the fit function of the estimator.


Returns
-------
score : float
Score of this parameter setting on given training / test split.

parameters : dict
The parameters that have been evaluated.

n_samples_test : int
Number of test samples in this split.
"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                      for k, v in parameters.items()))
        print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # update parameters of the classifier after a copy of its base structure
    clf = clone(base_estimator)
    clf.set_params(**parameters)

    if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel):
        # cannot compute the kernel values with custom function
        raise ValueError("Cannot use a custom kernel function. "
                         "Precompute the kernel matrix instead.")

    if not hasattr(X, "shape"):
        if getattr(base_estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(base_estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
        clf.fit(X_train, y_train, **fit_params)

        if scorer is not None:
            this_score = scorer(clf, X_test, y_test)
        else:
            this_score = clf.score(X_test, y_test)
    else:
        clf.fit(X_train, **fit_params)
        if scorer is not None:
            this_score = scorer(clf, X_test)
        else:
            this_score = clf.score(X_test)

    if not isinstance(this_score, numbers.Number):
        raise ValueError("scoring must return a number, got %s (%s)"
                         " instead." % (str(this_score), type(this_score)))

    if verbose > 2:
        msg += ", score=%f" % this_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
    extraRVs = {}
    if extraOut != None:
        if "estimator" in extraOut:
            extraRVs["estimator"] = clf
        if extraOut == "auto" or "predictions" in extraOut:
            predictions = clf.predict(X)
            predictionIndex = 0
            predictionByIndex = {}
            for exampleIndex in safe_mask(X, test):
                predictionByIndex[exampleIndex] = predictions[predictionIndex]
                predictionIndex += 1
            extraRVs["predictions"] = predictionByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"):
            extraRVs["importances"] = clf.feature_importances_
    rvs = [this_score, parameters, _num_samples(X_test), extraRVs]
    return rvs
Ejemplo n.º 48
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise'):
    """
    Fit estimator and compute scores for a given dataset split.
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.')

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        test_score = [_score(estimator, X_test, y_test, s) for s in scorer]
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_score = [_score(estimator, X_train, y_train, s)
                           for s in scorer]

    if verbose > 2:
        msg += ", score=".join(('%f' % ts for ts in test_score))
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg)

    ret = [train_score, test_score] if return_train_score else [test_score]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    return ret
Ejemplo n.º 49
0
    def _fit(self, X, y):

        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp()
        for jk in xrange(self.n_iter):
            suggestion = self.conn.experiments(self.experiment.id).suggestions().create()
            parameters = suggestion.assignments.to_json()
     
            # convert all unicode names and values to plain strings
            non_unicode_parameters = self._convert_unicode_dict(parameters)

            if self.verbose > 0:
                print "Evaluating params : ",non_unicode_parameters

            # do CV folds in parallel using joblib
            # returns scores on test set
            out = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(
                delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                        train, test, self.verbose, non_unicode_parameters,
                                        self.fit_params, return_parameters=True,
                                        error_score=self.error_score)
                    for train, test in cv)

            # grab scores from results
            scores = [o[0] for o in out]
            self.conn.experiments(self.experiment.id).observations().create(
                suggestion=suggestion.id,
                value=numpy.mean(scores),
                value_stddev=numpy.std(scores)
            )
              
        # return best SigOpt observation so far
        best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation
        self.best_params_ = best_obs.assignments.to_json()
         # convert all unicode names and values to plain strings
        self.best_params_ = self._convert_unicode_dict(self.best_params_)
        self.best_score_ = best_obs.value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
Ejemplo n.º 50
0
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise', extraOut="auto"):
    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )

    else:
        test_score = _score(estimator, X_test, y_test, scorer)
        if return_train_score:
            train_score = _score(estimator, X_train, y_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(X_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    
    # Add additional return values
    extraRVs = {}
    if extraOut != None:
        extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]}
        if "estimator" in extraOut:
            extraRVs["estimator"] = estimator
        if extraOut == "auto" or "predictions" in extraOut:
            assert test.shape[0] == X_test.shape[0]
            probabilities = estimator.predict_proba(X_test)
            probabilityByIndex = {}
            for exampleIndex, prediction in zip(test, probabilities):
                probabilityByIndex[exampleIndex] = prediction
            extraRVs["probabilities"] = probabilityByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"):
            extraRVs["importances"] = estimator.feature_importances_
    ret.append(extraRVs)
    
    return ret
Ejemplo n.º 51
0
def check_sample_weight_invariance(name, metric, y1, y2):
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 10, size=len(y1))

    # check that unit weights gives the same score as no weight
    unweighted_score = metric(y1, y2, sample_weight=None)

    assert_allclose(
        unweighted_score,
        metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
        err_msg="For %s sample_weight=None is not equivalent to "
                "sample_weight=ones" % name)

    # check that the weighted and unweighted scores are unequal
    weighted_score = metric(y1, y2, sample_weight=sample_weight)

    # use context manager to supply custom error message
    with assert_raises(AssertionError) as cm:
        assert_allclose(unweighted_score, weighted_score)
        cm.msg = ("Unweighted and weighted scores are unexpectedly almost "
                  "equal (%s) and (%s) for %s" % (unweighted_score,
                                                  weighted_score, name))

    # check that sample_weight can be a list
    weighted_score_list = metric(y1, y2,
                                 sample_weight=sample_weight.tolist())
    assert_allclose(
        weighted_score, weighted_score_list,
        err_msg=("Weighted scores for array and list "
                 "sample_weight input are not equal (%s != %s) for %s") % (
                     weighted_score, weighted_score_list, name))

    # check that integer weights is the same as repeated samples
    repeat_weighted_score = metric(
        np.repeat(y1, sample_weight, axis=0),
        np.repeat(y2, sample_weight, axis=0), sample_weight=None)
    assert_allclose(
        weighted_score, repeat_weighted_score,
        err_msg="Weighting %s is not equal to repeating samples" % name)

    # check that ignoring a fraction of the samples is equivalent to setting
    # the corresponding weights to zero
    sample_weight_subset = sample_weight[1::2]
    sample_weight_zeroed = np.copy(sample_weight)
    sample_weight_zeroed[::2] = 0
    y1_subset = y1[1::2]
    y2_subset = y2[1::2]
    weighted_score_subset = metric(y1_subset, y2_subset,
                                   sample_weight=sample_weight_subset)
    weighted_score_zeroed = metric(y1, y2,
                                   sample_weight=sample_weight_zeroed)
    assert_allclose(
        weighted_score_subset, weighted_score_zeroed,
        err_msg=("Zeroing weights does not give the same result as "
                 "removing the corresponding samples (%s != %s) for %s" %
                 (weighted_score_zeroed, weighted_score_subset, name)))

    if not name.startswith('unnormalized'):
        # check that the score is invariant under scaling of the weights by a
        # common factor
        for scaling in [2, 0.3]:
            assert_allclose(
                weighted_score,
                metric(y1, y2, sample_weight=sample_weight * scaling),
                err_msg="%s sample_weight is not invariant "
                        "under scaling" % name)

    # Check that if number of samples in y_true and sample_weight are not
    # equal, meaningful error is raised.
    error_message = ("Found input variables with inconsistent numbers of "
                     "samples: [{}, {}, {}]".format(
                         _num_samples(y1), _num_samples(y2),
                         _num_samples(sample_weight) * 2))
    assert_raise_message(ValueError, error_message, metric, y1, y2,
                         sample_weight=np.hstack([sample_weight,
                                                  sample_weight]))
Ejemplo n.º 52
0
    def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'):

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)


        # out = Parallel(
        #     n_jobs=self.n_jobs, verbose=self.verbose,
        #     pre_dispatch=pre_dispatch
        # )(
        #     delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
        #                             train, test, self.verbose, parameters,
        #                             self.fit_params, return_parameters=True,
        #                             error_score=self.error_score)
        #         for parameters in parameter_iterable
        #         for train, test in cv)

        train_test_parameters = ((train, test, parameters) \
                                 for parameters in parameter_iterable for train, test in cv)

        length = len(parameter_iterable) * len(cv)

        if x_is_index:
            X_to_pass = X
            y_to_pass = None
        else:
            X_to_pass = None
            y_to_pass = None

        self.view.block = False
        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(my_fit_and_score, estimator=clone(base_estimator),
                    X=X_to_pass,
                    y=y_to_pass,
                    verbose=self.verbose,
                    fit_params=self.fit_params,
                    return_parameters=True,
                    scorer=None,
                    x_is_index=x_is_index,
                    names=(X_name, y_name))

        # print('before map')

        # import cProfile
        #
        # pr = cProfile.Profile()
        # pr.enable()
        chunksize = 10

        out = self.view.map(f, itertools.islice(train_test_parameters, 0, length),
                            ordered=False,
                            block=False,
                            chunksize=chunksize)  # length / len(self.view))
        # pr.disable()
        # pr.print_stats('cumulative')
        print('map called')
        if self.callback is not None:
            old_progress = out.progress
            while not out.ready():
                self.callback(out.progress * chunksize, length, out.elapsed)
                if old_progress == out.progress and out.progress > 0:
                    for id, info in self.view.queue_status(verbose=True).iteritems():
                        # print(id, info)
                        if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0:
                            print(id, info['queue'])

                    pass
                old_progress = out.progress
                sleep(10)
        print('map ready')
        out = out.get()


        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 53
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
                      for parameters in parameter_iterable
                      for (train, test) in cv]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        fit_params = self.fit_params
        error_score = self.error_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                                  parameters, fit_params,
                                  return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 54
0
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, error_score='raise',
                   return_estimator=False, return_idx=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape at least 2D
        The data to fit.

    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.

    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.

        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.

        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.

    train : array-like, shape (n_train_samples,)
        Indices of training samples.

    test : array-like, shape (n_test_samples,)
        Indices of test samples.

    verbose : integer
        The verbosity level.

    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    parameters : dict or None
        Parameters to be set on the estimator.

    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.

    return_train_score : boolean, optional, default: False
        Compute and return score on training set.

    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.

    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``

    return_times : boolean, optional, default: False
        Whether to return the fit/score times.

    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None, optional
        The parameters that have been evaluated.

    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    test_scores = {}
    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    # do it for each patient
    X_train, y_train, X_test, y_test = _safe_split_multi(
        estimator, X, y, train, test)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)

    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    if return_idx:
        ret.extend([train, test])
    return ret
Ejemplo n.º 55
0
 def fit(self, X, y=None):
     super(OneClassSVM, self).fit(X, np.ones(_num_samples(X)))
Ejemplo n.º 56
0
    def _fit(self, X, y, labels, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                  train, test, self.verbose, parameters,
                                  self.fit_params, return_parameters=True,
                                  error_score=self.error_score)
          for parameters in parameter_iterable
          for train, test in cv.split(X, y, labels))

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 57
0
    def fit(self, X, y=None, labels=None):
        #return self._fit(
        #    X, y, labels,
        #    parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit
        #)

        # FIXME code duplication from BaseSearchCV._fit
        estimator = self.estimator
        cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                  'of samples (%i) than data (X: %i samples)'
                                  % (len(y), n_samples))

        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch
        # FIXME how to handle pre_dispatch


        # FIXME recursively getting new parameters to evaluate

#        parameter_iterable = ...  # the magic
#
#        # The evaluation (Parallel) stuff
#        out = Parallel(
#            n_jobs=self.n_jobs, verbose=self.verbose,
#            pre_dispatch=pre_dispatch
#        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
#                                  train, test, self.verbose, parameters,
#                                  self.fit_params, return_parameters=True,
#                                  error_score=self.error_score)
#            for parameters in parameter_iterable
#            for train, test in cv.split(X, y, labels))
#

        # n_fits on each (train, test)
        def cross_validation(raw_parameters):
            parameters = dict(zip(
                self.param_grid.keys(), raw_parameters
            ))  # TODO more robust way of doing this
            print(parameters)

            return Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                      train, test, self.verbose, parameters,
                                      self.fit_params, return_parameters=True,
                                      error_score=self.error_score)
               for train, test in cv.split(X, y, labels))

        x = cartesian_product(*self.param_grid.values())

        # FIXME implement as non-recursive
        def bo_(x_obs, y_obs, n_iter):
            if n_iter > 0:
                kernel = kernels.Matern() + kernels.WhiteKernel()
                gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16)
                gp.fit(x_obs, 1-y_obs)

                a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs)

                argmax_f_x_ = x[np.argmax(a(x))]

                # heavy evaluation
                f_argmax_f_x_ = cross_validation(argmax_f_x_)

                y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T

                return f_argmax_f_x_ + bo_(
                    x_obs=np.vstack((x_obs, argmax_f_x_)),
                    y_obs=np.vstack((y_obs, y_ob)),
                    n_iter=n_iter-1,
                )

            else:
                return []


        # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations
        # sobol initilization?

        sampled_x_ind = np.random.choice(
            x.shape[0],
            size=self.n_initial_points,
            replace=False,
        )
        print(sampled_x_ind)

        x_obs = x[sampled_x_ind]
        f_x_obs = list(map(cross_validation, x_obs))

        y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T

        out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter)

        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _ , parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))

            grid_scores.append(_search._CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))

        self.grid_scores_ = grid_scores

        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Ejemplo n.º 58
0
    def _fit(self, depthmaps, offset_points_projected, direction_vectors, true_joints, parameter_iterable):

        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(depthmaps)
        
        if _num_samples(offset_points_projected) != n_samples:
            raise ValueError('offset_points_projected has a different number '
                                'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(offset_points_projected), n_samples))
        
        if _num_samples(direction_vectors) != n_samples:
            raise ValueError('direction_vectors has a different number '
                                'of samples ({0}) than data (depthmaps: {1} samples)'.format(_num_samples(direction_vectors), n_samples))
        
        cv = _check_cv(cv, n_samples)
            
        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv)))
                      
        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(_fit_and_score)(clone(base_estimator), depthmaps, offset_points_projected,
                                    direction_vectors, true_joints, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
                for parameters in parameter_iterable
                for train, test in cv)
        
        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(depthmaps, offset_points_projected, direction_vectors, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 59
0
    def _fit(self, X, y, parameter_dict):

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax)

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" % (self.gene_type, maxints))

        toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)

        toolbox.register("evaluate", _evalFunction, searchobj=self,
                         name_values=name_values, X=X, y=y,
                         scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose,
                         error_score=self.error_score, fit_params=self.fit_params)

        toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type)

        toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("min", np.min)
        stats.register("max", np.max)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1)))

        pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                           ngen=self.generations_number, stats=stats,
                                           halloffame=hof, verbose=self.verbose)

        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)

        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" % (
                current_best_params_, current_best_score_)
                  )
            print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % (
                self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits)

        if current_best_score_ > self.best_score_:
            self.best_score_ = current_best_score_
            self.best_params_ = current_best_params_
Ejemplo n.º 60
0
 def predict(self, T):
     if self.check_X is not None:
         assert self.check_X(T)
     return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]