Example #1
0
def cross_val_score_fn(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
                       verbose=0, fit_params=None, pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation.
    This overrides the cross_val_score method typically found in
    cross_validation.py. Changes are clearly marked in comments, but
    the main change is augmenting the function to store Fit and Metric Events
    for each fold.
    """
    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

    scorer = check_scoring(estimator, scoring=scoring)

    # Default scoring scheme is 'accuracy' unless provided by user.
    if scoring is None:
        scoring = 'accuracy'
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    # Change from original scikit code: adding a new argument, scoring, to the
    # _fit_and_score function to track scoring function and create
    # MetricEvents.
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params, scoring)
                      for train, test in cv)
    return np.array(scores)[:, 0]
    def fit(self, X, y=None, groups=None, **fit_params):
        # pylint: disable=unbalanced-tuple-unpacking
        X, y, groups = indexable(X, y, groups)

        # Evaluate candidates.
        scorers = {
            "score": check_scoring(self.estimator, scoring=self.scoring)
        }
        candidate_params = list(
            ParameterSampler(self.param_distributions,
                             self.n_iter,
                             random_state=self.random_state))
        self.cv_results_ = self.evaluate_candidates(X, y, groups,
                                                    candidate_params, scorers,
                                                    fit_params)

        # Get the best parameter.
        self.best_index_ = self.cv_results_["rank_test_score"].argmin()
        self.best_score_ = self.cv_results_["mean_test_score"][
            self.best_index_]
        self.best_params_ = self.cv_results_["params"][self.best_index_]

        # Refit.
        if self.refit:
            best_estimator = clone(
                self.estimator).set_params(**self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator

        return self
Example #3
0
def fit_and_save(estimator,
                 X,
                 y=None,
                 groups=None,
                 scoring=None,
                 cv=None,
                 n_jobs=1,
                 verbose=0,
                 fit_params=None,
                 pre_dispatch='2*n_jobs',
                 return_train_score=True,
                 parameters=dict(),
                 uuid='',
                 url='http://127.0.0.1:8000'):

    import json, requests, numpy
    from sklearn.model_selection._validation import cross_validate

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers, _ = _check_multimetric_scoring(estimator, scoring=scoring)

    _base_scores = [0. for _ in range(cv.get_n_splits(X, y, groups))]

    cv_score = {}
    cv_score.update(
        {'train_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update(
        {'test_%s' % s: numpy.array(_base_scores)
         for s in scorers})
    cv_score.update({'fit_time': _base_scores, 'score_time': _base_scores})

    try:
        cv_score = cross_validate(estimator, X, y, groups, scorers, cv, n_jobs,
                                  verbose, fit_params, pre_dispatch,
                                  return_train_score)
        error = None
    except Exception as e:
        error = '{}: {}'.format(type(e).__name__, str(e))

    try:
        for k, v in cv_score.items():
            if type(v) == type(numpy.array([])):
                cv_score[k] = v.tolist()
        response = requests.post('{url}/grids/{uuid}/results'.format(
            url=url, uuid=uuid),
                                 data={
                                     'gridsearch': uuid,
                                     'params': json.dumps(parameters),
                                     'errors': error,
                                     'cv_data': json.dumps(cv_score)
                                 })

    except requests.exceptions.ConnectionError as e:
        response = None
    if response is None:
        return
    return response
Example #4
0
    def fit(self, X, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
        **fit_params : dict of str -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)

        self._run_search(X, y, cv)

        return self
Example #5
0
    def split(self, y, exogenous=None):
        """Generate indices to split data into training and test sets

        Parameters
        ----------
        y : array-like or iterable, shape=(n_samples,)
            The time-series array.

        exogenous : array-like, shape=[n_obs, n_vars], optional (default=None)
            An optional 2-d array of exogenous variables.

        Yields
        ------
        train : np.ndarray
            The training set indices for the split

        test : np.ndarray
            The test set indices for the split
        """
        y, exog = indexable(y, exogenous)
        indices = np.arange(y.shape[0])
        for train_index, test_index in self._iter_train_test_masks(y, exog):
            train_index = indices[train_index]
            test_index = indices[test_index]
            yield train_index, test_index
Example #6
0
    def fit(self, X, y, sample_weight=None, groups=None, missing=None, cat_cols=None,
            n_trials=10, timeout_per_estimator=None):
        # TODO check that y is regression and not classification
        # TODO: consider log-transform y?
        X, y, groups = indexable(X, y, groups)

        y = np.array(y)
        y_mean = np.mean(y)

        cv = check_cv(self.cv, y, classifier=False)
        if cv.random_state is None:
            cv.random_state = self.random_state

        # if self.sampler.seed is None:
        #     self.sampler.seed = self.random_state

        scorer, scorer_type, greater_is_better = get_scorer_type(self.scoring)

        valid_estimators = get_estimators(self.frameworks, self.model_types,
                                          objective_type="regression")

        #valid_estimators = filter_estimators(X, valid_estimators, y_mean, "regression")

        self.run_study(X, y, valid_estimators, cv, scorer, scorer_type,
                       greater_is_better, y_stats=y_mean, objective_type="regression",
                       sample_weight=sample_weight, groups=groups, missing=missing,
                       cat_cols=cat_cols, timeout_per_estimator=timeout_per_estimator,
                       n_trials=n_trials)

        self.save_results()

        if self.refit_:
            self.best_pipeline_.fit(X, y)
Example #7
0
    def _gen_train_val(self, X, y, cv_split_method):
        X, y, groups = indexable(X, y, None)
        Xs_tr, ys_tr, Xs_cv, ys_cv = [], [], [], []

        if isinstance(cv_split_method, BaseCrossValidator):
            for tr, cv in cv_split_method.split(X, y, groups):
                X_tr, y_tr = _safe_split(self, X, y, tr)
                X_cv, y_cv = _safe_split(self, X, y, cv, tr)
                Xs_tr.append(X_tr)
                Xs_cv.append(X_cv)
                ys_tr.append(y_tr)
                ys_cv.append(y_cv)
        elif cv_split_method.__name__ == 'train_test_split':
            X, X_val, y, y_val = train_test_split(
                X,
                y,
                random_state=self._random_state,
                test_size=self.validation_fraction)
            Xs_tr.append(X_tr)
            Xs_cv.append(X_cv)
            ys_tr.append(y_tr)
            ys_cv.append(y_cv)
        else:
            raise ValueError("Split method should be a "
                             "sklearn.model_selection spliter class...")

        return Xs_tr, ys_tr, Xs_cv, ys_cv
Example #8
0
    def _three_way_split(splitter: KFold,
                         X,
                         y: Optional = None,
                         groups: Optional = None) -> Generator:
        """A modified version of BaseCrossValidator.split().

        Yields (K-2/1/1) train/val/test splits.
        """
        X, y, groups = indexable(X, y, groups)
        indices = np.arange(_num_samples(X))
        test_masks_it = splitter._iter_test_masks(X, y, groups)
        first_mask = last_mask = next(test_masks_it)
        for test_mask in test_masks_it:
            train_index = indices[np.logical_not(
                np.logical_or(test_mask, last_mask))]
            val_index = indices[last_mask]
            test_index = indices[test_mask]
            yield train_index, val_index, test_index
            last_mask = test_mask
        # last fold
        test_mask = first_mask
        train_index = indices[np.logical_not(
            np.logical_or(test_mask, last_mask))]
        val_index = indices[last_mask]
        test_index = indices[test_mask]
        yield train_index, val_index, test_index
Example #9
0
    def split(self, y, X=None, **kwargs):  # TODO: remove kwargs
        """Generate indices to split data into training and test sets

        Parameters
        ----------
        y : array-like or iterable, shape=(n_samples,)
            The time-series array.

        X : array-like, shape=[n_obs, n_vars], optional (default=None)
            An optional 2-d array of exogenous variables.

        Yields
        ------
        train : np.ndarray
            The training set indices for the split

        test : np.ndarray
            The test set indices for the split
        """
        # Temporary shim until we remove `exogenous` support completely
        X, _ = pm_compat.get_X(X, **kwargs)

        y, X = indexable(y, X)
        indices = np.arange(y.shape[0])
        for train_index, test_index in self._iter_train_test_masks(y, X):
            train_index = indices[train_index]
            test_index = indices[test_index]
            yield train_index, test_index
Example #10
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)

        n_samples = _num_samples(X)
        indices = np.arange(n_samples)

        n_splits = self.n_splits
        fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
        fold_sizes[:n_samples % n_splits] += 1

        current = 0

        for fold_size in fold_sizes:
            start, stop = current, current + fold_size

            test_index = indices[start:stop]

            start_pad = start - self.n_reduce
            stop_pad = stop + self.n_reduce

            if start_pad < 0:
                start_pad = 0

            if stop_pad > n_samples:
                stop_pad = n_samples

            block_index = indices[start_pad:stop_pad]

            block_mask = np.zeros(n_samples, dtype=np.bool)
            block_mask[block_index] = True
            train_index = indices[np.logical_not(block_mask)]

            yield train_index, test_index

            current = stop
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        param_grid = ParameterSampler(self.param_distributions,
                                      self.n_iter,
                                      random_state=self.random_state)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(param_grid, Sized):
                n_candidates = len(param_grid)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid,
                                             len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_

        indexed_output = dict(
            par_param_grid.map(
                lambda i: local_fit(i[0], i[1], base_estimator, X_bc.value,
                                    y_bc.value, scorer, cv)).collect())
        out = [indexed_output[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        best = sorted(out, key=lambda x: x[0], reverse=True)[0]

        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #12
0
    def fit(self, X, y, groups=None):
        """Actual fitting,  performing the search over parameters."""
        results = dict()

        best_index = None
        best_parameters = None

        for bracket_idx in range(self.num_brackets - 1, -1, -1):
            successive_halving_steps = bracket_idx + 1
            # TODO: num_arms should be different

            estimator = self.estimator
            cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
            self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

            X, y, groups = indexable(X, y, groups)
            n_splits = cv.get_n_splits(X, y, groups)

            base_estimator = clone(self.estimator)

            arms_pulled = 0
            if 'mean_test_score' in results:
                arms_pulled = len(results['mean_test_score'])

            res = self._successive_halving(X, y, groups, cv, self.eta,
                                           successive_halving_steps - 1,
                                           self.num_brackets - 1)
            bracket_results, bracket_best_index, bracket_best_parameters = res
            for key, values in bracket_results.items():
                if key not in results:
                    results[key] = values
                else:
                    results[key] = np.append(results[key], values)

            if best_index is None:
                best_index = bracket_best_index + arms_pulled
                best_parameters = bracket_best_parameters
            elif bracket_results['mean_test_score'][
                    bracket_best_index] > results['mean_test_score'][
                        best_index]:
                best_index = bracket_best_index + arms_pulled
                best_parameters = bracket_best_parameters

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #13
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        :param X: array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        :param y: array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        :param groups: array-like of shape (n_samples,)
            Always ignored, exists for compatibility.

        :returns:
            train : ndarray
                The training set indices for that split.
            test : ndarray
                The testing set indices for that split.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None.")
        X, y, groups = indexable(X, y, groups)
        groups = check_array(groups, ensure_2d=False, dtype=None)

        unique_groups, groups = np.unique(groups, return_inverse=True)
        n_samples_per_group = np.bincount(groups)
        n_groups = len(unique_groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1

        if self.n_splits > n_groups:
            raise ValueError("Cannot have number of splits n_splits=%d greater"
                             " than the number of groups: %d." %
                             (self.n_splits, n_groups))

        indices = np.arange(n_samples)
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds, n_groups,
                            test_size)
        for test_start in test_starts:
            # here we already have groups after inverse operation
            # and don't need to use unique_group
            if self.max_train_size:
                # find number of group for start not to overflow train size
                sizes = n_samples_per_group[:test_start][::-1].cumsum()
                appropriate_indices = np.where(sizes <= self.max_train_size)[0]
                if appropriate_indices.size == 0:
                    train_start = max(test_start - 1, 0)
                else:
                    train_start = test_start - appropriate_indices.max() - 1
                yield (indices[(groups < test_start)
                               & (groups >= train_start)],
                       indices[(groups >= test_start)
                               & (groups < test_start + test_size)])
            else:
                yield (indices[groups < test_start],
                       indices[(groups >= test_start)
                               & (groups < test_start + test_size)])
Example #14
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            The target variable for supervised learning problems.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        Returns
        -------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """

        if groups is not None:
            # find all indices that are at the beginning of a group
            groups_unique = np.unique(groups)
            possible_test_start = [
                np.where(i == groups)[0][0] for i in np.nditer(groups_unique)
            ]
            possible_test_start = np.asarray(possible_test_start)

        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        if n_folds > n_samples:
            raise ValueError(("Cannot have number of folds ={0} greater"
                              " than the number of samples: {1}.").format(
                                  n_folds, n_samples))
        indices = np.arange(n_samples)
        test_size = (n_samples // n_folds)
        test_starts = range(test_size + n_samples % n_folds, n_samples,
                            test_size)

        if groups is not None:
            # find all possible starts that are closest to predefined test_starts
            test_starts = [
                possible_test_start[np.abs(possible_test_start - i).argmin()]
                for i in test_starts
            ]

        for test_start in test_starts:
            yield (indices[:test_start],
                   indices[test_start:test_start + test_size])
Example #15
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)

        n_samples = _num_samples(X)
        train_size = round((1 - self.test_size) * n_samples)

        train_index = np.arange(train_size - self.n_reduce)
        test_index = np.arange(train_size, n_samples)

        yield train_index, test_index
Example #16
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Example #17
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterSampler(self.param_distributions,
                                              self.n_iter,
                                              random_state=self.random_state)
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
                                      parameters, cv=cv)
            for parameters in parameter_iterable)

        best = sorted(out, reverse=True)[0]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Example #18
0
def cross_val_score(estimator,
                    X,
                    y=None,
                    fold_specific_X_extractor=None,
                    groups=None,
                    scorings=None,
                    cv=None,
                    n_jobs=1,
                    verbose=0,
                    fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    :param estimator: 
    :param X: 
    :param y: 
    :param fold_specific_X_extractor: 
    :param groups: 
    :param scorings: list of scorings (strings, callables, etc...)
    :param cv: 
    :param n_jobs: 
    :param verbose: 
    :param fit_params: 
    :param pre_dispatch: 
    :return: an array of scores, shape: <folds x scores>
    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorers = [
        check_scoring(estimator, scoring=scoring) for scoring in scorings
    ]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fe_fit_and_score)(
            clone(estimator),
            X,
            y,
            scorers,
            train,
            test,
            verbose,
            None,
            fit_params,
            fold_specific_X_extractor=fold_specific_X_extractor)
        for train, test in cv.split(X, y, groups))
    # here scores is python list of shape <folds x 1 x scores>
    scores = np.array(scores)
    # eliminate middle axis
    return scores.reshape((scores.shape[0], scores.shape[2]))
Example #19
0
    def __init__(self,
                 X,
                 generator,
                 y=None,
                 batch_size=32,
                 shuffle=True,
                 sample_weight=None,
                 seed=None):
        X, y, sample_weight = indexable(X, y, sample_weight)
        self.X = X
        self.generator = generator
        self.y = y
        self.sample_weight = sample_weight

        super(FastaToArrayIterator, self).__init__(X.shape[0], batch_size,
                                                   shuffle, seed)
Example #20
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, of length n_samples
            Training data, includes reaction's containers
        y : array-like, of length n_samples
            The target variable for supervised learning problems.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        cgr = CGRpreparer()
        cgrs = [cgr.condense(r) for r in X]

        structure_condition = defaultdict(set)

        for structure, condition in zip(cgrs, groups):
            structure_condition[structure].add(condition)

        train_data = defaultdict(list)
        test_data = []

        for n, (structure, condition) in enumerate(zip(cgrs, groups)):
            train_data[condition].append(n)
            if len(structure_condition[structure]) > 1:
                test_data.append(n)

        for condition, indexes in train_data.items():

            test_index = [index for index in indexes if index in test_data]
            if len(test_index) == 0:
                continue

            train_index = []
            for c in train_data.keys():
                if not c == condition:
                    train_index.extend(train_data[c])

            yield array(train_index), array(test_index)
Example #21
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)

        n_samples = _num_samples(X)
        indices = np.arange(n_samples)

        argsort = np.argsort(groups)

        for i in range(self.n_splits):
            train_index = indices[argsort[i::self.n_splits]]

            train_mask = np.zeros(n_samples, dtype=np.bool)
            train_mask[train_index] = True

            test_index = indices[np.logical_not(train_mask)]

            yield train_index, test_index
Example #22
0
    def split(self,
              X,
              y,
              erry=10e-4,
              groups=None,
              match_window=np.inf,
              num_pairs=None,
              closest_match=False,
              random_state=None):

        X, y, groups = indexable(X, y, groups)
        num_samples = _num_samples(X)
        if num_samples < 2:
            raise ValueError(
                'Number of samples must be greater than or equal to 2.')
        return self.generate_train_test(X, y, erry, groups, match_window,
                                        num_pairs, closest_match, random_state)
Example #23
0
    def fit(self, X, y, groups=None):
        """Actual fitting,  performing the search over parameters."""
        num_arms = self.eta**(self.num_steps - 1)
        parameter_iterable = ParameterSampler(self.param_distributions,
                                              num_arms,
                                              random_state=self.random_state)

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        results, best_index, best_parameters = self._successive_halving(
            X, y, groups, cv, self.eta, self.num_steps - 1, self.num_steps - 1)

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #24
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.

        groups : array-like, with shape (n_samples,), optional
            Always ignored, exists for compatibility.

        Returns
        -------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        initial_train_index = max(
            0,
            _get_size(n_samples, self.initial_train_index,
                      neg_mode='subtract'))
        final_index = n_samples
        test_size = _get_size(n_samples - initial_train_index,
                              self.test_size,
                              neg_mode='subtract')

        indices = np.arange(n_samples)
        train = indices[initial_train_index:-test_size]
        test = indices[-test_size:final_index]

        yield train, test
Example #25
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        indices = np.arange(_num_samples(X))
        X_copy = X.copy()
        X_copy.insert(0, "idx", indices)

        for test_index in self._iter_test_masks(X_copy, y, "pid"):
            train_index = indices[np.logical_not(test_index)]
            test_index = indices[test_index]

            discard_train_index = []
            # exclude days after test for specific participant
            for pid in set(X_copy.iloc[test_index].index.get_level_values("pid").tolist()):
                participant_in_train = X_copy.iloc[train_index][X_copy.iloc[train_index].index.get_level_values("pid") == pid]
                participant_in_test = X_copy.iloc[test_index][X_copy.iloc[test_index].index.get_level_values("pid") == pid]
                last_date_in_test = participant_in_test.index.max()

                discard_train_index.extend(participant_in_train[participant_in_train.index >= pd.Index([last_date_in_test]*len(participant_in_train))]["idx"].tolist())
            
            train_index = train_index[~np.isin(train_index, discard_train_index)]

            yield train_index, test_index
def cross_val_score_(estimators,
                     X,
                     y=None,
                     groups=None,
                     scoring=None,
                     cv=None,
                     n_jobs=1,
                     verbose=0,
                     fit_params=None):
    X, y, groups = indexable(X, y, groups)
    cv = check_cv(cv, y, classifier=True)
    cv_iter = list(cv.split(X, y, groups))

    parallel = Parallel(n_jobs=n_jobs, verbose=0)

    scores = parallel(
        delayed(_fit_and_score)(estimators[i], X, y,
                                check_scoring(estimators[i], scoring=scoring),
                                train, test, verbose, None, fit_params)
        for i, (train, test) in enumerate(cv_iter))

    return np.array(scores)[:, 0]
Example #27
0
def to_indexable(*args, **kwargs):
    """Ensure that all args are an indexable type.

    Conversion runs lazily for dask objects, immediately otherwise.

    Parameters
    ----------
    args : array_like or scalar
    allow_scalars : bool, optional
        Whether to allow scalars in args. Default is False.
    """
    if kwargs.get('allow_scalars', False):
        indexable = _maybe_indexable
    else:
        indexable = _indexable
    for x in args:
        if x is None or isinstance(x, da.Array):
            yield x
        elif is_dask_collection(x):
            yield delayed(indexable, pure=True)(x)
        else:
            yield indexable(x)
Example #28
0
def to_indexable(*args, **kwargs):
    """Ensure that all args are an indexable type.

    Conversion runs lazily for dask objects, immediately otherwise.

    Parameters
    ----------
    args : array_like or scalar
    allow_scalars : bool, optional
        Whether to allow scalars in args. Default is False.
    """
    if kwargs.get("allow_scalars", False):
        indexable = _maybe_indexable
    else:
        indexable = _indexable
    for x in args:
        if x is None or isinstance(x, da.Array):
            yield x
        elif is_dask_collection(x):
            yield delayed(indexable, pure=True)(x)
        else:
            yield indexable(x)
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.

        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.

        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.

        """

        X, y, groups = indexable(X, y, groups)

        n_samples = _num_samples(X)
        indices = np.arange(n_samples)

        h = self.h

        min_position = np.maximum(h, int(n_samples * (1 - self.p_to_use)))

        positions = np.flip(np.arange(min_position, n_samples - h))

        for position in positions:

            yield (indices[:position], indices[position:position + h])
Example #30
0
    def split(self, X, y=None, groups=None):
        """ Generates indices to split training and testing data

        Args:
            X:
            y: Not used, exists for compatibility
            groups: Not used, exists for compatibility

        Returns:
            train (np.ndarray): indices for training set
            test (np.ndarray): indices for testing set

        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)  # len(X)
        n_splits = self.n_splits

        trainm = self.min_train_size
        trainM = self.max_train_size if self.max_train_size is not None else np.inf
        testm = self.min_test_size
        testM = self.max_test_size if self.max_test_size is not None else np.inf
        delay = self.delay

        if (n_samples - (trainm + delay + testm) < n_splits - 1):
            raise ValueError("Not enough samples")

        # The datum for each fold will be the index of the first test sample
        self.test_starts = np.linspace(trainm + delay, n_samples - testm, n_splits, dtype=int)
        indices = np.arange(n_samples)
        for test_start in self.test_starts:
            test_end = min(test_start + testM, n_samples)

            train_end = test_start - delay
            train_start = max(test_start - delay - trainM, 0)

            yield indices[train_start:train_end], indices[test_start:test_end]
Example #31
0
    def fit(self, X, y, sample_weight=None, groups=None,
            missing=None, cat_cols=None, n_trials=10,
            timeout_per_estimator=None):
        X, y, groups = indexable(X, y, groups)

        ## convertc labels to np.array
        le = LabelEncoder()
        y = le.fit_transform(y)
        class_counts = np.bincount(y)

        cv = check_cv(self.cv, y, classifier=True)
        if cv.random_state is None:
            cv.random_state = self.random_state

        # if self.sampler.seed is None: #TODO: check for CMA
        #     self.sampler.seed = self.random_state

        scorer, scorer_type, greater_is_better = get_scorer_type(self.scoring)
        data_tags = get_data_tags(X, y, "classification", class_counts)
        #get estimators ("name", tags, class) by insatlled packages + version
        #filter estimators by data & constraints

        valid_estimators = get_estimators(self.frameworks, self.model_types,
                                          objective_type="classification")

        #valid_estimators = filter_estimators(X, valid_estimators, class_counts, "classification")

        self.run_study(X, y, valid_estimators, cv, scorer, scorer_type, greater_is_better,
                       y_stats=class_counts, objective_type="classification", sample_weight=sample_weight,
                       groups=groups, missing=missing, cat_cols=cat_cols,
                       timeout_per_estimator=timeout_per_estimator, n_trials=n_trials)

        self.save_results()

        if self.refit_:
            self.best_pipeline_.fit(X, y)
Example #32
0
    def fit(self, X, y=None, labels=None):
        #return self._fit(
        #    X, y, labels,
        #    parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit
        #)

        # FIXME code duplication from BaseSearchCV._fit
        estimator = self.estimator
        cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                  'of samples (%i) than data (X: %i samples)'
                                  % (len(y), n_samples))

        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch
        # FIXME how to handle pre_dispatch


        # FIXME recursively getting new parameters to evaluate

#        parameter_iterable = ...  # the magic
#
#        # The evaluation (Parallel) stuff
#        out = Parallel(
#            n_jobs=self.n_jobs, verbose=self.verbose,
#            pre_dispatch=pre_dispatch
#        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
#                                  train, test, self.verbose, parameters,
#                                  self.fit_params, return_parameters=True,
#                                  error_score=self.error_score)
#            for parameters in parameter_iterable
#            for train, test in cv.split(X, y, labels))
#

        # n_fits on each (train, test)
        def cross_validation(raw_parameters):
            parameters = dict(zip(
                self.param_grid.keys(), raw_parameters
            ))  # TODO more robust way of doing this
            print(parameters)

            return Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                      train, test, self.verbose, parameters,
                                      self.fit_params, return_parameters=True,
                                      error_score=self.error_score)
               for train, test in cv.split(X, y, labels))

        x = cartesian_product(*self.param_grid.values())

        # FIXME implement as non-recursive
        def bo_(x_obs, y_obs, n_iter):
            if n_iter > 0:
                kernel = kernels.Matern() + kernels.WhiteKernel()
                gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16)
                gp.fit(x_obs, 1-y_obs)

                a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs)

                argmax_f_x_ = x[np.argmax(a(x))]

                # heavy evaluation
                f_argmax_f_x_ = cross_validation(argmax_f_x_)

                y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T

                return f_argmax_f_x_ + bo_(
                    x_obs=np.vstack((x_obs, argmax_f_x_)),
                    y_obs=np.vstack((y_obs, y_ob)),
                    n_iter=n_iter-1,
                )

            else:
                return []


        # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations
        # sobol initilization?

        sampled_x_ind = np.random.choice(
            x.shape[0],
            size=self.n_initial_points,
            replace=False,
        )
        print(sampled_x_ind)

        x_obs = x[sampled_x_ind]
        f_x_obs = list(map(cross_validation, x_obs))

        y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T

        out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter)

        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _ , parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))

            grid_scores.append(_search._CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))

        self.grid_scores_ = grid_scores

        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Example #33
0
    def _fit(self, X, y, groups=None, parameter_iterable=None, **fit_params):
        if groups is not None:
            raise NotImplementedError('The groups argument is not supported.')
        if parameter_iterable is not None:
            raise NotImplementedError('The parameter_iterable argument is not supported.')
        if self.fit_params is not None:
            fit_params = self.fit_params

        # Actual fitting,  performing the search over parameters.
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        n_folds, cv_iter = our_check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp(self.sigopt_connection)

        # start tracking time to optimize estimator
        opt_start_time = time.time()
        for jk in range(0, self.n_iter, self.n_sug):
            # check for opt timeout, ensuring at least 1 observation
            # TODO : handling failure observations
            if (
                self.opt_timeout is not None and
                time.time() - opt_start_time > self.opt_timeout and
                jk >= 1
            ):
                # break out of loop and refit model with best params so far
                break

            suggestions = []
            parameter_configs = []
            for _ in range(self.n_sug):
                suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create()
                parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json())
                suggestions.append(suggestion)
                parameter_configs.append(parameters)

            if self.verbose > 0:
                print('Evaluating params : ', parameter_configs)


            # do CV folds in parallel using joblib
            # returns scores on test set
            obs_timed_out = False
            try:
                par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose,
                              'pre_dispatch': pre_dispatch}
                # add timeout kwarg if version of joblib supports it
                if 'timeout' in getfullargspec(Parallel.__init__).args:
                    par_kwargs['timeout'] = self.cv_timeout
                out = Parallel(
                    **par_kwargs
                )(
                    delayed(_fit_and_score)(clone(base_estimator), X, y,
                                            self.scorer_, train, test,
                                            self.verbose, parameters,
                                            fit_params,
                                            return_parameters=True,
                                            error_score=self.error_score)
                        for parameters in parameter_configs
                        for train, test in cv_iter)
            except TimeoutError:
                 obs_timed_out = True

            if not obs_timed_out:
                # grab scores from results
                for sidx, suggestion in enumerate(suggestions):
                    out_idx = sidx * n_folds
                    scores = [o[0] for o in out[out_idx:out_idx+n_folds]]
                    self.sigopt_connection.experiments(self.experiment.id).observations().create(
                        suggestion=suggestion.id,
                        value=numpy.mean(scores),
                        value_stddev=numpy.std(scores)
                    )
            else:
                # obsevation timed out so report a failure
                self.sigopt_connection.experiments(self.experiment.id).observations().create(
                    suggestion=suggestion.id,
                    failed=True)

        # return best SigOpt assignments so far
        best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data

        if not best_assignments:
            raise RuntimeError(
                'No valid observations found. '
                'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.')

        self.our_best_params_ = self._convert_sigopt_api_to_sklearn_assignments(
            best_assignments[0].assignments.to_json())
        self.our_best_score_ = best_assignments[0].value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.our_best_estimator_ = best_estimator
        return self
Example #34
0
    def _fit(self, X, y, groups, parameter_iterable):

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test) for parameters in parameter_iterable
                                                for train, test in list(cv.split(X, y, groups))]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        error_score = self.error_score
        fit_params = self.fit_params
        return_train_score = self.return_train_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                      parameters, fit_params,
                      return_train_score=return_train_score,
                      return_n_test_samples=True, return_times=True,
                      return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]
        if return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out)
        X_bc.unpersist()
        y_bc.unpersist()

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #35
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
                      for parameters in parameter_iterable
                      for (train, test) in cv]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        fit_params = self.fit_params
        error_score = self.error_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                                  parameters, fit_params,
                                  return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #36
0
def _fit(self, X, y, parameter_iterable, en_celery=False):
  """Actual fitting,  performing the search over parameters."""

  estimator = self.estimator
  cv = self.cv
  self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

  n_samples = _num_samples(X)
  X, y = indexable(X, y)

  if y is not None:
      if len(y) != n_samples:
          raise ValueError('Target variable (y) has a different number '
                           'of samples (%i) than data (X: %i samples)'
                           % (len(y), n_samples))
  cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

  if self.verbose > 0:
      if isinstance(parameter_iterable, Sized):
          n_candidates = len(parameter_iterable)
          print("Fitting {0} folds for each of {1} candidates, totalling"
                " {2} fits".format(len(cv), n_candidates,
                                   n_candidates * len(cv)))

  base_estimator = clone(self.estimator)

  pre_dispatch = self.pre_dispatch

  if en_celery:
    out = []
    timestamp = timestamp = datetime.now().strftime("%Y%m%d%H%M%s")
    key = "sample_%s_%s" % (timestamp, int(round(random.random(), 8)*1e8))
    red.set(key, pickle.dumps({'X': X, 'y': y}))
    grp = group(cjobs.fas_mp.s(clone(base_estimator), key, self.scorer_,
                                train, test, self.verbose, parameters,
                                self.fit_params, return_parameters=True,
                                error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)()
    out = grp.get()
    red.delete(key)
  else:
    out = Parallel(
        n_jobs=self.n_jobs, verbose=self.verbose,
        pre_dispatch=pre_dispatch
    )(
        delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                train, test, self.verbose, parameters,
                                self.fit_params, return_parameters=True,
                                error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)

  # Out is a list of triplet: score, estimator, n_test_samples
  n_fits = len(out)
  n_folds = len(cv)

  scores = list()
  grid_scores = list()
  for grid_start in range(0, n_fits, n_folds):
      n_test_samples = 0
      score = 0
      all_scores = []
      for this_score, this_n_test_samples, _, parameters in \
              out[grid_start:grid_start + n_folds]:
          all_scores.append(this_score)
          if self.iid:
              this_score *= this_n_test_samples
              n_test_samples += this_n_test_samples
          score += this_score
      if self.iid:
          score /= float(n_test_samples)
      else:
          score /= float(n_folds)
      scores.append((score, parameters))
      # TODO: shall we also store the test_fold_sizes?
      grid_scores.append(_CVScoreTuple(
          parameters,
          score,
          np.array(all_scores)))
  # Store the computed scores
  self.grid_scores_ = grid_scores

  # Find the best parameters by comparing on the mean validation score:
  # note that `sorted` is deterministic in the way it breaks ties
  best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                reverse=True)[0]
  self.best_params_ = best.parameters
  self.best_score_ = best.mean_validation_score

  if self.refit:
      # fit the best estimator using the entire dataset
      # clone first to work around broken estimators
      best_estimator = clone(base_estimator).set_params(
          **best.parameters)
      if y is not None:
          best_estimator.fit(X, y, **self.fit_params)
      else:
          best_estimator.fit(X, **self.fit_params)
      self.best_estimator_ = best_estimator
  return self
Example #37
0
def _indexable(x):
    return indexable(x)[0]
    def fit(self, X, y=None, x_is_index=False, X_name='X', y_name='y'):

        parameter_iterable = ParameterGrid(self.param_grid)
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)

        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)


        # out = Parallel(
        #     n_jobs=self.n_jobs, verbose=self.verbose,
        #     pre_dispatch=pre_dispatch
        # )(
        #     delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
        #                             train, test, self.verbose, parameters,
        #                             self.fit_params, return_parameters=True,
        #                             error_score=self.error_score)
        #         for parameters in parameter_iterable
        #         for train, test in cv)

        train_test_parameters = ((train, test, parameters) \
                                 for parameters in parameter_iterable for train, test in cv)

        length = len(parameter_iterable) * len(cv)

        if x_is_index:
            X_to_pass = X
            y_to_pass = None
        else:
            X_to_pass = None
            y_to_pass = None

        self.view.block = False
        # print('sequences')

        # sequences = [
        #     train_test_parameters,
        #     [clone(base_estimator)] * length,
        #     [X_to_pass] * length,
        #     [y_to_pass] * length,
        #     [self.verbose] * length,
        #     [self.fit_params] * length,
        #     [True] * length,
        #     [self.scorer_] * length,
        #     [x_is_index] * length,
        # ]

        f = partial(my_fit_and_score, estimator=clone(base_estimator),
                    X=X_to_pass,
                    y=y_to_pass,
                    verbose=self.verbose,
                    fit_params=self.fit_params,
                    return_parameters=True,
                    scorer=None,
                    x_is_index=x_is_index,
                    names=(X_name, y_name))

        # print('before map')

        # import cProfile
        #
        # pr = cProfile.Profile()
        # pr.enable()
        chunksize = 10

        out = self.view.map(f, itertools.islice(train_test_parameters, 0, length),
                            ordered=False,
                            block=False,
                            chunksize=chunksize)  # length / len(self.view))
        # pr.disable()
        # pr.print_stats('cumulative')
        print('map called')
        if self.callback is not None:
            old_progress = out.progress
            while not out.ready():
                self.callback(out.progress * chunksize, length, out.elapsed)
                if old_progress == out.progress and out.progress > 0:
                    for id, info in self.view.queue_status(verbose=True).iteritems():
                        # print(id, info)
                        if isinstance(info, dict) and 'queue' in info and len(info['queue']) > 0:
                            print(id, info['queue'])

                    pass
                old_progress = out.progress
                sleep(10)
        print('map ready')
        out = out.get()


        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #39
0
    def _fit(self, X, y, parameter_dict):

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax)

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" % (self.gene_type, maxints))

        toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)

        toolbox.register("evaluate", _evalFunction, searchobj=self,
                         name_values=name_values, X=X, y=y,
                         scorer=self.scorer_, cv=cv, iid=self.iid, verbose=self.verbose,
                         error_score=self.error_score, fit_params=self.fit_params)

        toolbox.register("mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type)

        toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("min", np.min)
        stats.register("max", np.max)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(np.prod(np.array(maxints)+1)))

        pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                           ngen=self.generations_number, stats=stats,
                                           halloffame=hof, verbose=self.verbose)

        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)

        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" % (
                current_best_params_, current_best_score_)
                  )
            print "Scoring evaluations: %d, Cache hits: %d, Total: %d" % (
                self.num_evaluations, self.num_cache_hits, self.num_evaluations + self.num_cache_hits)

        if current_best_score_ > self.best_score_:
            self.best_score_ = current_best_score_
            self.best_params_ = current_best_params_
Example #40
0
def _maybe_indexable(x):
    return indexable(x)[0] if _is_arraylike(x) else x
Example #41
0
    def _extendedFit(self, X, y, parameter_iterable):
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(
            delayed(_extended_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
                for parameters in parameter_iterable
                for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        grid_extras = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = []
            for this_score, this_n_test_samples, _, parameters, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            print "Refitting best estimator"
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #42
0
    def _fit(self, X, y):

        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp()
        for jk in xrange(self.n_iter):
            suggestion = self.conn.experiments(self.experiment.id).suggestions().create()
            parameters = suggestion.assignments.to_json()
     
            # convert all unicode names and values to plain strings
            non_unicode_parameters = self._convert_unicode_dict(parameters)

            if self.verbose > 0:
                print "Evaluating params : ",non_unicode_parameters

            # do CV folds in parallel using joblib
            # returns scores on test set
            out = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(
                delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                        train, test, self.verbose, non_unicode_parameters,
                                        self.fit_params, return_parameters=True,
                                        error_score=self.error_score)
                    for train, test in cv)

            # grab scores from results
            scores = [o[0] for o in out]
            self.conn.experiments(self.experiment.id).observations().create(
                suggestion=suggestion.id,
                value=numpy.mean(scores),
                value_stddev=numpy.std(scores)
            )
              
        # return best SigOpt observation so far
        best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation
        self.best_params_ = best_obs.assignments.to_json()
         # convert all unicode names and values to plain strings
        self.best_params_ = self._convert_unicode_dict(self.best_params_)
        self.best_score_ = best_obs.value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
Example #43
0
    def _fit(self, X, y, groups, parameter_iterable):
        """
        Actual fitting,  performing the search over parameters.
        Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X
                    .../sklearn/model_selection/_search.py
        """

        estimator = self.estimator
        cv = sklearn.model_selection._validation.check_cv(self.cv, y, classifier=
            is_classifier(estimator))
        self.scorer_ = check_scoring(
            self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))
        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch
        )(delayed(sklearn.model_selection._validation._fit_and_score)(
                clone(base_estimator),
                X, y, self.scorer_,
                train, test, self.verbose, parameters,
                fit_params=self.fit_params,
                return_train_score=self.return_train_score,
                return_n_test_samples=True,
                return_times=True, return_parameters=True,
                error_score=self.error_score
            )
            for parameters in parameter_iterable
            for train, test in cv_iter
        )

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts,
             fit_time, score_time, parameters) = zip(*out)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(
                                            MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self