def test_check_scoring():
    """Test all branches of check_scoring"""
    estimator = EstimatorWithoutFit()
    pattern = (r"estimator should a be an estimator implementing 'fit' method,"
               r" .* was passed")
    assert_raises_regexp(TypeError, pattern, check_scoring, estimator)

    estimator = EstimatorWithFitAndScore()
    estimator.fit([[1]], [1])
    scorer = check_scoring(estimator)
    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)

    estimator = EstimatorWithFitAndPredict()
    estimator.fit([[1]], [1])
    pattern = (r"If no scoring is specified, the estimator passed should have"
               r" a 'score' method\. The estimator .* does not\.")
    assert_raises_regexp(TypeError, pattern, check_scoring, estimator)

    scorer = check_scoring(estimator, "accuracy")
    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)

    estimator = EstimatorWithFit()
    pattern = (r"The estimator passed should have a 'score'"
               r" or a 'predict' method\. The estimator .* does not\.")
    assert_raises_regexp(TypeError, pattern, check_scoring, estimator,
                         "accuracy")

    estimator = EstimatorWithFit()
    scorer = check_scoring(estimator, allow_none=True)
    assert_true(scorer is None)
    def fit_ipp(self, X, y, grid):
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        if self.grid_parallel:
            scores, grid_scores = grid_cv_scores(self.estimator, X, y, grid, self.scoring, self.cv,
                    self.profile, self.n_jobs, self.verbose)
        else:
            scores = []
            # grid =  grid_search.ParameterGrid(self.param_grid)
            grid_scores = [];
            for parameters in grid:
                self.estimator.set_params(**parameters)
                scores_cv = cross_val_score(self.estimator, X, y, self.scoring, self.cv, profile=self.profile)
                scores.append(np.array(scores_cv).mean())
                grid_scores.append(grid_search._CVScoreTuple(
                    parameters,
                    scores_cv.mean(),
                    scores_cv))

        max_idx = np.array(scores).argmax()
        self.best_estimator_ = self.estimator.set_params(**list(grid)[max_idx])
        self.best_params_ = list(grid)[max_idx]
        self.scores_ = scores
        self.best_score_ = np.array(scores).max()
        self.grid_scores_ = grid_scores

        if self.refit:
            self.best_estimator_.fit(X, y)

        return self
Example #3
0
def permutation_test_score(estimator, X, y, groups=None, cv=None,
                           n_permutations=100, n_jobs=1, random_state=0,
                           verbose=0, scoring=None):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state),
            groups, cv, scorer)
        for _ in range(n_permutations))
    permutation_scores = np.array(permutation_scores)
    return permutation_scores
Example #4
0
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
                    n_jobs=1, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in splits)

    group_order = []
    if hasattr(cv, 'groups'):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order
Example #5
0
def plot_auc_curve(estimator, X_test, y_test):

    y_score = estimator.predict_proba(X_test)
    scorer = check_scoring(estimator, scoring=None)

    scorer(estimator, X_test, y_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    fpr[0], tpr[0], _ = roc_curve(y_test, y_score[:, 1])
    roc_auc[0] = auc(fpr[0], tpr[0])

    plt.figure()
    plt.plot(fpr[0], tpr[0], label='ROC curve (area = %0.2f)' % roc_auc[0])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return roc_auc[0]
Example #6
0
    def score(self, X, y):
        """Score each of the estimators on the tested dimensions.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The input samples. For each data slice, the corresponding estimator
            scores the prediction, e.g.:
            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``.
            The feature dimension can be multidimensional e.g.
            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators, n_slices)
            Score for each estimator / data slice couple.
        """  # noqa: E501
        from sklearn.metrics.scorer import check_scoring
        self._check_Xy(X)
        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(_gl_score, self.n_jobs)
        n_jobs = min(n_jobs, X.shape[-1])
        X_splits = np.array_split(X, n_jobs, axis=-1)
        scoring = check_scoring(self.base_estimator, self.scoring)
        y = _fix_auc(scoring, y)
        score = parallel(p_func(self.estimators_, scoring, x, y)
                         for x in X_splits)

        score = np.concatenate(score, axis=1)
        return score
Example #7
0
def test_check_scoring_gridsearchcv():
    # test that check_scoring works on GridSearchCV and pipeline.
    # slightly redundant non-regression test.

    grid = GridSearchCV(LinearSVC(), param_grid={"C": [0.1, 1]})
    scorer = check_scoring(grid, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    pipe = make_pipeline(LinearSVC())
    scorer = check_scoring(pipe, "f1")
    assert_true(isinstance(scorer, _PredictScorer))

    # check that cross_val_score definitely calls the scorer
    # and doesn't make any assumptions about the estimator apart from having a
    # fit.
    scores = cross_val_score(EstimatorWithFit(), [[1], [2], [3]], [1, 0, 1], scoring=DummyScorer())
    assert_array_equal(scores, 1)
Example #8
0
    def _score(self, X, y, scoring=None, clf=None):
        from sklearn.model_selection._validation import _score

        if scoring is None:
            scoring = self._scorer

        if clf is None:
            clf = self._estimator

        return _score(clf, X, y, check_scoring(clf, scoring=scoring))
Example #9
0
 def _scoring(self, net, X_test, y_test):
     """Resolve scoring and apply it to data. Use cached prediction
     instead of running inference again, if available."""
     scorer = check_scoring(net, self.scoring)
     scores = _score(
         estimator=net,
         X_test=X_test,
         y_test=y_test,
         scorer=scorer,
         is_multimetric=False,
     )
     return scores
Example #10
0
def train_predict(descriptions_models, X_train, y_train, X_valid, y_valid, scoring=None):
    """Run preliminary performance analyses of multiple machine learning models.

    Parameters
    ----------
    descriptions_models : Iterable of 2-tuples (str, object)
        Each 2-tuple element contains descriptive text and a model object.
        i.e. [('Model1 info', model1), ('Model2 info', model2), ...]

    X_train : pandas.DataFrame
        Training features data

    y_train : pandas.Series
        Training target data

    X_valid, y_valid : same as X_train, y_train, but used for validation

    scoring : str, callable or None, default=None
        See `scoring` parameter description for
        sklearn.grid_search.GridSearchCV.html

    Returns
    -------
    df_summary : pandas.DataFrame
        Performance summary of all the models
    """

    results = []
    for description, model in descriptions_models:

        scorer = check_scoring(model, scoring=scoring)
        result = {"description": description}

        # Train
        start = time.time()
        model.fit(X_train, y_train)
        result["time_train"] = time.time() - start

        # Predict train
        start = time.time()
        result["score_train"] = scorer(model, X_train, y_train)
        result["time_predict_train"] = time.time() - start

        # Predict validation
        start = time.time()
        result["score_valid"] = scorer(model, X_valid, y_valid)
        result["time_predict_valid"] = time.time() - start

        results.append(result)

    return pd.DataFrame(results)[
        ["description", "score_train", "score_valid", "time_train", "time_predict_train", "time_predict_valid"]
    ]
Example #11
0
def retrain_estimator(estimator, X, y, n,
                      scoring=None,
                      fit_params=None,
                      split_params=None):
    scorer = check_scoring(estimator, scoring=scoring)
    split_params = split_params if split_params is not None else {}
    X_train, X_test, y_train, y_test = train_test_split(X,y,**split_params) #FIXME: shuffles
    fit_params = fit_params if fit_params is not None else {}
    estimators = [clone(estimator).set_params(nn__random_state=i).fit(X_train,y_train,**fit_params) for i in range(n)]
    scores = [scorer(e,X_test,y_test) for e in estimators]
    print scores
    return estimators[np.argmax(scores)]
Example #12
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None, groups=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                    X=features,
                                    y=target,
                                    scorer=scorer,
                                    train=train,
                                    test=test,
                                    verbose=0,
                                    parameters=None,
                                    fit_params=sample_weight_dict)
                                for train, test in cv_iter]
            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
def test_check_scoring():
    """Test all branches of check_scoring"""
    estimator = EstimatorWithoutFit()
    assert_raise_message(TypeError, "'fit' method", check_scoring, estimator)

    estimator = EstimatorWithFitAndScore()
    estimator.fit([[1]], [1])
    scorer = check_scoring(estimator)
    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)

    estimator = EstimatorWithFitAndPredict()
    estimator.fit([[1]], [1])
    assert_raise_message(TypeError, "no scoring", check_scoring, estimator)

    scorer = check_scoring(estimator, "accuracy")
    assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)

    estimator = EstimatorWithFit()
    assert_raise_message(TypeError, "'score' or a 'predict'", check_scoring,
                         estimator, "accuracy")

    estimator = EstimatorWithFit()
    scorer = check_scoring(estimator, allow_none=True)
    assert_true(scorer is None)
Example #14
0
    def fit(self, X, y):
        """Fit the model to the training data."""
        X, y = check_X_y(X, y, force_all_finite=False,
                         multi_output=self.multi_output)

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if RANK == 0:
            if self.experiments_folder is not None:
                assert_path(self.experiments_folder)

            self._fit_master(X, y)
        else:
            self._fit_slave(X, y)

        return self
Example #15
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        self.scorer_ = check_scoring(estimator, scoring=self.scoring)
        error_score = self.error_score
        if not (isinstance(error_score, numbers.Number) or
                error_score == 'raise'):
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value.")

        dsk, keys, n_splits = build_graph(estimator, self.cv, self.scorer_,
                                          list(self._get_param_iterator()),
                                          X, y, groups, fit_params,
                                          iid=self.iid,
                                          refit=self.refit,
                                          error_score=error_score,
                                          return_train_score=self.return_train_score,
                                          cache_cv=self.cache_cv)
        self.dask_graph_ = dsk
        self.n_splits_ = n_splits

        n_jobs = _normalize_n_jobs(self.n_jobs)
        scheduler = _normalize_scheduler(self.scheduler, n_jobs)

        out = scheduler(dsk, keys, num_workers=n_jobs)

        self.cv_results_ = results = out[0]
        self.best_index_ = np.flatnonzero(results["rank_test_score"] == 1)[0]

        if self.refit:
            self.best_estimator_ = out[1]
        return self
    def fit(self, X, y):
        """Fit the model to the training data."""
        X, y = check_X_y(X, y, force_all_finite=False,
                         multi_output=self.multi_output)
        _check_param_grid(self.param_grid)

        # cv = _check_cv(self.cv, X, y, classifier=is_classifier(self.estimator))
        cv = _check_cv(self.cv, y, classifier=is_classifier(self.estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if comm_rank == 0:
            self._fit_master(X, y, cv)
        else:
            self._fit_slave()

        return self
Example #17
0
    def score(self, X, y):
        """Score each estimator on each task.

        The number of tasks in X should match the number of tasks/estimators
        given at fit time, i.e. we need
        ``X.shape[-1] == len(self.estimators_)``.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice, the corresponding estimator
            scores the prediction, e.g.:
            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks)

        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators)
            Score for each estimator/task.
        """  # noqa: E501
        from sklearn.metrics.scorer import check_scoring

        self._check_Xy(X)
        if X.shape[-1] != len(self.estimators_):
            raise ValueError('The number of estimators does not match '
                             'X.shape[-1]')

        scoring = check_scoring(self.base_estimator, self.scoring)
        y = _fix_auc(scoring, y)

        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(_sl_score, self.n_jobs)
        n_jobs = min(n_jobs, X.shape[-1])
        X_splits = np.array_split(X, n_jobs, axis=-1)
        est_splits = np.array_split(self.estimators_, n_jobs)
        score = parallel(p_func(est, scoring, x, y)
                         for (est, x) in zip(est_splits, X_splits))

        score = np.concatenate(score, axis=0)
        return score
    def _optimize_n_neighbors(self, X, y):
        print('Auto optimizing n_neighbors using ' + str(self.n_neighbor_candidates))
        X_train, X_validate, y_train, y_validate = self._get_split(X, y)
        estimator = copy.copy(self)
        estimator.auto_optimize_k = False
        estimator.fit(X_train, y_train)
        scorer = check_scoring(estimator, scoring=self.scoring)
        configs = []
        for n_neighbors in self.n_neighbor_candidates:
            estimator.n_neighbors = n_neighbors
            score = scorer(estimator, X_validate, y_validate)
            print('N_neighbors = ' + str(n_neighbors) + ' score: ' + str(self.scoring) + ' ' + str(score))
            configs.append((n_neighbors, score))

        configs = sorted(configs, key=lambda i: i[1], reverse=True)
        print('Configs in order of score: ')
        pprint.pprint(configs)
        self.n_neighbors = configs[0][0]
def get_cv_classifier(classifier, cv):
  if classifier["name"] == 'linear-ridge':
    c = RidgeClassifier()
  elif classifier["name"] == 'SVC':
    c = SVC()
  elif classifier["name"] == "l2-SVC":
    c = L2KernelClassifier()
  elif classifier["name"] == "fredholm":
    c = L2FredholmClassifier()
  elif classifier["name"] == "TSVM":
    c = SVMLight()
  elif classifier["name"] == "Lap-RLSC":
    c = LapRLSC()
  elif classifier["name"] == "fred_kernel_appr":
    c = FredholmKernelApprClassifier()
  else:
    raise NameError('Not existing classifier: ' + classifier["name"] + '.')
  return GridSearchCV(c, classifier["params_grid"], scoring=check_scoring(c),
                      fit_params={}, n_jobs=classifier["n_jobs"], cv=cv)
def cross_val_score_filter_feature_selection(model,filter_function,filter_criteria, X, y, scoring=None, cv=None, n_jobs=1,
                    verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs'):

    X, y = indexable(X, y)

    cv = check_cv(cv, X, y, classifier=is_classifier(model))
    scorer = check_scoring(model, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)

    #
    scores = parallel(delayed(_fit_and_score)(clone(model), filter_function(X,y,train,filter_criteria), y, scorer,
                                              train, test, verbose, None,
                                              fit_params)
                      for train, test in cv)

    return np.array(scores)[:, 0]
Example #21
0
	def fit(self,X,Y):
		if not self.best_subset:
			self.fshape = np.shape(X)[1]
			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

			self.cv = check_cv(self.cv, X, Y, classifier=is_classifier(self.estimator))

			self.best_subset = tuple()
			self.best_subset_score = 0
			self.scores_ = {self.best_subset:self.best_subset_score}
			X = np.array(X)
			Y = np.array(Y)


			try:
				self.get_best_subset(X,Y)
			except KeyboardInterrupt:
				pass
		self.estimator = self.estimator.fit(X[:,self.best_subset],Y)
		return self
Example #22
0
    def score(self, X, y):
        """Score each of the estimators on the tested dimensions.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The input samples. For each data slice, the corresponding estimator
            scores the prediction, e.g.:
            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``.
            The feature dimension can be multidimensional e.g.
            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators, n_slices)
            Score for each estimator / data slice couple.
        """  # noqa: E501
        from sklearn.metrics.scorer import check_scoring
        self._check_Xy(X)
        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        mesg = 'Scoring %s' % (self.__class__.__name__,)
        parallel, p_func, n_jobs = parallel_func(_gl_score, self.n_jobs,
                                                 verbose=False)
        n_jobs = min(n_jobs, X.shape[-1])
        scoring = check_scoring(self.base_estimator, self.scoring)
        y = _fix_auc(scoring, y)
        with ProgressBar(X.shape[-1] * len(self.estimators_),
                         verbose_bool='auto', mesg=mesg) as pb:
            score = parallel(p_func(self.estimators_, scoring, x, y,
                                    pb.subset(pb_idx))
                             for pb_idx, x in array_split_idx(
                                 X, n_jobs, axis=-1,
                                 n_per_split=len(self.estimators_)))

        score = np.concatenate(score, axis=1)
        return score
    def fit(self, X, y):
        """Fit KNN model by choosing the best `n_neighbors`.

        Parameters
        -----------
        X : scipy.sparse matrix, (n_samples, vocab_size)
            Data
        y : ndarray, shape (n_samples,) or (n_samples, n_targets)
            Target
        """
        if self.n_neighbors_try is None:
            n_neighbors_try = range(1, 6)
        else:
            n_neighbors_try = self.n_neighbors_try

        X = check_array(X, accept_sparse='csr', copy=True)
        X = normalize(X, norm='l1', copy=False)

        cv = check_cv(self.cv, X, y)
        knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')
        scorer = check_scoring(knn, scoring=self.scoring)

        scores = []
        for train_ix, test_ix in cv:
            dist = self._pairwise_wmd(X[test_ix], X[train_ix])
            knn.fit(X[train_ix], y[train_ix])
            scores.append([
                              scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])
                              for k in n_neighbors_try
                              ])
        scores = np.array(scores)
        self.cv_scores_ = scores

        best_k_ix = np.argmax(np.mean(scores, axis=0))
        best_k = n_neighbors_try[best_k_ix]
        self.n_neighbors = self.n_neighbors_ = best_k

        return super(WordMoversKNNCV, self).fit(X, y)
    def _fit(self, X, y, groups, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        base_estimator = clone(self.estimator)
        cv = check_cv(self.cv, y, classifier=is_classifier(base_estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        cv_iter = list(cv.split(X, y, groups))

        # Original: joblib
        # out = Parallel(
        #     n_jobs=self.n_jobs, verbose=self.verbose,
        #     pre_dispatch=pre_dispatch
        # )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
        #                           train, test, self.verbose, parameters,
        #                           fit_params=self.fit_params,
        #                           return_train_score=self.return_train_score,
        #                           return_n_test_samples=True,
        #                           return_times=True, return_parameters=True,
        #                           error_score=self.error_score)
        #   for parameters in parameter_iterable
        #   for train, test in cv_iter)

        name = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(10))
        tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
        if not os.path.exists(tempfolder):
            os.makedirs(tempfolder)

        # Create the parameter files
        parameter_files = dict()
        for num, parameters in enumerate(parameter_iterable):

            parameters["Number"] = str(num)

            # Convert parameter set to json
            fname = ('settings_{}.json').format(str(num))
            sourcename = os.path.join(tempfolder, 'parameters', fname)
            if not os.path.exists(os.path.dirname(sourcename)):
                os.makedirs(os.path.dirname(sourcename))
            with open(sourcename, 'w') as fp:
                json.dump(parameters, fp, indent=4)

            parameter_files[str(num)] = ('vfs://tmp/{}/{}/{}/{}').format(
                'GS', name, 'parameters', fname)

        # Create test-train splits
        traintest_files = dict()
        # TODO: ugly nummering solution
        num = 0
        for train, test in cv_iter:
            source_labels = ['train', 'test']

            source_data = pd.Series([train, test],
                                    index=source_labels,
                                    name='Train-test data')

            fname = ('traintest_{}.hdf5').format(str(num))
            sourcename = os.path.join(tempfolder, 'traintest', fname)
            if not os.path.exists(os.path.dirname(sourcename)):
                os.makedirs(os.path.dirname(sourcename))
            traintest_files[str(num)] = ('vfs://tmp/{}/{}/{}/{}').format(
                'GS', name, 'traintest', fname)

            sourcelabel = ("Source Data Iteration {}").format(str(num))
            source_data.to_hdf(sourcename, sourcelabel)

            num += 1

        # Create the files containing the estimator and settings
        estimator_labels = [
            'base_estimator', 'X', 'y', 'scorer', 'verbose', 'fit_params',
            'return_train_score', 'return_n_test_samples', 'return_times',
            'return_parameters', 'error_score'
        ]

        estimator_data = pd.Series([
            base_estimator, X, y, self.scorer_, self.verbose, self.fit_params,
            self.return_train_score, True, True, True, self.error_score
        ],
                                   index=estimator_labels,
                                   name='estimator Data')
        fname = 'estimatordata.hdf5'
        estimatorname = os.path.join(tempfolder, fname)
        estimator_data.to_hdf(estimatorname, 'Estimator Data')

        estimatordata = ("vfs://tmp/{}/{}/{}").format('GS', name, fname)

        # Create the fastr network
        network = fastr.Network('GridSearch_' + name)
        estimator_data = network.create_source('HDF5', id_='estimator_source')
        traintest_data = network.create_source('HDF5', id_='traintest')
        parameter_data = network.create_source('JsonFile', id_='parameters')
        sink_output = network.create_sink('HDF5', id_='output')

        fitandscore = network.create_node('fitandscore',
                                          memory='2G',
                                          id_='fitandscore')
        fitandscore.inputs['estimatordata'].input_group = 'estimator'
        fitandscore.inputs['traintest'].input_group = 'traintest'
        fitandscore.inputs['parameters'].input_group = 'parameters'

        fitandscore.inputs['estimatordata'] = estimator_data.output
        fitandscore.inputs['traintest'] = traintest_data.output
        fitandscore.inputs['parameters'] = parameter_data.output
        sink_output.input = fitandscore.outputs['fittedestimator']

        source_data = {
            'estimator_source': estimatordata,
            'traintest': traintest_files,
            'parameters': parameter_files
        }
        sink_data = {
            'output':
            ("vfs://tmp/{}/{}/output_{{sample_id}}_{{cardinality}}{{ext}}"
             ).format('GS', name)
        }

        network.execute(source_data,
                        sink_data,
                        tmpdir=os.path.join(tempfolder, 'GS', name, 'tmp'))

        # Read in the output data once finished
        # TODO: expanding fastr url is probably a nicer way
        sink_files = glob.glob(
            os.path.join(fastr.config.mounts['tmp'], 'GS', name) +
            '/output*.hdf5')
        save_data = list()
        feature_labels = list()
        for output in sink_files:
            data = pd.read_hdf(output)
            save_data.append(data['RET'])
            feature_labels.append(data['feature_labels'])

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*save_data)
        else:
            (test_scores, test_sample_counts, fit_time, score_time,
             parameters) = zip(*save_data)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score',
               test_scores,
               splits=True,
               rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params
        results['feature_labels'] = feature_labels

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #25
0
    def fit(
            self,
            X,  # type: TwoDimArrayLikeType
            y=None,  # type: Optional[Union[OneDimArrayLikeType, TwoDimArrayLikeType]]
            groups=None,  # type: Optional[OneDimArrayLikeType]
            **fit_params  # type: Any
    ):
        # type: (...) -> 'OptunaSearchCV'
        """Run fit with all sets of parameters.

        Args:
            X:
                Training data.

            y:
                Target variable.

            groups:
                Group labels for the samples used while splitting the dataset
                into train/test set.

            **fit_params:
                Parameters passed to ``fit`` on the estimator.

        Returns:
            self:
                Return self.
        """

        self._check_params()

        random_state = check_random_state(self.random_state)
        max_samples = self.subsample
        n_samples = _num_samples(X)
        old_level = logger.getEffectiveLevel()

        if self.verbose > 1:
            logger.setLevel(DEBUG)
        elif self.verbose > 0:
            logger.setLevel(INFO)
        else:
            logger.setLevel(WARNING)

        self.sample_indices_ = np.arange(n_samples)

        if type(max_samples) is float:
            max_samples = int(max_samples * n_samples)

        if max_samples < n_samples:
            self.sample_indices_ = random_state.choice(self.sample_indices_,
                                                       max_samples,
                                                       replace=False)

            self.sample_indices_.sort()

        X_res = safe_indexing(X, self.sample_indices_)
        y_res = safe_indexing(y, self.sample_indices_)
        groups_res = safe_indexing(groups, self.sample_indices_)
        fit_params_res = fit_params

        if fit_params_res is not None:
            fit_params_res = {
                key: _index_param_value(X, value, self.sample_indices_)
                for key, value in fit_params.items()
            }

        classifier = is_classifier(self.estimator)
        cv = check_cv(self.cv, y_res, classifier)

        self.n_splits_ = cv.get_n_splits(X_res, y_res, groups=groups_res)
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        if self.study is None:
            seed = random_state.randint(0, np.iinfo('int32').max)
            sampler = samplers.TPESampler(seed=seed)

            self.study_ = study_module.create_study(direction='maximize',
                                                    sampler=sampler)

        else:
            self.study_ = self.study

        objective = _Objective(self.estimator, self.param_distributions, X_res,
                               y_res, cv, self.enable_pruning,
                               self.error_score, fit_params_res, groups_res,
                               self.max_iter, self.return_train_score,
                               self.scorer_)

        logger.info('Searching the best hyperparameters using {} '
                    'samples...'.format(_num_samples(self.sample_indices_)))

        self.study_.optimize(objective,
                             n_jobs=self.n_jobs,
                             n_trials=self.n_trials,
                             timeout=self.timeout)

        logger.info('Finished hyperparemeter search!')

        if self.refit:
            self._refit(X, y, **fit_params)

        logger.setLevel(old_level)

        return self
Example #26
0
    def _extendedFit(self, X, y, parameter_iterable):
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print(
                    "Fitting {0} folds for each of {1} candidates, totalling"
                    " {2} fits".format(len(cv), n_candidates,
                                       n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(_extended_fit_and_score)(clone(base_estimator),
                                                 X,
                                                 y,
                                                 self.scorer_,
                                                 train,
                                                 test,
                                                 self.verbose,
                                                 parameters,
                                                 self.fit_params,
                                                 return_parameters=True,
                                                 error_score=self.error_score)
                for parameters in parameter_iterable for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        grid_extras = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = []
            for this_score, this_n_test_samples, _, parameters, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            print "Refitting best estimator"
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #27
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        self._random_state = check_random_state(self.random_state)
        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        R = list(self.cost_parameter_max.values())[0]

        if self.cost_parameter_min is None:
            Rmin = 1
        else:
            Rmin = list(self.cost_parameter_min.values())[0]

        if self.verbose > 0:
            n_candidates = hyperband_num_per_run(self.eta, R, Rmin)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))

        out = []
        smax = int(np.floor(np.log(R / Rmin) / np.log(self.eta)))
        B = (smax + 1.0) * R
        for s in range(smax, -1, -1):
            n = int(np.ceil(B / R * np.power(self.eta, s) / (s + 1.0)))
            r = int(R / np.power(self.eta, s))
            T = list(
                ParameterSampler(self.param_distributions,
                                 n,
                                 random_state=self._random_state))

            for i in range(0, s + 1):
                n_i = int(np.floor(n / np.power(self.eta, i)))
                r_i = int(r * np.power(self.eta, i))

                _jobs = []
                for parameters in T:
                    _parameters = copy.deepcopy(parameters)
                    _parameters.update(
                        {list(self.cost_parameter_max.keys())[0]: r_i})
                    for train, test in cv_iter:
                        _jobs.append(
                            delayed(_fit_and_score)(
                                clone(base_estimator),
                                X,
                                y,
                                self.scorer_,
                                train,
                                test,
                                self.verbose,
                                _parameters,
                                fit_params=fit_params,
                                return_train_score=self.return_train_score,
                                return_n_test_samples=True,
                                return_times=True,
                                return_parameters=True,
                                error_score=self.error_score))

                _out = Parallel(n_jobs=self.n_jobs,
                                verbose=self.verbose,
                                pre_dispatch=pre_dispatch)(_jobs)

                results, _ = self._process_outputs(_out, n_splits)
                num_to_keep = int(np.floor(n_i / self.eta))
                sind = np.argsort(results["rank_test_score"])
                msk = np.zeros(len(results['rank_test_score']))
                msk[sind[0:num_to_keep]] = 1
                msk = msk.astype(bool)
                T = [p for k, p in enumerate(results['params']) if msk[k]]

                out += _out

        results, best_index = self._process_outputs(out, n_splits)
        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits
        self.multimetric_ = False
        if not hasattr(self, 'best_score_'):
            self.best_score_ = results['mean_test_score'][best_index]
        if not hasattr(self, 'best_params_'):
            self.best_params_ = results['params'][best_index]

        if self.refit:
            best_estimator = clone(self.estimator).set_params(
                **self.cv_results_['params'][self.best_index_])

            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)

            self.best_estimator_ = best_estimator

        return self
Example #28
0
    def _fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp()
        for jk in xrange(self.n_iter):
            suggestion = self.conn.experiments(
                self.experiment.id).suggestions().create()
            parameters = suggestion.assignments.to_json()

            # convert all unicode names and values to plain strings
            non_unicode_parameters = self._convert_unicode_dict(parameters)

            if self.verbose > 0:
                print "Evaluating params : ", non_unicode_parameters

            # do CV folds in parallel using joblib
            # returns scores on test set
            out = Parallel(
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                pre_dispatch=pre_dispatch)(
                    delayed(_fit_and_score)(clone(base_estimator),
                                            X,
                                            y,
                                            self.scorer_,
                                            train,
                                            test,
                                            self.verbose,
                                            non_unicode_parameters,
                                            self.fit_params,
                                            return_parameters=True,
                                            error_score=self.error_score)
                    for train, test in cv)

            # grab scores from results
            scores = [o[0] for o in out]
            self.conn.experiments(self.experiment.id).observations().create(
                suggestion=suggestion.id,
                value=numpy.mean(scores),
                value_stddev=numpy.std(scores))

        # return best SigOpt observation so far
        best_obs = self.conn.experiments(
            self.experiment.id).fetch().progress.best_observation
        self.best_params_ = best_obs.assignments.to_json()
        # convert all unicode names and values to plain strings
        self.best_params_ = self._convert_unicode_dict(self.best_params_)
        self.best_score_ = best_obs.value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
Example #29
0
def cross_val_score_track_trials(estimator,
                                 X,
                                 y=None,
                                 scoring=accuracy_score,
                                 cv=5,
                                 args_to_scorer=None):
    """
    Use the given estimator to perform fit and predict for splits defined by 'cv' and compute the given score on 
    each of the splits.

    Parameters
    ----------

    estimator: A valid sklearn_wrapper estimator
    X, y: Valid data and target values that work with the estimator
    scoring: string or a scorer object created using 
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer.
        A string from sklearn.metrics.SCORERS.keys() can be used or a scorer created from one of 
        sklearn.metrics (https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics).
        A completely custom scorer object can be created from a python function following the example at 
        https://scikit-learn.org/stable/modules/model_evaluation.html
        The metric has to return a scalar value,
    cv: an integer or an object that has a split function as a generator yielding (train, test) splits as arrays of indices.
        Integer value is used as number of folds in sklearn.model_selection.StratifiedKFold, default is 5.
        Note that any of the iterators from https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators can be used here.
    args_to_scorer: A dictionary of additional keyword arguments to pass to the scorer. 
                Used for cases where the scorer has a signature such as ``scorer(estimator, X, y, **kwargs)``.
    Returns
    -------
        cv_results: a list of scores corresponding to each cross validation fold
    """
    if isinstance(cv, int):
        cv = StratifiedKFold(cv)

    if args_to_scorer is None:
        args_to_scorer = {}
    scorer = check_scoring(estimator, scoring=scoring)
    cv_results: List[float] = []
    log_loss_results = []
    time_results = []
    for train, test in cv.split(X, y):
        X_train, y_train = split_with_schemas(estimator, X, y, train)
        X_test, y_test = split_with_schemas(estimator, X, y, test, train)
        start = time.time()
        #Not calling sklearn.base.clone() here, because:
        #  (1) For Lale pipelines, clone() calls the pipeline constructor
        #      with edges=None, so the resulting topology is incorrect.
        #  (2) For Lale individual operators, the fit() method already
        #      clones the impl object, so cloning again is redundant.
        trained = estimator.fit(X_train, y_train)
        score_value = scorer(trained, X_test, y_test, **args_to_scorer)
        execution_time = time.time() - start
        # not all estimators have predict probability
        try:
            y_pred_proba = trained.predict_proba(X_test)
            logloss = log_loss(y_true=y_test, y_pred=y_pred_proba)
            log_loss_results.append(logloss)
        except BaseException:
            logger.debug("Warning, log loss cannot be computed")
        cv_results.append(score_value)
        time_results.append(execution_time)
    result = np.array(cv_results).mean(), np.array(
        log_loss_results).mean(), np.array(execution_time).mean()
    return result
Example #30
0
    def _fit(self, X, y, groups, parameter_iterable):

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test) for parameters in parameter_iterable
                                                for train, test in list(cv.split(X, y, groups))]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        error_score = self.error_score
        fit_params = self.fit_params
        return_train_score = self.return_train_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
                      parameters, fit_params,
                      return_train_score=return_train_score,
                      return_n_test_samples=True, return_times=True,
                      return_parameters=True, error_score=error_score)
            return (index, res)
        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]
        if return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out)
        X_bc.unpersist()
        y_bc.unpersist()

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score', test_scores, splits=True, rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #31
0
    def _fit(self, Z, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        cv = self.cv
        cv = _check_cv(cv, Z)

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch,
            backend="threading")(
                delayed(_fit_and_score)(clone(base_estimator),
                                        Z,
                                        self.scorer_,
                                        train,
                                        test,
                                        self.verbose,
                                        parameters,
                                        self.fit_params,
                                        return_parameters=True,
                                        error_score=self.error_score)
                for parameters in parameter_iterable for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(Z, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #32
0
def _wrapped_cross_val_score(sklearn_pipeline, features, target,
                             cv, scoring_function, sample_weight=None,
                             groups=None, use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [cv_results['split{}_test_score'.format(i)]
                  for i in range(n_splits)]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
                                         X=features,
                                         y=target,
                                         scorer=scorer,
                                         train=train,
                                         test=test,
                                         verbose=0,
                                         parameters=None,
                                         fit_params=sample_weight_dict)
                                    for train, test in cv_iter]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
    def fit(self, X, y=None, labels=None):
        #return self._fit(
        #    X, y, labels,
        #    parameter_iterable # parameter_iterable, \in Sized, it actually does len(parameter_iterable) in _fit
        #)

        # FIXME code duplication from BaseSearchCV._fit
        estimator = self.estimator
        cv = _split.check_cv(self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y, labels = indexable(X, y, labels)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                  'of samples (%i) than data (X: %i samples)'
                                  % (len(y), n_samples))

        n_splits = cv.get_n_splits(X, y, labels)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch
        # FIXME how to handle pre_dispatch


        # FIXME recursively getting new parameters to evaluate

#        parameter_iterable = ...  # the magic
#
#        # The evaluation (Parallel) stuff
#        out = Parallel(
#            n_jobs=self.n_jobs, verbose=self.verbose,
#            pre_dispatch=pre_dispatch
#        )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
#                                  train, test, self.verbose, parameters,
#                                  self.fit_params, return_parameters=True,
#                                  error_score=self.error_score)
#            for parameters in parameter_iterable
#            for train, test in cv.split(X, y, labels))
#

        # n_fits on each (train, test)
        def cross_validation(raw_parameters):
            parameters = dict(zip(
                self.param_grid.keys(), raw_parameters
            ))  # TODO more robust way of doing this
            print(parameters)

            return Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                      train, test, self.verbose, parameters,
                                      self.fit_params, return_parameters=True,
                                      error_score=self.error_score)
               for train, test in cv.split(X, y, labels))

        x = cartesian_product(*self.param_grid.values())

        # FIXME implement as non-recursive
        def bo_(x_obs, y_obs, n_iter):
            if n_iter > 0:
                kernel = kernels.Matern() + kernels.WhiteKernel()
                gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=16)
                gp.fit(x_obs, 1-y_obs)

                a = a_EI(gp, x_obs=x_obs, y_obs=1-y_obs)

                argmax_f_x_ = x[np.argmax(a(x))]

                # heavy evaluation
                f_argmax_f_x_ = cross_validation(argmax_f_x_)

                y_ob = np.atleast_2d(mean_mean_validation_scores(f_argmax_f_x_)).T

                return f_argmax_f_x_ + bo_(
                    x_obs=np.vstack((x_obs, argmax_f_x_)),
                    y_obs=np.vstack((y_obs, y_ob)),
                    n_iter=n_iter-1,
                )

            else:
                return []


        # FIXME (most informative) decision like Numerical Probabilistics stuff for integrations
        # sobol initilization?

        sampled_x_ind = np.random.choice(
            x.shape[0],
            size=self.n_initial_points,
            replace=False,
        )
        print(sampled_x_ind)

        x_obs = x[sampled_x_ind]
        f_x_obs = list(map(cross_validation, x_obs))

        y_obs = np.atleast_2d(list(map(mean_mean_validation_scores, f_x_obs))).T

        out = sum(f_x_obs, []) + bo_(x_obs, y_obs, n_iter=self.n_iter)

        n_fits = len(out)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_splits):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _ , parameters in \
                    out[grid_start:grid_start + n_splits]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_splits)
            scores.append((score, parameters))

            grid_scores.append(_search._CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))

        self.grid_scores_ = grid_scores

        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]

        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Example #34
0
def _fit(self, X, y, parameter_iterable):
    """Actual fitting,  performing the search over parameters."""
    estimator = self.estimator
    foldsForEstimator = {}
    cv = self.cv

    self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

    n_samples = _num_samples(X)
    X, y = indexable(X, y)

    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))

    from collections import Sized
    # Splits the data based on provided cross-validation splitting strategy.
    cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
    if self.verbose > 0:
        if isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling \
                {2} fits".format(len(cv), n_candidates,
                                 n_candidates * len(cv)))

    base_estimator = clone(self.estimator)

    pre_dispatch = self.pre_dispatch

    # Change from original scikit code: adding a new argument,
    # foldsForEstimator, to the _fit_and_score function to track metadata
    # for each estimator, for each fold.
    # _fit_and_score fits the estimator and computes the score for a given
    # data-split, for given parameters.
    out = Parallel(n_jobs=self.n_jobs,
                   verbose=self.verbose,
                   pre_dispatch=pre_dispatch)(
                       delayed(_fit_and_score)(clone(base_estimator),
                                               X,
                                               y,
                                               self.scorer_,
                                               train,
                                               test,
                                               self.verbose,
                                               parameters,
                                               self.fit_params,
                                               foldsForEstimator,
                                               return_parameters=True,
                                               error_score=self.error_score)
                       for parameters in parameter_iterable
                       for train, test in cv)

    # Out is a list of triplet: score, estimator, n_test_samples
    n_fits = len(out)
    n_folds = len(cv)

    # Computes the scores for each of the folds, for all the possible
    # parameters, and stores them in grid_scores.
    scores = list()
    grid_scores = list()
    for grid_start in range(0, n_fits, n_folds):
        n_test_samples = 0
        score = 0
        all_scores = []
        for this_score, this_n_test_samples, _, parameters in out[
                grid_start:grid_start + n_folds]:
            all_scores.append(this_score)
            if self.iid:
                this_score *= this_n_test_samples
                n_test_samples += this_n_test_samples
            score += this_score
        if self.iid:
            score /= float(n_test_samples)
        else:
            score /= float(n_folds)
        scores.append((score, parameters))
        # TODO: shall we also store the test_fold_sizes?
        grid_scores.append(
            CVScoreTuple(parameters, score, np.array(all_scores)))

    # Store the computed scores
    self.grid_scores_ = grid_scores

    # Find the best parameters by comparing on the mean validation score:
    # note that `sorted` is deterministic in the way it breaks ties
    best = sorted(grid_scores,
                  key=lambda x: x.mean_validation_score,
                  reverse=True)[0]
    self.best_params_ = best.parameters
    self.best_score_ = best.mean_validation_score

    if self.refit:
        # fit the best estimator using the entire dataset
        # clone first to work around broken estimators
        best_estimator = clone(base_estimator).set_params(**best.parameters)
        if y is not None:
            best_estimator.fit(X, y, **self.fit_params)
        else:
            best_estimator.fit(X, **self.fit_params)
        self.best_estimator_ = best_estimator
    else:
        # If refit is false, we cannot _best_estimator_ is unavailable, and
        # further predictions can't be made on instance
        raise Warning(
            "Note: Refit has been set to false, which makes it impossible to "
            "make predictions using this GridSearchCV instance after fitting. "
            "Change refit to true to enable this")

    # Change from original scikit code:
    # Populate new field with necessary attributes for storing
    # cross-validation event
    self.grid_cv_event = [
        X, foldsForEstimator, 0,
        type_of_target(y), self.best_estimator_, self.best_estimator_, n_folds
    ]
    return self
Example #35
0
    def get_scores(self, X, y, row_keys, scoring, collect_n=None):
        """
        Gives scores for prediction on cross-validation.

        Parameters
        ----------
        X : array-like
            The data to fit. Can be, for example a list, or an array at least 2d, or
            dictionary.

        y : array-like, optional, default: None
            The target variable to try to predict in the case of supervised learning.

        row_keys : list of strings
            List of transformers names. ``Pipeliner`` takes transformers
            from ``named_steps`` using keys from ``row_keys`` and creates
            pipeline to transform.

        scoring : string, callable or None, default=None
            A string (see model evaluation documentation) or a scorer
            callable object / function with signature
            ``scorer(estimator, X, y)``. If None, the score method of the
            estimator is used.

        collect_n : list of strings
            List of keys from data dictionary you want to collect and
            create feature vectors.

        Returns
        -------
        scores : array-like
            Scores calculated on cross-validation.
        """
        columns = list(self.plan_table.columns)[-len(row_keys):]
        param_key = ''.join(row_keys) + str(scoring)

        steps = list()
        for row_key, column in zip(row_keys, columns):
            steps.append((row_key, self.named_steps[column][row_key]))

        steps[-1][1].set_params(**self.best_params[param_key])

        if not collect_n:
            scores = cross_val_score(Pipeline(steps),
                                     X,
                                     y,
                                     scoring=scoring,
                                     cv=self.eval_cv,
                                     n_jobs=-1)
        else:
            init_random_state = self.eval_cv.random_state
            scores = list()
            for i in range(collect_n):
                fold_prediction = cross_val_predict(Pipeline(steps),
                                                    X,
                                                    y,
                                                    cv=self.eval_cv,
                                                    n_jobs=-1)
                metric = check_scoring(steps[-1][1],
                                       scoring=scoring).__dict__['_score_func']
                scores.append(metric(y, fold_prediction))
                self.eval_cv.random_state += 1

            self.eval_cv.random_state = init_random_state
        return scores
Example #36
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit with all sets of parameters.

        :param X: array-like, `shape = [n_samples, n_features]`
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        :param y: array-like, `shape = [n_samples]` or `[n_samples, n_output]`, optional;
            Target relative to X for classification or regression; None for unsupervised learning.
        :param groups: array-like, with shape `(n_samples,)`, optional;
            Group labels for the samples used while splitting the dataset into train/test set.
        :param fit_params: dict of `string -> object`;
            Parameters passed to the fit method of the estimator
        :return: `self`
        """
        from random import uniform
        from numpy import array, unique, sqrt
        from sklearn.base import clone, is_classifier
        from sklearn.metrics.scorer import check_scoring
        from sklearn.model_selection._search import check_cv
        from sklearn.model_selection._validation import _fit_and_score

        # from lightgbm.sklearn import LightGBMError
        radius_list = []
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        target_classes = []
        if y is not None:
            target_classes = unique(y)
        # if type(self.cv) is int:
        #    cv = ShuffleSplit(n_splits=self.cv, test_size=.25)
        # else:
        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
        x0_ = []
        self.bounds = list(self.bounds)
        for param in self.params_list:
            param_num_range = self.params[param]
            if param_num_range.VType != "hdreal":
                radius_list.append(
                    (param_num_range.upper - param_num_range.lower) / 2.0)
                if param in self.init:
                    if param_num_range.VType == "categorical":
                        x0_.append(
                            param_num_range.items.index(self.init[param]))
                    else:
                        x0_.append(self.init[param])
                else:
                    x0_.append(
                        uniform(param_num_range.lower, param_num_range.upper))
                self.bounds.append(param_num_range.bound_tuple)
            else:
                for i in range(param_num_range.n):
                    radius_list.append(
                        (param_num_range.bound_tuple[i][1] -
                         param_num_range.bound_tuple[i][0]) / 2.0)
                    if (param in self.init) and (i in self.init[param]):
                        x0_.append(self.init[param][i])
                    else:
                        x0_.append(
                            uniform(
                                param_num_range.bound_tuple[i][0],
                                param_num_range.bound_tuple[i][1],
                            ))
                self.bounds = self.bounds + list(param_num_range.bound_tuple)
        if self.radius is None:
            rds = 0.0
            for r in radius_list:
                rds += r**2
            self.radius = sqrt(rds)
        self.x0 = array(x0_)
        self.bounds = tuple(self.bounds)
        cv_dat = list(cv.split(X, y))

        def obj(x):
            cand_params = {}
            _idx = 0
            for _param in self.params_list:
                _param_num_range = self.params[_param]
                if _param_num_range.VType != "hdreal":
                    if _param_num_range.VType == "integer":
                        cand_params[_param] = int(round(x[_idx]))
                    elif _param_num_range.VType == "categorical":
                        cand_params[_param] = _param_num_range.items[int(
                            round(x[_idx]))]
                    else:
                        cand_params[_param] = x[_idx]
                    _idx += 1
                else:
                    _cls_dict = {}
                    for i_ in range(_param_num_range.n):
                        _cls_dict[target_classes[i_]] = x[_idx]
                        _idx += 1
                    cand_params[_param] = _cls_dict
            #cl = clone(self.estimator)
            #cl.set_params(**cand_params)
            score = 0
            n_test = 0

            def parallel_fit_score(cl, cand_params, X, y, scorer, train, test,
                                   verbose, fit_params, error_score):
                cl.set_params(**cand_params)
                try:
                    _score = _fit_and_score(
                        estimator=cl,
                        X=X,
                        y=y,
                        scorer=scorer,  #
                        train=train,
                        test=test,
                        verbose=verbose,  #
                        parameters=cand_params,
                        fit_params=fit_params,  #
                        error_score=error_score,  #
                    )[0]
                    return _score
                except ValueError:
                    if self.verbose > 1:
                        print("Model evaluation error")
                    else:
                        pass
                except:  # LightGBMError:
                    pass
                return None

            scores = Parallel(
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                pre_dispatch=self.pre_dispatch)(
                    delayed(parallel_fit_score)(clone(self.estimator),
                                                cand_params=cand_params,
                                                X=X,
                                                y=y,
                                                scorer=self.scorer_,
                                                train=train,
                                                test=test,
                                                verbose=self.verbose,
                                                fit_params=self.fit_params,
                                                error_score=self.error_score)
                    for train, test in cv_dat)
            for sc in scores:
                if sc is not None:
                    score += sc
                    n_test += 1
            score = score / float(max(n_test, 1))
            return -score

        self.OPTIM = SurrogateSearch(
            obj,
            x0=self.x0,
            max_iter=self.max_iter,
            min_evals=self.min_evals,
            ineqs=self.ineqs,
            bounds=self.bounds,
            verbose=self.verbose,
            radius=self.radius,
            regressor=self.regressor,
            sampling=self.sampling,
            search_sphere=self.search_sphere,
            contraction=self.contraction,
            max_itr_no_prog=self.max_itr_no_prog,
            optimizer=self.optimizer,
            scipy_solver=self.scipy_solver,
            optimithon_dd_method=self.optimithon_dd_method,
            optimithon_difftool=self.optimithon_difftool,
            optimithon_t_method=self.optimithon_t_method,
            optimithon_ls_method=self.optimithon_ls_method,
            optimithon_ls_bt_method=self.optimithon_ls_bt_method,
            optimithon_br_func=self.optimithon_br_func,
            optimithon_penalty=self.optimithon_penalty,
            task_name=self.task_name,
            warm_start=self.warm_start,
            Continue=self.Continue,
        )
        x, scr = self.OPTIM()
        best_params_ = {}
        idx = 0
        for param in self.params_list:
            param_num_range = self.params[param]
            if param_num_range.VType != "hdreal":
                if param_num_range.VType == "integer":
                    best_params_[param] = int(round(x[idx]))
                elif param_num_range.VType == "categorical":
                    best_params_[param] = param_num_range.items[int(
                        round(x[idx]))]
                else:
                    best_params_[param] = x[idx]
                idx += 1
            else:
                cls_dict = {}
                for i in range(param_num_range.n):
                    cls_dict[target_classes[i]] = x[idx]
                    idx += 1
                best_params_[param] = cls_dict
        self.best_estimator_ = clone(self.estimator).set_params(**best_params_)
        self.best_estimator_score = scr
        self.best_score_ = scr
        return self
Example #37
0
    def fit(self):
        LOG.info('Start fitting ...')

        gs_cv_params = {
            'n_jobs': self.n_jobs,
            'cv': _cv_build(self.cv_inner),
            'verbose': 0
        }

        zscore_cv_auc = []
        zscore_cv_acc = []
        split_id = 0
        for dozs in [False, True]:
            LOG.info('Generate %sz-scored sample ...', '' if dozs else 'non ')
            X, y, groups = self._generate_sample(zscored=dozs)

            # The inner CV loop is a grid search on clf_params
            LOG.info('Creating ModelAndGridSearchCV')
            inner_cv = ModelAndGridSearchCV(self.param, **gs_cv_params)

            # Some sklearn's validations
            scoring = check_scoring(inner_cv, scoring='roc_auc')
            cv_outer = check_cv(_cv_build(self.cv_outer),
                                y,
                                classifier=is_classifier(inner_cv))

            # Outer CV loop
            outer_cv_scores = []
            outer_cv_acc = []
            LOG.info('Starting nested cross-validation ...')
            for train, test in list(cv_outer.split(X, y, groups)):
                # Find the groups in the train set, in case inner CV is LOSO.
                fit_params = None
                if self.cv_inner.get('type') == 'loso':
                    train_groups = [groups[i] for i in train]
                    fit_params = {'groups': train_groups}

                result = nested_fit_and_score(clone(inner_cv),
                                              X,
                                              y,
                                              scoring,
                                              train,
                                              test,
                                              fit_params=fit_params,
                                              verbose=1)

                # Test group has no positive cases
                if result is None:
                    continue

                score, clf = result
                test_group = list(set(groups[i] for i in test))[0]
                self._models.append({
                    # 'clf_type': clf_str,
                    'zscored':
                    int(dozs),
                    'outer_split_id':
                    split_id,
                    'left-out-sites':
                    self.sites[test_group],
                    'best_model':
                    clf.best_model_,
                    'best_params':
                    clf.best_params_,
                    'best_score':
                    clf.best_score_,
                    'best_index':
                    clf.best_index_,
                    'cv_results':
                    clf.cv_results_,
                    'cv_scores':
                    score['test']['roc_auc'],
                    'cv_accuracy':
                    score['test']['accuracy'],
                    'cv_params':
                    clf.cv_results_['params'],
                    'cv_auc_means':
                    clf.cv_results_['mean_test_score'],
                    'cv_splits': {
                        'split%03d' % i:
                        clf.cv_results_['split%d_test_score' % i]
                        for i in list(range(clf.n_splits_))
                    }
                })

                # Store the outer loop scores
                if score['test']['roc_auc'] is not None:
                    outer_cv_scores.append(score['test']['roc_auc'])
                outer_cv_acc.append(score['test']['accuracy'])
                split_id += 1

                # LOG.info(
                #     '[%s-%szs] Outer CV: roc_auc=%f, accuracy=%f, '
                #     'Inner CV: best roc_auc=%f, params=%s. ',
                #     clf.best_model_[0], 'n' if not dozs else '',
                #     score['test']['roc_auc'] if score['test']['roc_auc'] is not None else -1.0,
                #     score['test']['accuracy'],
                #     clf.best_score_, clf.best_model_[1])

            LOG.info(
                'Outer CV loop finished, roc_auc=%f (+/-%f), accuracy=%f (+/-%f)',
                np.mean(outer_cv_scores), 2 * np.std(outer_cv_scores),
                np.mean(outer_cv_acc), 2 * np.std(outer_cv_acc))

            zscore_cv_auc.append(outer_cv_scores)
            zscore_cv_acc.append(outer_cv_acc)

        # Select best performing model
        best_inner_loops = [model['best_score'] for model in self._models]
        best_idx = np.argmax(best_inner_loops)
        self._best_model = self._models[best_idx]
        LOG.info(
            'Inner CV [%d models compared] - best model %s-%szs, score=%f, params=%s',
            len(best_inner_loops) * len(self._models[0]['cv_params']),
            self._best_model['best_model'][0],
            'n' if not self._best_model['zscored'] else '',
            self._best_model['best_score'], self._best_model['best_params'])

        # Write out evaluation result
        best_zs = 1 if self._best_model['zscored'] else 0
        LOG.info(
            'CV - estimated performance: roc_auc=%f (+/-%f), accuracy=%f (+/-%f)',
            np.mean(zscore_cv_auc[best_zs]),
            2 * np.std(zscore_cv_auc[best_zs]),
            np.mean(zscore_cv_acc[best_zs]),
            2 * np.std(zscore_cv_acc[best_zs]),
        )
Example #38
0
def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None,
                            iid=True, n_jobs=1, verbose=1,
                            pre_dispatch='2*n_jobs'):
    """Fit and score an estimator with cross-validation

    This function is basically a copy of sklearn's
    grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV
    fit() method. Unfortunately, that class does _not_ return the training
    set scores, which we want to save in the database, and because of the
    way it's written, you can't change it by subclassing or monkeypatching.

    This function uses some undocumented internal sklearn APIs (non-public).
    It was written against sklearn version 0.16.1. Prior Versions are likely
    to fail due to changes in the design of cross_validation module.

    Returns
    -------
    out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores'
        The scores on the training and test sets, as well as the mean test set
        score.
    """

    scorer = check_scoring(estimator, scoring=scoring)
    n_samples = num_samples(X)
    X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr',
                        allow_nans=True)
    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)'
                             % (len(y), n_samples))
    cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator))

    out = Parallel(
        n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch
    )(
        delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                train, test, verbose, parameters,
                                fit_params=None)
        for train, test in cv.split(X, y))

    assert len(out) == cv.n_splits

    train_scores, test_scores = [], []
    n_train_samples, n_test_samples = [], []
    for test_score, n_test, train_score, n_train, _ in out:
        train_scores.append(train_score)
        test_scores.append(test_score)
        n_test_samples.append(n_test)
        n_train_samples.append(n_train)

    train_scores, test_scores = map(list, check_arrays(train_scores,
                                                       test_scores,
                                                       warn_nans=True,
                                                       replace_nans=True))

    if iid:
        if verbose > 0 and is_msmbuilder_estimator(estimator):
            print('[CV] Using MSMBuilder API n_samples averaging')
            print('[CV]   n_train_samples: %s' % str(n_train_samples))
            print('[CV]   n_test_samples: %s' % str(n_test_samples))
        mean_test_score = np.average(test_scores, weights=n_test_samples)
        mean_train_score = np.average(train_scores, weights=n_train_samples)
    else:
        mean_test_score = np.average(test_scores)
        mean_train_score = np.average(train_scores)

    grid_scores = {
        'mean_test_score': mean_test_score, 'test_scores': test_scores,
        'mean_train_score': mean_train_score, 'train_scores': train_scores,
        'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples}
    return grid_scores
Example #39
0
def fit_and_score_estimator(estimator,
                            parameters,
                            cv,
                            X,
                            y=None,
                            scoring=None,
                            iid=True,
                            n_jobs=1,
                            verbose=1,
                            pre_dispatch='2*n_jobs'):
    """Fit and score an estimator with cross-validation

    This function is basically a copy of sklearn's
    grid_search._BaseSearchCV._fit(), which is the core of the GridSearchCV
    fit() method. Unfortunately, that class does _not_ return the training
    set scores, which we want to save in the database, and because of the
    way it's written, you can't change it by subclassing or monkeypatching.

    This function uses some undocumented internal sklearn APIs (non-public).
    It was written against sklearn version 0.16.1. Prior Versions are likely
    to fail due to changes in the design of cross_validation module.

    Returns
    -------
    out : dict, with keys 'mean_test_score' 'test_scores', 'train_scores'
        The scores on the training and test sets, as well as the mean test set
        score.
    """

    scorer = check_scoring(estimator, scoring=scoring)
    n_samples = num_samples(X)
    X, y = check_arrays(X,
                        y,
                        allow_lists=True,
                        sparse_format='csr',
                        allow_nans=True)
    if y is not None:
        if len(y) != n_samples:
            raise ValueError('Target variable (y) has a different number '
                             'of samples (%i) than data (X: %i samples)' %
                             (len(y), n_samples))
    cv = check_cv(cv=cv, y=y, classifier=is_classifier(estimator))

    out = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters,
                                fit_params=None)
        for train, test in cv.split(X, y))

    assert len(out) == cv.n_splits

    train_scores, test_scores = [], []
    n_train_samples, n_test_samples = [], []
    for test_score, n_test, train_score, n_train, _ in out:
        train_scores.append(train_score)
        test_scores.append(test_score)
        n_test_samples.append(n_test)
        n_train_samples.append(n_train)

    train_scores, test_scores = map(
        list,
        check_arrays(train_scores,
                     test_scores,
                     warn_nans=True,
                     replace_nans=True))

    if iid:
        if verbose > 0 and is_msmbuilder_estimator(estimator):
            print('[CV] Using MSMBuilder API n_samples averaging')
            print('[CV]   n_train_samples: %s' % str(n_train_samples))
            print('[CV]   n_test_samples: %s' % str(n_test_samples))
        mean_test_score = np.average(test_scores, weights=n_test_samples)
        mean_train_score = np.average(train_scores, weights=n_train_samples)
    else:
        mean_test_score = np.average(test_scores)
        mean_train_score = np.average(train_scores)

    grid_scores = {
        'mean_test_score': mean_test_score,
        'test_scores': test_scores,
        'mean_train_score': mean_train_score,
        'train_scores': train_scores,
        'n_test_samples': n_test_samples,
        'n_train_samples': n_train_samples
    }
    return grid_scores
def rerun_nested_for_scoring(nested: NestedCV,
                             score: str,
                             X,
                             y=None,
                             groups=None,
                             how='max',
                             n_jobs=1,
                             verbose=0,
                             pre_dispatch='2*n_jobs',
                             return_estimators=False):
    """ Rerun a nested CV grid / random hyper param run but very efficiently by using the stored scoring data
    from a previous run

    Parameters
    ----------
    nested : An already "scored" NestedCV
    score : A string of a score calculated during the scoring run of nested
    how : 'max' or 'min', optional, default='max'
        will look for the min or max of the score provided
    return_estimators : if true return a tuple with new estimators in addition to nested cross, optional, default=False
    Returns
    -------
    nested with new values, (optional, new_estimators)
    """
    sub_scores = [
        extract_score_grid(searcher) for searcher in nested.estimators_
    ]
    sub_scores_means = [sub_score[[c for c in sub_score.columns if 'test' in c and 'mean' in c]] \
                        for sub_score in sub_scores]

    def create_summary(mean_table):
        return pd.DataFrame({
            'maxidx': mean_table.idxmax(),
            'max': mean_table.max(),
            'min': mean_table.min(),
            'minidx': mean_table.idxmin()
        })

    sub_scores_summary = [
        create_summary(mean_table) for mean_table in sub_scores_means
    ]
    row = "mean_{}_test".format(score)
    col = how + "idx"
    idxs = [summary.loc[row, col] for summary in sub_scores_summary]
    params = [
        pd.DataFrame(estimator.cv_results_)['params'][idx]
        for idx, estimator in zip(idxs, nested.estimators_)
    ]
    nested.best_params_ = params
    nested.best_idxs_ = idxs
    new_estimators = [
        clone(estimator.estimator).set_params(**param)
        for param, estimator in zip(params, nested.estimators_)
    ]
    #set the random state so can reproduce results
    for est in new_estimators:
        est.set_params(random_state=nested.random_state)
    if hasattr(nested.scoring, 'change_decision_score'):
        new_scoring = nested.scoring.change_decision_score(score)
    else:
        new_scoring = nested.scoring
    parallel = Parallel(n_jobs=n_jobs,
                        verbose=verbose,
                        pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score_with_extra_data)(
            estimator,
            X,
            y,
            check_scoring(estimator, new_scoring),
            train,
            test,
            verbose,
            None,
            nested.fit_params,
            return_train_score=True,
            return_times=True,
            return_estimator=return_estimators)
        for (train, test), estimator in zip(nested.cv_iter_, new_estimators))
    if return_estimators:
        (nested.train_score_datas_, nested.train_scores_,
         nested.test_score_datas_, nested.test_scores_, nested.fit_times_,
         nested.score_times_, new_estimators) = zip(*scores)
        return nested, new_estimators
    else:
        (nested.train_score_datas_, nested.train_scores_,
         nested.test_score_datas_, nested.test_scores_, nested.fit_times_,
         nested.score_times_) = zip(*scores)
        return nested
Example #41
0
    def _fit(self, X, y, parameter_dict):
        self._cv_results = None  # To indicate to the property the need to update
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(
            parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" %
                  (self.gene_type, maxints))

        toolbox.register("individual",
                         _initIndividual,
                         creator.Individual,
                         maxints=maxints)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)

        # If n_jobs is an int, greater than 1 or less than 0 (indicating to use as
        # many jobs as possible) then we are going to create a default pool.
        # Windows users need to be warned of this feature as it only works properly
        # on linux. They need to encapsulate their pool in an if __name__ == "__main__"
        # wrapper so that pools are not recursively created when the module is reloaded in each map
        if isinstance(self.n_jobs, int):
            if self.n_jobs > 1 or self.n_jobs < 0:
                from multiprocessing import Pool  # Only imports if needed
                if os.name == 'nt':  # Checks if we are on Windows
                    warnings.warn((
                        "Windows requires Pools to be declared from within "
                        "an \'if __name__==\"__main__\":\' structure. In this "
                        "case, n_jobs will accept map functions as well to "
                        "facilitate custom parallelism. Please check to see "
                        "that all code is working as expected."))
                pool = Pool(self.n_jobs)
                toolbox.register("map", pool.map)

        # If it's not an int, we are going to pass it as the map directly
        else:
            try:
                toolbox.register("map", self.n_jobs)
            except Exception:
                raise TypeError(
                    "n_jobs must be either an integer or map function. Received: {}"
                    .format(type(self.n_jobs)))

        toolbox.register("evaluate",
                         _evalFunction,
                         name_values=name_values,
                         X=X,
                         y=y,
                         scorer=self.scorer_,
                         cv=cv,
                         iid=self.iid,
                         verbose=self.verbose,
                         error_score=self.error_score,
                         fit_params=self.fit_params,
                         score_cache=self.score_cache)

        toolbox.register("mate",
                         _cxIndividual,
                         indpb=self.gene_crossover_prob,
                         gene_type=self.gene_type)

        toolbox.register("mutate",
                         _mutIndividual,
                         indpb=self.gene_mutation_prob,
                         up=maxints)
        toolbox.register("select",
                         tools.selTournament,
                         tournsize=self.tournament_size)

        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)

        # Stats
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.nanmean)
        stats.register("min", np.nanmin)
        stats.register("max", np.nanmax)
        stats.register("std", np.nanstd)

        # History
        hist = tools.History()
        toolbox.decorate("mate", hist.decorator)
        toolbox.decorate("mutate", hist.decorator)
        hist.update(pop)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(
                np.prod(np.array(maxints) + 1)))

        pop, logbook = algorithms.eaSimple(pop,
                                           toolbox,
                                           cxpb=0.5,
                                           mutpb=0.2,
                                           ngen=self.generations_number,
                                           stats=stats,
                                           halloffame=hof,
                                           verbose=self.verbose)

        # Save History
        self.all_history_.append(hist)
        self.all_logbooks_.append(logbook)
        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)
        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" %
                  (current_best_params_, current_best_score_))

        if current_best_score_ > self.best_mem_score_:
            self.best_mem_score_ = current_best_score_
            self.best_mem_params_ = current_best_params_

        # Check memoization, potentially unknown bug
        # assert str(hof[0]) in self.score_cache, "Best individual not stored in score_cache for cv_results_."

        # Close your pools if you made them
        if isinstance(self.n_jobs, int) and (self.n_jobs > 1
                                             or self.n_jobs < 0):
            pool.close()
            pool.join()

        self.best_score_ = current_best_score_
        self.best_params_ = current_best_params_
Example #42
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None,
                             use_dask=False):
    """Fit estimator and compute scores for a given dataset split.

    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    use_dask : bool, default False
        Whether to use dask
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    if use_dask:
        try:
            import dask_ml.model_selection  # noqa
            import dask  # noqa
            from dask.delayed import Delayed
        except ImportError:
            msg = "'use_dask' requires the optional dask and dask-ml depedencies."
            raise ImportError(msg)

        dsk, keys, n_splits = dask_ml.model_selection._search.build_graph(
            estimator=sklearn_pipeline,
            cv=cv,
            scorer=scorer,
            candidate_params=[{}],
            X=features,
            y=target,
            groups=groups,
            fit_params=sample_weight_dict,
            refit=False,
            error_score=float('-inf'),
        )

        cv_results = Delayed(keys[0], dsk)
        scores = [
            cv_results['split{}_test_score'.format(i)] for i in range(n_splits)
        ]
        CV_score = dask.delayed(np.array)(scores)[:, 0]
        return dask.delayed(np.nanmean)(CV_score)
    else:
        try:
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                scores = [
                    _fit_and_score(estimator=clone(sklearn_pipeline),
                                   X=features,
                                   y=target,
                                   scorer=scorer,
                                   train=train,
                                   test=test,
                                   verbose=0,
                                   parameters=None,
                                   fit_params=sample_weight_dict)
                    for train, test in cv_iter
                ]
                CV_score = np.array(scores)[:, 0]
                return np.nanmean(CV_score)
        except TimeoutException:
            return "Timeout"
        except Exception as e:
            return -float('inf')
    def fit(self, X, y, sample_weight=None):
        # fitting everything except weights
        original_weights = self.weights
        # have to set self.weights because VotingClassifier.fit checks them
        self.weights = np.ones(len(self.estimators))
        # fit to the full data
        super(VotingClassifierCV, self).fit(X, y, sample_weight=sample_weight)
        estimators = self.estimators_
        self.weights = original_weights

        # generate cross_validated predictions for each classifier
        cv = check_cv(self.cv)
        scoring = check_scoring(self, self.scoring)
        parallel = Parallel(n_jobs=self.n_jobs)
        method = 'predict_proba' if self.voting == 'soft' else 'predict'
        fit_params = {}
        if sample_weight is not None:
            fit_params['sample_weight'] = sample_weight
        verbose = False
        preds = []
        for name, est in self.estimators:
            prediction_blocks = parallel(
                delayed(_fit_and_predict)(clone(est), X, y, train, test,
                                          verbose, fit_params, method)
                for train, test in cv.split(X, y))
            preds.append([pred for pred, _ in prediction_blocks])

        # prepare sample weights and targets for each fold
        test_targets = []
        test_weights = []
        for train, test in cv.split(y):
            test_targets.append(y[test])
            if sample_weight:
                test_weights.append(sample_weight[test])
            else:
                test_weights.append(None)

        # recreate list of possible weights
        weights_array = np.array(self.weights)
        if len(weights_array.shape) == 2:
            self.weigths_seq_ = self.weights
        else:
            if len(weights_array.shape) == 0:
                weight_wec = np.arange(self.weights)
            else:  # assume it is 1d
                weight_wec = self.weights
            clf_len = len(self.estimators)
            self.weigths_seq_ = [
                x for x in product(*[weight_wec] * clf_len) if sum(x) > 0
            ]
        # score the classifier at different weights
        scores = []
        for weights_vector in self.weigths_seq_:
            self.weights_ = weights_vector
            cv_scores = []
            for fold, pred_vectors in enumerate(zip(*preds)):
                self.estimators_ = [
                    PredefinedClassifier(pred_vector)
                    for pred_vector in pred_vectors
                ]
                test_y = test_targets[fold]
                test_x = None
                test_w = test_weights[fold]
                cv_scores.append(scoring(self, test_x, test_y, test_w))
            scores.append(cv_scores)
        self.scores_ = np.array(scores)

        # choose the best weight
        self.weights_ = self.weigths_seq_[np.argmax(
            np.mean(self.scores_, axis=1))]
        self.estimators_ = estimators
Example #44
0
 def _scoring(self, net, X_test, y_test):
     """Resolve scoring and apply it to data. Use cached prediction
     instead of running inference again, if available."""
     scorer = check_scoring(net, self.scoring_)
     return scorer(net, X_test, y_test)
    def fit(self, X, y, Xd, yd):
        """Fit the gradient boosting model.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            The input samples.
        
        Xd : array-like, shape=(n_design, n_features)
            The design samples.

        y : array-like, shape=(n_samples, output_dim=2)
            Target values.

        yd : array-like, shape=(n_samples, output_dim=2)
            Design target values.

        Returns
        -------
        self : object
        """
        #add#
        #if (not hasattr(self, 'Xd')) or (not hasattr(self, 'yd')):
        #    raise NotImplementedError

        fit_start_time = time()
        acc_find_split_time = 0.  # time spent finding the best splits
        acc_apply_split_time = 0.  # time spent splitting nodes
        # time spent predicting X for gradient and hessians update
        acc_prediction_time = 0.
        # TODO: add support for mixed-typed (numerical + categorical) data
        # TODO: add support for missing data
        # TODO: add support for pre-binned data (pass-through)?
        # TODO: test input checking

        X, y = check_X_y(
            X,
            y,
            dtype=[np.float32, np.float64],
            ## Add by k##
            multi_output=True)
        y = self._encode_y(y)
        #add#

        rng = check_random_state(self.random_state)

        ## Add by k ##
        Xd_, yd_ = check_X_y(
            Xd,
            yd,
            dtype=[np.float32, np.float64],
            ## Add by k##
            multi_output=True)
        yd_ = self._encode_y(yd_)
        self.Xd = Xd_  # X design
        self.yd = yd_  # y design

        self.n_design = self.Xd.shape[0]
        self.output_dim = np.reshape(self.yd, (1, -1)).shape[1]
        assert (self.Xd.shape[1] == X.shape[1])
        ## Add by k ##

        self._validate_parameters()

        tic = time()
        self.bin_mapper_ = BinMapper(max_bins=self.max_bins, random_state=rng)

        # modify X
        diff_X_Xd = np.abs(X[:, :, np.newaxis] -
                           self.Xd[:, :, np.newaxis].T).reshape(
                               X.shape[0],
                               self.Xd.shape[0],
                               X.shape[1],
                               order='f')
        mean_X_Xd = .5 * (
            X[:, :, np.newaxis] + self.Xd[:, :, np.newaxis].T).reshape(
                X.shape[0], self.Xd.shape[0], X.shape[1], order='f')
        Phi_X_Xd = np.concatenate((diff_X_Xd, mean_X_Xd), axis=2)
        #print(Phi_X_Xd.shape)
        if self.verbose:
            print(f"Binning {Phi_X_Xd.nbytes / 1e9:.3f} GB of data: ",
                  end="",
                  flush=True)
        X_binned = self.bin_mapper_.fit_transform(Phi_X_Xd)
        ## add by k ##

        ########
        ## OK ##
        ########

        toc = time()
        if self.verbose:
            duration = toc - tic
            troughput = X.nbytes / duration
            print(f"{duration:.3f} s ({troughput / 1e6:.3f} MB/s)")

        self.loss_ = self._get_loss()

        if self.validation_split is not None:
            # stratify for classification
            stratify = y if hasattr(self.loss_, 'predict_proba') else None
            if hasattr(self.loss_, 'predict_proba'):
                raise (NotImplementedError)
            X_binned_train, X_binned_val, y_train, y_val = train_test_split(
                X_binned,
                y,
                test_size=self.validation_split,
                stratify=stratify,
                random_state=rng)
            X_binned_train = np.asfortranarray(X_binned_train)
            X_binned_val = np.asfortranarray(X_binned_val)
            # Histogram computation is faster on feature-aligned data.
        else:
            X_binned_train, y_train = X_binned, y
            X_binned_val, y_val = None, None
            X_binned_train = np.asfortranarray(X_binned_train)

        # Subsample the training set for score-based monitoring.
        subsample_size = 10000
        if X_binned_train.shape[0] < subsample_size:
            X_binned_small_train = np.asfortranarray(X_binned_train)
            y_small_train = y_train
        else:
            indices = rng.choice(np.arange(X_binned_train.shape[0]),
                                 subsample_size)
            X_binned_small_train = X_binned_train[indices]
            y_small_train = y_train[indices]
        self.X_binned_small_train = X_binned_small_train
        self.X_binned_val = X_binned_val
        if self.verbose:
            print("Fitting gradient boosted rounds:")

        #n_samples = X_binned_train.shape[0] * X_binned_train.shape[1]
        n_samples = X_binned_small_train.shape[0]
        # values predicted by the trees. Used as-is in regression, and
        # transformed into probas and / or classes for classification
        raw_predictions = np.zeros(shape=(n_samples,
                                          self.n_trees_per_iteration_),
                                   dtype=y_train.dtype)
        # gradients and hessians are 1D arrays of size
        # n_samples * n_trees_per_iteration

        # gradients and hessians have changed

        gradients, hessians = self.loss_.init_gradients_and_hessians(
            n_samples=n_samples,
            n_trees_per_iteration=self.n_trees_per_iteration_)
        #print('raw_', raw_predictions)
        #print('gradient', gradients)
        # predictors_ is a matrix of TreePredictor objects with shape
        # (n_iter_, n_trees_per_iteration)
        self.predictors_ = predictors = []
        self.train_scores_ = []
        if self.validation_split is not None:
            self.validation_scores_ = []
        scorer = check_scoring(self, self.scoring)
        gb_start_time = time()
        # TODO: compute training loss and use it for early stopping if no
        # validation data is provided?
        self.n_iter_ = 0
        while True:
            should_stop = self._stopping_criterion(gb_start_time, scorer,
                                                   X_binned_small_train,
                                                   y_small_train, X_binned_val,
                                                   y_val)
            if should_stop or self.n_iter_ == self.max_iter:
                break

            # Update gradients and hessians, inplace
            self.loss_.update_gradients_and_hessians(gradients, hessians,
                                                     y_small_train,
                                                     raw_predictions)

            #print('grad', gradients)
            predictors.append([])

            # Build `n_trees_per_iteration` trees.
            for k, (gradients_at_k, hessians_at_k) in enumerate(
                    zip(np.array_split(gradients, self.n_trees_per_iteration_),
                        np.array_split(hessians,
                                       self.n_trees_per_iteration_))):
                # the xxxx_at_k arrays are **views** on the original arrays.
                # Note that for binary classif and regressions,
                # n_trees_per_iteration is 1 and xxxx_at_k is equivalent to the
                # whole array.

                #X_binned_small_train_, _, gradients_at_k_, _, indices_subsample_, _ = \
                #train_test_split(X_binned_small_train, gradients_at_k, np.arange(len(X_binned_small_train)), \
                #train_size=subsample_ratio, shuffle=False, random_state=0)
                #X_binned_small_train_ = np.asfortranarray(X_binned_small_train_)

                grower = TreeGrower(
                    X_binned_small_train,
                    gradients_at_k,
                    hessians_at_k,
                    yd,
                    max_bins=self.max_bins,
                    n_bins_per_feature=self.bin_mapper_.n_bins_per_feature_,
                    max_leaf_nodes=self.max_leaf_nodes,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    l2_regularization=self.l2_regularization,
                    shrinkage=self.learning_rate)
                grower.grow()
                #print('I grew')

                acc_apply_split_time += grower.total_apply_split_time
                acc_find_split_time += grower.total_find_split_time

                predictor = grower.make_predictor(
                    bin_thresholds=self.bin_mapper_.bin_thresholds_)
                predictors[-1].append(predictor)

                tic_pred = time()
                # prepare leaves_data so that _update_raw_predictions can be
                # @njitted
                leaves_data = [(l.value, l.sample_indices)
                               for l in grower.finalized_leaves]
                _update_raw_predictions(leaves_data, self.yd,
                                        raw_predictions[:, k])
                #print('raw_pred', raw_predictions)
                toc_pred = time()
                acc_prediction_time += toc_pred - tic_pred

            self.n_iter_ += 1
            #self.learning_rate *= 1. # maybe to set
            self.learning_rate = 1. / (self.n_iter_ + 1)
            #self.max_depth += 1
            #self.max_depth = min(5, self.max_depth)
            #print('pred', raw_predictions)
            #print('n_iter', self.n_iter_)
        if self.verbose:
            duration = time() - fit_start_time
            n_total_leaves = sum(
                predictor.get_n_leaf_nodes()
                for predictors_at_ith_iteration in self.predictors_
                for predictor in predictors_at_ith_iteration)
            n_predictors = sum(
                len(predictors_at_ith_iteration)
                for predictors_at_ith_iteration in self.predictors_)
            print(f"Fit {n_predictors} trees in {duration:.3f} s, "
                  f"({n_total_leaves} total leaves)")
            print(f"{'Time spent finding best splits:':<32} "
                  f"{acc_find_split_time:.3f}s")
            print(f"{'Time spent applying splits:':<32} "
                  f"{acc_apply_split_time:.3f}s")
            print(f"{'Time spent predicting:':<32} "
                  f"{acc_prediction_time:.3f}s")
        self.train_scores_ = np.asarray(self.train_scores_)
        if self.validation_split is not None:
            self.validation_scores_ = np.asarray(self.validation_scores_)
        return self
Example #46
0
    def _fit(self, X, y, groups, parameter_iterable):
        """
        Actual fitting,  performing the search over parameters.
        Taken from https://github.com/scikit-learn/scikit-learn/blob/0.18.X
                    .../sklearn/model_selection/_search.py
        """

        estimator = self.estimator
        cv = sklearn.model_selection._validation.check_cv(
            self.cv, y, classifier=is_classifier(estimator))
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)
        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        cv_iter = list(cv.split(X, y, groups))
        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(sklearn.model_selection._validation._fit_and_score)(
                    clone(base_estimator),
                    X,
                    y,
                    self.scorer_,
                    train,
                    test,
                    self.verbose,
                    parameters,
                    fit_params=self.fit_params,
                    return_train_score=self.return_train_score,
                    return_n_test_samples=True,
                    return_times=True,
                    return_parameters=True,
                    error_score=self.error_score)
                for parameters in parameter_iterable
                for train, test in cv_iter)

        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time,
             parameters) = zip(*out)

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score',
               test_scores,
               splits=True,
               rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #47
0
def _wrapped_cross_val_score(sklearn_pipeline,
                             features,
                             target,
                             cv,
                             scoring_function,
                             sample_weight=None,
                             groups=None,
                             feature_selectors=None,
                             fs_modifier=None):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    sklearn_pipeline : pipeline object implementing 'fit'
        The object to use to fit the data.
    features : array-like of shape at least 2D
        The data to fit.
    target : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    cv: int or cross-validation generator
        If CV is a number, then it is the number of folds to evaluate each
        pipeline over in k-fold cross-validation during the TPOT optimization
         process. If it is an object then it is an object to be used as a
         cross-validation generator.
    scoring_function : callable
        A scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    sample_weight : array-like, optional
        List of sample weights to balance (or un-balanace) the dataset target as needed
    groups: array-like {n_samples, }, optional
        Group labels for the samples used while splitting the dataset into train/test set
    fs_modifier: float, optional (default: 0.0001)
        Modifier value how the number of features should reduce the score. Only used when feature_selection is not None
    """
    sample_weight_dict = set_sample_weight(sklearn_pipeline.steps,
                                           sample_weight)

    features, target, groups = indexable(features, target, groups)

    cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
    cv_iter = list(cv.split(features, target, groups))
    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)

    try:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            # START ADDITIONAL FS code
            # Split sklearn_pipeline steps to find out the number of features left after selection
            if (not fs_modifier is None):

                fs_pipeline = None

                # Check for every pipeline step, starting with the last
                for i in reversed(range(len(sklearn_pipeline.steps))):

                    # Whether it is a feature selector
                    if (sklearn_pipeline.steps[i][0]) in feature_selectors:
                        fs_pipeline = Pipeline(
                            steps=sklearn_pipeline.steps[:i + 1])
                        break

                # If part of it is a feature selection pipeline:
                if not fs_pipeline is None:
                    n_features = fs_pipeline.fit_transform(features,
                                                           target).shape[1]
                else:
                    n_features = features.shape[1]
            # END ADDITIONAL FS code

            scores = [
                _fit_and_score(estimator=clone(sklearn_pipeline),
                               X=features,
                               y=target,
                               scorer=scorer,
                               train=train,
                               test=test,
                               verbose=0,
                               parameters=None,
                               fit_params=sample_weight_dict)
                for train, test in cv_iter
            ]

            # START ADDITIONAL FS code
            # Alter the score by removing a set number of features
            if not fs_modifier is None:
                scores = [[
                    unlisted_score * (fs_modifier**n_features)
                    for unlisted_score in score
                ] for score in scores]
                print(
                    "The final score becomes %f, after multiplying it it by %.2f  for %i features and modifier %.2f"
                    % (np.average(np.asarray(scores)), fs_modifier**
                       n_features, n_features, fs_modifier))
            # END ADDITIONAL FS code

            CV_score = np.array(scores)[:, 0]
            return np.nanmean(CV_score)
    except TimeoutException:
        return "Timeout"
    except Exception as e:
        return -float('inf')
    def _fit(self, X, y, groups, parameter_iterable):

        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        if hasattr(cv, 'random_state'):
            if not cv.random_state:
                cv.random_state = randint(1000, 9999)

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        if self.verbose > 0 and isinstance(parameter_iterable, Sized):
            n_candidates = len(parameter_iterable)
            print("Fitting {0} folds for each of {1} candidates, totalling"
                  " {2} fits".format(n_splits, n_candidates,
                                     n_candidates * n_splits))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, test_sequence_index)
                      for parameters in parameter_iterable
                      for test_sequence_index in range(n_splits)]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid,
                                             len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)
        groups_bc = self.sc.broadcast(groups)

        scorer = self.scorer_
        verbose = self.verbose
        error_score = self.error_score
        fit_params = self.fit_params
        return_train_score = self.return_train_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, test_sequence_index)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            local_groups = groups_bc.value

            train, test = next(
                islice(cv.split(local_X, local_y, local_groups),
                       test_sequence_index, test_sequence_index + 1))
            res = fas(local_estimator,
                      local_X,
                      local_y,
                      scorer,
                      train,
                      test,
                      verbose,
                      parameters,
                      fit_params,
                      return_train_score=return_train_score,
                      return_n_test_samples=True,
                      return_times=True,
                      return_parameters=True,
                      error_score=error_score)
            return (index, res)

        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]
        if return_train_score:
            (train_scores, test_scores, test_sample_counts, fit_time,
             score_time, parameters) = zip(*out)
        else:
            (test_scores, test_sample_counts, fit_time, score_time,
             parameters) = zip(*out)
        X_bc.unpersist()
        y_bc.unpersist()
        groups_bc.unpersist()

        candidate_params = parameters[::n_splits]
        n_candidates = len(candidate_params)

        results = dict()

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            array = np.array(array,
                             dtype=np.float64).reshape(n_candidates, n_splits)
            if splits:
                for split_i in range(n_splits):
                    results["split%d_%s" %
                            (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(
                np.average((array - array_means[:, np.newaxis])**2,
                           axis=1,
                           weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(rankdata(
                    -array_means, method='min'),
                                                           dtype=np.int32)

        # Computed the (weighted) mean and std for test scores alone
        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits],
                                      dtype=np.int)

        _store('test_score',
               test_scores,
               splits=True,
               rank=True,
               weights=test_sample_counts if self.iid else None)
        if self.return_train_score:
            _store('train_score', train_scores, splits=True)
        _store('fit_time', fit_time)
        _store('score_time', score_time)

        best_index = np.flatnonzero(results["rank_test_score"] == 1)[0]
        best_parameters = candidate_params[best_index]

        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(
            partial(MaskedArray,
                    np.empty(n_candidates, ),
                    mask=True,
                    dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)

        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        self.cv_results_ = results
        self.best_index_ = best_index
        self.n_splits_ = n_splits

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best_parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
    def score(self,
              X,
              y=None,
              groups=None,
              n_jobs=1,
              verbose=0,
              pre_dispatch='2*n_jobs'):
        """ Will score the estimator and score according to self.cv
        """
        X, y, groups = indexable(X, y, groups)
        if not isinstance(
                self.random_state,
            (numbers.Integral, np.integer)) and self.use_same_random_state:
            raise ValueError(
                "If use_same_randome_state, the random state passed in must be an Integer"
            )

        def clone_estimator():
            """Clone the estimator and put in the correct random state for the nested cross validation
            """
            estimator = clone(self.estimator)
            if self.use_same_random_state and (
                    'random_state' in estimator.get_params().keys()):
                estimator.set_params(random_state=self.random_state)
            return estimator

        cv = check_cv2(self.cv,
                       y,
                       classifier=is_classifier(self.estimator),
                       random_state=self.random_state)
        self.cv_iter_ = list(cv.split(X, y, groups))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        # We clone the estimator to make sure that all the folds are
        # independent, and that it is pickle-able.
        parallel = Parallel(n_jobs=n_jobs,
                            verbose=verbose,
                            pre_dispatch=pre_dispatch)
        scores = parallel(
            delayed(_fit_and_score_with_extra_data)(clone_estimator(),
                                                    X,
                                                    y,
                                                    scorer,
                                                    train,
                                                    test,
                                                    verbose,
                                                    None,
                                                    self.fit_params,
                                                    return_train_score=True,
                                                    return_times=True,
                                                    return_estimator=True)
            for train, test in self.cv_iter_)

        (self.train_score_datas_, self.train_scores_, self.test_score_datas_,
         self.test_scores_, self.fit_times_, self.score_times_,
         self.estimators_) = zip(*scores)

        if hasattr(self.estimators_[0], 'best_params_'):
            self.best_params_ = [
                estimator.best_params_ for estimator in self.estimators_
            ]
        else:
            print("WARN: NestedCV.best_params_ set to None")
            self.best_params_ = None
        if hasattr(self.estimators_[0], 'best_index_'):
            self.best_idxs_ = [
                estimator.best_index_ for estimator in self.estimators_
            ]
        else:
            print("WARN: NestedCV.best_idxs_ set to None")
            self.best_idxs_ = None

        return self.test_scores_
Example #50
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples))
 
	if slversion == 18:
            cv = check_cv(cv, y, classifier=is_classifier(estimator))
            # cv is actually not the same generator
        else:
            cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        param_grid = [(parameters, train, test)
                      for parameters in parameter_iterable for (train, test) in cv]
        # Because the original python code expects a certain order for the elements, we need to
        # respect it.
        indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
        par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
        X_bc = self.sc.broadcast(X)
        y_bc = self.sc.broadcast(y)

        scorer = self.scorer_
        verbose = self.verbose
        fit_params = self.fit_params
        error_score = self.error_score
        fas = _fit_and_score

        def fun(tup):
            (index, (parameters, train, test)) = tup
            local_estimator = clone(base_estimator)
            local_X = X_bc.value
            local_y = y_bc.value
            res = fas(local_estimator,
                      local_X,
                      local_y,
                      scorer,
                      train,
                      test,
                      verbose,
                      parameters,
                      fit_params,
                      return_parameters=True,
                      error_score=error_score)
            return (index, res)

        indexed_out0 = dict(par_param_grid.map(fun).collect())
        out = [indexed_out0[idx] for idx in range(len(param_grid))]

        X_bc.unpersist()
        y_bc.unpersist()

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in out[grid_start:grid_start +
                                                                      n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #51
0
    def _fit(self, Z, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        cv = self.cv
        cv = _check_cv(cv, Z)

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch, backend="threading"
        )(
            delayed(_fit_and_score)(clone(base_estimator), Z, self.scorer_,
                                    train, test, self.verbose, parameters,
                                    self.fit_params, return_parameters=True,
                                    error_score=self.error_score)
            for parameters in parameter_iterable
            for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            best_estimator.fit(Z, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #52
0
def cross_val_multiscore(estimator,
                         X,
                         y=None,
                         groups=None,
                         scoring=None,
                         cv=None,
                         n_jobs=1,
                         verbose=0,
                         fit_params=None,
                         pre_dispatch='2*n_jobs'):
    """Evaluate a score by cross-validation.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like, shape (n_samples, n_dimensional_features,)
        The data to fit. Can be, for example a list, or an array at least 2d.
    y : array-like, shape (n_samples, n_targets,)
        The target variable to try to predict in the case of
        supervised learning.
    groups : array-like, with shape (n_samples,)
        Group labels for the samples used while splitting the dataset into
        train/test set.
    scoring : string, callable | None
        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    cv : int, cross-validation generator | iterable
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 3-fold cross validation,
        - integer, to specify the number of folds in a ``(Stratified)KFold``,
        - An object to be used as a cross-validation generator.
        - An iterable yielding train, test splits.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass,
        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
        other cases, :class:`sklearn.model_selection.KFold` is used.
    n_jobs : integer, optional
        The number of CPUs to use to do the computation. -1 means
        'all CPUs'.
    verbose : integer, optional
        The verbosity level.
    fit_params : dict, optional
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

        - None, in which case all the jobs are immediately
          created and spawned. Use this for lightweight and
          fast-running jobs, to avoid delays due to on-demand
          spawning of the jobs
        - An int, giving the exact number of total jobs that are
          spawned
        - A string, giving an expression as a function of n_jobs,
          as in '2*n_jobs'

    Returns
    -------
    scores : array of float, shape (n_splits,) | shape (n_splits, n_scores)
        Array of scores of the estimator for each run of the cross validation.
    """
    # This code is copied from sklearn

    from sklearn.base import clone
    from sklearn.utils import indexable
    from sklearn.metrics.scorer import check_scoring
    from sklearn.model_selection._split import check_cv

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    cv_iter = list(cv.split(X, y, groups))
    scorer = check_scoring(estimator, scoring=scoring)
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    # Note: this parallelization is implemented using MNE Parallel
    parallel, p_func, n_jobs = parallel_func(_fit_and_score,
                                             n_jobs,
                                             pre_dispatch=pre_dispatch)
    scores = parallel(
        p_func(clone(estimator), X, y, scorer, train, test, verbose, None,
               fit_params) for train, test in cv_iter)
    return np.array(scores)[:, 0, ...]  # flatten over joblib output.
Example #53
0
	def __init__(self,
				parameters,
				estimator,
				X=None,
				y=None,
				model='GCP',
				score_format = 'cv',
				fit_params=None,
				scoring=None,
				cv=5,
				acquisition_function = 'UCB',
				corr_kernel= 'squared_exponential',
				n_clusters=1,
				n_clusters_max=5,
				cluster_evol = 'constant',
				GCP_mapWithNoise=False,
				GCP_useAllNoisyY=False,
				model_noise=None,
				n_iter=100,
				n_init=10,
				n_final_iter = 5,
				n_candidates = 500,
				nugget=1.e-10,
				detailed_res=1,
				verbose=1):

		self.parameters = parameters
		self.n_parameters = len(parameters)
		self.n_iter = n_iter
		self.n_init = n_init
		self.n_final_iter = n_final_iter
		self.n_candidates = n_candidates
		self.param_names = sorted(parameters.keys())
		self.param_isInt = np.array([ 0 if (parameters[k][0]=='float') else 1 for k in self.param_names ]) 
		self.param_bounds = np.zeros((self.n_parameters,2))
		self.verbose = verbose
		self.scoring = scoring
		self.estimator = estimator
		self.fit_params = fit_params if fit_params is not None else {}
		self.cv = cv
		self.X = X
		self.y = y

		self.model = model
		self.score_format = score_format # 'cv' or 'avg'
		self.acquisition_function = acquisition_function
		self.corr_kernel = corr_kernel
		self.n_clusters = n_clusters
		self.n_clusters_max = n_clusters_max
		self.cluster_evol = cluster_evol
		self.GCP_mapWithNoise = GCP_mapWithNoise
		self.GCP_useAllNoisyY = GCP_useAllNoisyY
		self.model_noise = model_noise		
		self.GCP_upperBound_coef = 1.96
		self.nugget = nugget
		self.detailed_res = detailed_res

		self.best_parameter_ = None
		self.tested_parameters_ = None
		self.cv_scores_ = None

		if(cluster_evol != 'constant'):
			self.GCP_args = [corr_kernel, 1,GCP_mapWithNoise,GCP_useAllNoisyY,model_noise,nugget,self.GCP_upperBound_coef]
		else:
			self.GCP_args = [corr_kernel, n_clusters,GCP_mapWithNoise,GCP_useAllNoisyY,model_noise,nugget,self.GCP_upperBound_coef]
			
		if(callable(estimator)):
			self._callable_estimator = True
			if(verbose):
				print('Estimator is a callable and not an sklearn Estimator')
		else:
			self._callable_estimator = False

		if not self._callable_estimator:
			self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

		# init param_bounds
		for i in range(self.n_parameters):
			if(parameters[self.param_names[i]][0]=='cat'):
				self.param_bounds[i,0] = 0
				self.param_bounds[i,1] = len(parameters[self.param_names[i]][1])
			else:
				self.param_bounds[i] = np.array(parameters[self.param_names[i]][1])
				if(parameters[self.param_names[i]][0]=='int'):
					self.param_bounds[i,1] += 1

		if(self.verbose):
			print(self.parameters)
			print(self.param_names)
			print(self.param_isInt)
			print(self.param_bounds)
Example #54
0
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        if self.max_features is not None:
            if not isinstance(self.max_features, numbers.Integral):
                raise TypeError("'max_features' should be an integer between 1 and {} features."
                                " Got {!r} instead."
                                .format(n_features, self.max_features))
            elif self.max_features < 1 or self.max_features > n_features:
                raise ValueError("'max_features' should be between 1 and {} features."
                                 " Got {} instead."
                                 .format(n_features, self.max_features))
            max_features = self.max_features
        else:
            max_features = n_features

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y,
                         cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params,
                         max_features=max_features, caching=self.caching)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs == 0:
            raise ValueError("n_jobs == 0 has no meaning.")
        elif self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        _, log = algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba,
                                     mutpb=self.mutation_proba, ngen=self.n_generations,
                                     stats=stats, halloffame=hof, verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.generation_scores_ = np.array([score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
Example #55
0
    def _fit(self, X, y, parameter_iterable=None):
        if parameter_iterable is not None:
            raise NotImplementedError('The parameter_iterable argument is not supported.')

        # Actual fitting,  performing the search over parameters.
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        n_folds = len(cv)
        self._create_sigopt_exp(self.sigopt_connection, n_folds)

        # start tracking time to optimize estimator
        opt_start_time = time.time()
        for jk in range(0, self.n_iter, self.n_sug):
            # check for opt timeout, ensuring at least 1 observation
            # TODO : handling failure observations
            if (
                self.opt_timeout is not None and
                time.time() - opt_start_time > self.opt_timeout and
                jk >= 1
            ):
                # break out of loop and refit model with best params so far
                break

            suggestions = []
            jobs = []
            for _ in range(self.n_sug):
                for train, test in cv:
                    suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create()
                    parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json())
                    suggestions.append(suggestion)
                    jobs.append([parameters, train, test])

            if self.verbose > 0:
                print('Evaluating params : ', [job[0] for job in jobs])


            # do CV folds in parallel using joblib
            # returns scores on test set
            obs_timed_out = False
            try:
                par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose,
                              'pre_dispatch': pre_dispatch}
                # add timeout kwarg if version of joblib supports it
                if 'timeout' in getfullargspec(Parallel.__init__).args:
                    par_kwargs['timeout'] = self.cv_timeout
                out = Parallel(
                    **par_kwargs
                )(
                    delayed(_fit_and_score)(clone(base_estimator), X, y,
                                            self.scorer_, train, test,
                                            self.verbose, parameters,
                                            self.fit_params,
                                            return_parameters=True,
                                            error_score=self.error_score)
                        for parameters, train, test in jobs)
            except TimeoutError:
                 obs_timed_out = True

            if not obs_timed_out:
                # grab scores from results
                for sidx, suggestion in enumerate(suggestions):
                    score = out[sidx][0]
                    self.sigopt_connection.experiments(self.experiment.id).observations().create(
                        suggestion=suggestion.id,
                        value=score)
            else:
                # obsevation timed out so report a failure
                self.sigopt_connection.experiments(self.experiment.id).observations().create(
                    suggestion=suggestion.id,
                    failed=True)

        # return best SigOpt assignments so far
        best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data

        if not best_assignments:
            raise RuntimeError(
                'No valid observations found. '
                'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.')

        self.best_params_ = self._convert_sigopt_api_to_sklearn_assignments(best_assignments[0].assignments.to_json())
        self.best_score_ = best_assignments[0].value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #56
0
    def _fit(self, X, y, parameter_dict):
        self._cv_results = None  # To indicate to the property the need to update
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self.estimator))

        creator.create("FitnessMax", base.Fitness, weights=(1.0, ))
        creator.create("Individual",
                       list,
                       est=clone(self.estimator),
                       fitness=creator.FitnessMax)

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(
            parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" %
                  (self.gene_type, maxints))

        toolbox.register("individual",
                         _initIndividual,
                         creator.Individual,
                         maxints=maxints)
        toolbox.register("population", tools.initRepeat, list,
                         toolbox.individual)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)

        toolbox.register("evaluate",
                         _evalFunction,
                         name_values=name_values,
                         X=X,
                         y=y,
                         scorer=self.scorer_,
                         cv=cv,
                         iid=self.iid,
                         verbose=self.verbose,
                         error_score=self.error_score,
                         fit_params=self.fit_params,
                         score_cache=self.score_cache)

        toolbox.register("mate",
                         _cxIndividual,
                         indpb=self.gene_crossover_prob,
                         gene_type=self.gene_type)

        toolbox.register("mutate",
                         _mutIndividual,
                         indpb=self.gene_mutation_prob,
                         up=maxints)
        toolbox.register("select",
                         tools.selTournament,
                         tournsize=self.tournament_size)

        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)

        # Stats
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.nanmean)
        stats.register("min", np.nanmin)
        stats.register("max", np.nanmax)

        # History
        hist = tools.History()
        toolbox.decorate("mate", hist.decorator)
        toolbox.decorate("mutate", hist.decorator)
        hist.update(pop)

        if self.verbose:
            print('--- Evolve in {0} possible combinations ---'.format(
                np.prod(np.array(maxints) + 1)))

        pop, logbook = algorithms.eaSimple(pop,
                                           toolbox,
                                           cxpb=0.5,
                                           mutpb=0.2,
                                           ngen=self.generations_number,
                                           stats=stats,
                                           halloffame=hof,
                                           verbose=self.verbose)

        # Save History
        self.all_history_.append(hist)
        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)
        if self.verbose:
            print("Best individual is: %s\nwith fitness: %s" %
                  (current_best_params_, current_best_score_))

        if current_best_score_ > self.best_mem_score_:
            self.best_mem_score_ = current_best_score_
            self.best_mem_params_ = current_best_params_

        # Check memoization, potentially unknown bug
        assert str(
            hof[0]
        ) in self.score_cache, "Best individual not stored in score_cache for cv_results_."

        if self.n_jobs > 1:
            pool.close()
            pool.join()
Example #57
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        X, y = _indexable(X, y)

        # for debugging
        assert not isinstance(X, pd.DataFrame)
        assert not isinstance(y, pd.DataFrame)

        # begin sklearn code
        estimator = self.estimator
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        # n_samples = _num_samples(X)  # don't need for now...
        cv = self.cv
        cv = _set_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # get groups, add it to kwargs
        X, y, groups = _get_groups(X, y)
        kwargs = {'groups': groups}

        # test_score, n_samples, _, parameters
        out = _do_fit(self.n_jobs, self.verbose, pre_dispatch,
                      base_estimator, X, y, self.scorer_, parameter_iterable,
                      self.fit_params, self.error_score, cv, **kwargs)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = _cv_len(cv, X, y)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, this_n_test_samples, _, parameters in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))

            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Example #58
0
    def fit(self, X, y):
        """Fit the RFE model and automatically tune the number of selected
           features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.

        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).
        """
        X, y = check_X_y(X, y, "csr")

        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]
        n_features_to_select = 1

        if 0.0 < self.step < 1.0:
            step = int(max(1, self.step * n_features))
        else:
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        rfe = RFE(estimator=self.estimator,
                  n_features_to_select=n_features_to_select,
                  step=self.step,
                  verbose=self.verbose)

        # Determine the number of subsets of features by fitting across
        # the train folds and choosing the "features_to_select" parameter
        # that gives the least averaged error across all folds.

        # Note that joblib raises a non-picklable error for bound methods
        # even if n_jobs is set to 1 with the default multiprocessing
        # backend.
        # This branching is done so that to
        # make sure that user code that sets n_jobs to 1
        # and provides bound methods as scorers is not broken with the
        # addition of n_jobs parameter in version 0.18.

        if self.n_jobs == 1:
            parallel, func = list, _rfe_single_fit
        else:
            parallel, func, = Parallel(
                n_jobs=self.n_jobs), delayed(_rfe_single_fit)

        scores = parallel(
            func(rfe, self.estimator, X, y, train, test, scorer)
            for train, test in cv.split(X, y))

        scores = np.sum(scores, axis=0)
        n_features_to_select = max(n_features - (np.argmax(scores) * step),
                                   n_features_to_select)

        # Re-execute an elimination with best_k over the whole set
        rfe = RFE(estimator=self.estimator,
                  n_features_to_select=n_features_to_select,
                  step=self.step)

        rfe.fit(X, y)

        # Set final attributes
        self.support_ = rfe.support_
        self.n_features_ = rfe.n_features_
        self.ranking_ = rfe.ranking_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1
        # here, the scores are normalized by get_n_splits(X, y)
        self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y)
        return self
Example #59
0
    def _fit(self, X, y):

        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        self._create_sigopt_exp()
        for jk in xrange(self.n_iter):
            suggestion = self.conn.experiments(self.experiment.id).suggestions().create()
            parameters = suggestion.assignments.to_json()
     
            # convert all unicode names and values to plain strings
            non_unicode_parameters = self._convert_unicode_dict(parameters)

            if self.verbose > 0:
                print "Evaluating params : ",non_unicode_parameters

            # do CV folds in parallel using joblib
            # returns scores on test set
            out = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose,
                pre_dispatch=pre_dispatch
            )(
                delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
                                        train, test, self.verbose, non_unicode_parameters,
                                        self.fit_params, return_parameters=True,
                                        error_score=self.error_score)
                    for train, test in cv)

            # grab scores from results
            scores = [o[0] for o in out]
            self.conn.experiments(self.experiment.id).observations().create(
                suggestion=suggestion.id,
                value=numpy.mean(scores),
                value_stddev=numpy.std(scores)
            )
              
        # return best SigOpt observation so far
        best_obs = self.conn.experiments(self.experiment.id).fetch().progress.best_observation
        self.best_params_ = best_obs.assignments.to_json()
         # convert all unicode names and values to plain strings
        self.best_params_ = self._convert_unicode_dict(self.best_params_)
        self.best_score_ = best_obs.value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
Example #60
0
    def _fit(self, X, y, parameter_dict):

        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError(
                    "Target variable (y) has a different number "
                    "of samples (%i) than data (X: %i samples)" % (len(y), n_samples)
                )
        cv = self.cv

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            #  * y.shape[1]
            creator.create("FitnessMax", base.Fitness, weights=(1.0,))
            creator.create(
                "Individual", list, est=clone(self.estimator), fitness=creator.FitnessMax
            )

        toolbox = base.Toolbox()

        name_values, gene_type, maxints = _get_param_types_maxint(parameter_dict)
        if self.gene_type is None:
            self.gene_type = gene_type

        if self.verbose:
            print("Types %s and maxint %s detected" % (self.gene_type, maxints))

        toolbox.register("individual", _initIndividual, creator.Individual, maxints=maxints)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)

        toolbox.register(
            "evaluate",
            _evalFunction,
            name_values=name_values,
            X=X,
            y=y,
            scorer=self.scorer_,
            cv=cv,
            iid=self.iid,
            verbose=self.verbose,
            error_score=self.error_score,
            fit_params=self.fit_params,
        )

        toolbox.register(
            "mate", _cxIndividual, indpb=self.gene_crossover_prob, gene_type=self.gene_type
        )

        toolbox.register("mutate", _mutIndividual, indpb=self.gene_mutation_prob, up=maxints)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        pop = toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean)
        stats.register("min", np.min)
        stats.register("max", np.max)

        if self.verbose:
            msg_template = "--- Evolve in {0} possible combinations ---"
            print(msg_template.format(np.prod(np.array(maxints) + 1)))

        pop, logbook = algorithms.eaSimple(
            pop,
            toolbox,
            cxpb=0.5,
            mutpb=0.2,
            ngen=self.generations_number,
            stats=stats,
            halloffame=hof,
            verbose=self.verbose,
        )
        print(hof[0].fitness.values)

        current_best_score_ = hof[0].fitness.values[0]
        current_best_params_ = _individual_to_params(hof[0], name_values)

        print("cbp", current_best_params_)
        if self.verbose:
            print(
                "Best individual is: %s\nwith fitness: %s"
                % (current_best_params_, current_best_score_)
            )

        if current_best_score_ > self.best_score_:
            self.best_score_ = current_best_score_
            self.best_params_ = current_best_params_