Esempio n. 1
0
def _generate_meta_estimator_instances_with_pipeline():
    """Generate instances of meta-estimators fed with a pipeline

    Are considered meta-estimators all estimators accepting one of "estimator",
    "base_estimator" or "estimators".
    """
    for _, Estimator in sorted(all_estimators()):
        sig = set(signature(Estimator).parameters)

        if "estimator" in sig or "base_estimator" in sig or "regressor" in sig:
            if is_regressor(Estimator):
                estimator = make_pipeline(TfidfVectorizer(), Ridge())
                param_grid = {"ridge__alpha": [0.1, 1.0]}
            else:
                estimator = make_pipeline(TfidfVectorizer(),
                                          LogisticRegression())
                param_grid = {"logisticregression__C": [0.1, 1.0]}

            if "param_grid" in sig or "param_distributions" in sig:
                # SearchCV estimators
                extra_params = {"n_iter": 2} if "n_iter" in sig else {}
                yield Estimator(estimator, param_grid, **extra_params)
            else:
                yield Estimator(estimator)

        elif "transformer_list" in sig:
            # FeatureUnion
            transformer_list = [
                ("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
                (
                    "trans2",
                    make_pipeline(TfidfVectorizer(),
                                  StandardScaler(with_mean=False)),
                ),
            ]
            yield Estimator(transformer_list)

        elif "estimators" in sig:
            # stacking, voting
            if is_regressor(Estimator):
                estimator = [
                    ("est1", make_pipeline(TfidfVectorizer(),
                                           Ridge(alpha=0.1))),
                    ("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))),
                ]
            else:
                estimator = [
                    (
                        "est1",
                        make_pipeline(TfidfVectorizer(),
                                      LogisticRegression(C=0.1)),
                    ),
                    ("est2",
                     make_pipeline(TfidfVectorizer(),
                                   LogisticRegression(C=1))),
                ]
            yield Estimator(estimator)

        else:
            continue
Esempio n. 2
0
    def _convert_train_data(self, X, y, weights=None):
        """Convert the training data to a form accepted by Lolo

        Args:
            X (ndarray): Input variables
            y (ndarray): Output variables
            weights (ndarray): Wegihts for each sample
        Returns
            train_data (JavaObject): Pointer to the training data in Java
        """

        # Make some default weights
        if weights is None:
            weights = np.ones(len(y))

        # Convert y and w to float64 or int32 with native ordering
        y = np.array(y, dtype=np.float64 if is_regressor(self) else np.int32)
        weights = np.array(weights, dtype=np.float64)

        # Convert X, y, and w to Java Objects
        X_java = send_feature_array(self.gateway, X)
        if self._num_outputs == 1:
            y_java = send_1D_array(self.gateway, y, is_regressor(self))
        else:
            y_java = send_feature_array(self.gateway, y)

        assert y_java.length() == len(y) == len(X)
        w_java = send_1D_array(self.gateway, weights, True)
        assert w_java.length() == len(weights)

        return self.gateway.jvm.io.citrine.lolo.util.LoloPyDataLoader.zipTrainingData(
            X_java, y_java), w_java
Esempio n. 3
0
    def _convert_train_data(self, X, y, weights=None):
        """Convert the training data to a form accepted by Lolo

        Args:
            X (ndarray): Input variables
            y (ndarray): Output variables
            weights (ndarray): Wegihts for each sample
        Returns
            train_data (JavaObject): Pointer to the training data in Java
        """

        # Make some default weights
        if weights is None:
            weights = np.ones(len(y))

        # Convert x, y, and w to float64 and int8 with native ordering
        X = np.array(X, dtype=np.float64)
        y = np.array(y, dtype=np.float64 if is_regressor(self) else np.int32)
        weights = np.array(weights, dtype=np.float64)
        big_end = sys.byteorder == "big"

        # Convert X and y to Java Objects
        X_java = self.gateway.jvm.io.citrine.lolo.util.LoloPyDataLoader.getFeatureArray(
            X.tobytes(), X.shape[1], big_end)
        y_java = self.gateway.jvm.io.citrine.lolo.util.LoloPyDataLoader.get1DArray(
            y.tobytes(), is_regressor(self), big_end)
        assert y_java.length() == len(y) == len(X)
        w_java = self.gateway.jvm.io.citrine.lolo.util.LoloPyDataLoader.get1DArray(
            np.array(weights).tobytes(), True, big_end)
        assert w_java.length() == len(weights)

        return self.gateway.jvm.io.citrine.lolo.util.LoloPyDataLoader.zipTrainingData(
            X_java, y_java), w_java
Esempio n. 4
0
def yield_all_checks(name, estimator):
    tags = estimator._get_tags()
    if "2darray" not in tags["X_types"]:
        warnings.warn("Can't test estimator {} which requires input "
                      " of type {}".format(name, tags["X_types"]),
                      SkipTestWarning)
        return
    if tags["_skip_test"]:
        warnings.warn("Explicit SKIP via _skip_test tag for estimator "
                      "{}.".format(name),
                      SkipTestWarning)
        return

    yield from _yield_checks(name, estimator)
    if is_classifier(estimator):
        yield from _yield_classifier_checks(name, estimator)
    if is_regressor(estimator):
        yield from _yield_regressor_checks(name, estimator)
    if hasattr(estimator, 'transform'):
        if not tags["allow_variable_length"]:
            # Transformer tests ensure that shapes are the same at fit and
            # transform time, hence we need to skip them for estimators that
            # allow variable-length inputs
            yield from _yield_transformer_checks(name, estimator)
    if isinstance(estimator, ClusterMixin):
        yield from _yield_clustering_checks(name, estimator)
    if is_outlier_detector(estimator):
        yield from _yield_outliers_checks(name, estimator)
    # We are not strict on presence/absence of the 3rd dimension
    # yield check_fit2d_predict1d

    if not tags["non_deterministic"]:
        yield check_methods_subset_invariance

    yield check_fit2d_1sample
    yield check_fit2d_1feature
    yield check_fit1d
    yield check_get_params_invariance
    yield check_set_params
    yield check_dict_unchanged
    yield check_dont_overwrite_parameters
    yield check_fit_idempotent

    if (is_classifier(estimator) or
            is_regressor(estimator) or
            isinstance(estimator, ClusterMixin)):
        if tags["allow_variable_length"]:
            yield check_different_length_fit_predict_transform
Esempio n. 5
0
def check_regressor(regressor=None, random_state=None):
    """Check if a regressor is given and if it is valid, otherwise set default regressor.

    Parameters
    ----------
    regressor : sklearn-like regressor, optional, default=None.
    random_state : int, RandomState instance or None, default=None
        Used to set random_state of the default regressor.

    Returns
    -------
    regressor

    Raises
    ------
    ValueError
        Raise error if given regressor is not a valid sklearn-like regressor.
    """
    if regressor is None:
        regressor = GradientBoostingRegressor(max_depth=5,
                                              random_state=random_state)
    else:
        if not is_regressor(regressor):
            raise ValueError(
                f"`regressor` should be a sklearn-like regressor, "
                f"but found: {regressor}")
        regressor = clone(regressor)
    return regressor
Esempio n. 6
0
    def decision_function(self, x):
        """output f(x) for given samples
        
        Parameters
        ---------
        x : array-like of shape (n_samples, d)
            containing the input dataset
        Returns
        -------
        np.array of shape (n_samples,)
            containing f(x) 
        """
        check_is_fitted(self, "reg")
        x = x.copy()

        if is_regressor(self):
            pred = stats.predict(
                self.reg,
                pd.DataFrame(
                    x, columns=['x' + str(i + 1) for i in range(x.shape[1])]))
        elif is_classifier(self):
            pred = stats.predict(
                self.reg.reg,
                pd.DataFrame(
                    x, columns=['x' + str(i + 1) for i in range(x.shape[1])]))

        return pred.flatten()
Esempio n. 7
0
    def decision_function(self, x):
        """output f(x) for given samples

        Parameters
        ---------
        x : array-like of shape (n_samples, 1)
            containing the input dataset
        Returns
        -------
        np.array of shape (n_samples,)
            containing f(x)
        """

        check_is_fitted(self, "sm_")
        x = x.copy()
        x[x < self.xmin] = self.xmin
        x[x > self.xmax] = self.xmax
        if isinstance(self.sm_, (np.ndarray, np.int, int, np.floating, float)):
            pred = self.sm_ * np.ones(x.shape[0])
        else:
            if is_classifier(self):
                pred = bigsplines.predict_bigssg(self.sm_,
                                                 ro.r("data.frame")(x=x))[1]
            if is_regressor(self):
                pred = bigsplines.predict_bigspline(self.sm_,
                                                    ro.r("data.frame")(x=x))
        return pred
Esempio n. 8
0
def get_estimator_params(estimator):
    """Get estimator parameters.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        estimator (:obj:`estimator`):
            | Scikit-learn estimator from which to log parameters.

    Returns:
        ``dict`` with all parameters mapped to their values.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            rfr = RandomForestRegressor()

            run = neptune.init(project='my_workspace/my_project')
            run['estimator/params'] = npt_utils.get_estimator_params(rfr)
    """
    assert is_regressor(estimator) or is_classifier(estimator) or isinstance(estimator, KMeans), \
        'Estimator should be sklearn regressor, classifier or kmeans clusterer.'

    return estimator.get_params()
    def train(self, x_train, y_train):
        md = self.model(random_state=SEED)

        if 'base_estimator' in md.get_params(
        ) and md.get_params()['base_estimator'] is None:
            if is_classifier(self.model):
                base_param = {
                    'base_estimator': DecisionTreeClassifier(random_state=SEED)
                }

            elif is_regressor(self.model):
                base_param = {
                    'base_estimator': DecisionTreeRegressor(random_state=SEED)
                }

            md = md.set_params(**base_param)

        if self.estimator in self.hyper.keys():
            md.set_params(**self.hyper[self.estimator]['params'])

        gt_train, pd_train, md = self._cross_validation(md,
                                                        x_train,
                                                        y_train,
                                                        run_only_once=True)
        matches = self._report(gt_train, pd_train, md, prefix='train')

        return gt_train, pd_train, matches, self.output
Esempio n. 10
0
    def __init__(self,
                 X,
                 t,
                 y,
                 cv,
                 outcome_learner,
                 effect_learner,
                 outcome_param_grid={},
                 effect_param_grid={},
                 params_outcome={},
                 params_effect={}):

        super().__init__(X, t, y, cv)
        self.outcome_learner = copy.deepcopy(outcome_learner) \
            .set_params(**params_outcome)
        self.outcome_param_grid = outcome_param_grid
        self.effect_learner = copy.deepcopy(effect_learner) \
            .set_params(**params_effect)
        self.effect_param_grid = effect_param_grid

        # Check if outcome learner is regressor or classifier, and initialize the
        # appropriate metalearner object
        if is_regressor(self.outcome_learner):
            self.meta_learner_class = BaseRRegressor
        else:
            self.meta_learner_class = BaseRClassifier
        self._make_meta_learner()
Esempio n. 11
0
    def _stack_layers(self, *layers, default=None):
        if is_classifier(self):
            stack = partial(StackingClassifier, cv=self.cv,
                            n_jobs=self.n_jobs, passthrough=self.passthrough, verbose=0)
        elif is_regressor(self):
            stack = partial(StackingRegressor, cv=self.cv,
                            n_jobs=self.n_jobs, passthrough=self.passthrough, verbose=0)

        layers = list(layers)

        if len(layers) == 1:
            if isinstance(layers[0], list):
                [(str(i) + '_' + layers[0][i].__class__.__name__, layers[0][i]) for i in range(len(layers[0]))]
                return stack(layers[0], default)
            else:
                return layers[0]

        elif len(layers) == 2:
            if not isinstance(layers[0], list):
                layers[0] = [layers[0]]

            if not isinstance(layers[1], list):
                layers[1] = [layers[1]]

            layers[0] = [(str(i) + '_' + layers[0][i].__class__.__name__, layers[0][i]) for i in range(len(layers[0]))]

            return self._stack_layers(*[stack(layers[0], estimator) for estimator in layers[1]], default=default)

        elif len(layers) > 2:
            return self._stack_layers(self._stack_layers(layers[0], layers[1], default=default),
                                      *layers[2:], default=default)

        raise Exception
Esempio n. 12
0
    def _validate_hyperparameters(self) -> None:
        """Validate the hyperparameters."""
        if not (hasattr(self.input_to_node, "fit")
                and hasattr(self.input_to_node, "fit_transform")
                and hasattr(self.input_to_node, "transform")):
            raise TypeError("All input_to_node should be transformers and"
                            "implement fit and transform '{0}' (type {1}) "
                            "doesn't".format(self.input_to_node,
                                             type(self.input_to_node)))

        if not (hasattr(self.node_to_node, "fit")
                and hasattr(self.node_to_node, "fit_transform")
                and hasattr(self.node_to_node, "transform")):
            raise TypeError("All node_to_node should be transformers and"
                            "implement fit and transform '{0}' (type {1}) "
                            "doesn't".format(self.node_to_node,
                                             type(self.node_to_node)))

        if (self._requires_sequence != "auto"
                and not isinstance(self._requires_sequence, bool)):
            raise ValueError('Invalid value for requires_sequence, got {0}'
                             .format(self._requires_sequence))

        if not is_regressor(self._regressor):
            raise TypeError("The last step should be a regressor and "
                            "implement fit and predict '{0}' (type {1})"
                            "doesn't".format(self._regressor,
                                             type(self._regressor)))
Esempio n. 13
0
    def _validate_hyperparameters(self):
        """
        Validates the hyperparameters.

        Returns
        -------
        """
        self.random_state = check_random_state(self.random_state)

        if not (hasattr(self.input_to_node, "fit")
                and hasattr(self.input_to_node, "fit_transform")
                and hasattr(self.input_to_node, "transform")):
            raise TypeError("All input_to_node should be transformers "
                            "and implement fit and transform "
                            "'%s' (type %s) doesn't" %
                            (self.input_to_node, type(self.input_to_node)))

        if self._chunk_size is not None and (
                not isinstance(self._chunk_size, int) or self._chunk_size < 0):
            raise ValueError('Invalid value for chunk_size, got {0}'.format(
                self._chunk_size))

        if not is_regressor(self._regressor):
            raise TypeError("The last step should be a regressor "
                            "and implement fit and predict"
                            "'%s' (type %s) doesn't" %
                            (self._regressor, type(self._regressor)))
Esempio n. 14
0
 def __init__(self,
              estimator,
              n_estimators=100,
              perc=100,
              alpha=0.05,
              two_step=True,
              max_iter=100,
              random_state=None,
              verbose=0,
              importance_type='gini',
              scale_permutation_bytree=False):
     self.estimator = estimator
     self.n_estimators = n_estimators
     self.perc = perc
     self.alpha = alpha
     self.two_step = two_step
     self.max_iter = max_iter
     self.random_state = random_state
     self.verbose = verbose
     self.importance_type = importance_type
     self.scale_permutation_bytree = scale_permutation_bytree
     if is_classifier(self.estimator):
         self.task = 'classification'
     elif is_regressor(self.estimator):
         self.task = 'regression'
     else:
         self.task = 'other'
Esempio n. 15
0
def test_meta_estimators_delegate_data_validation(estimator):
    # Check that meta-estimators delegate data validation to the inner
    # estimator(s).
    rng = np.random.RandomState(0)
    set_random_state(estimator)

    n_samples = 30
    X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)

    if is_regressor(estimator):
        y = rng.normal(size=n_samples)
    else:
        y = rng.randint(3, size=n_samples)

    # We convert to lists to make sure it works on array-like
    X = _enforce_estimator_tags_x(estimator, X).tolist()
    y = _enforce_estimator_tags_y(estimator, y).tolist()

    # Calling fit should not raise any data validation exception since X is a
    # valid input datastructure for the first step of the pipeline passed as
    # base estimator to the meta estimator.
    estimator.fit(X, y)

    # n_features_in_ should not be defined since data is not tabular data.
    assert not hasattr(estimator, "n_features_in_")
Esempio n. 16
0
def test_StackerClassifier():

    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = 1 * (np.random.randn(100) > 0)

    stacker = StackerClassifier(
        models=[
            RandomForestClassifier(random_state=123),
            LogisticRegression(C=1, random_state=123)
        ],
        cv=10,
        blender=LogisticRegression(C=1, random_state=123),
    )

    stacker.fit(X, y)

    yhat = stacker.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == X.shape[0]

    assert list(set(yhat)) == [0, 1]
    assert list(stacker.classes_) == [0, 1]

    yhat_proba = stacker.predict_proba(X)

    assert yhat_proba.shape == (y.shape[0], 2)

    assert not is_regressor(stacker)
    assert is_classifier(stacker)
Esempio n. 17
0
    def __init__(self,
                 X,
                 t,
                 y,
                 cv,
                 base_learner,
                 param_grid={},
                 params_treat={},
                 params_control={}):

        super().__init__(X, t, y, cv)
        self.treatment_outcome_learner = copy.deepcopy(base_learner) \
            .set_params(**params_treat)
        self.control_outcome_learner = copy.deepcopy(base_learner) \
            .set_params(**params_control)
        self.param_grid = param_grid

        # Check if base learner is regressor or classifier, and initialize the
        # appropriate metalearner object
        if is_regressor(self.treatment_outcome_learner):
            self.meta_learner_class = BaseTRegressor
            self.scoring = "r2"
        else:
            self.meta_learner_class = BaseTClassifier
            self.scoring = "neg_log_loss"
        self._make_meta_learner()
Esempio n. 18
0
 def __init__(self, X, t, y, cv, outcome_learner, effect_learner, 
              outcome_param_grid = {}, effect_param_grid = {},
              params_treat = {}, params_control = {},
              params_treat_effect = {}, params_control_effect = {}):
     
     super().__init__(X, t, y, cv)
     self.treatment_outcome_learner = copy.deepcopy(outcome_learner) \
                                         .set_params(**params_treat)
     self.control_outcome_learner = copy.deepcopy(outcome_learner) \
                                     .set_params(**params_control)
     self.outcome_param_grid = outcome_param_grid  # param grid for effect learner
     self.treatment_effect_learner = copy.deepcopy(effect_learner) \
                                         .set_params(**params_treat_effect)
     self.control_effect_learner = copy.deepcopy(effect_learner) \
                                         .set_params(**params_control_effect)
     self.effect_param_grid = effect_param_grid # param grid for effect learner
     # Flag to tune the outcome learners. Set to True if params_treat or 
     # params_control is the empty dictionary. The outcome learners are tuned
     # exactly the same way as those for a T-learner, so we don't have to
     # repeat this part of the tuning.
     self.tune_outcome_learners = (len(params_treat) == 0) or \
                                         (len(params_control) == 0)
     
     # Check if outcome learner is regressor or classifier, and initialize the 
     # appropriate metalearner object
     if is_regressor(self.treatment_outcome_learner):
         self.meta_learner_class = BaseXRegressor
         self.scoring = "r2"
     else:
         self.meta_learner_class = BaseXClassifier
         self.scoring = "neg_log_loss"
     self._make_meta_learner()
Esempio n. 19
0
def predict(features_df, mdl, target_class_names=None, cols_to_save=None):
    from sklearn.base import is_classifier, is_regressor
    import pandas as pd

    if cols_to_save is not None:
        existing_cols_to_save = list(
            set(cols_to_save).intersection(features_df.columns))
        res_df = features_df[existing_cols_to_save].copy()
    else:
        res_df = pd.DataFrame()

    if is_classifier(mdl):
        pred = mdl.predict_proba(features_df)

        if pred.shape[1] == 2:
            res_df['target_proba'] = pred[:, 1]
        else:
            if target_class_names is None:
                target_class_names = [
                    f'class{i}' for i in range(pred.shape[1])
                ]

            for i, label in enumerate(target_class_names):
                res_df[label] = pred[:, i]
    elif is_regressor(mdl):
        res_df['pred'] = mdl.predict(features_df)
    else:
        raise AttributeError('unknown model type')

    return res_df
Esempio n. 20
0
    def _check_arguments(self, base_estimator, n_initial_points,
                         acq_optimizer):
        """Check arguments for sanity."""

        if isinstance(base_estimator, str):
            base_estimator = cook_estimator(base_estimator,
                                            space=self.space,
                                            random_state=self.rng)
        if not is_regressor(base_estimator):
            raise ValueError("%s has to be a regressor." % base_estimator)
        self.base_estimator_ = base_estimator

        if n_initial_points < 0:
            raise ValueError("Expected `n_initial_points` >= 0, got %d" %
                             n_initial_points)
        self._n_initial_points = n_initial_points
        self.n_initial_points_ = n_initial_points

        if acq_optimizer == "auto":
            if has_gradients(self.base_estimator_):
                acq_optimizer = "sampling"
            else:
                acq_optimizer = "lbfgs"

        if acq_optimizer not in ["lbfgs", "sampling"]:
            raise ValueError("Expected acq_optimizer to be 'lbfgs' or "
                             "'sampling', got {0}".format(acq_optimizer))

        if has_gradients(self.base_estimator_) and acq_optimizer != "sampling":
            raise ValueError("The regressor {0} should run with "
                             "acq_optimizer='sampling'".format(
                                 type(base_estimator)))

        self.acq_optimizer = acq_optimizer
    def _report(self, gt, pred, model, prefix):
        os.makedirs(self.output, exist_ok=True)
        pickle.dump(model, open(os.path.join(self.output, 'model.sav'), 'wb'))
        matches = None

        if is_classifier(self.model):
            with open(
                    os.path.join(self.output,
                                 f'{prefix}_classification_report.txt'),
                    'w') as f:
                matches, report = [gt, pred
                                   ], classification_report(gt,
                                                            pred,
                                                            digits=4,
                                                            zero_division=True)
                f.write(''.join(report))
                print(''.join(report))

        elif is_regressor(self.model):
            with open(os.path.join(self.output, f'{prefix}_rmse_accuracy.txt'),
                      'w') as f:
                matches, report = self._acc_tolerance(gt,
                                                      pred,
                                                      tolerance=[0, 1, 2, 3])
                f.write('\n'.join(report))
                print('\n'.join(report))

        return matches
Esempio n. 22
0
def test_linear():
    print('\ntest_linear():')
    rs = np.random.RandomState(42)
    index = range(1000)
    X = np.hstack(
        (np.linspace(0., 10.,
                     1000).reshape(-1, 1), np.linspace(-1., 1.,
                                                       1000).reshape(-1, 1),
         rs.random(1000).reshape(-1, 1)))
    transformation = rs.random(size=(3, 2))
    y = np.matmul(X, transformation)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=10,
                                                        random_state=42)
    reg = IncrementalRegression()
    assert is_regressor(reg)

    for prt in np.array_split(index, 3):
        reg.partial_fit(X[prt, :], y[prt, :])

    y_reg = reg.predict(X_test)
    print("tests: {0}\nregr: {1}".format(y_test, y_reg))
    np.testing.assert_allclose(y_reg, y_test, rtol=.01, atol=.15)
Esempio n. 23
0
def get_pickled_model(estimator):
    """Get pickled estimator.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        estimator (:obj:`estimator`):
            | Scikit-learn estimator to pickle.

    Returns:
        ``neptune.types.File`` object that you can assign to run's ``base_namespace``.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            rfr = RandomForestRegressor()

            run = neptune.init(project='my_workspace/my_project')
            run['estimator/pickled_model'] = npt_utils.get_pickled_model(rfr)
    """
    assert is_regressor(estimator) or is_classifier(estimator), \
        'Estimator should be sklearn regressor or classifier.'

    return neptune.types.File.as_pickle(estimator)
Esempio n. 24
0
def find_sklearn_model(name, mt="regression"):
    """Given string name, find the sklearn object and module associated."""
    if isinstance(name, str):
        for pkg in skpackages():
            if hasattr(pkg, name):
                return getattr(pkg, name)(), pkg.__name__
        # if we've reached here, the thing isn't in a known package - let's search the shorthand space
        _mt = model_types()
        q = _mt.query("Short == @name")
        # if we just have one row, return
        if q.shape[0] == 1:
            return find_sklearn_model(q.index[0])
        elif q.shape[0] > 1:
            # determine whether we want a classifier or a regressor
            return find_sklearn_model(q[q["ModelType"] == mt].index[0])
        else:
            raise ValueError(
                "name '{}' does not exist as a model".format(name))
    elif is_classifier(name):
        return name, name.__module__.rsplit(".", 1)[0]
    elif is_regressor(name):
        return name, name.__module__.rsplit(".", 1)[0]
    else:
        raise TypeError(
            "model '{}' not recognized as scikit-learn model.".format(name))
Esempio n. 25
0
def _predict_binary(estimator, X):
    """Make predictions using a single binary estimator."""
    if is_regressor(estimator):
        return estimator.predict(X)
        # probabilities of the positive class
    score = estimator.predict_proba(X)[:, 1]
    return score
Esempio n. 26
0
def test_OutSamplerTransformer_regressor(multi_output):

    np.random.seed(123)
    X = np.random.randn(100, 10)
    if multi_output:
        y = np.random.randn(100, 2)
    else:
        y = np.random.randn(100)

    model = OutSamplerTransformer(RandomForestRegressor(n_estimators=10,
                                                        random_state=123),
                                  cv=10)
    model.fit(X, y)

    y1 = model.model.predict(X)
    y2 = model.transform(X)

    assert not is_classifier(model)
    assert not is_regressor(model)

    if multi_output:
        assert np.abs(y1[:, 0] - y2[:, 0]).max() <= 10**(-10)
        assert np.abs(y1[:, 1] - y2[:, 1]).max() <= 10**(-10)
        assert y2.shape == (100, 2)

        assert model.get_feature_names() == [
            "output%d__RandomForestRegressor__target" % d
            for d in range(y.shape[1])
        ]

    else:
        assert np.abs(y1 - y2[:, 0]).max() <= 10**(-10)
        assert y2.shape == (100, 1)

        assert model.get_feature_names() == ["RandomForestRegressor__target"]
Esempio n. 27
0
 def new_scorer(estimator, X, Y):
     if is_regressor(estimator):
         return regr_scorer(estimator, X, Y)
     elif is_classifier(estimator):
         return class_scorer(estimator, X, Y)
     else:
         raise ValueError("Not supported type of Estimator")
Esempio n. 28
0
def test_StackerRegressor():

    np.random.seed(123)
    X = np.random.randn(100, 10)
    y = np.random.randn(100)

    stacker = StackerRegressor(
        models=[
            RandomForestRegressor(n_estimators=10, random_state=123),
            Ridge(random_state=123)
        ],
        cv=10,
        blender=Ridge(random_state=123),
    )

    stacker.fit(X, y)

    yhat = stacker.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == X.shape[0]

    assert is_regressor(stacker)
    assert not is_classifier(stacker)

    with pytest.raises(AttributeError):
        stacker.predict_proba(X)

    with pytest.raises(AttributeError):
        stacker.classes_
Esempio n. 29
0
def cook_estimator(base_estimator, space=None, **kwargs):
    """
    Cook a default estimator.

    Parameters
    ----------
    * `base_estimator` ["GP", "RF", "ET", "GBRT" or sklearn regressor, default="GP"]:
        Should inherit from `sklearn.base.RegressorMixin`.
        In addition the `predict` method should have an optional `return_std`
        argument, which returns `std(Y | x)`` along with `E[Y | x]`.
        If base_estimator is one of ["GP", "RF", "ET", "GBRT"], a default
        surrogate model of the corresponding type is used corresponding to what
        is used in the minimize functions.

    * `space` [Space instance]:
        Has to be provided if the base_estimator is a gaussian process.
        Ignored otherwise.

    * `kwargs` [dict]:
        Extra parameters provided to the base_estimator at init time.
    """
    if space is not None:
        n_dims = space.transformed_n_dims
        is_cat = space.is_categorical
    if isinstance(base_estimator, str):
        base_estimator = base_estimator.upper()
        if base_estimator not in ["GP", "ET", "RF", "GBRT"]:
            raise ValueError("Valid strings for the base_estimator parameter "
                             "are: 'RF', 'ET' or 'GP' not %s" % base_estimator)
    elif not is_regressor(base_estimator):
        raise ValueError("base_estimator has to be a regressor.")

    if base_estimator == "GP":
        cov_amplitude = ConstantKernel(1.0, (0.01, 1000.0))
        if is_cat:
            other_kernel = HammingKernel(length_scale=np.ones(n_dims))
        else:
            other_kernel = Matern(length_scale=np.ones(n_dims),
                                  length_scale_bounds=[(0.01, 100)] * n_dims,
                                  nu=2.5)

        base_estimator = GaussianProcessRegressor(kernel=cov_amplitude *
                                                  other_kernel,
                                                  normalize_y=True,
                                                  random_state=None,
                                                  alpha=0.0,
                                                  noise="gaussian",
                                                  n_restarts_optimizer=2)
    elif base_estimator == "RF":
        base_estimator = RandomForestRegressor(n_estimators=100,
                                               min_samples_leaf=3)
    elif base_estimator == "ET":
        base_estimator = ExtraTreesRegressor(n_estimators=100,
                                             min_samples_leaf=3)
    elif base_estimator == "GBRT":
        gbrt = GradientBoostingRegressor(n_estimators=30, loss="quantile")
        base_estimator = GradientBoostingQuantileRegressor(base_estimator=gbrt)

    base_estimator.set_params(**kwargs)
    return base_estimator
Esempio n. 30
0
def test_staged_predict(HistGradientBoosting, X, y):

    # Test whether staged predictor eventually gives
    # the same prediction.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.5,
                                                        random_state=0)
    gb = HistGradientBoosting(max_iter=10)

    # test raise NotFittedError if not fitted
    with pytest.raises(NotFittedError):
        next(gb.staged_predict(X_test))

    gb.fit(X_train, y_train)

    # test if the staged predictions of each iteration
    # are equal to the corresponding predictions of the same estimator
    # trained from scratch.
    # this also test limit case when max_iter = 1
    method_names = (["predict"] if is_regressor(gb) else
                    ["predict", "predict_proba", "decision_function"])
    for method_name in method_names:

        staged_method = getattr(gb, "staged_" + method_name)
        staged_predictions = list(staged_method(X_test))
        assert len(staged_predictions) == gb.n_iter_
        for n_iter, staged_predictions in enumerate(staged_method(X_test), 1):
            aux = HistGradientBoosting(max_iter=n_iter)
            aux.fit(X_train, y_train)
            pred_aux = getattr(aux, method_name)(X_test)

            assert_allclose(staged_predictions, pred_aux)
            assert staged_predictions.shape == pred_aux.shape
Esempio n. 31
0
    def _check_arguments(self, base_estimator, n_initial_points,
                         acq_optimizer, dimensions):
        """Check arguments for sanity."""

        if isinstance(base_estimator, str):
            base_estimator = cook_estimator(
                base_estimator, space=dimensions, random_state=self.rng)

        if not is_regressor(base_estimator) and base_estimator is not None:
            raise ValueError(
                "%s has to be a regressor." % base_estimator)

        if "ps" in self.acq_func:
            self.base_estimator_ = MultiOutputRegressor(base_estimator)
        else:
            self.base_estimator_ = base_estimator

        if n_initial_points < 0:
            raise ValueError(
                "Expected `n_initial_points` >= 0, got %d" % n_initial_points)
        self._n_initial_points = n_initial_points
        self.n_initial_points_ = n_initial_points

        if acq_optimizer == "auto":
            if has_gradients(self.base_estimator_):
                acq_optimizer = "lbfgs"
            else:
                acq_optimizer = "sampling"

        if acq_optimizer not in ["lbfgs", "sampling"]:
            raise ValueError("Expected acq_optimizer to be 'lbfgs' or "
                             "'sampling', got {0}".format(acq_optimizer))

        if (not has_gradients(self.base_estimator_) and
            acq_optimizer != "sampling"):
            raise ValueError("The regressor {0} should run with "
                             "acq_optimizer"
                             "='sampling'.".format(type(base_estimator)))

        self.acq_optimizer = acq_optimizer
Esempio n. 32
0
    def __init__(self, models):
        """Proxy class to build an ensemble of models with an API as one

        Parameters
        ----------
        models: array
            An array of models
        """
        self._models = models if len(models) else None
        if self._models is not None:
            if is_classifier(self._models[0]):
                check_type = is_classifier
                self._scoring_fun = accuracy_score
            elif is_regressor(self._models[0]):
                check_type = is_regressor
                self._scoring_fun = r2_score
            else:
                raise ValueError('Expected regressors or classifiers,'
                                 ' got %s instead' % type(self._models[0]))
            for model in self._models:
                if not check_type(model):
                    raise ValueError('Different types of models found, privide'
                                     ' either regressors or classifiers.')
Esempio n. 33
0
    def fit(self, X, y):
        """Fit a receptive field model.

        Parameters
        ----------
        X : array, shape (n_times[, n_epochs], n_features)
            The input features for the model.
        y : array, shape (n_times[, n_epochs], n_outputs)
            The output features for the model.

        Returns
        -------
        self : instance
            The instance so you can chain operations.
        """
        if self.scoring not in _SCORERS.keys():
            raise ValueError('scoring must be one of %s, got'
                             '%s ' % (sorted(_SCORERS.keys()), self.scoring))
        from sklearn.base import is_regressor, clone
        X, y = self._check_dimensions(X, y)

        # Initialize delays
        self.delays_ = _times_to_delays(self.tmin, self.tmax, self.sfreq)

        # Define the slice that we should use in the middle
        self.keep_samples_ = _delays_to_slice(self.delays_)

        if isinstance(self.estimator, numbers.Real):
            estimator = TimeDelayingRidge(self.tmin, self.tmax, self.sfreq,
                                          alpha=self.estimator,
                                          fit_intercept=self.fit_intercept)
        elif is_regressor(self.estimator):
            estimator = clone(self.estimator)
        else:
            raise ValueError('`estimator` must be a float or an instance'
                             ' of `BaseEstimator`,'
                             ' got type %s.' % type(self.estimator))
        self.estimator_ = estimator
        del estimator
        _check_estimator(self.estimator_)

        # Create input features
        n_times, n_epochs, n_feats = X.shape

        # Update feature names if we have none
        if self.feature_names is None:
            self.feature_names = ['feature_%s' % ii for ii in range(n_feats)]
        if len(self.feature_names) != n_feats:
            raise ValueError('n_features in X does not match feature names '
                             '(%s != %s)' % (n_feats, len(self.feature_names)))

        # Create input features
        X_del, y = self._delay_and_reshape(X, y)
        self.estimator_.fit(X_del, y)

        coefs = get_coef(self.estimator_, 'coef_')
        coefs = coefs.reshape([-1, n_feats, len(self.delays_)])
        if len(coefs) == 1:
            # Remove a singleton first dimension if only 1 output
            coefs = coefs[0]
        self.coef_ = coefs
        return self
Esempio n. 34
0
def cook_estimator(base_estimator, space=None, **kwargs):
    """
    Cook a default estimator.

    For the special base_estimator called "DUMMY" the return value is None.
    This corresponds to sampling points at random, hence there is no need
    for an estimator.

    Parameters
    ----------
    * `base_estimator` ["GP", "RF", "ET", "GBRT", "DUMMY"
                        or sklearn regressor, default="GP"]:
        Should inherit from `sklearn.base.RegressorMixin`.
        In addition the `predict` method should have an optional `return_std`
        argument, which returns `std(Y | x)`` along with `E[Y | x]`.
        If base_estimator is one of ["GP", "RF", "ET", "GBRT", "DUMMY"], a
        surrogate model corresponding to the relevant `X_minimize` function
        is created.

    * `space` [Space instance]:
        Has to be provided if the base_estimator is a gaussian process.
        Ignored otherwise.

    * `kwargs` [dict]:
        Extra parameters provided to the base_estimator at init time.
    """
    if isinstance(base_estimator, str):
        base_estimator = base_estimator.upper()
        if base_estimator not in ["GP", "ET", "RF", "GBRT", "DUMMY"]:
            raise ValueError("Valid strings for the base_estimator parameter "
                             " are: 'RF', 'ET', 'GP', 'GBRT' or 'DUMMY' not "
                             "%s." % base_estimator)
    elif not is_regressor(base_estimator):
        raise ValueError("base_estimator has to be a regressor.")

    if base_estimator == "GP":
        if space is not None:
            space = Space(space)
            space = Space(normalize_dimensions(space.dimensions))
            n_dims = space.transformed_n_dims
            is_cat = space.is_categorical

        else:
            raise ValueError("Expected a Space instance, not None.")

        cov_amplitude = ConstantKernel(1.0, (0.01, 1000.0))
        # only special if *all* dimensions are categorical
        if is_cat:
            other_kernel = HammingKernel(length_scale=np.ones(n_dims))
        else:
            other_kernel = Matern(
                length_scale=np.ones(n_dims),
                length_scale_bounds=[(0.01, 100)] * n_dims, nu=2.5)

        base_estimator = GaussianProcessRegressor(
            kernel=cov_amplitude * other_kernel,
            normalize_y=True, noise="gaussian",
            n_restarts_optimizer=2)
    elif base_estimator == "RF":
        base_estimator = RandomForestRegressor(n_estimators=100,
                                               min_samples_leaf=3)
    elif base_estimator == "ET":
        base_estimator = ExtraTreesRegressor(n_estimators=100,
                                             min_samples_leaf=3)
    elif base_estimator == "GBRT":
        gbrt = GradientBoostingRegressor(n_estimators=30, loss="quantile")
        base_estimator = GradientBoostingQuantileRegressor(base_estimator=gbrt)

    elif base_estimator == "DUMMY":
        return None

    base_estimator.set_params(**kwargs)
    return base_estimator
Esempio n. 35
0
def forest_minimize(func, dimensions, base_estimator='et', n_calls=100,
                    n_points=1000, n_random_starts=10, x0=None, y0=None,
                    n_jobs=1, random_state=None, acq="EI", xi=0.01, kappa=1.96):
    """Sequential optimisation using decision trees.

    A tree based regression model is used to model the expensive to evaluate
    function `func`. The model is improved by sequentially evaluating
    the expensive function at the next best point. Thereby finding the
    minimum of `func` with as few evaluations as possible.

    The total number of evaluations, `n_calls`, are performed like the
    following. If `x0` is provided but not `y0`, then the elements of `x0`
    are first evaluated, followed by `n_random_starts` evaluations.
    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
    made guided by the surrogate model. If `x0` and `y0` are both
    provided then `n_random_starts` evaluations are first made then
    `n_calls - n_random_starts` subsequent evaluations are made
    guided by the surrogate model.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `base_estimator` [string or `Regressor`, default=`"et"`]:
        The regressor to use as surrogate model. Can be either

        - `"rf"` for random forest regressor
        - `"et"` for extra trees regressor
        - `"dt"` for single decision tree regressor
        - instance of regressor with support for `return_std` in its predict
          method

        The predefined models are initilized with good defaults. If you
        want to adjust the model parameters pass your own instance of
        a regressor which returns the mean and standard deviation when
        making predictions.

    * `n_calls` [int, default=100]:
        Number of calls to `func`.

    * `n_random_starts` [int, default=10]:
        Number of evaluations of `func` with random initialization points
        before approximating the `func` with `base_estimator`.

    * `n_points` [int, default=1000]:
        Number of points to sample when minimizing the acquisition function.

    * `x0` [list, list of lists or `None`]:
        Initial input points.

        - If it is a list of lists, use it as a list of input points.
        - If it is a list, use it as a single initial input point.
        - If it is `None`, no initial input points are used.

    * `y0` [list, scalar or `None`]:
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.
        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.
        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    * `n_jobs` [int, default=1]:
        The number of jobs to run in parallel for `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    * `acq` [string, default=`"LCB"`]:
        Function to minimize over the forest posterior. Can be either

        - `"LCB"` for lower confidence bound,
        - `"EI"` for expected improvement,
        - `"PI"` for probability of improvement.

    * `xi` [float, default=0.01]:
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    * `kappa` [float, default=1.96]:
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimization space.
        - `specs` [dict]`: the call specifications.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    # Save call args + rng
    specs = {"args": copy.copy(inspect.currentframe().f_locals),
             "function": inspect.currentframe().f_code.co_name}

    # Check params
    rng = check_random_state(random_state)

    # Default estimator
    if isinstance(base_estimator, str):
        if base_estimator not in ("rf", "et", "dt"):
            raise ValueError(
                "Valid values for the base_estimator parameter"
                " are: 'rf', 'et' or 'dt', not '%s'" % base_estimator)

        if base_estimator == "rf":
            base_estimator = RandomForestRegressor(n_estimators=100,
                                                   min_samples_leaf=3,
                                                   n_jobs=n_jobs,
                                                   random_state=rng)

        elif base_estimator == "et":
            base_estimator = ExtraTreesRegressor(n_estimators=100,
                                                 min_samples_leaf=3,
                                                 n_jobs=n_jobs,
                                                 random_state=rng)

        elif base_estimator == "dt":
            base_estimator = DecisionTreeRegressor(min_samples_leaf=3,
                                                   random_state=rng)

    else:
        if not is_regressor(base_estimator):
            raise ValueError("The base_estimator parameter has to either"
                             " be a string or a regressor instance."
                             " '%s' is neither." % base_estimator)

    res = _tree_minimize(func, dimensions, base_estimator,
                         n_calls=n_calls,
                         n_points=n_points, n_random_starts=n_random_starts,
                         x0=x0, y0=y0, random_state=random_state, acq=acq,
                         xi=xi, kappa=kappa)
    res.specs = specs

    return res
Esempio n. 36
0
def analyse_results(
        regular_cv_results, permutation_cv_results, labels, estimator,
        base_folder=None, analysis_folder='analysis', feature_names=None,
        learning_task=None, vs_analysis=None,
        threshold=.75, model_assessment_options=None,
        score_surfaces_options=None):
    """Summary and plot generation."""

    # learning_task follows the convention of
    # sklearn.utils.multiclass.type_of_target
    if learning_task is None:
        if is_regressor(estimator):
            learning_task = 'continuous'
        else:
            learning_task = type_of_target(labels)

    # Create an empty dictionary which will contain the key results
    # of the analysis
    analysis_summary = dict()

    # Run the appropriate analysis according to the learning_task
    is_regression = learning_task.lower() in ('continuous', 'regression')
    if is_regression:
        # Perform regression analysis
        target = 'regression'
    elif learning_task.lower() == 'multiclass':
        target = 'multiclass'
    else:
        # Perform classification analysis
        target = 'classification'

    # Support for empty regular or permutation tests
    performance_regular = performance_metrics(
        regular_cv_results, labels, target)
    performance_permutation = performance_metrics(
        permutation_cv_results, labels, target)
    if base_folder is not None and analysis_folder is not None:
        analysis_folder = os.path.join(base_folder, analysis_folder)
        if not os.path.exists(analysis_folder):
            os.makedirs(analysis_folder)

        # ### Create two separate folders for figures in different formats
        try:
            os.mkdir(os.path.join(analysis_folder, 'figures_pdf'))
            os.mkdir(os.path.join(analysis_folder, 'figures_png'))
        except OSError:
            pass  # if folder already exists, ignore it
    else:
        analysis_folder = None

    if model_assessment_options is None:
        model_assessment_options = {}
    # Handle variable selection step
    if vs_analysis is not None:
        # Get feature names
        if feature_names is None:
            # what follows creates [feat_0, feat_1, ..., feat_d]
            # feature_names = 'feat_' + np.arange(
            #     labels.size).astype(str).astype(object)
            raise ValueError(
                "Variable selection analysis was specified, but no feature "
                "names were provided.")

        feature_names = np.array(feature_names)  # force feature names to array
        if threshold is None:
            threshold = .75
        selected = {}
        # Init variable selection containers
        selected['regular'] = dict(zip(feature_names,
                                       np.zeros(len(feature_names))))
        selected['permutation'] = selected['regular'].copy()

        n_splits_regular = len((regular_cv_results.values() or [[]])[0])
        n_splits_permutation = len((permutation_cv_results.values() or [[]])[0])
        n_jobs = {'regular': n_splits_regular,
                  'permutation': n_splits_permutation}
        names_ = ('regular', 'permutation')
        cv_results_ = (regular_cv_results, permutation_cv_results)
        for batch_name, cv_result in zip(names_, cv_results_):
            # cv_result['estimator'] is a list containing
            # the grid-search estimators
            estimators = cv_result.get('estimator', None)
            if estimators is None:
                continue  # in case of no permutations skip this iter
            for estimator in estimators:
                selected_list = get_selected_list(
                    estimator, vs_analysis)
                if len(selected_list) < 1:
                    continue
                selected_variables = feature_names[selected_list]

                for var in selected_variables:
                    selected[batch_name][var] += 1. / n_jobs[batch_name]

            # Save selected variables textual summary
            if analysis_folder is not None:
                save_signature(os.path.join(
                    analysis_folder, 'signature_%s.txt' % batch_name),
                    selected[batch_name], threshold)

            # Also save the frequency list as an entry of the analysis summary

            # Create an empty pandas dataframe to store the frequencies
            df_tmp = pd.DataFrame(columns=['Frequency'])

            for k in reversed(sorted(
                    selected[batch_name], key=selected[batch_name].__getitem__)):

                    df_tmp.loc[k] = selected[batch_name][k] * 100

            # Add the dataframe to the analysis summary
            analysis_summary['selection_frequency_{}'.format(batch_name)] = df_tmp

        feat_arr_r = np.array(list(iteritems(selected['regular'])), dtype=object)
        feat_arr_p = np.array(list(iteritems(selected['permutation'])), dtype=object)

        # sort by name
        feat_arr_r = feat_arr_r[feat_arr_r[:, 0].argsort()]
        feat_arr_p = feat_arr_p[feat_arr_p[:, 0].argsort()]

        # Save graphical summary
        plotting.feature_frequencies(
            feat_arr_r, analysis_folder,
            threshold=threshold)

        plotting.features_manhattan(
            feat_arr_r, feat_arr_p, analysis_folder,
            threshold=threshold)

        plotting.select_over_threshold(
            feat_arr_r, feat_arr_p, analysis_folder,
            threshold=threshold)

    # Generate distribution plots
    # And save distributions in analysis summary
    for i, metric in enumerate(performance_regular):
        plotting.distributions(
            v_regular=performance_regular[metric],
            v_permutation=performance_permutation.get(metric, []),
            base_folder=analysis_folder,
            metric=metric,
            first_run=i == 0,
            is_regression=is_regression)

        v_regular = performance_regular[metric]
        v_permutation = performance_permutation.get(metric, [])

        metric_values = dict()
        metric_values['values_regular'] = v_regular
        metric_values['values_permutation'] = v_permutation

        r_mean, r_sd = np.nanmean(v_regular), np.nanstd(v_regular)
        p_mean, p_sd = np.nanmean(v_permutation), np.nanstd(v_permutation)
        rstest = stats.ks_2samp(v_regular, v_permutation)

        metric_values['mean_regular'] = r_mean
        metric_values['sd_regular'] = r_sd

        metric_values['mean_permutation'] = p_mean
        metric_values['sd_permutation'] = p_sd

        metric_values['rstest'] = rstest

        analysis_summary['metric_{}'.format(metric)] = metric_values

    # Generate surfaces
    # This has meaning only if the estimator is an istance of GridSearchCV
    if isinstance(estimator, BaseSearchCV):
        if score_surfaces_options is None:
            score_surfaces_options = {}
        plotting.score_surfaces(
            param_grid=estimator.param_grid,
            results=regular_cv_results,
            base_folder=analysis_folder,
            is_regression=is_regression,
            **score_surfaces_options)

    # Finally, save in the analysis folder the pickled summary
    if analysis_folder is not None:
        with open(os.path.join(analysis_folder, 'summary.pkl'), 'w') as af:
            pkl.dump(analysis_summary, af)
Esempio n. 37
0
    def __init__(self, dimensions, base_estimator="gp",
                 n_random_starts=None, n_initial_points=10,
                 acq_func="gp_hedge",
                 acq_optimizer="auto",
                 random_state=None, acq_func_kwargs=None,
                 acq_optimizer_kwargs=None):

        self.rng = check_random_state(random_state)

        # Configure acquisition function

        # Store and creat acquisition function set
        self.acq_func = acq_func
        self.acq_func_kwargs = acq_func_kwargs

        allowed_acq_funcs = ["gp_hedge", "EI", "LCB", "PI", "EIps", "PIps"]
        if self.acq_func not in allowed_acq_funcs:
            raise ValueError("expected acq_func to be in %s, got %s" %
                             (",".join(allowed_acq_funcs), self.acq_func))

        # treat hedging method separately
        if self.acq_func == "gp_hedge":
            self.cand_acq_funcs_ = ["EI", "LCB", "PI"]
            self.gains_ = np.zeros(3)
        else:
            self.cand_acq_funcs_ = [self.acq_func]

        if acq_func_kwargs is None:
            acq_func_kwargs = dict()
        self.eta = acq_func_kwargs.get("eta", 1.0)

        # Configure counters of points

        # Check `n_random_starts` deprecation first
        if n_random_starts is not None:
            warnings.warn(("n_random_starts will be removed in favour of "
                           "n_initial_points."),
                          DeprecationWarning)
            n_initial_points = n_random_starts

        if n_initial_points < 0:
            raise ValueError(
                "Expected `n_initial_points` >= 0, got %d" % n_initial_points)
        self._n_initial_points = n_initial_points
        self.n_initial_points_ = n_initial_points

        # Configure estimator

        # build base_estimator if doesn't exist
        if isinstance(base_estimator, str):
            base_estimator = cook_estimator(
                base_estimator, space=dimensions,
                random_state=self.rng.randint(0, np.iinfo(np.int32).max))

        # check if regressor
        if not is_regressor(base_estimator) and base_estimator is not None:
            raise ValueError(
                "%s has to be a regressor." % base_estimator)

        # treat per second acqusition function specially
        is_multi_regressor = isinstance(base_estimator, MultiOutputRegressor)
        if "ps" in self.acq_func and not is_multi_regressor:
            self.base_estimator_ = MultiOutputRegressor(base_estimator)
        else:
            self.base_estimator_ = base_estimator

        # Configure optimizer

        # decide optimizer based on gradient information
        if acq_optimizer == "auto":
            if has_gradients(self.base_estimator_):
                acq_optimizer = "lbfgs"
            else:
                acq_optimizer = "sampling"

        if acq_optimizer not in ["lbfgs", "sampling"]:
            raise ValueError("Expected acq_optimizer to be 'lbfgs' or "
                             "'sampling', got {0}".format(acq_optimizer))

        if (not has_gradients(self.base_estimator_) and
            acq_optimizer != "sampling"):
            raise ValueError("The regressor {0} should run with "
                             "acq_optimizer"
                             "='sampling'.".format(type(base_estimator)))
        self.acq_optimizer = acq_optimizer

        # record other arguments
        if acq_optimizer_kwargs is None:
            acq_optimizer_kwargs = dict()

        self.n_points = acq_optimizer_kwargs.get("n_points", 10000)
        self.n_restarts_optimizer = acq_optimizer_kwargs.get(
            "n_restarts_optimizer", 5)
        n_jobs = acq_optimizer_kwargs.get("n_jobs", 1)
        self.n_jobs = n_jobs
        self.acq_optimizer_kwargs = acq_optimizer_kwargs

        # Configure search space

        # normalize space if GP regressor
        if isinstance(self.base_estimator_, GaussianProcessRegressor):
            dimensions = normalize_dimensions(dimensions)
        self.space = Space(dimensions)

        # record categorical and non-categorical indices
        self._cat_inds = []
        self._non_cat_inds = []
        for ind, dim in enumerate(self.space.dimensions):
            if isinstance(dim, Categorical):
                self._cat_inds.append(ind)
            else:
                self._non_cat_inds.append(ind)

        # Initialize storage for optimization

        self.models = []
        self.Xi = []
        self.yi = []

        # Initialize cache for `ask` method responses

        # This ensures that multiple calls to `ask` with n_points set
        # return same sets of points. Reset to {} at every call to `tell`.
        self.cache_ = {}
Esempio n. 38
0
def forest_minimize(func, dimensions, base_estimator='rf', maxiter=100,
                    n_points=100, n_start=10, random_state=None):
    """Sequential optimisation using decision trees.

    A tree based regression model is used to model the expensive to evaluate
    function `func`. The model is improved by sequentially evaluating
    the expensive function at the next best point. Thereby finding the
    minimum of `func` with as few evaluations as possible.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `base_estimator` [string or `Regressor`, default=`"rf"`]:
        The regressor to use as surrogate model. Can be either

        - `"rf"` for random forest regressor
        - `"et"` for extra trees regressor
        - `"dt"` for single decision tree regressor
        - instance of regressor with support for `return_std` in its predict
          method

        The predefined models are initilized with good defaults. If you
        want to adjust the model parameters pass your own instance of
        a regressor which returns the mean and standard deviation when
        making predictions.

    * `maxiter` [int, default=100]:
        Number of iterations used to find the minimum. This corresponds
        to the total number of evaluations of `func`. If `n_start` > 0
        only `maxiter - n_start` additional evaluations of `func` are
        made that are guided by the surrogate model.

    * `n_start` [int, default=10]:
        Number of random points to draw before fitting `base_estimator`
        for the first time. If `n_start = maxiter` this degrades to
        a random search for the minimum.

    * `n_points` [int, default=1000]:
        Number of points to sample when minimizing the acquisition function.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [float]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [array]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimisation space.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = check_random_state(random_state)

    if isinstance(base_estimator, str):
        if base_estimator not in ("rf", "et", "dt"):
            raise ValueError("Valid values for the base_estimator parameter"
                             " are: 'rf', 'et' or 'dt', not '%s'" % base_estimator)

        if base_estimator == "rf":
            base_estimator = RandomForestRegressor(min_samples_leaf=10,
                                                   random_state=rng)

        elif base_estimator == "et":
            base_estimator = ExtraTreesRegressor(min_samples_leaf=10,
                                                 random_state=rng)

        elif base_estimator == "dt":
            base_estimator = DecisionTreeRegressor(min_samples_leaf=10,
                                                   random_state=rng)

    else:
        if not is_regressor(base_estimator):
            raise ValueError("The base_estimator parameter has to either"
                             " be a string or a regressor instance."
                             " '%s' is neither." % base_estimator)

    return _tree_minimize(func, dimensions, base_estimator, maxiter=maxiter,
                          n_points=n_points, n_start=n_start,
                          random_state=random_state)