Example #1
0
def test_bound_scaler() -> None:
    ref = p.Instrumentation(
        p.Array(shape=(1, 2)).set_bounds(-12, 12, method="arctan"),
        p.Array(shape=(2, )).set_bounds(-12, 12, full_range_sampling=False),
        lr=p.Log(lower=0.001, upper=1000),
        stuff=p.Scalar(lower=-1, upper=2),
        unbounded=p.Scalar(lower=-1, init=0.0),
        value=p.Scalar(),
        letter=p.Choice("abc"),
    )
    # make sure the order is preserved using legacy split method
    expected = [x[1] for x in split_as_data_parameters(ref)]
    assert p.helpers.list_data(ref) == expected
    # check the bounds
    param = ref.spawn_child()
    scaler = utils.BoundScaler(param)
    output = scaler.transform([1.0] * param.dimension, lambda x: x)
    param.set_standardized_data(output)
    (array1, array2), values = param.value
    np.testing.assert_array_almost_equal(array1, [[12, 12]])
    np.testing.assert_array_almost_equal(array2, [1, 1])
    assert values["stuff"] == 2
    assert values["unbounded"] == 1
    assert values["value"] == 1
    assert values["lr"] == pytest.approx(1000)
    # again, on the middle point
    output = scaler.transform([0] * param.dimension, lambda x: x)
    param.set_standardized_data(output)
    assert param.value[1]["lr"] == pytest.approx(1.0)
    assert param.value[1]["stuff"] == pytest.approx(0.5)
Example #2
0
def test_bound_scaler() -> None:
    ref = p.Instrumentation(
        p.Array(shape=(1, 2)).set_bounds(-12, 12, method="arctan"),
        p.Array(shape=(2, )).set_bounds(-12, 12, full_range_sampling=False),
        lr=p.Log(lower=0.001, upper=1000),
        stuff=p.Scalar(lower=-1, upper=2),
        unbounded=p.Scalar(lower=-1, init=0.0),
        value=p.Scalar(),
        letter=p.Choice("abc"),
    )
    param = ref.spawn_child()
    scaler = utils.BoundScaler(param)
    output = scaler.transform([1.0] * param.dimension, lambda x: x)
    param.set_standardized_data(output)
    (array1, array2), values = param.value
    np.testing.assert_array_almost_equal(array1, [[12, 12]])
    np.testing.assert_array_almost_equal(array2, [1, 1])
    assert values["stuff"] == 2
    assert values["unbounded"] == 1
    assert values["value"] == 1
    np.testing.assert_almost_equal(values["lr"], 1000)
    # again, on the middle point
    output = scaler.transform([0] * param.dimension, lambda x: x)
    param.set_standardized_data(output)
    np.testing.assert_almost_equal(param.value[1]["lr"], 1.0)
    np.testing.assert_almost_equal(param.value[1]["stuff"], 0.5)
Example #3
0
    def __init__(
        self,
        regressor: str,
        data_dimension: tp.Optional[int] = None,
        dataset: str = "artificial",
        overfitter: bool = False
    ) -> None:
        self.regressor = regressor
        self.data_dimension = data_dimension
        self.dataset = dataset
        self.overfitter = overfitter
        self._descriptors: tp.Dict[str, tp.Any] = {}
        self.add_descriptors(regressor=regressor, data_dimension=data_dimension, dataset=dataset, overfitter=overfitter)
        self.name = regressor + f"Dim{data_dimension}"
        self.num_data = 120  # default for artificial function
        self._cross_val_num = 10  # number of cross validation
        # Dimension does not make sense if we use a real world dataset.
        assert bool("artificial" in dataset) == bool(data_dimension is not None)

        # Variables for storing the training set and the test set.
        self.X: np.ndarray = np.array([])
        self.y: np.ndarray

        # Variables for storing the cross-validation splits.
        self.X_train_cv: tp.List[tp.Any] = []  # This will be the list of training subsets.
        self.X_valid_cv: tp.List[tp.Any] = []  # This will be the list of validation subsets.
        self.y_train_cv: tp.List[tp.Any] = []
        self.y_valid_cv: tp.List[tp.Any] = []
        self.X_train: np.ndarray
        self.y_train: np.ndarray
        self.X_test: np.ndarray
        self.y_test: np.ndarray

        evalparams: tp.Dict[str, tp.Any] = {}
        if regressor == "decision_tree_depth":
            # Only the depth, as an evaluation.
            parametrization = p.Instrumentation(depth=p.Scalar(lower=1, upper=1200).set_integer_casting())
            # We optimize only the depth, so we fix all other parameters than the depth
            params = dict(noise_free=False, criterion="mse",
                          min_samples_split=0.00001,
                          regressor="decision_tree",
                          alpha=1.0, learning_rate="no",
                          activation="no", solver="no")
        elif regressor == "any":
            # First we define the list of parameters in the optimization
            parametrization = p.Instrumentation(
                depth=p.Scalar(lower=1, upper=1200).set_integer_casting(),  # Depth, in case we use a decision tree.
                criterion=p.Choice(["mse", "friedman_mse", "mae"]),  # Criterion for building the decision tree.
                min_samples_split=p.Log(lower=0.0000001, upper=1),  # Min ratio of samples in a node for splitting.
                regressor=p.Choice(["mlp", "decision_tree"]),  # Type of regressor.
                activation=p.Choice(["identity", "logistic", "tanh", "relu"]),  # Activation function, in case we use a net.
                solver=p.Choice(["lbfgs", "sgd", "adam"]),  # Numerical optimizer.
                learning_rate=p.Choice(["constant", "invscaling", "adaptive"]),  # Learning rate schedule.
                alpha=p.Log(lower=0.0000001, upper=1.),  # Complexity penalization.
            )
            # noise_free is False (meaning that we consider the cross-validation loss) during the optimization.
            params = dict(noise_free=False)
        elif regressor == "decision_tree":
            # We specify below the list of hyperparameters for the decision trees.
            parametrization = p.Instrumentation(
                depth=p.Scalar(lower=1, upper=1200).set_integer_casting(),
                criterion=p.Choice(["mse", "friedman_mse", "mae"]),
                min_samples_split=p.Log(lower=0.0000001, upper=1),
                regressor="decision_tree",
            )
            params = dict(noise_free=False,
                          alpha=1.0, learning_rate="no", regressor="decision_tree",
                          activation="no", solver="no")
            evalparams = dict(params, criterion="mse", min_samples_split=0.00001)
        elif regressor == "mlp":
            # Let us define the parameters of the neural network.
            parametrization = p.Instrumentation(
                activation=p.Choice(["identity", "logistic", "tanh", "relu"]),
                solver=p.Choice(["lbfgs", "sgd", "adam"]),
                regressor="mlp",
                learning_rate=p.Choice(["constant", "invscaling", "adaptive"]),
                alpha=p.Log(lower=0.0000001, upper=1.),
            )
            params = dict(noise_free=False, regressor="mlp", depth=-3, criterion="no", min_samples_split=0.1)
        else:
            assert False, f"Problem type {regressor} undefined!"
        # build eval params if not specified
        if not evalparams:
            evalparams = dict(params)
        # For the evaluation we remove the noise (unless overfitter)
        evalparams["noise_free"] = not overfitter
        super().__init__(partial(self._ml_parametrization, **params), parametrization.set_name(""))
        self._evalparams = evalparams
        self.register_initialization(regressor=regressor, data_dimension=data_dimension, dataset=dataset,
                                     overfitter=overfitter)
Example #4
0
    def __init__(self,
                 regressor: str,
                 data_dimension: tp.Optional[int] = None,
                 dataset: str = "artificial",
                 overfitter: bool = False) -> None:
        self.regressor = regressor
        self.data_dimension = data_dimension
        self.dataset = dataset
        self.overfitter = overfitter
        self._descriptors: tp.Dict[str, tp.Any] = {}
        self.add_descriptors(regressor=regressor,
                             data_dimension=data_dimension,
                             dataset=dataset,
                             overfitter=overfitter)
        self.name = regressor + f"Dim{data_dimension}"
        self.num_data: int = 0
        # Dimension does not make sense if we use a real world dataset.
        assert bool("artificial" in dataset) == bool(
            data_dimension is not None)

        # Variables for storing the training set and the test set.
        self.X: np.ndarray = np.array([])
        self.y: np.ndarray

        # Variables for storing the cross-validation splits.
        self.X_train: tp.List[tp.Any] = [
        ]  # This will be the list of training subsets.
        self.X_valid: tp.List[tp.Any] = [
        ]  # This will be the list of validation subsets.
        self.y_train: tp.List[tp.Any] = []
        self.y_valid: tp.List[tp.Any] = []
        self.X_test: np.ndarray
        self.y_test: np.ndarray

        if regressor == "decision_tree_depth":
            # Only the depth, as an evaluation.
            parametrization = p.Instrumentation(
                depth=p.Scalar(lower=1, upper=1200).set_integer_casting())
            # We optimize only the depth, so we fix all other parameters than the depth, using "partial".
            super().__init__(
                partial(self._ml_parametrization,
                        noise_free=False,
                        criterion="mse",
                        min_samples_split=0.00001,
                        regressor="decision_tree",
                        alpha=1.0,
                        learning_rate="no",
                        activation="no",
                        solver="no"), parametrization)
            # For the evaluation, we remove the noise.
            self.evaluation_function = partial(
                self._ml_parametrization,  # type: ignore
                noise_free=not overfitter,
                criterion="mse",
                min_samples_split=0.00001,
                regressor="decision_tree",
                alpha=1.0,
                learning_rate="no",
                activation="no",
                solver="no")
        elif regressor == "any":
            # First we define the list of parameters in the optimization
            parametrization = p.Instrumentation(
                depth=p.Scalar(lower=1, upper=1200).set_integer_casting(
                ),  # Depth, in case we use a decision tree.
                criterion=p.Choice(
                    ["mse", "friedman_mse",
                     "mae"]),  # Criterion for building the decision tree.
                min_samples_split=p.Log(
                    lower=0.0000001,
                    upper=1),  # Min ratio of samples in a node for splitting.
                regressor=p.Choice(["mlp",
                                    "decision_tree"]),  # Type of regressor.
                activation=p.Choice(
                    ["identity", "logistic", "tanh",
                     "relu"]),  # Activation function, in case we use a net.
                solver=p.Choice(["lbfgs", "sgd",
                                 "adam"]),  # Numerical optimizer.
                learning_rate=p.Choice(["constant", "invscaling", "adaptive"
                                        ]),  # Learning rate schedule.
                alpha=p.Log(lower=0.0000001,
                            upper=1.),  # Complexity penalization.
            )
            # Only the dimension is fixed, so "partial" is just used for fixing the dimension.
            # noise_free is False (meaning that we consider the cross-validation loss) during the optimization.
            super().__init__(
                partial(self._ml_parametrization, noise_free=False),
                parametrization)
            # For the evaluation we use the test set, which is big, so noise_free = True.
            self.evaluation_function = partial(
                self._ml_parametrization,  # type: ignore
                noise_free=not overfitter)
        elif regressor == "decision_tree":
            # We specify below the list of hyperparameters for the decision trees.
            parametrization = p.Instrumentation(
                depth=p.Scalar(lower=1, upper=1200).set_integer_casting(),
                criterion=p.Choice(["mse", "friedman_mse", "mae"]),
                min_samples_split=p.Log(lower=0.0000001, upper=1),
                regressor="decision_tree",
            )
            # We use "partial" for fixing the parameters of the neural network, given that we work on the decision tree only.
            super().__init__(
                partial(self._ml_parametrization,
                        noise_free=False,
                        alpha=1.0,
                        learning_rate="no",
                        regressor="decision_tree",
                        activation="no",
                        solver="no"), parametrization)
            # For the test we just switch noise_free to True.
            self.evaluation_function = partial(
                self._ml_parametrization,
                criterion="mse",  # type: ignore
                min_samples_split=0.00001,
                regressor="decision_tree",
                noise_free=not overfitter,
                alpha=1.0,
                learning_rate="no",
                activation="no",
                solver="no")
        elif regressor == "mlp":
            # Let us define the parameters of the neural network.
            parametrization = p.Instrumentation(
                activation=p.Choice(["identity", "logistic", "tanh", "relu"]),
                solver=p.Choice(["lbfgs", "sgd", "adam"]),
                regressor="mlp",
                learning_rate=p.Choice(["constant", "invscaling", "adaptive"]),
                alpha=p.Log(lower=0.0000001, upper=1.),
            )
            # And, using partial, we get rid of the parameters of the decision tree (we work on the neural net, not
            # on the decision tree).
            super().__init__(
                partial(self._ml_parametrization,
                        noise_free=False,
                        regressor="mlp",
                        depth=-3,
                        criterion="no",
                        min_samples_split=0.1), parametrization)
            self.evaluation_function = partial(
                self._ml_parametrization,  # type: ignore
                regressor="mlp",
                noise_free=not overfitter,
                depth=-3,
                criterion="no",
                min_samples_split=0.1)
        else:
            assert False, f"Problem type {regressor} undefined!"

        # assert data_dimension is not None or dataset[:10] != "artificial"
        # self.get_dataset(data_dimension, dataset)
        self.register_initialization(regressor=regressor,
                                     data_dimension=data_dimension,
                                     dataset=dataset,
                                     overfitter=overfitter)