Esempio n. 1
0
    def __init__(
        self, visualization_type: str = "points", rectify: Union[float, bool] = False, **kwargs
    ):
        """Initialize generalized function plot.

        Parameters:
            visualization_type: how to visualize generalized functions.
                Either single value or list of appropriate length.
                Possible values: "points" (default), "box-whisker", "shaded-line"
            rectify: whether and by how much each curves' values will be horizontally displaced
                to visually disentangle markers from different curves at the same location.
                True indicates automatic displacement, False indicates no displacement.
                If not specified, horizontal axis positions are not modified (default).
                If the horizontal axis scaling is logarithmic, the rectification factor
                is applied in log-space.

        Examples:
            # show three curves with automatic horizontal rectification
            __init__(visualization_type=("points", "points", "box-whisker"), rectify=True)
        """

        super().__init__(**kwargs)

        # parameter validation

        enum_f = lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"})
        self._visualization_type = params.any_(
            visualization_type, enum_f, lambda arg: params.tuple_(arg, enum_f)
        )
        # arity can only be tested in evaluate()

        self._rectify = params.any_(rectify, lambda arg: params.real(arg, from_=0), params.boolean)
Esempio n. 2
0
    def __init__(self,
                 internal_hp_optimization: bool = True,
                 kernel: Optional[Kernel] = None,
                 alpha: Union[float, Sequence] = 1e-5,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 normalize_y=False,
                 random_state: int = None,
                 **kwargs):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            internal_hp_optimization: if True, hyperparameters are optimized "internally"
                by the Gaussian process, that is, scikit-learn optimizes hyperparameters
                and for smlb the learner has no hyperparameters;
                if False, hyperparameters are optimized by smlb (and scikit-learn does
                not optimize any hyperparameters)
            kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default
            alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal.
                   Equivalent to adding a "WhiteKernel"; the default is the corresponding value from
                   scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor.
            optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True
            n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True
            normalize_y: whether to subtract the mean of the labels
            random_state: integer seed

        See skl.gaussian_process.GaussianProcessRegressor parameters.
        """

        super().__init__(**kwargs)

        internal_hp_optimization = params.boolean(internal_hp_optimization)
        kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel),
                             params.none)
        # incomplete check for alpha as dimension becomes known only at fitting time
        alpha = params.any_(
            alpha,
            lambda arg: params.real(arg, from_=0),
            lambda arg: params.real_vector(arg, domain=[0, np.inf]),
        )
        # todo: check optimizer, requires params.union (of string and callable) and params.function
        normalize_y = params.boolean(normalize_y)
        random_state = params.integer(random_state)

        if kernel is None:
            kernel = skl.gaussian_process.kernels.RBF(
            ) + skl.gaussian_process.kernels.WhiteKernel()

        assert internal_hp_optimization is True  # external HP optimization not yet supported

        self._model = skl.gaussian_process.GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            random_state=random_state,
        )
Esempio n. 3
0
def test_tuple_():
    """Tests tuple_ meta test."""

    testf = lambda arg: params.none(arg)

    # special case: no tuple
    with pytest.raises(InvalidParameterError):
        params.tuple_(None, lambda arg: arg)

    # special case: single test
    assert params.tuple_((None,), testf) == (None,)
    with pytest.raises(InvalidParameterError):
        params.any_("_", testf)

    # special case: 2-tuple
    assert params.tuple_((None, None), testf, testf) == (None, None)
    with pytest.raises(InvalidParameterError):
        params.tuple_(("_", None), testf, testf)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, "_"), testf, testf)
    with pytest.raises(InvalidParameterError):
        params.tuple_(("_", "_"), testf, testf)

    # arity parameter
    assert params.tuple_((None, None), testf, arity=2)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, None), testf, arity=3)
    with pytest.raises(InvalidParameterError):
        params.tuple_((None, None, None), testf, arity=2)

    # default parameter
    assert params.tuple_((None,), testf, arity=3, default=None) == (None, None, None)

    # no arity, no default
    assert params.tuple_((None, None, None), testf) == (None, None, None)
Esempio n. 4
0
    def __init__(
        self,
        target=None,
        configuration: Optional[PlotConfiguration] = None,
        axes_labels=(None, None, None, None),
        axes_scales=("linear", "linear"),
        **kwargs,
    ):
        """Initialize Evaluation.

        Parameters:
            target: rendering target that evaluation outcome is rendered to;
                can be a single filename, or a matplotlib Axes or (Figure, Axes) pair,
                or a sequence thereof; if a matplotlib Axes or (Figure, Axes) pair,
                evaluation will add to it; if None, a new rendering target is created
            configuration: optional plot configuration controlling rendering details
            axes_labels: labels for all axes (bottom, left, top, right), None to not label an axis;
                         for shorter tuples remaining entries are assumed None, so ('x', 'y') is valid
            axes_scales: scales ("linear" or "log") for horizontal and vertical axes

        Examples:
            __init__(axes_labels=("bottom", "left", "top"))  # right is None
            __init__(axes_scales=("log", "log"))
        """

        configuration = params.any_(
            configuration, lambda arg: params.instance(arg, PlotConfiguration), params.none
        )

        super().__init__(configuration=configuration, **kwargs)

        # Axes, (Figure, Axes), filename, None, or sequence (without None)
        target_f = lambda arg: params.any_(
            arg,
            lambda arg: params.instance(arg, mpl.axes.Axes),
            lambda arg: params.tuple_(
                arg,
                lambda arg: params.instance(arg, mpl.figure.Figure),
                lambda arg: params.instance(arg, mpl.axes.Axes),
                arity=2,
            ),
            params.string,
        )
        self._target = params.any_(
            target, target_f, params.none, lambda arg: params.tuple_(arg, target_f)
        )

        self._axes_labels = params.tuple_(
            axes_labels,
            lambda arg: params.any_(arg, params.string, params.none),
            arity=4,
            default=None,
        )

        self._axes_scales = params.tuple_(
            axes_scales, lambda arg: params.enumeration(arg, {"linear", "log"}), arity=2
        )

        self._figaxis = None
Esempio n. 5
0
    def __init__(self,
                 labels_to_load: Optional[Union[str, List[str]]] = None,
                 ignore_dubious: bool = False):
        """Initialize Ni-superalloy dataset with specified labels.

        Parameters:
            labels_to_load (str or List[str]): which labels to load. Options are
                'Yield Strength', 'Ultimate Tensile Strength', 'Stress Rupture Time',
                'Stress Rupture Stress', and 'Elongation'.
                If None, then all labels are loaded.
            ignore_dubious: whether or not to ignore samples that have something
                questionable about them

        """

        labels_to_load = params.optional_(
            labels_to_load,
            lambda arg: params.any_(
                arg,
                params.string,
                lambda arg: params.sequence(arg, type_=str),
            ),
        )
        ignore_dubious = params.boolean(ignore_dubious)

        filepath = self.DEFAULT_PATH
        data, labels = self._load_data_and_labels(filepath, labels_to_load,
                                                  ignore_dubious)
        super().__init__(data=data, labels=labels)
Esempio n. 6
0
File: java.py Progetto: syam-s/smlb
    def __init__(self, class_path: Optional[str] = None):
        """Initialize Java gateway.

        If derived class is initialized for the first time,
        start up JVM and create gateway. On subsequent initializations
        of derived class, the same gateway is used, except when a
        different class_path is passed. In that case,
        the JVM is shut down and restarted with the new class path.

        Parameters:
            class_path: local filesystem class path containing one
                or more directories or .jar files. If not specified,
                an empty string is passed as classpath to the JVM.

        Raises:
            BenchmarkError if the class_path is invalid.
        """

        # todo: class_path = params.optional_(class_path, params.string)
        class_path = params.any_(class_path, params.string, params.none)

        if self.__class__._gateway is None:
            # first time derived class is instantiated, create gateway
            self._launch_gateway(class_path=class_path)
        elif self.__class__._class_path != class_path:
            # if parameters changed, restart the JVM
            self._shutdown_gateway()
            self._launch_gateway(class_path=class_path)
        else:
            # subsequent instantiations use the same gateway
            pass
Esempio n. 7
0
    def axes_labels(self, labels=(None, None, None, None), **kwargs):
        """Set axes labels.

        Parameters:
            axes_labels: labels for bottom, left, top, right axes
                None indicates to use the current value

        Examples:
            axes_labels = (None, "y")  # set only left axis label
        """

        string_or_none_f = lambda arg: params.any_(arg, params.string, params.none)
        labels = params.tuple_(labels, string_or_none_f, arity=4, default=None)

        # re-assign tuple as a whole
        self._labels = tuple(
            self.axes_labels[i] if labels[i] is None else labels[i] for i in range(4)
        )

        # set labels if specified (not None)
        # this allows to pass kwargs specific to one axis
        if labels[0] is not None:
            self.ax.set_xlabel(labels[0], fontdict=self._fontdict(), **kwargs)
        if labels[1] is not None:
            self.ax.set_ylabel(labels[1], fontdict=self._fontdict(), **kwargs)
        if labels[2] is not None or labels[3] is not None:
            # todo; possible implementation via xtwin/ytwin, storing these axes in outcome
            raise NotImplementedError
Esempio n. 8
0
    def axes_scales(self, scales=(None, None), **kwargs):
        """Set axes scales.

        Parameters:
            axes_scales: scales (None, "linear" or "log") for horizontal and vertical axes;
                None indicates to use the current value

        Examples:
            axes_scales = (None, "log")  # change only vertical axis
        """

        scale_or_none_f = lambda arg: params.any_(
            arg, lambda arg: params.enumeration(arg, {"linear", "log"}), params.none
        )
        scales = params.tuple_(scales, scale_or_none_f, arity=2, default=None)

        # re-assign tuple as a whole
        self._scales = (
            self.axes_scales[0] if scales[0] is None else scales[0],
            self.axes_scales[1] if scales[1] is None else scales[1],
        )

        # set axes if specified (not None)
        # this allows to pass kwargs specific to one axis
        if scales[0] is not None:
            self.ax.set_xscale(scales[0], **kwargs)
        if scales[1] is not None:
            self.ax.set_yscale(scales[1], **kwargs)
Esempio n. 9
0
    def __init__(
        self,
        source: str,
        join: Optional[Union[str, bool]] = None,
        **kwargs,
    ):
        """Loads dataset.

        All `IndexedFiniteLabeledDataPandasBackend.__init__` keyword arguments can be passed,
        in particular join, filterf, samplef, and labelf. See there for further explanation.

        Parameters:
            source: path to underlying data file (see class docstring); accepts both
                .csv and .csv.zip versions
            join: whether to join entries with the same chemical sum formula; this changes
                labels from single numbers to varying-length sequences of numbers.
                True can be passed to join by stoichiometry.
            filterf: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        All samples have these keys:
            id: unique identifier (integer)
            SMILES: SMILES encoding
            formula: stoichiometric formula

        All labels have these keys:
            mass: weight of molecule
            PCE: power conversion efficiency
            VOC: open circuit voltage
            JSC: short-circuit current density
            H**O: highest occupied molecular orbital
            gap: LUMO-H**O
            LUMO: lowest unoccupied molecular orbital

        The identifiers and SMILES strings are unique.
        Stoichiometries are not (10,474 unique ones).

        Raises:
            InvalidParameterError: on invalid parameter values
        """

        join = params.any_(join, params.string, params.boolean, params.none)

        # parse boolean settings for join
        if join is True:
            join = "formula"
        if join is False:
            join = None

        data, labels = self._load_data(source)

        super().__init__(data=data, labels=labels, join=join, **kwargs)
Esempio n. 10
0
 def _indices_testf(self, indices: Sequence[Any]):
     return params.optional_(
         indices,
         lambda arg: list(
             params.any_(  # NumPy indexing expects a list
                 arg,
                 lambda arg: params.tuple_(arg, None, arity=0),  # empty set
                 lambda arg: params.tuple_(
                     arg, lambda arg: params.integer(
                         arg, from_=0, below=self.num_samples)),
             )),
     )
Esempio n. 11
0
    def __init__(
        self,
        rng: int = None,
        num_seeds: int = 1,
        resolution: int = 64,
        max_relative_jump: float = 1.0,
        dimensions_varied: Union[str, float, int] = "all",
        max_iters: Optional[int] = None,
        max_evals: Optional[int] = None,
        **kwargs,
    ):
        """Initialize state.

        Parameters:
            rng: pseudo-random number generator seed
            num_seeds: the number of starting points, and the number of points chosen at the end
                of each iteration
            resolution: the number of points to sample along a single dimension for a single seed
            max_relative_jump: the maximum relative step size along a single dimension. If a given
                dimension has length `L` and a seed has value `x` along that dimension, then the
                candidates are `resolution` linearly spaced points from the range
                [x - max_relative_jump * L, x + max_relative_jump * L] (clipped by the bounds).
                `max_relative_jump must be on (0, 1].
                For a value of 1, the entire range is always considered.
            dimensions_varied: how many randomly selected dimensions to explore with each step.
                'all' indicates all dimensions. An integer directly specifies the number of
                dimensions. A float on (0, 1) indicates the fractional number of the total.
            max_iters: the maximum number of iterations
            max_evals: the maximum number of function evaluations (this is a soft maximum:
                once it is reached then the current iteration finishes)

        TODO: add tolerance stopping conditions
        """
        super().__init__(rng=rng, **kwargs)

        self._num_seeds = params.integer(num_seeds, from_=1)
        self._resolution = params.integer(resolution, from_=2)
        self._max_relative_jump = params.real(max_relative_jump,
                                              above=0.0,
                                              to=1.0)
        self._dimensions_varied = params.any_(
            dimensions_varied,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, below=1.0),
            lambda arg: params.enumeration(arg, {"all"}),
        )
        self._max_iters = params.optional_(
            max_iters, lambda arg: params.integer(arg, from_=1))
        self._max_evals = params.optional_(
            max_evals, lambda arg: params.integer(arg, from_=1))
        if self._max_iters is None and self._max_evals is None:
            raise InvalidParameterError(
                "at least one stopping condition defined", "all Nones")
Esempio n. 12
0
    def __init__(self, configuration: Optional[EvaluationConfiguration] = None, **kwargs):
        """Initialize Evaluation.

        Parameters:
            configuration: optional configuration object controlling rendering details
        """

        super().__init__(**kwargs)

        self._configuration = params.any_(
            configuration, lambda arg: params.instance(arg, EvaluationConfiguration), params.none
        )
        if self._configuration is None:
            self._configuration = self._default_configuration()

        self._auxiliary = dict()  # internal handle on optional auxiliary outcome data
Esempio n. 13
0
    def __init__(self, noise_part=None, signal_part=None, **kwargs):
        """Initialize decompositions.

        Parameters:
            noise_part: estimated noise distribution; the aleatoric component
            signal_part: estimated signal distribution; the epistemic component
        """

        super().__init__(**kwargs)

        optional = lambda arg: params.any_(
            arg, lambda x: params.instance(x, PredictiveDistribution), params.
            none)
        self._noise_part = optional(noise_part)
        self._signal_part = optional(signal_part)

        pass
Esempio n. 14
0
 def __init__(
     self,
     input_: TabularData,
     output: PredictiveDistribution,
     scores: Sequence[float],
     **kwargs
 ):
     super().__init__(**kwargs)
     self._input: TabularData = params.instance(input_, TabularData)
     self._output: PredictiveDistribution = params.instance(output, PredictiveDistribution)
     # total number of function evaluations during this step
     self._num_evaluations: int = params.integer(self._input.num_samples, from_=1)
     self._scores: Sequence[float] = params.any_(
         scores,
         lambda arg: params.sequence(arg, length=1, type_=float),
         lambda arg: params.sequence(arg, length=self._num_evaluations, type_=float),
     )
Esempio n. 15
0
File: noise.py Progetto: syam-s/smlb
    def noise(self, shape=None):
        """Add Gaussian noise to labels.

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor

        Returns:
            a numerical array of given shape containing independent
            identically distributed Gaussian noise

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return self.random.normal(self._mean, self._stddev, size=shape)
Esempio n. 16
0
    def __init__(self, cdk_jar_path: Optional[str] = None):
        """Initialize CDK Java gateway.

        See base class JavaGateway for details.

        This class provides CDK-specific functionality,
        namely the path to the CDK .jar file.

        Parameters:
            cdk_jar_path: local filesystem path to the CDK jar, e.g.,
                '/file/path/cdk.jar'. If not specified, smlb tries to
                find the CDK jar.

        Raises:
            BenchmarkError if the CDK .jar file can not be found.
        """

        # todo: optional_
        # cdk_jar_path = params.optional_(cdk_jar_path, params.string)  todo: valid path
        cdk_jar_path = params.any_(cdk_jar_path, params.string, params.none)

        # finding CDK .jar file logic
        if cdk_jar_path is None:
            if self._cdk_jar_path_auto is not None:
                # already detected, use stored path
                cdk_jar_path = self._cdk_jar_path_auto
            else:
                # attempt to find CDK .jar file
                # todo: find correct path for installed versions

                path = os.path.join(os.path.dirname(__file__), "../build/cdk.jar")
                if not os.access(path, os.R_OK):
                    raise BenchmarkError(
                        "Valid path to .jar file",
                        path,
                        explanation=f"Jar file {path} does not exist or is not readable.",
                    )

                cdk_jar_path = path

        super().__init__(cdk_jar_path)
Esempio n. 17
0
File: noise.py Progetto: syam-s/smlb
    def noise(self, shape=None):
        """Return no noise.

        A constant value is returned.

        Parameters:
            shape: shape of noise vector, matrix or higher-order tensor

        Returns:
            a numerical array of given shape containing a constant value

        Raises:
            InvalidParameterError: for invalid parameters
        """

        # valid shape are either positive integer or a tuple of positive integer
        is_nonneg_int = lambda arg: params.integer(arg, from_=1)
        is_tuple = lambda arg: params.tuple_(arg, is_nonneg_int)
        shape = params.any_(shape, is_nonneg_int, is_tuple)

        return np.full(shape, self._value)
Esempio n. 18
0
def test_all_():
    """Tests all_ meta test."""

    # special case: single test
    assert params.any_(None, lambda arg: params.none(arg)) is None
    with pytest.raises(InvalidParameterError):
        params.all_("_", lambda arg: params.none(arg))

    # special case: and
    assert (
        params.all_(
            2, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2)
        )
        == 2
    )
    assert (
        params.all_(
            3,
            lambda arg: params.integer(arg, above=1),
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.integer(arg, from_=3),
        )
        == 3
    )

    # fail in first testf
    with pytest.raises(InvalidParameterError):
        params.all_(
            1, lambda arg: params.integer(arg, above=1), lambda arg: params.integer(arg, from_=2)
        )

    # fail in last testf
    with pytest.raises(InvalidParameterError):
        params.all_(
            2,
            lambda arg: params.integer(arg, above=1),
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.integer(arg, above=2),
        )
Esempio n. 19
0
    def __init__(
        self,
        fits: bool = True,
        fit_lambda: float = 1e-7,
        fit_weights: Optional[str] = None,
        base=10,
        **kwargs,
    ):
        """Initialize learning curve plot.

        Parameters:
            fits: if True, show estimated asymptotic fits
            fit_lambda: regularization strength for asymptotic fits; defaults to 1e-7
            fit_weights: if and how to weight fits; one of
                None: no weighting, "variance": weigh by variance for each training set size
            base: base for logarithmic plotting
            All parameters from base classes, in particular GeneralizedFunctionPlot and Plot.
        """

        # set learning curve-specific arguments if not explicitly set
        kwargs["axes_scales"] = kwargs.get("axes_scales", ("log", "log"))
        kwargs["axes_labels"] = kwargs.get(
            "axes_labels", ("training set size", "evaluation metric", None, None)
        )

        super().__init__(**kwargs)

        # parameters
        self._fits = params.boolean(fits)
        self._fit_lambda = params.real(fit_lambda, from_=0)
        self._fit_weights = params.any_(
            fit_weights, lambda arg: params.enumeration(arg, {"variance"}), params.none
        )
        self._base = params.real(base, from_=2)

        self._logf = lambda x: np.log(x) / np.log(self._base)
        self._powf = lambda x: np.power(self._base, x)
Esempio n. 20
0
    def __init__(
        self,
        data: "pandas.DataFrame",  # noqa F821
        labels: Optional[Union["pandas.DataFrame", Sequence[str]]] = None,
        dtype: Optional[dict] = None,
        join: Optional[str] = None,
        filterf: Optional[Callable[[Any], bool]] = None,
        samplef: Optional[Callable[[Any], Any]] = None,
        labelf: Optional[Callable[[Any], Any]] = None,
        **kwargs,
    ):
        """Initialize dataset.

        Parameters control loading and preprocessing of the data. Order:
        1. joining
        2. filtering
        3. sample and label transform

        Parameters:
            data: the samples in the form of a Pandas DataFrame.
            labels: the labels, either in the form of a Pandas DataFrame with same number of rows
                as data and different column names, or in the form of a list of column names,
                which are then split out from the data and used as labels. If not specified,
                the dataset is unlabeled.
            dtype: the NumPy data types to use for samples and labels, in the form of a dictionary
                with column names as keys and dtypes as values. Can be used to override dtype
                auto-detection for some or all columns.
            join: if specified, name of "column" to join by; this changes labels
                to be sequences of single-entry labels
            filterf: a function that accepts a sample and returns whether to keep it
                (True) or exclude it (False). Default retains all samples
            samplef: function accepting and returning a sample; applied to all samples
                as post-processing
            labelf: function accepting and returning a label; applied to all labels
                as post-processing

        Raises:
            InvalidParameterError for invalid arguments. In particular,
                numbers of data and labels must match. If column names are given,
                they must be unique across data and labels, if any.
        """

        import pandas as pd  # only import if class is used

        # parameter validation
        data = params.instance(data, pd.DataFrame)
        labels = params.optional_(
            labels,
            lambda arg: params.any_(
                arg,
                lambda arg: params.instance(arg, pd.DataFrame
                                            ),  # before tuple_
                lambda arg: params.tuple_(arg, params.string),
            ),
        )
        dtype = params.optional_(dtype,
                                 lambda arg: params.instance(arg, dict),
                                 default={})
        join = params.optional_(join, params.string)
        singleargf = lambda arg: params.callable(arg, num_pos_or_kw=1
                                                 )  # noqa: E731
        filterf = params.optional_(filterf, singleargf)
        samplef = params.optional_(samplef, singleargf)
        labelf = params.optional_(labelf, singleargf)

        if labels is None and labelf:
            raise InvalidParameterError(
                "matching labels and label function",
                "label function specified for unlabeled data")

        # process data
        data = data.reset_index(drop=True)

        # if labels are given as separate DataFrame, join them
        if isinstance(labels, pd.DataFrame):
            if len(data) != len(labels):
                raise InvalidParameterError(
                    "matching data and labels",
                    f"different number of rows ({len(data)} != {len(labels)})",
                )

            labels = labels.reset_index(drop=True)

            col_names = np.hstack((data.columns, labels.columns))
            if len(col_names) != len(pd.unique(col_names)):
                raise InvalidParameterError(
                    "unique column names",
                    f"{data.columns.values} and {labels.columns.values}")

            data = pd.concat([data, labels], axis=1)
            labels = labels.columns.values

        # 1. optional joining
        if join:
            groups = data.groupby(join, sort=False, as_index=False)
            data = groups.aggregate(lambda tdf: tdf.tolist())

        # 2. optional filtering
        if filterf:
            selection = data.apply(filterf, axis=1)
            data = data[selection]

        # split data and labels
        if labels is not None:
            # DataFrame column indexing requires list, not tuple
            data, labels = data.drop(columns=list(labels)), data[list(labels)]

        # 3. optional sample and label transform
        if samplef:
            data = data.apply(samplef, axis=1, result_type="reduce")
            if isinstance(data, pd.Series):
                data = pd.DataFrame(data, columns=["Samples"])
        if labelf:
            labels = labels.apply(labelf, axis=1, result_type="reduce")
            if isinstance(labels, pd.Series):
                labels = pd.DataFrame(labels, columns=["Labels"])

        # convert to NumPy structured array
        data = self._to_numpy(data, dtype=dtype)
        labels = self._to_numpy(labels,
                                dtype=dtype) if labels is not None else None

        super().__init__(data=data, labels=labels, **kwargs)
Esempio n. 21
0
    def __init__(
        self,
        uncertainties: Optional[str] = None,
        loss: str = "ls",
        alpha: float = 0.9,
        learning_rate: float = 0.1,
        subsample: float = 1.0,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: int = 3,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = None,
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        random_state: int = None,
        ccp_alpha: float = 0.0,
        init: Optional[Any] = None,
        validation_fraction: float = 0.1,
        n_iter_no_change: Optional[int] = None,
        tol: float = 0.0001,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; possible choices are
                None; by default, RandomForestRegressor does not return any predictive uncertainties;
            loss: loss function to optimize; valid values are "ls" (least squares), "lad" (least absolute deviation),
                "huber" (Huber's loss), "quantile" (quantile regression). Use alpha parameter for huber and quantile.
            alpha: quantile for "huber" and "quantile" loss functions
            learning_rate: value by which to shrink contribution of consecutive trees; trade-off with num_estimators
            subsample: fraction of samples for fitting base learners; if <1 results in Stochastic Gradient Boosting.
                reducing subsample reduces variance and increases bias.
            n_estimators: number of decision trees
            criterion: either Friedman improved score ("friedman_rmse"), variance reduction ("mse", mean squared error),
                or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is 3
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            random_state: pseudo-random number generator seed
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            init: estimator for initial predictions; can be 'zero' for constant zero predictions
            validation_fraction: fraction of training data to set aside for early stopping; only with n_iter_no_change
            n_iter_no_change: set to integer to stop after no improvement (beyond tol) for that many rounds
            tol: tolerance for early stopping; only improvements larger than tol are considered

        The sklearn.GradientBoostingRegressor parameters `oob_score`, `verbose`, `warm_start` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(**kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None})

        loss = params.enumeration(loss, {"ls", "lad", "huber", "quantile"})
        alpha = params.real(alpha, above=0, below=1)
        learning_rate = params.real(learning_rate, above=0, to=1)
        subsample = params.real(subsample, above=0, to=1)
        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion,
                                       {"friedman_rmse", "mse", "mae"})
        max_depth = params.any_(max_depth,
                                lambda arg: params.integer(arg, from_=1),
                                params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf,
                                               from_=0.0,
                                               to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(max_leaf_nodes,
                                     lambda arg: params.integer(arg, from_=1),
                                     params.none)
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        random_state = params.integer(random_state)
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        # no validation for init (no class signature validator)
        validation_fraction = params.real(validation_fraction,
                                          above=0,
                                          below=1)
        n_iter_no_change = params.any_(
            n_iter_no_change, lambda arg: params.integer(arg, from_=0),
            params.none)
        tol = params.real(tol, from_=0)

        self._model = skl.ensemble.GradientBoostingRegressor(
            loss=loss,
            alpha=alpha,
            learning_rate=learning_rate,
            subsample=subsample,
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            random_state=random_state,
            ccp_alpha=ccp_alpha,
            init=init,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
        )
Esempio n. 22
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Esempio n. 23
0
    def __init__(self,
                 rng: int = None,
                 strategy: str = "best1bin",
                 maxiter: int = 1000,
                 popsize: int = 15,
                 tol: float = 0.01,
                 mutation=(0.5, 1),
                 recombination: float = 0.7,
                 **kwargs):
        """Initialize state.

        Scipy-specific parameters are passed through.

        Parameters:
            rng: integer seed. Will be used to generate a new seed each time the optimizer is run.
            strategy: The differential evolution strategy to use. See documentation for complete
                list and explanations.
            maxiter: The maximum number of generations over which the entire population is evolved.
            popsize: A multiplier for setting the total population size.
            tol: Relative tolerance for convergence.
            mutation: The mutation constant. Either a number between 0 and 2 or a tuple (min, max)
                in which case the mutation constant is randomly selected uniformly from between
                min and max with each generation.
            recombination: The recombination constant. Must be between 0 and 1.

        """
        super().__init__(rng=rng, **kwargs)

        allowed_strategies = {
            "best1bin",
            "best1exp",
            "rand1exp",
            "randtobest1exp",
            "currenttobest1exp",
            "best2exp",
            "rand2exp",
            "randtobest1bin",
            "currenttobest1bin",
            "best2bin",
            "rand2bin",
            "rand1bin",
        }
        self._strategy = params.enumeration(strategy, allowed_strategies)

        self._maxiter = params.integer(maxiter, from_=1)
        self._popsize = params.integer(popsize, from_=1)
        self._tol = params.real(tol, above=0.0)

        def test_mutation_range(arg, low=0.0):
            return params.real(arg, from_=low, to=2.0)

        self._mutation = params.any_(
            mutation,
            test_mutation_range,
            lambda pair: params.tuple_(
                pair,
                test_mutation_range,
                lambda arg2: test_mutation_range(arg2, low=pair[0]),
                arity=2,
            ),
        )
        self._recombination = params.real(recombination, from_=0.0, to=1.0)
Esempio n. 24
0
    def __init__(
        self,
        rng: int = None,
        uncertainties: Optional[str] = None,
        n_estimators: int = 100,
        criterion: str = "mse",
        max_depth: Optional[int] = None,
        min_samples_split: Union[int, float] = 2,
        min_samples_leaf: Union[int, float] = 1,
        min_weight_fraction_leaf: float = 0.0,
        max_features: Union[int, float, str, None] = "auto",
        max_leaf_nodes: Optional[int] = None,
        min_impurity_decrease: float = 0.0,
        # min_impurity_split deprecated
        bootstrap: bool = True,
        n_jobs: Optional[int] = None,
        ccp_alpha: float = 0.0,
        max_samples: Optional[Union[int, float]] = None,
        **kwargs,
    ):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            uncertainties: whether and how to compute predictive uncertainties; choices are
                None; by default, RandomForestRegressor does not return predictive uncertainties;
                "naive"; uses the ensembles standard deviation
            n_estimators: number of decision trees
            criterion: either variance reduction ("mse", mean squared error), or, mean absolute error ("mae")
            max_depth: maximum depth of a tree; default is restricted only by min_samples_leaf
            min_samples_split: minimum number of samples required to split an internal node;
                float numbers indicate a fraction of number of training samples
            min_samples_leaf: minimum number of training samples required in a leaf node
                float numbers indicate a fraction of number of training samples
            min_weight_fraction_leaf: minimum weighted fraction of weights required in a leaf node
            max_features: number of features considered when splitting; integers directly specify the number,
                floating point values specify which fraction of all features to use;
                "auto" uses all features, "sqrt" and "log2" use square root and binary logarithm of number of features
            max_leaf_nodes: maximum number of leaves a tree can have
            min_impurity_decrease: minimum impurity decrease required for splitting
            bootstrap: if False, the whole dataset is used to build trees
            n_jobs: number of parallel jobs; -1 to use all available processors; None means 1
            ccp_alpha: complexity parameter for minimal cost-complexity pruning.
            max_samples: number of input samples to draw during bootstrap; integers directly specify the number,
                floating point values specify which fraction of samples to use; all by default

        The sklearn.RandomForestRegressor parameters `oob_score`, `verbose`, `warm_restart` are not considered.

        See skl.ensemble.ExtraTreesRegressor parameters.
        """

        super().__init__(rng=rng, **kwargs)

        # validate parameters

        self._uncertainties = params.enumeration(uncertainties, {None, "naive"})

        n_estimators = params.integer(n_estimators, from_=1)
        criterion = params.enumeration(criterion, {"mse", "mae"})
        max_depth = params.any_(max_depth, lambda arg: params.integer(arg, from_=1), params.none)
        min_samples_split = params.any_(
            min_samples_split,
            lambda arg: params.integer(arg, from_=2),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_samples_leaf = params.any_(
            min_samples_leaf,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, above=0.0, to=1.0),
        )
        min_weight_fraction_leaf = params.real(min_weight_fraction_leaf, from_=0.0, to=1.0)
        max_features = params.any_(
            max_features,
            lambda arg: params.integer(arg, above=0),
            lambda arg: params.real(arg, above=0.0, to=1.0),
            lambda arg: params.enumeration(arg, {"auto", "sqrt", "log2"}),
            params.none,
        )
        max_leaf_nodes = params.any_(
            max_leaf_nodes, lambda arg: params.integer(arg, from_=1), params.none
        )
        min_impurity_decrease = params.real(min_impurity_decrease, from_=0.0)
        bootstrap = params.boolean(bootstrap)
        n_jobs = params.any_(
            n_jobs,
            lambda arg: params.integer(arg, from_=-1, to=-1),
            lambda arg: params.integer(arg, from_=1),
            params.none,
        )
        ccp_alpha = params.real(ccp_alpha, from_=0.0)
        max_samples = params.any_(
            max_samples,
            lambda arg: params.integer(arg, from_=1),
            lambda arg: params.real(arg, from_=0.0, to=1.0),
            params.none,
        )

        self._model = ExtraTreesRegressor(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            min_impurity_decrease=min_impurity_decrease,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            ccp_alpha=ccp_alpha,
            max_samples=max_samples,
        )
Esempio n. 25
0
def test_any_():
    """Tests any_ meta test."""

    # special case: single test
    assert params.any_(None, lambda arg: params.none(arg)) is None
    with pytest.raises(InvalidParameterError):
        params.any_("_", lambda arg: params.none(arg))

    # special case: or
    assert params.any_(None, lambda arg: params.none(arg), lambda arg: params.none(arg)) is None
    assert params.any_(None, lambda arg: params.none("_"), lambda arg: params.none(arg)) is None
    assert params.any_(None, lambda arg: params.none(arg), lambda arg: params.none("_")) is None
    with pytest.raises(InvalidParameterError):
        params.any_(None, lambda arg: params.none("_"), lambda arg: params.none("_"))

    # three tests
    assert (
        params.any_(
            None,
            lambda arg: params.none(arg),
            lambda arg: params.none(arg),
            lambda arg: params.none(arg),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none(arg),
            lambda arg: params.none(arg),
            lambda arg: params.none("_"),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none(arg),
            lambda arg: params.none("_"),
            lambda arg: params.none(arg),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none(arg),
            lambda arg: params.none("_"),
            lambda arg: params.none("_"),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none("_"),
            lambda arg: params.none(arg),
            lambda arg: params.none(arg),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none("_"),
            lambda arg: params.none(arg),
            lambda arg: params.none("_"),
        )
        is None
    )
    assert (
        params.any_(
            None,
            lambda arg: params.none("_"),
            lambda arg: params.none("_"),
            lambda arg: params.none(arg),
        )
        is None
    )
    with pytest.raises(InvalidParameterError):
        params.any_(
            None,
            lambda arg: params.none("_"),
            lambda arg: params.none("_"),
            lambda arg: params.none("_"),
        )