Ejemplo n.º 1
0
    def apply(self, data: Data) -> PredictiveDistribution:
        """Predicts new inputs.

        Parameters:
            data: finite indexed data to predict

        Returns:
            predictive normal distributions if predictive uncertainties were requested,
            otherwise delta distributions
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)

        xpred = params.real_matrix(data.samples())

        if self._with_uncertainties:
            try:
                preds, stddevs = self._model.predict(xpred, return_std=True)
                return NormalPredictiveDistribution(mean=preds, stddev=stddevs)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
        else:
            try:
                preds = self._model.predict(xpred, return_std=False)
                return DeltaPredictiveDistribution(mean=preds)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
Ejemplo n.º 2
0
    def finalize(self, data: Data) -> Data:
        """Change dataset according to registered failures and failure mode.

        Parameters:
            data: transformed Data

        Returns:
            Transformed Data after handling failures.
        """

        self.failures = sorted(list(set(
            self.failures)))  # remove duplicate indices

        if self.failmode == "raise":
            if len(self.failures) > 0:
                raise BenchmarkError(
                    "DataTransformation failed for some samples")
            return data
        elif self.failmode == "drop":
            return complement(data,
                              data.subset(self.failures))  # todo: duplicates?
        elif self.failmode == "mask":
            self.mask[self.failures] = True
            return data
        elif self.failmode == "index":
            self.index.extend(self.failures)
            return data

        raise BenchmarkError(
            f"Internal error, unrecognized failure mode '{self.failmode}'")
Ejemplo n.º 3
0
    def labels(self, indices: Optional[np.ndarray] = None) -> Sequence[L]:
        """Query computed labels.

        Returns a sequence of labels or raises InvalidParameterError.

        Parameters:
            indices: a real matrix of appropriate dimensions (rows are vectors)

        Returns:
            A sequence of labels

        Raises:
            InvalidParameterError: for invalid indices
            BenchmarkError: when querying labels for unlabeled data.
                If the label function returns too few or too many labels
            Any exception the label function raises.
        """

        if not self.is_labeled:
            raise BenchmarkError("querying labels for unlabeled data")

        inputs = self.samples(indices)
        labels = self._function(inputs)

        if len(labels) != len(indices):
            raise BenchmarkError(
                "Label function returned wrong number of labels")

        return labels
Ejemplo n.º 4
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's RandomForestRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None and self._correlations is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            if self._correlations is None:
                return NormalPredictiveDistribution(
                    mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                )
            elif self._correlations == "naive":
                if (data.num_samples > 25000) and not self._force_corr:
                    warn(
                        "Input correlations requested for >2.5E4 predictions."
                        " Corelation matrix will not be computed, because a matrix this large may"
                        " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)."
                        " To force computation anyway, set `force_corr = True` in learner constructor.",
                        UserWarning,
                    )
                    return NormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                    )
                else:
                    # Must handle single-prediction separately, as in this case np.corrcoef
                    # will return single number rather than 1x1 array.
                    if preds.shape[1] == 1:
                        corr = np.array([[1]])
                    else:
                        corr = np.corrcoef(preds, rowvar=False)
                    return CorrelatedNormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr
                    )
            else:
                raise BenchmarkError(
                    "internal error, unknown parameter for correlations of RandomForestRegressionSklearn"
                )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn"
            )
Ejemplo n.º 5
0
    def handle_failure(self, i):
        """Take action according to failure mode.

        Parameters:
            i: index of failed sample
        """

        if self.failmode == "raise":
            raise BenchmarkError(f"DataTransformation failed for sample #{i}")
        elif self.failmode in ("drop", "mask", "index"):
            self.failures.append(i)
        else:
            raise BenchmarkError(
                f"Internal error, unknown failure mode {self._failmode_failmode}"
            )
Ejemplo n.º 6
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(data, Data)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            return NormalPredictiveDistribution(
                mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
            )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
Ejemplo n.º 7
0
Archivo: noise.py Proyecto: syam-s/smlb
    def apply(self, data: Data) -> Data:
        """Transforms data.

        Parameters:
            data: labeled data to transform

        Returns:
            transformed data

        Raises:
            InvalidParameterError if Data is not labeled
        """

        data = params.instance(data, Data)
        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")

        # patch the labels() method of the data object (not class)
        # there is no need to store the old labels function as it is a class member, not an object member

        for name in ("_orig_labels", "labels", "_noise"):
            # patch if necessary by choosing a random name instead of _labels
            if name in data.__dict__:
                raise BenchmarkError(
                    f"internal error: data object already has {name} method")

        # create a copy of the dataset
        data = copy.deepcopy(data)

        # rename labels to _labels for data only
        setattr(data, "_orig_labels", getattr(data, "labels"))

        # store noise model
        setattr(data, "_noise", self._noise)

        # add wrapper as new labels() method

        def labels(self, indices=None):
            """Query labels of a sequence of samples.

            This wrapper adds noise.

            Parameters:
                indices: a sequence of sample 'indices'.
                         See 'samples()' for details.

            Returns:
                a sequence of labels
            """

            labels = self._orig_labels(indices)
            return labels + self._noise.noise(labels.shape)

        setattr(data, "labels", labels.__get__(data))

        return data
Ejemplo n.º 8
0
Archivo: plots.py Proyecto: syam-s/smlb
    def asymptotic_fit(self, fdata):
        r"""Compute asymptotic fit in log-space for a single curve.

        The asymptotic fit is computed using a simple form of linear ridge regression,
        estimating two parameters, offset b and slope a: $f(x) = b + a x$.
        In short, we augment x with a second dimension of constant value 1 to remove the bias,
        $f( (x,1) ) = <(a,b),(x,1)>$. Then, solving
        $\argmin_{a,b} \sum_{i=1}^n (y_i - f((x_i,1)))^2 + \lambda ||(a,b)||^2$
        by rewriting in matrix notation, setting the derivative to zero and solving for (a,b) yields
        $(a,b) = (X^T X + \lambda I)^{-1} X^T y$, where the $n \times 2$-dimensional matrix X
        contains the data the fit is based on. The variance, or mean squared error (MSE),
        indicates how well empirical errors follow the asymptotic fit.

        Parameters:
            fdata: data for a single curve
        """

        # compute mean in log-space as the fit is linear in log space
        # todo: verify that this is the correct procedure
        sizes = self._logf(np.asfarray(tuple(entry[0] for entry in fdata)))
        means = np.asfarray(
            tuple(np.mean(self._logf(entry[1])) for entry in fdata))
        n = len(sizes)  # number of training set sizes

        if self._fit_weights is None:
            weights = np.ones(n)
        elif self._fit_weights == "variance":
            raise NotImplementedError  # todo: do weighting properly
            if min(len(entry[1]) for entry in fdata) < 2:
                raise InvalidParameterError(
                    "multiple values per horizontal location",
                    "fewer than two samples for at least one location",
                    explanation=
                    "weighting by variance not defined for fewer than two samples",
                )
            # todo: check for zero variance cases and replace by one
            weights = tuple(1 / np.var(entry[1]) for entry in fdata)
        else:
            raise BenchmarkError("internal error, invalid weighting scheme")
        weights /= np.sum(weights)

        X = np.ones((n, 2))  # second column is 1
        X[:, 0], y = sizes, means  # fit is in log-space
        assert y.shape == (n, ), f"loss vector has wrong dimensions {y.shape}"

        # standard linear ridge regression in log-space
        slope, offset = np.linalg.pinv(X.T @ X + self._fit_lambda *
                                       np.identity(2)) @ X.T @ y

        # variance of the fit
        residuals = y - (offset + slope * self._logf(n))
        variance = np.mean(np.asfarray(residuals**2))

        return offset, slope, residuals, variance
Ejemplo n.º 9
0
    def signal_part(self):
        """Query signal part of decomposition.

        Raises:
            BenchmarkError: if distribution does not provide signal part
        """

        if self._signal_part is None:
            raise BenchmarkError(
                "Distribution does not provide signal part decomposition")

        return self._signal_part
Ejemplo n.º 10
0
    def _render(self, target, **kwargs):
        """Render generalized function plot.

        Parameters:
            target: rendering target which evaluation outcome is rendered to; see Evaluation._render method
        """

        # draw curves
        for (i, pd) in enumerate(self._plotdata):
            # point markers, every single point is drawn
            if self._visualization_type[i] == "points":
                self.points(pd, color=i)
            elif self._visualization_type[i] == "box-whisker":
                self.box_whisker(pd[0], pd[1], color=i, widths=pd[2])
            else:
                raise BenchmarkError("internal error, unknown visualization type")
Ejemplo n.º 11
0
    def _parse_csv_entry(self, e: str) -> dict:
        """Helper function, parse single entry from underlying data.

        After processing, the entry contains exactly these keys:
            "citation1": first citation URL
            "citation2": second citation URL if it exists, empty string otherwise
            "formula": chemical sum formula
            "Tc/K": superconducting critical temperature in K

        Parameters:
            e: line from the underlying exported data file

        Returns:
            an entry with keys as above
        """

        e = e.split(",")

        if len(e) != 6:
            # some entries have commas in the formula
            formula = ",".join(e[2:-3])
            if formula[0] == '"':
                formula = formula[1:-1]
            e = e[:2] + [
                formula,
            ] + e[-3:]

        assert len(e) == 6
        assert e[3] == "Superconducting critical temperature (Tc)"
        assert e[5] == "K"

        try:
            value = float(e[4])
        except ValueError:
            try:
                value = e[4].split(" to ")
                assert len(value) == 2
                value = (float(value[0]), float(value[1]))
            except Exception as e:
                raise BenchmarkError(f"invalid temperature '{value}'") from e

        return {
            "citation1": e[0],
            "citation2": e[1],
            "formula": e[2],
            "Tc/K": value,
        }
Ejemplo n.º 12
0
 def _determine_num_dimensions(self, total_dimensions: int) -> int:
     """Apply the self._dimensions_varied argument to a total number of dimensions to
     determine the number of dimensions varied with each step.
     """
     if self._dimensions_varied == "all":
         dimensions = total_dimensions
     elif isinstance(self._dimensions_varied, float):
         dimensions = int(
             np.ceil(self._dimensions_varied * total_dimensions))
     elif isinstance(self._dimensions_varied, int):
         dimensions = self._dimensions_varied
     else:
         dimensions = 0
     if dimensions <= 0 or dimensions > total_dimensions:
         raise BenchmarkError(
             f"Rook design optimizer cannot vary {dimensions} dimensions "
             f"for a dataset that has {total_dimensions} dimensions")
     return dimensions
Ejemplo n.º 13
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            # todo: there is a discrepancy between the ensemble mean and predictions
            #       until this has been resolved, naive uncertainties are not supported
            #       when fixing this, update parameter validation and unit tests
            raise NotImplementedError
        #     # #trees x #samples matrix of predictions of ensemble's trees
        #     staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred)))

        #     # this does NOT yield the same predictions as self._model.predict(xpred)
        #     mean, stddev = (
        #         np.mean(staged_preds, axis=0),
        #         np.std(staged_preds, axis=0),
        #     )
        #     return NormalPredictiveDistribution(mean=mean, stddev=stddev)
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
Ejemplo n.º 14
0
    def add_auxiliary(self, key: str, value: Any):
        """Add auxiliary information.

        Parameters:
            key: string key for retrieving information later
            value: auxiliary information to store under key

        A setter could have been used, for example, as
        `auxiliary = { key: value }`. This solution was
        considered abuse of notation as the syntax would
        have suggested assignment but would have added instead
        """

        key = params.string(key)

        if self._auxiliary.keys() & key:
            raise BenchmarkError("internal error: non-unique evaluation auxiliary data")

        self._auxiliary[key] = value
Ejemplo n.º 15
0
    def apply(self, data: Data, **kwargs) -> Data:
        """Draw random vectors.

        Parameters:
            data: Data to draw from

        Returns:
            TabularData of vectors
        """

        data = params.instance(data, Data)
        if self._domain is None:
            if data.domain is None:
                domain = np.asarray([[0, 1]] * data.dimensions)
            else:
                domain = data.domain
        else:
            domain = params.hypercube_domain(
                self._domain, dimensions=data.dimensions
            )  # checks dimensionality (see __init__)

        for low, high in domain:
            if low == -np.inf or high == np.inf:
                raise BenchmarkError("can not sample from infinite domain")

        # vectors = np.transpose(
        #     np.asfarray(
        #         [
        #             self.random.uniform(low=low, high=high, size=self._size)
        #             for (low, high) in self._domain
        #         ]
        #     )
        # )

        # this version avoids the python loop for efficiency in high dimensions
        vectors = (
            self.random.uniform(size=(self._size, data.dimensions)) * (domain[:, 1] - domain[:, 0])
            + domain[:, 0]  # noqa W503
        )

        return data.subset(vectors)
Ejemplo n.º 16
0
    def __init__(self, cdk_jar_path: Optional[str] = None):
        """Initialize CDK Java gateway.

        See base class JavaGateway for details.

        This class provides CDK-specific functionality,
        namely the path to the CDK .jar file.

        Parameters:
            cdk_jar_path: local filesystem path to the CDK jar, e.g.,
                '/file/path/cdk.jar'. If not specified, smlb tries to
                find the CDK jar.

        Raises:
            BenchmarkError if the CDK .jar file can not be found.
        """

        # todo: optional_
        # cdk_jar_path = params.optional_(cdk_jar_path, params.string)  todo: valid path
        cdk_jar_path = params.any_(cdk_jar_path, params.string, params.none)

        # finding CDK .jar file logic
        if cdk_jar_path is None:
            if self._cdk_jar_path_auto is not None:
                # already detected, use stored path
                cdk_jar_path = self._cdk_jar_path_auto
            else:
                # attempt to find CDK .jar file
                # todo: find correct path for installed versions

                path = os.path.join(os.path.dirname(__file__), "../build/cdk.jar")
                if not os.access(path, os.R_OK):
                    raise BenchmarkError(
                        "Valid path to .jar file",
                        path,
                        explanation=f"Jar file {path} does not exist or is not readable.",
                    )

                cdk_jar_path = path

        super().__init__(cdk_jar_path)
Ejemplo n.º 17
0
    def labels(self, indices: Optional[Sequence[int]] = None) -> np.ndarray:
        """Query labels.

        Returns a sequence of labels or raises InvalidParameterError.

        Parameters:
            indices: A sequence of non-negative integers in the range [0, n),
                where n is number of samples. By default, all labels are returned.

        Returns:
            NumPy ndarray of labels. Type of labels depends on the data.

        Raises:
            InvalidParameterError: for invalid indices, or if dataset is not labeled
        """

        if not self.is_labeled:
            raise BenchmarkError("Querying labels of unlabeled data")

        indices = self._indices_testf(indices)

        return self._labels[indices] if indices else self._labels
Ejemplo n.º 18
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self
Ejemplo n.º 19
0
    def evaluate(self, results, **kwargs):
        """Compute plot data for multiple generalized (set-valued) functions.

        Multiple curves C_1, ..., C_k can be drawn.
        Each curve C_i is specified by a non-empty sequence of 2-tuples,
        where the first value is location on horizontal axis, and the
        other value is a sequence of locations on the vertical axis.

        Each curve can be drawn in a different way (points, box-whisker).

        Parameters:
            results: sequence of generalized functions data (curve data).
                     Each datum is a sequence of tuples (x,fx), where
                     x is a real number and fx is a sequence of real numbers.

        Examples:
            # two curves sharing one horizontal location
            evaluate([
                [(1,(1,0.9,1.1)), (3,(2,))],  # curve 1
                [(1,(0.7,)), (2,(3.1,2.8)), (4,(5.5,7.3,6))], # curve 2
            ])
        """

        super().evaluate(results=results, **kwargs)

        # parameter validation

        tuple_testf = lambda arg: params.tuple_(arg, params.real, params.real_vector, arity=2)
        curve_testf = lambda arg: params.tuple_(arg, tuple_testf)
        results = params.tuple_(results, curve_testf)

        # _rectify evaluates to True if True or if > 0
        if len(results) > len(self.RECTIFY_DELTAS) and self._rectify:
            raise InvalidParameterError(
                f"at most {len(self.RECTIFY_DELTAS)} curves", f"{len(self.RECTIFY_DELTAS)} curves"
            )

        # finalize parameter validation for visualization_type
        if not is_sequence(self._visualization_type):
            self._visualization_type = (self._visualization_type,) * len(results)
        self._visualization_type = params.tuple_(
            self._visualization_type,
            lambda arg: params.enumeration(arg, {"points", "box-whisker", "shaded-line"}),
            arity=len(results),
            default="points",
        )

        # prepare plot

        # determine all distinct horizontal positons in the results data
        all_positions = np.unique([entry[0] for curve in results for entry in curve])

        # there is nothing to do without data to plot
        if len(all_positions) == 0:
            self._plotdata = []
            return

        # do not rectify if there is only a single horizontal position
        if len(all_positions) == 1 or self._rectify is False:
            self._rectify = 0.0

        # automatic determination of horizontal rectification factor
        #
        # the correct way to draw box-plots on a logarithmic horizontal axis is to have
        # different left-width and right-width of the boxes. However, matplotlib does not
        # support this. Because box widths are small compared to horizontal plot range,
        # it suffices to use the sum of left- and right-half widths.
        between_groups_spacing = 0.4
        in_group_spacing = 0.9  # box-whisker plots
        if self.axes_scales[0] == "linear":
            logf = lambda arg: arg
            powf = lambda arg: arg
        elif self.axes_scales[0] == "log":
            base = 10
            logf = lambda arg: np.log(arg) / np.log(base)
            powf = lambda arg: np.power(base, arg)

        if self._rectify is True:
            # diff(...) requires at least two horizontal locations; this is ensured above
            self._rectify = (
                between_groups_spacing * min(np.diff(logf(all_positions))) / len(results)
            )

        # determine positions
        self._plotdata = [None] * len(results)
        deltas = self.RECTIFY_DELTAS[len(results)] if self._rectify else np.zeros(len(results))
        for (i, curve) in enumerate(results):
            # point markers, every single point is drawn
            if self._visualization_type[i] == "points":
                positions = powf(
                    np.hstack(
                        [
                            logf(entry[0] * np.ones(len(entry[1]))) + deltas[i] * self._rectify / 2
                            for entry in curve
                        ]
                    )
                )
                values = np.hstack([entry[1] for entry in curve])
                self._plotdata[i] = np.transpose([positions, values])
            # box-whisker plots
            elif self._visualization_type[i] == "box-whisker":
                positions = np.asfarray(
                    [logf(entry[0]) + deltas[i] * self._rectify / 2 for entry in curve]
                )
                values = [entry[1] for entry in curve]
                # can't use rectify for width if 0; 1 is a wild guess
                # todo: if plot ranges have been set, a better default value could
                #       be 10% of horizontal plot range
                w = 1 if not self._rectify else self._rectify
                widths = powf((positions + w / 2) * in_group_spacing) - powf(
                    (positions - w / 2) * in_group_spacing
                )
                positions = powf(positions)
                self._plotdata[i] = (positions, values, widths)
            elif self._visualization_type[i] == "shaded-line":
                positions = np.asfarray([entry[0] for entry in curve])
                values = [entry[1] for entry in curve]
                self._plotdata[i] = (positions, values)
            else:
                raise BenchmarkError("internal error, unknown visualization type")
Ejemplo n.º 20
0
    def run(self):
        """Execute workflow."""

        nlearn, ntrain = len(self._learners), len(self._training)
        ntotal = nlearn * ntrain
        self._progressf(0, ntotal)

        # 1) Validation data

        # sample validation data from dataset
        validation_data = self._validation.fit(self._data).apply(self._data)

        # remove validation data from dataset for finite datasets
        if self._data.is_finite:
            remaining_data = complement(self._data, validation_data)
        else:  # infinite
            # any finite subset has measure zero
            remaining_data = self._data

        # 2) Training sets

        # sample training sets from remaining dataset
        training_data = tuple(
            sampler.fit(remaining_data).apply(remaining_data) for sampler in self._training
        )

        # verify that the intersection between validation and all training sets is empty
        for train in training_data:
            # this assumes that both validation and training set are finite
            inters = intersection(train, validation_data)
            if inters.num_samples > 0:
                i, j, k = inters.num_samples, validation_data.num_samples, train.num_samples
                msg = f"Non-empty intersection between validation and training data ({i} shared samples out of {j} and {k})"
                raise BenchmarkError(msg)

        # 3) Featurization

        # featurize validation and training sets
        validation_data = self._features.fit(validation_data).apply(validation_data)
        training_data = tuple(self._features.fit(train).apply(train) for train in training_data)

        # 4) Training and prediction

        # train each learner on each training set and predict validation set
        predictions = np.empty((nlearn, ntrain), dtype=PredictiveDistribution)
        for i, learner in enumerate(self._learners):
            for j, training in enumerate(training_data):
                learner.fit(training)
                predictions[i, j] = learner.apply(validation_data)

                self._progressf(i * ntrain + j + 1, ntotal)  # 1-based

        # 5) Evaluate results

        # compute evaluation metric for each run
        metric = np.asfarray(
            [
                [
                    self._metric.evaluate(true=validation_data.labels(), pred=predictions[i, j])
                    for j in range(ntrain)
                ]
                for i in range(nlearn)
            ]
        )

        # render each evaluation
        eval_data = [
            [(train.num_samples, (metric[i, j],)) for j, train in enumerate(training_data)]
            for i, learner in enumerate(self._learners)
        ]
        for eval_ in self._evaluations:
            eval_.evaluate(eval_data)
            eval_.render()
Ejemplo n.º 21
0
 def check_arity(expected, actual):
     if expected != actual:
         raise BenchmarkError(
             f"Invalid descriptor result arity (expected {expected}, was {actual})"
         )
Ejemplo n.º 22
0
    def apply(self, data: Data) -> TabularData:
        """Compute selected molecular features.

        Parameters:
            data: molecular structures given as SMILES strings.
                  Can be labeled, and labels will be retained

        Returns:
            TabularData with CDK molecular features as samples
        """

        data = params.instance(data, Data)  # todo: params.data(data, is_finite=True)

        failmode = DataTransformationFailureMode(self._failmode, data.num_samples)

        # set up molecule SMILES
        builder = self._java_gateway.jvm.org.openscience.cdk.DefaultChemObjectBuilder.getInstance()
        parser = self._java_gateway.jvm.org.openscience.cdk.smiles.SmilesParser(builder)

        def parse_smiles(s: str, i: int):
            """Return parsed SMILES string or None on failure."""
            try:
                return parser.parseSmiles(self._samplef(s))
            except py4j.protocol.Py4JJavaError:
                # expected to be raised from org.openscience.cdk.exception.InvalidSmilesException
                failmode.handle_failure(i)
                return None  # internal sentinel value

        smiles = tuple(parse_smiles(s, i) for i, s in enumerate(data.samples()))

        # compute descriptors
        # todo: the dtype of the columns could be set in advance by querying the descriptors
        #       currently, all values are stored as floating point numbers
        features = np.empty((data.num_samples, np.sum(self._arities)))
        index = 0

        def java_is_instance_of(object_, class_):
            return py4j.java_gateway.is_instance_of(
                self._java_gateway, object_, "org.openscience.cdk.qsar.result." + class_
            )

        def check_arity(expected, actual):
            if expected != actual:
                raise BenchmarkError(
                    f"Invalid descriptor result arity (expected {expected}, was {actual})"
                )

        for descriptor, arity in zip(self._descriptors, self._arities):
            for i, smile in enumerate(smiles):
                if smiles is None:
                    features[i, index : index + arity] = float("nan")
                    continue

                try:
                    value = descriptor.calculate(smile).getValue()
                except py4j.protocol.Py4JJavaError:
                    failmode.handle_failure(i)
                    features[i, index : index + arity] = float("nan")
                    continue

                if java_is_instance_of(value, "IntegerResult"):
                    check_arity(arity, 1)
                    features[i, index] = int(value.intValue())
                elif java_is_instance_of(value, "DoubleResult"):
                    check_arity(arity, 1)
                    features[i, index] = float(value.doubleValue())
                elif java_is_instance_of(value, "BooleanResult"):
                    check_arity(arity, 1)
                    features[i, index] = bool(value.booleanValue())
                elif java_is_instance_of(value, "IntegerArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        int(value.get(j)) for j in range(value.length())
                    )
                elif java_is_instance_of(value, "DoubleArrayResult"):
                    check_arity(arity, value.length())
                    features[i, index : index + arity] = tuple(
                        float(value.get(j)) for j in range(value.length())
                    )
                # there seems to be no BooleanArrayResult in CDK
                else:
                    name = value.getClass().getSimpleName()
                    raise BenchmarkError(f"Unsupported CDK result type '{name}'")
            index += arity

        result = (
            TabularData(data=features, labels=data.labels())
            if data.is_labeled
            else TabularData(data=features)
        )

        result = failmode.finalize(result)

        return result
Ejemplo n.º 23
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Ejemplo n.º 24
0
    def __init__(
            self,
            select: Union[str, Sequence[str]] = "all",
            samplef: Callable[[Any], Any] = lambda arg: arg,
            stoichiometry_p_list: Sequence[int] = (0, 2, 3, 5, 7, 10),
            elemental_preset: str = "magpie",
            ionic_fast: bool = False,
            valence_orbitals: Sequence[str] = ("s", "p", "d", "f"),
            valence_props: Sequence[str] = ("avg", "frac"),
            **kwargs,
    ):
        """Initialize state.

        Selected parameters of the wrapped matminer classes Stoichiometry, ElementProperty,
        IonProperty, ValenceOrbital can be passed through. These parameters are prefixed
        with stoichiometry, elemental, ionic, valence. For example, stoichiometry_p_list
        is the p_list parameter of Stoichiometry. For further details on these, see
        https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/composition.py

        Parameters:
            select: which feature sets to compute (by default, all). Specifying
                multiple sets (e.g., ('stoichiometry', 'elemental') selects both).
                Valid choices:
                'all': all features
                'stoichiometry': norms of stoichiometric features
                'elemental': element properties
                'ionic': ion properties
                'valence': valence orbital shell features
            samplef: a function accepting and returning a sample. This enables
                transformation of samples, for example, to select an entry by key
                if sample is a dictionary, or to turn a dictionary into a vector.
                Default is to return the sample unchanged.
            stoichiometry_p_list: list of L_p norms to compute
            elemental_preset: matminer preset to use. Valid choices include:
                'magpie', 'deml', 'matminer', 'matscholar_el', 'megnet_el'
            ionic_fast: if True, assumes that elements exist in single oxidation state
            valence_orbitals: which valence orbitals to consider
            valence_props: whether to return average properties, fractional, or both

        Requires the matminer package (see file documentation).
        """

        super().__init__(**kwargs)

        SELECT_SETS = ("stoichiometry", "elemental", "ionic", "valence")

        if select == "all":
            select = SELECT_SETS
        if isinstance(select, str):
            select = (select,
                      )  # tuple(str,) yields tuple of characters in str
        select = params.tuple_(
            select,
            lambda arg: params.enumeration(arg, set(SELECT_SETS)),
        )

        self._stoichiometry_p_list = params.tuple_(
            stoichiometry_p_list, lambda p: params.integer(p, from_=0))
        self._elemental_preset = params.enumeration(
            elemental_preset,
            {"magpie", "deml", "matminer", "matscholar_el", "megnet_el"})
        self._ionic_fast = params.boolean(ionic_fast)
        self._valence_orbitals = params.tuple_(
            valence_orbitals,
            lambda arg: params.enumeration(arg, {"s", "p", "d", "f"}))
        self._valence_props = params.tuple_(
            valence_props,
            lambda arg: params.enumeration(arg, {"avg", "frac"}))

        self.samplef = samplef  # todo: add callable to params

        # set up matminer
        try:
            import matminer
            import matminer.featurizers
            import matminer.featurizers.base
            import matminer.featurizers.composition
            import matminer.featurizers.conversions
            import pymatgen
        except ModuleNotFoundError as e:
            raise BenchmarkError(
                f"'{type(self).__name__}' requires 'matminer' and 'pymatgen' packages"
            ) from e

        self._composition = pymatgen.core.composition.Composition

        # set up features
        features = []
        if "stoichiometry" in select:
            features.append(
                matminer.featurizers.composition.Stoichiometry(
                    p_list=self._stoichiometry_p_list))
        if "elemental" in select:
            features.append(
                matminer.featurizers.composition.ElementProperty.from_preset(
                    self._elemental_preset))
        if "ionic" in select:
            features.append(
                matminer.featurizers.composition.IonProperty(
                    fast=self._ionic_fast))
        if "valence" in select:
            features.append(
                matminer.featurizers.composition.ValenceOrbital(
                    orbitals=self._valence_orbitals,
                    props=self._valence_props))

        self._mmfeatures = matminer.featurizers.base.MultipleFeaturizer(
            features)