Ejemplo n.º 1
0
def setdiff2d(a, b):
    """Set difference between two sets of vectors.

    Returns the unique vectors in a that are not in b.
    The equivalent of np.setdiff1d for matrix rows.

    Parameters:
        a: NumPy 2d array (row vectors)
        b: NumPy 2d array (row vectors)
    Returns:
        NumPy 2d array, rows are unique vectors from a that are not in b
    """

    a = params.real_matrix(a)
    b = params.real_matrix(b)

    # special cases
    if len(a) == 0:
        return np.zeros((0, 0), dtype=float)

    a_rows = a.view([("", a.dtype)] * a.shape[1])
    b_rows = b.view([("", b.dtype)] * b.shape[1])
    res = np.setdiff1d(a_rows, b_rows).view(a.dtype).reshape(-1, a.shape[1])

    return res
Ejemplo n.º 2
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's RandomForestRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None and self._correlations is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            if self._correlations is None:
                return NormalPredictiveDistribution(
                    mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                )
            elif self._correlations == "naive":
                if (data.num_samples > 25000) and not self._force_corr:
                    warn(
                        "Input correlations requested for >2.5E4 predictions."
                        " Corelation matrix will not be computed, because a matrix this large may"
                        " take up too much RAM. (2.5E4^2 entries * 8 byes per entry / 1E6 bytes per MB = 3200MB)."
                        " To force computation anyway, set `force_corr = True` in learner constructor.",
                        UserWarning,
                    )
                    return NormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
                    )
                else:
                    # Must handle single-prediction separately, as in this case np.corrcoef
                    # will return single number rather than 1x1 array.
                    if preds.shape[1] == 1:
                        corr = np.array([[1]])
                    else:
                        corr = np.corrcoef(preds, rowvar=False)
                    return CorrelatedNormalPredictiveDistribution(
                        mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0), corr=corr
                    )
            else:
                raise BenchmarkError(
                    "internal error, unknown parameter for correlations of RandomForestRegressionSklearn"
                )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of RandomForestRegressionSklearn"
            )
Ejemplo n.º 3
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(data, Data)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            preds = np.asfarray([tree.predict(xpred) for tree in self._model.estimators_])
            return NormalPredictiveDistribution(
                mean=np.mean(preds, axis=0), stddev=np.std(preds, axis=0)
            )
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
Ejemplo n.º 4
0
    def samples(self, indices: Optional[np.ndarray] = None) -> np.ndarray:
        """Query vector samples.

        Returns a sequence of samples or raises InvalidParameterError.

        Vectors are queried by themselves, that is, vectors are their own indices.

        Parameter:
            indices: a real matrix of appropriate dimensions (rows are vectors)

        Return:
            real matrix (vectors are rows)

        Raises:
            InvalidParameterError: for invalid keys
        """

        samples = params.real_matrix(indices, ncols=self.dimensions)

        if self.domain is not None:
            if (samples < self._domain[:, 0]).any() or (
                    samples > self._domain[:, 1]).any():
                raise InvalidParameterError("vectors in domain",
                                            "vectors outside of domain")

        return samples
Ejemplo n.º 5
0
    def apply(self, data: Data) -> PredictiveDistribution:
        """Predicts new inputs.

        Parameters:
            data: finite indexed data to predict

        Returns:
            predictive normal distributions if predictive uncertainties were requested,
            otherwise delta distributions
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)

        xpred = params.real_matrix(data.samples())

        if self._with_uncertainties:
            try:
                preds, stddevs = self._model.predict(xpred, return_std=True)
                return NormalPredictiveDistribution(mean=preds, stddev=stddevs)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
        else:
            try:
                preds = self._model.predict(xpred, return_std=False)
                return DeltaPredictiveDistribution(mean=preds)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
Ejemplo n.º 6
0
    def apply(self, data: Data) -> NormalPredictiveDistribution:
        r"""Predicts new inputs.

        For Gaussian processes, both the noise-free predictive (posterior)
        distribution as well as the noise estimate are normally distributed.
        The predictive distribution with noise is the sum of the former two.

        The $\alpha$ training noise specified at initialization time is not
        added at prediction time, and thus not part of the noise model.
        The current implementation considers contributions from any
        WhiteKernel or other kernel that has a hyperparameter 'noise_level'.

        Limitations:
            It is a currently accepted shortcoming that WhiteKernels that are
            not 'first-level' sum members might yield wrong noise models.
            Examples:
            WhiteKernel(...) + other kernels will work
            kernel(...) * WhiteKernel(...) will not work as intended

            Training data noise $\alpha$ is not added

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution with the following decomposition:
                predicted: sum of model and noise distribution
                noise_part: normal distribution for estimated noise
                signal_part: normal distribution for estimated model contribution;
                             the Gaussian process' "predictive variance";
                             depends only on distance from the training data
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())
        n = data.num_samples

        # predict
        preds, stddevs = self._model.predict(xpred, return_std=True)

        # noise
        # noise are all noise_level of WhiteKernel, where noise_level is variance (not standard deviation)
        # this assumes that the noise level are independent
        noise = tuple(v for k, v in self._model.kernel_.get_params().items()
                      if k.endswith("noise_level"))
        noise = np.ones(shape=n) * np.sum(noise)
        noise_part = NormalPredictiveDistribution(mean=np.zeros(shape=n),
                                                  stddev=np.sqrt(noise))

        return NormalPredictiveDistribution(
            mean=preds,
            stddev=np.sqrt(np.square(stddevs) + noise),
            noise_part=noise_part,
            signal_part=NormalPredictiveDistribution(mean=preds,
                                                     stddev=stddevs),
        )
Ejemplo n.º 7
0
    def line(self, line, color=0, **kwargs):
        """Draw a line.

        Parameters:
            line: n x 2 matrix of n points in two dimensions
            color: color index
        """

        line = params.real_matrix(line, ncols=2)
        color = params.integer(color, from_=0, below=len(self.configuration.color_set))

        self.ax.plot(
            line[:, 0], line[:, 1], linestyle="-", color=self.configuration.color(color), **kwargs
        )
Ejemplo n.º 8
0
        def morsef(r):
            """Evaluate Morse potential at a sequence of vectors r.

            Parameters:
                r: n x 1 matrix of n one-dimensional vectors

            Returns:
                vector of Morse potential values at r
            """

            r = params.real_matrix(r, ncols=1)
            n = len(r)

            gamma = np.exp(-self._a * (r - self._r0))
            v = self._d * (np.square(gamma) - 2 * gamma)
            return v.reshape(n)
Ejemplo n.º 9
0
    def friedman_silverman_1989(xx):
        """Computes Friedman & Silverman (1989) test function without noise.

        Parameters:
            xx: matrix, rows are input vectors

        Returns:
            vector of computed function values
        """

        xx = params.real_matrix(
            xx)  # base class verifies dimensionality and domain

        return (0.1 * np.exp(4 * xx[:, 0]) + 4 /
                (1 + np.exp(-(xx[:, 1] - 0.5) / 0.05)) + 3 * xx[:, 2] +
                2 * xx[:, 3] + xx[:, 4])
Ejemplo n.º 10
0
    def friedman1979(xx):
        """Computes Friedman (1979) test function without noise term
        
        Parameters:
            xx: sequence of vectors
          
        Returns:
            sequence of computed labels
        """

        xx = params.real_matrix(
            xx)  # base class verifies dimensionality and domain

        return (10 * np.sin(np.pi * xx[:, 0] * xx[:, 1]) +
                20 * np.power(xx[:, 2] - 1 / 2, 2) + 10 * xx[:, 3] +
                5 * xx[:, 4])
Ejemplo n.º 11
0
    def apply(
        self, data: Data
    ) -> Union[DeltaPredictiveDistribution, NormalPredictiveDistribution]:
        r"""Predicts new inputs.

        Parameters:
            data: finite indexed data to predict;

        Returns:
            predictive normal distribution
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)

        xpred = params.real_matrix(data.samples())

        # predict
        # scikit-learn's ExtraTreesRegressor.predict() method does not support
        # returning predictions for all trees in the ensemble. Therefore,
        # `preds = self._model.predict(xpred)` is insufficient.

        if self._uncertainties is None:
            preds = self._model.predict(xpred)
            return DeltaPredictiveDistribution(mean=preds)
        elif self._uncertainties == "naive":
            # todo: there is a discrepancy between the ensemble mean and predictions
            #       until this has been resolved, naive uncertainties are not supported
            #       when fixing this, update parameter validation and unit tests
            raise NotImplementedError
        #     # #trees x #samples matrix of predictions of ensemble's trees
        #     staged_preds = np.asfarray(tuple(self._model.staged_predict(xpred)))

        #     # this does NOT yield the same predictions as self._model.predict(xpred)
        #     mean, stddev = (
        #         np.mean(staged_preds, axis=0),
        #         np.std(staged_preds, axis=0),
        #     )
        #     return NormalPredictiveDistribution(mean=mean, stddev=stddev)
        else:
            raise BenchmarkError(
                "internal error, unknown parameter for uncertainties of ExtremelyRandomizedTreesRegressionSklearn"
            )
Ejemplo n.º 12
0
    def points(self, points, color=0, **kwargs):
        """Draw set of points.

        Parameters:
            points: n x 2 matrix of n points in two dimensions
            color: color index
        """

        points = params.real_matrix(points, ncols=2)
        color = params.integer(color, from_=0, below=len(self.configuration.color_set))

        self.ax.plot(
            points[:, 0],
            points[:, 1],
            linestyle="",
            marker="o",
            color=self.configuration.color(color),
            **kwargs,
        )
Ejemplo n.º 13
0
    def schwefel26_1981(xx):
        """Computes Schwefel (1981) test function 26.

        Parameters:
            xx: input matrix, rows are samples

        Returns:
            sequence of computed labels

        Examples:
            schwefel26_1981(np.random.uniform(-500, 500, (100,2))) # evaluate on 100 2-dimensional inputs
        """

        xx = params.real_matrix(
            xx)  # base class verifies dimensionality and domain
        d = xx.shape[1]

        return 418.9829 * d - np.sum(
            np.multiply(xx, np.sin(np.sqrt(np.abs(xx)))), axis=-1)
Ejemplo n.º 14
0
    def fit(self, data: Data) -> "RandomForestRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on
        
        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Ejemplo n.º 15
0
    def __init__(self, mean, stddev, corr, **kwargs):
        """Initialize state.

        The correlated normal distribution is completely characterized by
        its mean, standard deviations, and correlation matrix.

        Parameters:
            mean: a sequence of means (floats)
            stddev: a sequence of standard deviations (non-negative floats)
            corr: a matrix of Pearson correlations between individual predictions (floats between 0 and 1)
        """

        super().__init__(**kwargs)

        self._mean = params.real_vector(mean)
        self._stddev = params.real_vector(stddev,
                                          dimensions=len(self._mean),
                                          domain=(0, np.inf))
        self._corr = params.real_matrix(corr,
                                        nrows=len(self._mean),
                                        ncols=len(self._mean))
Ejemplo n.º 16
0
    def doubleWell(xx):
        """Computes double well test function without noise term.

        Parameters:
            xx: sequence of vectors

        Returns:
            sequence of computed labels
        """

        xx = params.real_matrix(xx)  # base class verifies dimensionality and domain

        x = xx[:, 0]
        y = xx[:, 1]
        return (
            1 / 4 * np.power(x, 4)
            + 1 / 3 * np.power(x, 3)
            - 2 * np.power(x, 2)
            - 4 * x
            + np.power(y, 2)
            + 28 / 3
        )
Ejemplo n.º 17
0
    def fit(self, data: Data) -> "GaussianProcessRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: labeled data to train on;
                  must derive from IndexedData and LabeledData

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
    def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(data, Data)

        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Ejemplo n.º 19
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self