Esempio n. 1
0
    def box_whisker(self, positions, values, color=0, widths=0.5, **kwargs):
        """Draw box-whisker plots.

        Parameter:
            positions: where to place plots on horizontal axis
            values: samples for each location
            color: color index
            widths: widths of boxes
        """

        positions = params.real_vector(positions)
        point_set_f = lambda arg: params.real_vector(arg)
        values = params.tuple_(values, params.real_vector, arity=len(positions))
        color = params.integer(color, from_=0, below=len(self.configuration.color_set))
        widths = params.real_vector(widths, dimensions=len(positions), domain=(0, 999))

        color = self.configuration.color(color)

        self.ax.boxplot(
            values,
            positions=positions,
            whis=(0, 100),
            bootstrap=None,
            widths=widths,
            notch=False,
            showmeans=True,
            boxprops={"color": color},
            whiskerprops={"color": color},
            capprops={"color": color},
            meanprops={"marker": "*", "markerfacecolor": color, "markeredgecolor": color},
            medianprops={"color": color},
            manage_ticks=False,
            **kwargs,
        )
Esempio n. 2
0
def test_real_vector2():
    """Accumulated test cases."""

    # only (a,b) is extended, [(a,b)] is not
    assert (
        params.real_vector([0.5, 0.5, 1], dimensions=3, domain=(0, np.inf))
        == np.asfarray([0.5, 0.5, 1])
    ).all()
    with pytest.raises(InvalidParameterError):
        params.real_vector([0.5, 0.5, 1], dimensions=3, domain=[(0, np.inf)])
Esempio n. 3
0
    def __init__(self,
                 internal_hp_optimization: bool = True,
                 kernel: Optional[Kernel] = None,
                 alpha: Union[float, Sequence] = 1e-5,
                 optimizer="fmin_l_bfgs_b",
                 n_restarts_optimizer=0,
                 normalize_y=False,
                 random_state: int = None,
                 **kwargs):
        """Initialize state.

        sklearn-specific parameters are passed through to the implementation.

        Parameters:
            internal_hp_optimization: if True, hyperparameters are optimized "internally"
                by the Gaussian process, that is, scikit-learn optimizes hyperparameters
                and for smlb the learner has no hyperparameters;
                if False, hyperparameters are optimized by smlb (and scikit-learn does
                not optimize any hyperparameters)
            kernel: scikit-learn kernel; if None, a single Gaussian kernel is used as default
            alpha: regularization constant (scalar or vector); added as-is to kernel matrix diagonal.
                   Equivalent to adding a "WhiteKernel"; the default is the corresponding value from
                   scikit-learn's WhiteKernel, and different from scikit-learn's GaussianProcessRegressor.
            optimizer: hyperparameter optimization algorithm; used only if internal_hp_optimization is True
            n_restarts_optimizer: number of times optimizer is restarted; only used if internal_hp_optimization is True
            normalize_y: whether to subtract the mean of the labels
            random_state: integer seed

        See skl.gaussian_process.GaussianProcessRegressor parameters.
        """

        super().__init__(**kwargs)

        internal_hp_optimization = params.boolean(internal_hp_optimization)
        kernel = params.any_(kernel, lambda arg: params.instance(arg, Kernel),
                             params.none)
        # incomplete check for alpha as dimension becomes known only at fitting time
        alpha = params.any_(
            alpha,
            lambda arg: params.real(arg, from_=0),
            lambda arg: params.real_vector(arg, domain=[0, np.inf]),
        )
        # todo: check optimizer, requires params.union (of string and callable) and params.function
        normalize_y = params.boolean(normalize_y)
        random_state = params.integer(random_state)

        if kernel is None:
            kernel = skl.gaussian_process.kernels.RBF(
            ) + skl.gaussian_process.kernels.WhiteKernel()

        assert internal_hp_optimization is True  # external HP optimization not yet supported

        self._model = skl.gaussian_process.GaussianProcessRegressor(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            random_state=random_state,
        )
Esempio n. 4
0
    def __init__(self, mean, stddev, **kwargs):
        """Initialize state.

        The normal distribution is completely characterized by
        its mean and standard deviation.

        Parameters:
            mean: a sequence of means (floats)
            stddev: a sequence of standard deviations (non-negative floats)
        """

        super().__init__(**kwargs)

        self._mean = params.real_vector(mean)
        self._stddev = params.real_vector(stddev,
                                          dimensions=len(self._mean),
                                          domain=(0, np.inf))
Esempio n. 5
0
def two_sample_cumulative_distribution_function_statistic(
    sample_a,
    sample_b,
    f=lambda p, t: np.square(p - t),
    g=lambda s, w: np.sum(s * w)):
    r"""Compute a statistic of the difference between two empirical cumulative distribution functions.

    Calculate statistics of the cumulative distribution functions (CDF) of two samples.
    Let $x_1,\ldots,x_d$ be the union of the two samples, $x_i < x_{i+1}$, and let
    $w_i = x_{i+1}-x_i$, $i = 1,\ldots,d-1$ be the differences between them.
    The calculated statistics have the form $g(s,w)$ where $s_i = f(F_a(x_i), F_b(x_i))$)
    and $F_a$, $F_b$ are the CDFs of the two samples.

    Here, the $x_i$ are the points where one or both of the CDFs changes, $f$ is a statistic
    that depends on the value of the two CDFs, and $g$ is an arbitrary function of $s$ and $w$.

    The default choice for $g$ is Riemann integration; as the CDFs are step functions, this is exact
    and leads to statistics of the form

    \[ \int_{-\infty}^{\infty} f(F_a(x),F_b(x)) dx . \]

    Parameters:
        sample_a: first sample; a sequence of real numbers
        sample_b: second sample; a sequence of real numbers;
                  can be of different length than first sample
        f: function accepting two same-length real vectors, returning a real vector of same length.
           This function computes a value that depends only on the two CDFs, and is thus constant
           between change points. The default is the squared difference, f(a,b) = np.square(a-b).
           The convention here is to use the left endpoint of the "steps".
        g: function accepting two same-length real vectors, returning a real number.
           Computes the statistic based on values of f and step "widths".
           The default, g(s,w) = np.sum(g * w), performs Riemann integration.
    """

    sample_a = params.real_vector(sample_a)
    sample_b = params.real_vector(sample_b)

    allx = np.union1d(sample_a, sample_b)  # all x where F_a and F_b change
    xdif = np.ediff1d(allx)  # width of Riemann integration bars
    allx = allx.reshape((len(allx), 1))
    cdfa = np.count_nonzero(np.sort(sample_a) <= allx, axis=1) / len(sample_a)
    cdfb = np.count_nonzero(np.sort(sample_b) <= allx, axis=1) / len(sample_b)
    stat = np.asfarray(f(cdfa, cdfb))

    return g(stat[:-1], xdif)
Esempio n. 6
0
    def __init__(self, mean, **kwargs):
        """Initialize state.

        Parameters:
            mean: sequence of means (floats)
        """

        super().__init__(**kwargs)

        self._mean = params.real_vector(mean)
Esempio n. 7
0
    def apply(self, dist: PredictiveDistribution) -> Sequence[float]:
        """Calculate the likelihood of the given distribution improving on the target value.
        This currently only works for normal distributions. To extend to non-normal distributions,
        we should have the `PredictiveDistribution` class expose a `cdf()` method.

        Parameters:
            dist: a univariate predictive distribution

        Returns:
             The probability mass of the distribution that is above/below the target
                (depending on if the goal is to maximize or minimize)
        """
        mean = params.real_vector(dist.mean)
        stddev = params.real_vector(dist.stddev, dimensions=len(mean), domain=(0, np.inf))

        # If the goal is to minimize, negate the target and the mean value.
        # Then, calculate the likelihood of improvement assuming maximization.
        target = self._target * self._direction
        mean = mean * self._direction
        return np.asfarray([self._calculate_li_above(m, s, target) for m, s in zip(mean, stddev)])
Esempio n. 8
0
    def shaded_line(
        self,
        positions: np.ndarray,
        values: List[np.ndarray],
        color_idx: int = 0,
        label: Optional[str] = None,
        quantile_width: float = 0.5,
        alpha: float = 0.2,
        show_extrema: bool = True,
        **kwargs,
    ):
        """Draw a line plot with shaded quantiles.

        Parameters:
            positions: 1-d array of point locations on the horizontal axis
            values: list of arrays, each one containing all of the values at a given location.
                len(values) must equal len(positions)
            color_idx: color index
            label: line label
            quantile_width: fraction of the range to shade. For the default value, 0.5,
                shade from the 25th percentile to the 75th percentile.
            alpha: shading alpha level
            show_extrema: whether or not to draw dashed lines at the best/worst point
        """
        positions = params.real_vector(positions)
        values = params.tuple_(values, params.real_vector, arity=len(positions))
        color_idx = params.integer(color_idx, from_=0, below=len(self.configuration.color_set))
        quantile_width = params.real(quantile_width, from_=0, to=1)
        alpha = params.real(alpha, from_=0, to=1)

        color = self.configuration.color(color_idx)
        lower_bound = 0.5 - quantile_width / 2.0
        upper_bound = 0.5 + quantile_width / 2.0

        median = [np.median(samples) for samples in values]
        lower_shading = [np.quantile(samples, lower_bound) for samples in values]
        upper_shading = [np.quantile(samples, upper_bound) for samples in values]

        self.ax.plot(positions, median, linestyle="-", color=color, label=label, **kwargs)
        self.ax.fill_between(
            positions,
            lower_shading,
            upper_shading,
            color=color,
            alpha=alpha,
            **kwargs,
        )

        if show_extrema:
            min_val = [np.min(samples) for samples in values]
            max_val = [np.max(samples) for samples in values]
            self.ax.plot(positions, min_val, linestyle="--", color=color, **kwargs)
            self.ax.plot(positions, max_val, linestyle="--", color=color, **kwargs)
Esempio n. 9
0
    def __init__(self, mean, stddev, corr, **kwargs):
        """Initialize state.

        The correlated normal distribution is completely characterized by
        its mean, standard deviations, and correlation matrix.

        Parameters:
            mean: a sequence of means (floats)
            stddev: a sequence of standard deviations (non-negative floats)
            corr: a matrix of Pearson correlations between individual predictions (floats between 0 and 1)
        """

        super().__init__(**kwargs)

        self._mean = params.real_vector(mean)
        self._stddev = params.real_vector(stddev,
                                          dimensions=len(self._mean),
                                          domain=(0, np.inf))
        self._corr = params.real_matrix(corr,
                                        nrows=len(self._mean),
                                        ncols=len(self._mean))
Esempio n. 10
0
def test_real_vector_1():
    """Tests real vectors."""

    assert np.array_equal(params.real_vector([1]), np.asfarray([1]))
    assert np.array_equal(params.real_vector([1, 2], dimensions=2), np.asfarray([1, 2]))
    assert np.array_equal(
        params.real_vector([1, 2], dimensions=2, domain=[0, 3]), np.asfarray([1, 2])
    )
    assert np.array_equal(
        params.real_vector([1, 2], domain=[[0.5, 1.5], [0, 3]]), np.asfarray([1, 2])
    )

    with pytest.raises(InvalidParameterError):
        params.real_vector([1, 2], dimensions=3)
    with pytest.raises(InvalidParameterError):
        params.real_vector([1, 2], domain=[0, 1.5])
Esempio n. 11
0
    def fit(self, data: Data) -> "RandomForestRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on
        
        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
    def fit(self, data: Data) -> "ExtremelyRandomizedTreesRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: tabular labeled data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(data, Data)

        if not data.is_labeled:
            raise InvalidParameterError("labeled data", "unlabeled data")
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Esempio n. 13
0
    def fit(self, data: Data) -> "GaussianProcessRegressionSklearn":
        """Fits the model using training data.

        Parameters:
            data: labeled data to train on;
                  must derive from IndexedData and LabeledData

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data,
            Data)  # todo: params.data(..., is_finite=True, is_labeled=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        self._model.fit(xtrain, ytrain)

        return self
Esempio n. 14
0
    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self