Example #1
0
def test_check_reflect_boolean():
    np.random.seed(12456)
    ndim = 3

    data = np.random.uniform(-1.0, 1.0, (ndim, 100))
    data[1] *= 2.0

    ref = kernels._check_reflect(None, data)
    tools.assert_true(ref is None)

    ref = kernels._check_reflect(False, data)
    tools.assert_true(ref is None)

    extrema = [utils.minmax(dd) for dd in data]
    extrema = np.asarray(extrema)
    extrema[:, 0] *= (1 - kale._NUM_PAD)
    extrema[:, 1] *= (1 + kale._NUM_PAD)

    # If `True` is Given, values should be set to extrema
    reflect = kernels._check_reflect(True, data)
    tools.assert_true(np.allclose(reflect, extrema, atol=1e-10))

    for ii in range(ndim):
        # True given for a single dimension
        reflect = [None for ii in range(ndim)]
        reflect[ii] = True
        ref = kernels._check_reflect(reflect, data)

        tools.assert_true(np.allclose(ref[ii], extrema[ii], atol=1e-10))
        tools.assert_true(
            np.all([ref[jj] is None for jj in range(ndim) if jj != ii]))

        # True given as lower or upper bound for dimension
        reflect = [None for ii in range(ndim)]
        # val = np.random.uniform(-10.0, 10.0)
        val = 12.34

        # Fixed upper-value (must be larger than lower-value!)
        reflect[ii] = [True, val]
        ref = kernels._check_reflect(reflect, data)
        tools.assert_true(np.isclose(ref[ii][0], extrema[ii][0], atol=1e-10))
        tools.assert_true(ref[ii][1] == val)
        tools.assert_true(
            np.all([ref[jj] is None for jj in range(ndim) if jj != ii]))

        # Fixed lower-value (must be less than upper-value!)
        reflect[ii] = [-val, True]
        ref = kernels._check_reflect(reflect, data)
        tools.assert_true(np.isclose(ref[ii][1], extrema[ii][1], atol=1e-10))
        tools.assert_true(ref[ii][0] == -val)
        tools.assert_true(
            np.all([ref[jj] is None for jj in range(ndim) if jj != ii]))

    return
Example #2
0
def _test_check_reflect(ndim, weights_flag):
    reflects_good = [
        None,
        [-1.0, 1.0],
        [-1.0, None],
        [None, 1.0],
        [None, None],
    ]

    reflects_bad = [
        [None],
        10.0,
        [12.0],
        [None, 1.0, 2.0],
        [2.0, 1.0],
    ]

    ndata = 7

    # Generate data
    data = []
    for ii in range(ndim):
        data.append(np.random.uniform(-1.0, 1.0, ndata))

    weights = np.random.uniform(0.0, 1.0, ndata) if weights_flag else None
    nref = len(reflects_good)
    shape = [nref] * ndim
    for inds in np.ndindex(*shape):
        reflect = [reflects_good[ii] for ii in inds]
        print("---------")
        print("inds = {}, reflect = {}".format(inds, reflect))
        # Make sure good values do not raise errors, and return the appropriate object
        ref = kernels._check_reflect(reflect, data, weights=weights)
        if len(ref) != ndim:
            err = "Returned reflect has length {}, but ndim = {}!".format(
                len(ref), ndim)
            raise ValueError(err)

        # Make sure bad values raise errors
        for jj, bad in enumerate(reflects_bad):
            jj = jj % ndim
            reflect[jj] = bad
            tools.assert_raises(ValueError, kernels._check_reflect,
                                *(reflect, data, weights))

    return
Example #3
0
    def density(self,
                points=None,
                reflect=None,
                params=None,
                grid=False,
                probability=False):
        """Evaluate the KDE distribution at the given data-points.

        This method acts as an API to the `Kernel.pdf` method for this instance's `kernel`.


        Arguments
        ---------
        points : ([D,]M,) array_like of float, or (D,) set of array_like point specifications
            The locations at which the PDF should be evaluated.  The number of dimensions `D` must
            match that of the `dataset` that initialized this class' instance.
            NOTE: If the `params` kwarg (see below) is given, then only those dimensions of the
            target parameters should be specified in `points`.

            The meaning of `points` depends on the value of the `grid` argument:

            * `grid=True`  : `points` must be a set of (D,) array_like objects which each give the
              evaluation points for the corresponding dimension to produce a grid of values.
              For example, for a 2D dataset,
              `points=([0.1, 0.2, 0.3], [1, 2])`,
              would produce a grid of points with shape (3, 2):
              `[[0.1, 1], [0.1, 2]], [[0.2, 1], [0.2, 2]], [[0.3, 1], [0.3, 2]]`,
              and the returned values would be an array of the same shape (3, 2).

            * `grid=False` : `points` must be an array_like (D,M) describing the position of `M`
              sample points in each of `D` dimensions.
              For example, for a 3D dataset:
              `points=([0.1, 0.2], [1.0, 2.0], [10, 20])`,
              describes 2 sample points at the 3D locations, `(0.1, 1.0, 10)` and `(0.2, 2.0, 20)`,
              and the returned values would be an array of shape (2,).

        reflect : (D,) array_like, None
            Locations at which reflecting boundary conditions should be imposed.
            For each dimension `D` (matching the input data), a pair of boundary locations
            (lower, upper) must be specified, or `None`.  `None` can also be given as one of the
            two locations, to specify no boundary at that location.
            If the data is one-dimensional (D=1), then `reflect` may be shaped as (2,).
            See class docstrings:`Reflection` for more information.

        params : int, array_like of int, None
            Only calculate the PDF for certain parameters (dimensions).
            See class docstrings:`Projection` for more information.

        grid : bool,
            Evaluate the KDE distribution at a grid of points specified by `points`.
            See `points` argument description above.

        probability : bool, normalize the results to sum to unity


        Returns
        -------
        points : array_like of scalar
            Locations at which the PDF is evaluated.
        vals : array_like of scalar
            PDF evaluated at the given points

        """
        ndim = self.ndim
        data = self.dataset
        # if reflect is None:
        #     reflect = self._reflect
        # print(f"{np.shape(data)=}, {params=}, {reflect=}")

        squeeze = False
        if params is not None:
            if (ndim == 1):
                if params == 0:
                    params = None
                else:
                    err = "Cannot specify `params` ('{}') > 0 for 1D data!".format(
                        params)
                    raise ValueError(err)

            if params is not None:
                squeeze = np.isscalar(params)
                params = np.atleast_1d(params)
                if reflect is None:
                    reflect = self.reflect
                    if reflect is not None:
                        reflect = [reflect[pp] for pp in params]

                data = data[params, :]

        if (params is None) and (reflect is None):
            reflect = self.reflect

        # Make sure `reflect` shape matches the data
        if (reflect is not None) and (len(reflect) != np.shape(data)[0]):
            # If the data is 1D, and `reflect` is (2,) --- that's okay, convert `reflect` to (1, 2)
            if (ndim == 1) and (np.shape(reflect)
                                == (2, )) and utils.really1d(reflect):
                reflect = np.atleast_2d(reflect)
            else:
                err = ("length of `reflect` ({}) ".format(reflect),
                       "does not match `data` ({})!".format(np.shape(data)))
                raise ValueError(err)

        if points is None:
            points = self.points
            if params is not None:
                points = [points[pp] for pp in params]
                grid = (len(points) > 1)
            else:
                grid = (self.ndim > 1)
        elif utils.really1d(points):
            points = np.atleast_2d(points)
            squeeze = True

        if grid:
            _points = points
            points = utils.meshgrid(*points)
            shape = np.shape(points[0])
            points = [pp.flatten() for pp in points]

        reflect = kernels._check_reflect(reflect, data, weights=self.weights)

        values = self.kernel.density(points,
                                     data,
                                     self.weights,
                                     reflect=reflect,
                                     params=params)

        if probability:
            if self.weights is None:
                values = values / self.ndata
            else:
                values = values / np.sum(self.weights)

        if grid:
            values = values.reshape(shape)
            points = _points

        if squeeze:
            points = points[0]
            values = values.squeeze()

        return points, values
Example #4
0
    def __init__(self,
                 dataset,
                 bandwidth=None,
                 weights=None,
                 kernel=None,
                 extrema=None,
                 points=None,
                 reflect=None,
                 covariance=None,
                 neff=None,
                 diagonal=False,
                 helper=True,
                 bw_rescale=None,
                 **kwargs):
        """Initialize the `KDE` class with the given dataset and optional specifications.

        Arguments
        ---------
        dataset : array_like (N,) or (D,N,)
            Dataset from which to construct the kernel-density-estimate.
            For multivariate data with `D` variables and `N` values, the data must be shaped (D,N).
            For univariate (D=1) data, this can be a single array with shape (N,).

        bandwidth : str, float, array of float, None  [optional]
            Specification for the bandwidth, or the method by which the bandwidth should be
            determined.  If a `str` is given, it must match one of the standard bandwidth
            determination methods.  If a `float` is given, it is used as the bandwidth in each
            dimension.  If an array of `float`s are given, then each value will be used as the
            bandwidth for the corresponding data dimension.

        weights : array_like (N,), None  [optional]
            Weights corresponding to each `dataset` point.  Must match the number of points `N` in
            the `dataset`.
            If `None`, weights are uniformly set to 1.0 for each value.

        kernel : str, Distribution, None  [optional]
            The distribution function that should be used for the kernel.  This can be a `str`
            specification that must match one of the existing distribution functions, or this can
            be a `Distribution` subclass itself that overrides the `_evaluate` method.

        neff : int, None  [optional]
            An effective number of datapoints.  This is used in the plugin bandwidth determination
            methods.
            If `None`, `neff` is calculated from the `weights` array.  If `weights` are all
            uniform, then `neff` equals the number of datapoints `N`.

        diagonal : bool,
            Whether the bandwidth/covariance matrix should be set as a diagonal matrix
            (i.e. without covariances between parameters).
            NOTE: see `KDE` docstrings, "Dynamic Range".

        """

        self._squeeze = (np.ndim(dataset) == 1)
        self._dataset = np.atleast_2d(dataset)
        ndim, ndata = self.dataset.shape
        if ndim > ndata:
            level = logging.WARNING
            if ndim > 10 * ndata:
                level = logging.ERROR
            logging.log(
                level,
                f"data dimension {ndim} > data points {ndata}; should this be transposed?"
            )

        reflect = kernels._check_reflect(reflect, self.dataset)

        self._helper = helper
        self._ndim = ndim
        self._ndata = ndata
        self._diagonal = diagonal
        self._reflect = reflect
        # The first time `points` are used, they need to be 'checked' for consistency
        self._check_points_flag = True
        self._points = points
        if ndata == 0:
            err = "ERROR: no data points provided!  Dataset shape: ({}, {})".format(
                ndim, ndata)
            raise ValueError(err)
        if (ndata < 3) and (bandwidth is None):
            err = "WARNING: very few data points ({}, {}), recommend providing manual `bandwidth`!".format(
                ndim, ndata)
            logging.warning(err)

        # Set `weights`
        # --------------------------------
        weights_uniform = True
        if weights is not None:
            if np.shape(weights) != (ndata, ):
                err = "`weights` input (shape={}) should be shaped as (N,)=({},)!".format(
                    np.shape(weights), ndata)
                raise ValueError(err)

            if np.count_nonzero(weights) == 0 or np.any(~np.isfinite(weights)
                                                        | (weights < 0)):
                raise ValueError(
                    "Invalid `weights` entries, all must be finite and > 0!")

            weights = np.asarray(weights).astype(float)
            weights_uniform = False

        if neff is None:
            if weights_uniform:
                neff = ndata
            else:
                neff = np.sum(weights)**2 / np.sum(weights**2)

        self._weights = weights
        self._weights_uniform = weights_uniform  # currently unused
        self._neff = neff

        # Set covariance, bandwidth, distribution and kernel
        # -----------------------------------------------------------
        if covariance is None:
            covariance = np.cov(dataset,
                                rowvar=True,
                                bias=False,
                                aweights=weights)
        self._covariance = np.atleast_2d(covariance)

        if bandwidth is None:
            bandwidth = _BANDWIDTH_DEFAULT

        self._set_bandwidth(bandwidth, bw_rescale)

        # Convert from string, class, etc to a kernel
        dist = kernels.get_distribution_class(kernel)
        self._kernel = kernels.Kernel(distribution=dist,
                                      bandwidth=self._bandwidth,
                                      covariance=self._covariance,
                                      helper=helper,
                                      **kwargs)

        # Get Distribution Extrema
        # ------------------------------------
        # Determine the effective minima / maxima that should be used; KDE generally has support
        #   outside of the data values themselves.

        # If the Kernel is finite, then there is only support out to `bandwidth` beyond datapoints
        if self.kernel.FINITE:
            out = (1.0 + _NUM_PAD)
        # If infinite kernel, how many standard-deviations can we expect values to lie at
        else:
            out = sp.stats.norm.ppf(1.0 - 1.0 / neff)
            # Extra to be double sure...
            out *= 1.2

        # Find the effective-extrema in each dimension, to be used if `extrema` is not specified
        _bandwidth = np.sqrt(self.kernel.matrix.diagonal())
        eff_extrema = [[np.min(dd) - bw * out,
                        np.max(dd) + bw * out]
                       for bw, dd in zip(_bandwidth, self.dataset)]

        if (extrema is None) and (reflect is not None):
            extrema = copy.deepcopy(reflect)

        # `eff_extrema` is, by design, outside of data limits, so don't `warn` about limits
        extrema = utils._parse_extrema(eff_extrema, extrema, warn=False)
        self._extrema = extrema

        # Finish Intialization
        # -------------------------------
        self._cdf_grid = None
        self._cdf_func = None

        self._finalize()
        return