def test_check_reflect_boolean(): np.random.seed(12456) ndim = 3 data = np.random.uniform(-1.0, 1.0, (ndim, 100)) data[1] *= 2.0 ref = kernels._check_reflect(None, data) tools.assert_true(ref is None) ref = kernels._check_reflect(False, data) tools.assert_true(ref is None) extrema = [utils.minmax(dd) for dd in data] extrema = np.asarray(extrema) extrema[:, 0] *= (1 - kale._NUM_PAD) extrema[:, 1] *= (1 + kale._NUM_PAD) # If `True` is Given, values should be set to extrema reflect = kernels._check_reflect(True, data) tools.assert_true(np.allclose(reflect, extrema, atol=1e-10)) for ii in range(ndim): # True given for a single dimension reflect = [None for ii in range(ndim)] reflect[ii] = True ref = kernels._check_reflect(reflect, data) tools.assert_true(np.allclose(ref[ii], extrema[ii], atol=1e-10)) tools.assert_true( np.all([ref[jj] is None for jj in range(ndim) if jj != ii])) # True given as lower or upper bound for dimension reflect = [None for ii in range(ndim)] # val = np.random.uniform(-10.0, 10.0) val = 12.34 # Fixed upper-value (must be larger than lower-value!) reflect[ii] = [True, val] ref = kernels._check_reflect(reflect, data) tools.assert_true(np.isclose(ref[ii][0], extrema[ii][0], atol=1e-10)) tools.assert_true(ref[ii][1] == val) tools.assert_true( np.all([ref[jj] is None for jj in range(ndim) if jj != ii])) # Fixed lower-value (must be less than upper-value!) reflect[ii] = [-val, True] ref = kernels._check_reflect(reflect, data) tools.assert_true(np.isclose(ref[ii][1], extrema[ii][1], atol=1e-10)) tools.assert_true(ref[ii][0] == -val) tools.assert_true( np.all([ref[jj] is None for jj in range(ndim) if jj != ii])) return
def _test_check_reflect(ndim, weights_flag): reflects_good = [ None, [-1.0, 1.0], [-1.0, None], [None, 1.0], [None, None], ] reflects_bad = [ [None], 10.0, [12.0], [None, 1.0, 2.0], [2.0, 1.0], ] ndata = 7 # Generate data data = [] for ii in range(ndim): data.append(np.random.uniform(-1.0, 1.0, ndata)) weights = np.random.uniform(0.0, 1.0, ndata) if weights_flag else None nref = len(reflects_good) shape = [nref] * ndim for inds in np.ndindex(*shape): reflect = [reflects_good[ii] for ii in inds] print("---------") print("inds = {}, reflect = {}".format(inds, reflect)) # Make sure good values do not raise errors, and return the appropriate object ref = kernels._check_reflect(reflect, data, weights=weights) if len(ref) != ndim: err = "Returned reflect has length {}, but ndim = {}!".format( len(ref), ndim) raise ValueError(err) # Make sure bad values raise errors for jj, bad in enumerate(reflects_bad): jj = jj % ndim reflect[jj] = bad tools.assert_raises(ValueError, kernels._check_reflect, *(reflect, data, weights)) return
def density(self, points=None, reflect=None, params=None, grid=False, probability=False): """Evaluate the KDE distribution at the given data-points. This method acts as an API to the `Kernel.pdf` method for this instance's `kernel`. Arguments --------- points : ([D,]M,) array_like of float, or (D,) set of array_like point specifications The locations at which the PDF should be evaluated. The number of dimensions `D` must match that of the `dataset` that initialized this class' instance. NOTE: If the `params` kwarg (see below) is given, then only those dimensions of the target parameters should be specified in `points`. The meaning of `points` depends on the value of the `grid` argument: * `grid=True` : `points` must be a set of (D,) array_like objects which each give the evaluation points for the corresponding dimension to produce a grid of values. For example, for a 2D dataset, `points=([0.1, 0.2, 0.3], [1, 2])`, would produce a grid of points with shape (3, 2): `[[0.1, 1], [0.1, 2]], [[0.2, 1], [0.2, 2]], [[0.3, 1], [0.3, 2]]`, and the returned values would be an array of the same shape (3, 2). * `grid=False` : `points` must be an array_like (D,M) describing the position of `M` sample points in each of `D` dimensions. For example, for a 3D dataset: `points=([0.1, 0.2], [1.0, 2.0], [10, 20])`, describes 2 sample points at the 3D locations, `(0.1, 1.0, 10)` and `(0.2, 2.0, 20)`, and the returned values would be an array of shape (2,). reflect : (D,) array_like, None Locations at which reflecting boundary conditions should be imposed. For each dimension `D` (matching the input data), a pair of boundary locations (lower, upper) must be specified, or `None`. `None` can also be given as one of the two locations, to specify no boundary at that location. If the data is one-dimensional (D=1), then `reflect` may be shaped as (2,). See class docstrings:`Reflection` for more information. params : int, array_like of int, None Only calculate the PDF for certain parameters (dimensions). See class docstrings:`Projection` for more information. grid : bool, Evaluate the KDE distribution at a grid of points specified by `points`. See `points` argument description above. probability : bool, normalize the results to sum to unity Returns ------- points : array_like of scalar Locations at which the PDF is evaluated. vals : array_like of scalar PDF evaluated at the given points """ ndim = self.ndim data = self.dataset # if reflect is None: # reflect = self._reflect # print(f"{np.shape(data)=}, {params=}, {reflect=}") squeeze = False if params is not None: if (ndim == 1): if params == 0: params = None else: err = "Cannot specify `params` ('{}') > 0 for 1D data!".format( params) raise ValueError(err) if params is not None: squeeze = np.isscalar(params) params = np.atleast_1d(params) if reflect is None: reflect = self.reflect if reflect is not None: reflect = [reflect[pp] for pp in params] data = data[params, :] if (params is None) and (reflect is None): reflect = self.reflect # Make sure `reflect` shape matches the data if (reflect is not None) and (len(reflect) != np.shape(data)[0]): # If the data is 1D, and `reflect` is (2,) --- that's okay, convert `reflect` to (1, 2) if (ndim == 1) and (np.shape(reflect) == (2, )) and utils.really1d(reflect): reflect = np.atleast_2d(reflect) else: err = ("length of `reflect` ({}) ".format(reflect), "does not match `data` ({})!".format(np.shape(data))) raise ValueError(err) if points is None: points = self.points if params is not None: points = [points[pp] for pp in params] grid = (len(points) > 1) else: grid = (self.ndim > 1) elif utils.really1d(points): points = np.atleast_2d(points) squeeze = True if grid: _points = points points = utils.meshgrid(*points) shape = np.shape(points[0]) points = [pp.flatten() for pp in points] reflect = kernels._check_reflect(reflect, data, weights=self.weights) values = self.kernel.density(points, data, self.weights, reflect=reflect, params=params) if probability: if self.weights is None: values = values / self.ndata else: values = values / np.sum(self.weights) if grid: values = values.reshape(shape) points = _points if squeeze: points = points[0] values = values.squeeze() return points, values
def __init__(self, dataset, bandwidth=None, weights=None, kernel=None, extrema=None, points=None, reflect=None, covariance=None, neff=None, diagonal=False, helper=True, bw_rescale=None, **kwargs): """Initialize the `KDE` class with the given dataset and optional specifications. Arguments --------- dataset : array_like (N,) or (D,N,) Dataset from which to construct the kernel-density-estimate. For multivariate data with `D` variables and `N` values, the data must be shaped (D,N). For univariate (D=1) data, this can be a single array with shape (N,). bandwidth : str, float, array of float, None [optional] Specification for the bandwidth, or the method by which the bandwidth should be determined. If a `str` is given, it must match one of the standard bandwidth determination methods. If a `float` is given, it is used as the bandwidth in each dimension. If an array of `float`s are given, then each value will be used as the bandwidth for the corresponding data dimension. weights : array_like (N,), None [optional] Weights corresponding to each `dataset` point. Must match the number of points `N` in the `dataset`. If `None`, weights are uniformly set to 1.0 for each value. kernel : str, Distribution, None [optional] The distribution function that should be used for the kernel. This can be a `str` specification that must match one of the existing distribution functions, or this can be a `Distribution` subclass itself that overrides the `_evaluate` method. neff : int, None [optional] An effective number of datapoints. This is used in the plugin bandwidth determination methods. If `None`, `neff` is calculated from the `weights` array. If `weights` are all uniform, then `neff` equals the number of datapoints `N`. diagonal : bool, Whether the bandwidth/covariance matrix should be set as a diagonal matrix (i.e. without covariances between parameters). NOTE: see `KDE` docstrings, "Dynamic Range". """ self._squeeze = (np.ndim(dataset) == 1) self._dataset = np.atleast_2d(dataset) ndim, ndata = self.dataset.shape if ndim > ndata: level = logging.WARNING if ndim > 10 * ndata: level = logging.ERROR logging.log( level, f"data dimension {ndim} > data points {ndata}; should this be transposed?" ) reflect = kernels._check_reflect(reflect, self.dataset) self._helper = helper self._ndim = ndim self._ndata = ndata self._diagonal = diagonal self._reflect = reflect # The first time `points` are used, they need to be 'checked' for consistency self._check_points_flag = True self._points = points if ndata == 0: err = "ERROR: no data points provided! Dataset shape: ({}, {})".format( ndim, ndata) raise ValueError(err) if (ndata < 3) and (bandwidth is None): err = "WARNING: very few data points ({}, {}), recommend providing manual `bandwidth`!".format( ndim, ndata) logging.warning(err) # Set `weights` # -------------------------------- weights_uniform = True if weights is not None: if np.shape(weights) != (ndata, ): err = "`weights` input (shape={}) should be shaped as (N,)=({},)!".format( np.shape(weights), ndata) raise ValueError(err) if np.count_nonzero(weights) == 0 or np.any(~np.isfinite(weights) | (weights < 0)): raise ValueError( "Invalid `weights` entries, all must be finite and > 0!") weights = np.asarray(weights).astype(float) weights_uniform = False if neff is None: if weights_uniform: neff = ndata else: neff = np.sum(weights)**2 / np.sum(weights**2) self._weights = weights self._weights_uniform = weights_uniform # currently unused self._neff = neff # Set covariance, bandwidth, distribution and kernel # ----------------------------------------------------------- if covariance is None: covariance = np.cov(dataset, rowvar=True, bias=False, aweights=weights) self._covariance = np.atleast_2d(covariance) if bandwidth is None: bandwidth = _BANDWIDTH_DEFAULT self._set_bandwidth(bandwidth, bw_rescale) # Convert from string, class, etc to a kernel dist = kernels.get_distribution_class(kernel) self._kernel = kernels.Kernel(distribution=dist, bandwidth=self._bandwidth, covariance=self._covariance, helper=helper, **kwargs) # Get Distribution Extrema # ------------------------------------ # Determine the effective minima / maxima that should be used; KDE generally has support # outside of the data values themselves. # If the Kernel is finite, then there is only support out to `bandwidth` beyond datapoints if self.kernel.FINITE: out = (1.0 + _NUM_PAD) # If infinite kernel, how many standard-deviations can we expect values to lie at else: out = sp.stats.norm.ppf(1.0 - 1.0 / neff) # Extra to be double sure... out *= 1.2 # Find the effective-extrema in each dimension, to be used if `extrema` is not specified _bandwidth = np.sqrt(self.kernel.matrix.diagonal()) eff_extrema = [[np.min(dd) - bw * out, np.max(dd) + bw * out] for bw, dd in zip(_bandwidth, self.dataset)] if (extrema is None) and (reflect is not None): extrema = copy.deepcopy(reflect) # `eff_extrema` is, by design, outside of data limits, so don't `warn` about limits extrema = utils._parse_extrema(eff_extrema, extrema, warn=False) self._extrema = extrema # Finish Intialization # ------------------------------- self._cdf_grid = None self._cdf_func = None self._finalize() return