Beispiel #1
0
    def test_midpoints_log(self):
        print("\n|Test_Utils:test_midpoints_log()|")
        test = [[1e0, 1e1, 1e2, 1e3], [1e2, 1e3, 1e4, 1e5]]

        aa = np.sqrt(10.0)

        truth = [[1e1, 1e2, 1e3, 1e4],
                 [[aa * 1e0, aa * 1e1, aa * 1e2],
                  [aa * 1e2, aa * 1e3, aa * 1e4]]]

        for ii, tr in enumerate(truth):
            vals = utils.midpoints(test, 'log', axis=ii)
            assert_true(np.all(np.shape(tr) == np.shape(vals)))
            assert_true(np.allclose(tr, vals))

        shp = (4, 5)
        test_log = np.random.uniform(-2.0, 2.0, np.product(shp)).reshape(shp)
        test_lin = 10**test_log
        for ii in range(2):
            # Make sure `midpoints` gives consistent results itself
            vals_log = utils.midpoints(test_log, 'lin', axis=ii)
            vals_lin = utils.midpoints(test_lin, 'log', axis=ii)
            assert_true(np.all(np.shape(vals_log) == np.shape(vals_lin)))
            assert_true(np.allclose(10**vals_log, vals_lin))

            # Compare log-midpoint to known values
            temp = np.moveaxis(test_lin, ii, 0)
            temp = np.log10(temp)
            true = temp[:-1, :] + 0.5 * np.diff(temp, axis=0)
            true = np.moveaxis(true, 0, ii)
            true = 10**true
            assert_true(np.all(np.shape(true) == np.shape(vals_lin)))
            assert_true(np.allclose(true, vals_lin))

        return
Beispiel #2
0
    def test_reflect_2d(self):
        print("\n|Test_KDE_Resample:test_reflect_2d()|")

        seed = np.random.randint(int(1e4))
        seed = 8067
        print(seed)
        np.random.seed(seed)
        NUM = 2000
        xx = np.random.uniform(0.0, 2.0, NUM)
        yy = np.random.normal(1.0, 1.5, NUM)
        yy = yy[yy < 2.0]
        yy = np.concatenate([yy, np.random.choice(yy, NUM - yy.size)])

        data = [xx, yy]
        edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]]
        egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges]
        cgrid = [utils.midpoints(ee, 'lin') for ee in egrid]
        # width = [np.diff(ee) for ee in egrid]

        xc, yc = np.meshgrid(*cgrid, indexing='ij')

        # grid = np.vstack([xc.ravel(), yc.ravel()])

        hist, *_ = np.histogram2d(*data, bins=egrid, density=True)

        kde = kale.KDE(data)

        reflections = [[[0.0, 2.0], [None, 2.0]], [[0.0, 2.0], None],
                       [None, [None, 2.0]], None]
        for jj, reflect in enumerate(reflections):
            samps_ref = kde.resample(reflect=reflect)
            samps_nrm = kde.resample()

            if reflect is None:
                continue

            for ii, ref in enumerate(reflect):
                if ref is None:
                    continue
                if ref[0] is None:
                    ref[0] = -np.inf
                if ref[1] is None:
                    ref[1] = np.inf

                print(jj, ii, ref)
                for kk, zz in enumerate([samps_nrm[ii], samps_ref[ii]]):
                    inside = (ref[0] < zz) & (zz < ref[1])
                    outside = ((zz < ref[0]) | (ref[1] < zz))

                    print("\tin : ", kk, np.all(inside), np.any(inside))
                    print("\tout: ", kk, np.all(outside), np.any(outside))

                    if kk == 0:
                        assert_false(np.all(inside))
                        assert_true(np.any(outside))
                    else:
                        assert_true(np.all(inside))
                        assert_false(np.any(outside))

        return
Beispiel #3
0
    def test_different_bws(self):
        print("\n|Test_KDE_Resample:test_different_bws()|")
        np.random.seed(9235)
        NUM = 1000
        a1 = np.random.normal(6.0, 1.0, NUM // 2)
        a2 = np.random.lognormal(0, 0.5, size=NUM // 2)
        aa = np.concatenate([a1, a2])

        bb = np.random.normal(3.0, 0.02, NUM) + aa / 100

        data = [aa, bb]
        edges = [utils.spacing(dd, 'lin', 100, stretch=1.0) for dd in data]
        cents = [utils.midpoints(ee, 'lin') for ee in edges]

        xe, ye = np.meshgrid(*edges, indexing='ij')
        xc, yc = np.meshgrid(*cents, indexing='ij')

        bws = [0.5, 2.0]
        kde2d = kale.KDE(data, bandwidth=bws)
        kde1d = [kale.KDE(dd, bandwidth=ss) for dd, ss in zip(data, bws)]

        for ii in range(2):
            samp_1d = kde1d[ii].resample(NUM).squeeze()
            samp_2d = kde2d.resample(NUM)[ii]

            # Make sure the two distributions resemble eachother
            ks, pv = sp.stats.ks_2samp(samp_1d, samp_2d)
            # Calibrated to the above seed-value of `9235`
            print("{}, pv = {}".format(ii, pv))
            assert_true(pv > 0.05)

        return
Beispiel #4
0
    def pdf_params_fixed_bandwidth(self, kernel):
        print("\n|Test_KDE_PDF:pdf_params_fixed_bandwidth()|")
        np.random.seed(124)

        NUM = 1000
        bandwidth = 0.02

        sigma = [2.5, 1.5]
        corr = 0.9

        s2 = np.square(sigma)
        cc = corr * sigma[0] * sigma[1]
        cov = [[s2[0], cc], [cc, s2[1]]]
        cov = np.array(cov)

        data = np.random.multivariate_normal([1.0, 2.0], cov, NUM).T

        sigma = [2.5, 0.5]
        corr = 0.0

        s2 = np.square(sigma)
        cc = corr * sigma[0] * sigma[1]
        cov = [[s2[0], cc], [cc, s2[1]]]
        cov = np.array(cov)
        more = np.random.multivariate_normal([1.0, 6.0], cov, NUM).T
        data = np.concatenate([data, more], axis=-1)

        kde = kale.KDE(data, bandwidth=bandwidth, kernel=kernel)

        edges = [utils.spacing(dd, 'lin', 200, stretch=0.1) for dd in data]
        cents = [utils.midpoints(ee, 'lin') for ee in edges]
        widths = [np.diff(ee) for ee in edges]
        # area = widths[0][:, np.newaxis] * widths[1][np.newaxis, :]

        xe, ye = np.meshgrid(*edges, indexing='ij')
        xc, yc = np.meshgrid(*cents, indexing='ij')
        # grid = np.vstack([xc.ravel(), yc.ravel()])

        hist, *_ = np.histogram2d(*data, bins=edges, density=True)

        for par in range(2):
            xx = cents[par]
            pdf_2d = kde.density(xx, params=par, probability=True)[1]
            kde_1d = kale.KDE(data[par, :], bandwidth=bandwidth, kernel=kernel)
            pdf_1d = kde_1d.density(xx, probability=True)[1]
            # print("matrix : ", kde.bandwidth.matrix, kde_1d.bandwidth.matrix)
            print(f"pdf_1d = {utils.stats_str(pdf_1d)}")
            print(f"pdf_2d = {utils.stats_str(pdf_2d)}")
            assert_true(np.allclose(pdf_2d, pdf_1d, rtol=1e-3))

            for pdf, ls, lw in zip([pdf_2d, pdf_1d], ['-', '--'], [1.5, 3.0]):

                tot = np.sum(pdf * widths[par])
                print("tot = {:.4e}".format(tot))
                assert_true(np.isclose(tot, 1.0, rtol=2e-2))
                vals = [xx, pdf]
                if par == 1:
                    vals = vals[::-1]

        return
Beispiel #5
0
    def compare_scipy_2d(self, kernel):
        print("\n|Test_KDE_PDF:test_compare_scipy_2d()|")

        NUM = 1000
        a1 = np.random.normal(6.0, 1.0, NUM//2)
        a2 = np.random.lognormal(0, 0.5, size=NUM//2)
        aa = np.concatenate([a1, a2])

        bb = np.random.normal(3.0, 0.02, NUM) + aa/100

        data = [aa, bb]
        edges = [utils.spacing(dd, 'lin', 30, stretch=0.5) for dd in data]
        cents = [utils.midpoints(ee, 'lin') for ee in edges]

        xe, ye = np.meshgrid(*edges, indexing='ij')
        xc, yc = np.meshgrid(*cents, indexing='ij')
        grid = np.vstack([xc.ravel(), yc.ravel()])

        methods = ['scott', 0.04, 0.2, 0.8]
        # classes = [sp.stats.gaussian_kde, kale.KDE]
        classes = [lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw),
                   lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel)]
        for mm in methods:
            kdes_list = []
            for cc in classes:
                try:
                    test = cc(data, mm).density(grid, probability=True)[1].reshape(xc.shape).T
                except AttributeError:
                    test = cc(data, mm).pdf(grid).reshape(xc.shape).T

                kdes_list.append(test)

            assert_true(np.allclose(kdes_list[0], kdes_list[1]))

        return
Beispiel #6
0
    def reflect_2d(self, kernel):
        print("\n|Test_KDE_PDF:test_reflect_2d()|")
        np.random.seed(124)
        NUM = 1000
        xx = np.random.uniform(0.0, 2.0, NUM)
        yy = np.random.normal(1.0, 1.0, NUM)
        yy = yy[yy < 2.0]
        yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)])

        data = [xx, yy]
        edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]]
        egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges]
        cgrid = [utils.midpoints(ee, 'lin') for ee in egrid]
        width = [np.diff(ee) for ee in egrid]

        xc, yc = np.meshgrid(*cgrid, indexing='ij')

        grid = np.vstack([xc.ravel(), yc.ravel()])

        hist, *_ = np.histogram2d(*data, bins=egrid, density=True)

        kde = kale.KDE(data, kernel=kernel)
        inside_test_func = np.all if kernel._FINITE == 'infinite' else np.any

        reflections = [
            [[0.0, 2.0], [None, 2.0]],
            [[0.0, 2.0], None],
            [None, [None, 2.0]],
            None
        ]
        for jj, reflect in enumerate(reflections):
            pdf_1d = kde.density(grid, reflect=reflect, probability=True)[1]
            pdf = pdf_1d.reshape(hist.shape)

            inside = np.ones_like(pdf_1d, dtype=bool)
            if reflect is None:
                outside = np.zeros_like(pdf_1d, dtype=bool)
            else:
                outside = np.ones_like(pdf_1d, dtype=bool)
                for ii, ref in enumerate(reflect):
                    if ref is None:
                        ref = [-np.inf, np.inf]
                    if ref[0] is None:
                        ref[0] = -np.inf
                    if ref[1] is None:
                        ref[1] = np.inf
                    inside = inside & (ref[0] < grid[ii]) & (grid[ii] < ref[1])
                    outside = outside & ((grid[ii] < ref[0]) | (ref[1] < grid[ii]))

            assert_true(inside_test_func(pdf_1d[inside] > 0.0))
            assert_true(np.allclose(pdf_1d[outside], 0.0))

            area = width[0][:, np.newaxis] * width[1][np.newaxis, :]
            prob_tot = np.sum(pdf * area)
            print(jj, reflect, "prob_tot = {:.4e}".format(prob_tot))
            assert_true(np.isclose(prob_tot, 1.0, rtol=3e-2))

        return
Beispiel #7
0
    def test_midpoints_lin(self):
        print("\n|Test_Utils:test_midpoints_lin()|")
        test = [[0, 1, 2, 3], [2, 3, 4, 5]]

        truth = [[1, 2, 3, 4], [[0.5, 1.5, 2.5], [2.5, 3.5, 4.5]]]

        for ii, tr in enumerate(truth):
            vals = utils.midpoints(test, 'lin', axis=ii)
            assert_true(np.all(np.shape(tr) == np.shape(vals)))
            assert_true(np.all(tr == vals))

        shp = (4, 5)
        test = np.random.uniform(-1.0, 1.0, np.product(shp)).reshape(shp)
        for ii in range(2):
            vals = utils.midpoints(test, 'lin', axis=ii)

            temp = np.moveaxis(test, ii, 0)
            true = temp[:-1, :] + 0.5 * np.diff(temp, axis=0)
            true = np.moveaxis(true, 0, ii)
            assert_true(np.all(np.shape(true) == np.shape(vals)))
            assert_true(np.allclose(true, vals))

        return
Beispiel #8
0
    def __init__(self, edges, dens, threshold=10.0, **kwargs):
        super().__init__(edges, dens, **kwargs)

        # Note: `dens` has already been converted from density to mass (i.e. integrating each cell)
        #       this happened in `Sample_Grid.__init__()` ==> `Sample_Outliers._init_data()`
        #       `data_edge` is still a density (at the corners of each cell)
        mass_outs = np.copy(self._mass)

        # We're only going to stochastically sample from bins below the threshold value
        #     recalc `csum` zeroing out the values above threshold
        outs = (mass_outs > threshold)
        # print(f"Outside: {np.count_nonzero(outs)/outs.size:.4f}")
        # print(f"Inside : {np.count_nonzero(~outs)/outs.size:.4f}")
        mass_outs[outs] = 0.0
        idx, csum = _data_to_cumulative(mass_outs, prefilter=False)
        self._idx = idx
        self._csum = csum

        # We'll manually sample bins above threshold, so store those for later
        mass_ins = np.copy(self._mass)
        mass_ins[~outs] = 0.0

        # Find the center-of-mass of each cell (based on density corner values)
        coms = self.grid
        dens_edge = self._dens
        dens_cent = utils.midpoints(dens_edge, log=False, axis=None)
        coms = [
            utils.midpoints(dens_edge * ll, log=False, axis=None) / dens_cent
            for ll in coms
        ]

        self._threshold = threshold
        self._mass_ins = mass_ins
        self._coms_ins = coms
        self._mass_outs = mass_outs
        return
Beispiel #9
0
    def test_midpoints_axes(self):
        print("\n|Test_Utils:test_midpoints_axes()|")
        # NUM = 100

        shp = (12, 14, 16)
        test = np.ones(shp)
        for ii in range(test.ndim):
            vals = utils.midpoints(test, 'lin', axis=ii)
            new_shape = np.array(shp)
            new_shape[ii] -= 1
            assert_true(np.all(vals.shape == new_shape))
            assert_true(np.all(vals == 1.0))

            vals = utils.midpoints(test, 'log', axis=ii)
            new_shape = np.array(shp)
            new_shape[ii] -= 1
            assert_true(np.all(vals.shape == new_shape))
            assert_true(np.all(vals == 1.0))

        test = np.arange(10)
        vals = utils.midpoints(test, 'lin')
        true = 0.5 * (test[:-1] + test[1:])
        assert_true(np.allclose(vals, true))
        return
Beispiel #10
0
    def reflect_1d(self, kernel):
        print("\n|Test_KDE_PDF:reflect_1d()|")

        np.random.seed(124)
        NUM = 1000
        EXTR = [0.0, 2.0]
        aa = np.random.uniform(*EXTR, NUM)

        egrid = utils.spacing(aa, 'lin', 2000, stretch=0.5)
        cgrid = utils.midpoints(egrid, 'lin')
        delta = np.diff(egrid)

        boundaries = [None, EXTR]
        for bnd in boundaries:
            kde = kale.KDE(aa, kernel=kernel)
            pdf = kde.density(cgrid, reflect=bnd, probability=True)[1]

            # If the kernel's support is infinite, then all points outside of boundaries should be
            # nonzero; if it's finite-supported, then only some of them (near edges) will be
            outside_test_func = np.all if kernel._FINITE == 'infinite' else np.any

            # Make sure unitarity is preserved
            tot = np.sum(pdf * delta)
            print("Boundary '{}', total = {:.4e}".format(bnd, tot))
            assert_true(np.isclose(tot, 1.0, rtol=1e-3))

            ratio_extr = np.max(pdf) / np.min(pdf[pdf > 0])
            # No reflection, then non-zero PDF everywhere, and large ratio of extrema
            if bnd is None:
                assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0))
                assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0))
                assert_true(ratio_extr > 10.0)
            # No lower-reflection, nonzero values below 0.0
            elif bnd[0] is None:
                assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0))
                assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0))
            # No upper-reflection, nonzero values above 2.0
            elif bnd[1] is None:
                assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0))
                assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0))
            else:
                assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0))
                assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0))
                assert_true(ratio_extr < 2.0)

        return
Beispiel #11
0
def _grad_along(data_edge, dim):
    grad = np.diff(data_edge, axis=dim)
    nums = list(np.arange(grad.ndim))
    nums.pop(dim)
    grad = utils.midpoints(grad, log=False, axis=nums)
    return grad
Beispiel #12
0
    def sample(self, nsamp=None, interpolate=True, return_scalar=None):
        """Sample from the probability distribution.

        Arguments
        ---------
        nsamp : scalar or None
        interpolate : bool
        return_scalar : bool

        Returns
        -------
        vals : (D, N) ndarray of scalar

        """
        dens = self._dens
        scalar_dens = self._scalar_dens
        edges = self._edges
        ndim = self._ndim

        # ---- initialize parameters
        if interpolate and (dens is None):
            logging.info("`dens` is None, cannot interpolate sampling")
            interpolate = False

        # If no number of samples are given, assume that the units of `self._mass` are number of samples, and choose
        # the total numbe of samples to be the total of this
        if nsamp is None:
            nsamp = self._mass.sum()
        nsamp = int(nsamp)

        if return_scalar is None:
            return_scalar = (scalar_dens is not None)
        elif return_scalar and (scalar_dens is None):
            return_scalar = False
            logging.warning(
                "WARNING: no `scalar` initialized, but `return_scalar`=True!")

        # ---- Get generalized sampling locations

        # Choose random bins, proportionally to `mass`, and positions within bins (uniformly distributed)
        #     `bin_numbers_flat` (N*D,) are the index numbers for bins in flattened 1D array of length N*D
        #     `intrabin_locs` (D, N) are position [0.0, 1.0] within each bin for each sample in each dimension
        bin_numbers_flat, intrabin_locs = self._random_bins(nsamp)
        # Convert from flat (N,) indices into ND indices;  (D, N) for D dimensions, N samples (`nsamp`)
        bin_numbers = np.unravel_index(bin_numbers_flat, self._shape_bins)

        # If scalars are also being sampled: find scalar value for bin centers (i.e. bin averages)
        #     this will be updated/improved if `interpolation=True`
        if return_scalar:
            scalar_mass = self._scalar_mass
            scalar_values = scalar_mass[bin_numbers]

        # ---- Place samples in each dimension

        vals = np.zeros_like(intrabin_locs)
        for dim, (edge, bidx) in enumerate(zip(edges, bin_numbers)):
            # Width of bins in this dimension
            wid = np.diff(edge)

            # Random location, in this dimension, for each bin. Relative position, i.e. between [0.0, 1.0]
            loc = intrabin_locs[dim]

            # Uniform / no-interpolation :: random-uniform within each bin
            if (not interpolate):
                vals[dim, :] = edge[bidx] + wid[bidx] * loc

            # Interpolated :: random-linear proportional to bin gradients (i.e. slope across bin in each dimension)
            else:
                # Calculate normalization for gradients; needs to be done for each dimension specifically
                #    This normalization is needed to ensure that the pdf values are unitary when integrating in each dim
                norm = utils.trapz_dens_to_mass(dens, edges, axis=dim)
                others = np.arange(ndim).tolist()
                others.pop(dim)
                norm = utils.midpoints(norm, axis=others)

                edge = np.asarray(edge)

                # Find the gradient along this dimension (using center-values in other dimensions)
                grad = _grad_along(dens, dim) / norm
                # get the gradient for each sample
                grad = grad.flat[bin_numbers_flat] * wid[bidx]
                # interpolate edge values in this dimension (returns values [0.0, 1.0])
                temp = _intrabin_linear_interp(loc, grad)
                # convert from intrabin positions to overall positions by linearly rescaling
                vals[dim, :] = edge[bidx] + temp * wid[bidx]

            # interpolate scalar values also
            if return_scalar and interpolate:
                grad = _grad_along(scalar_dens, dim)
                grad = grad.flat[bin_numbers_flat]
                # shift `loc` (location within bin) to center point
                scalar_values += grad * (loc - 0.5)

        if return_scalar:
            return vals, scalar_values

        return vals