Example #1
0
    def compare_scipy_1d(self, kernel):
        print("\n|Test_KDE_PDF:test_compare_scipy_1d()|")
        NUM = 100
        a1 = np.random.normal(6.0, 1.0, NUM // 2)
        a2 = np.random.lognormal(0, 0.5, size=NUM // 2)
        aa = np.concatenate([a1, a2])

        bins = utils.spacing([-1, 14.0], 'lin', 40)
        grid = utils.spacing(bins, 'lin', 3000)

        methods = ['scott', 0.04, 0.2, 0.8]
        classes = [
            lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw),
            lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel)
        ]
        for mm in methods:
            kde_list = []
            for cc in classes:
                try:
                    test = cc(aa, mm).density(grid, probability=True)[1]
                except AttributeError:
                    test = cc(aa, mm).pdf(grid)

                kde_list.append(test)

            print("method: {}".format(mm))
            print("\t" + utils.stats_str(kde_list[0]))
            print("\t" + utils.stats_str(kde_list[1]))
            assert_true(np.allclose(kde_list[0], kde_list[1]))

        return
Example #2
0
    def ppf(self, cd):
        """Percentile Point Function - the inverse of the cumulative distribution function.

        NOTE: for symmetric kernels, this (effectively) uses points only with cdf in [0.0, 0.5],
        which produces better numerical results (unclear why).

        """
        if self._ppf_func is None:
            x0, y0 = self.cdf_grid
            self._ppf_func = sp.interpolate.interp1d(
                y0, x0, kind='cubic', fill_value='extrapolate')  # **self._INTERP_KWARGS)

        # Symmetry can be utilized to get better accuracy of results, see 'note' above
        if self.SYMMETRIC:
            cd = np.atleast_1d(cd)
            idx = (cd > 0.5)
            cd = np.copy(cd)
            cd[idx] = 1 - cd[idx]

        try:
            xx = self._ppf_func(cd)
        except ValueError:
            logging.error("`_ppf_func` failed!")
            logging.error("input `cd` = {}  <===  {}".format(
                utils.stats_str(cd), utils.array_str(cd)))
            for vv in self.cdf_grid:
                logging.error("\tcdf_grid: {} <== {}".format(
                    utils.stats_str(vv), utils.array_str(vv)))
            raise

        if self.SYMMETRIC:
            xx[idx] = -xx[idx]

        return xx
Example #3
0
    def test_resample_keep_params_1(self):
        print("\n|Test_KDE_Resample:test_resample_keep_params_1()|")
        np.random.seed(9235)
        NUM = int(1e3)

        # Construct some random data
        # ------------------------------------
        a1 = np.random.normal(6.0, 1.0, NUM // 2)
        a2 = np.random.lognormal(1.0, 0.5, size=NUM // 2)
        aa = np.concatenate([a1, a2])
        # aa = a1

        bb = np.random.normal(3.0, 0.02, aa.size) + aa / 100

        data = [aa, bb]

        norm = 2.3

        # Add an array of uniform values at location `ii`, make sure they are preserved in resample
        for ii in range(3):
            test = np.array(data)
            tt = norm * np.ones_like(test[0])
            idx = np.random.choice(tt.size, tt.size // 2)
            tt[idx] *= -1
            test = np.insert(test, ii, tt, axis=0)

            # Construct KDE
            kde3d = kale.KDE(test)

            # Resample from KDE preserving the uniform data
            samples = kde3d.resample(NUM, keep=ii)
            # Make sure the uniform values are still the same
            param_samp = samples[ii]
            assert_true(
                np.all(
                    np.isclose(param_samp, norm)
                    | np.isclose(param_samp, -norm)))

            # Make sure the other two parameters are consistent (KS-test) with input data
            samples = np.delete(samples, ii, axis=0)
            for jj in range(2):
                stuff = [samples[jj], data[jj]]
                ks, pv = sp.stats.ks_2samp(*stuff)
                msg = "{} {} :: {:.2e} {:.2e}".format(ii, jj, ks, pv)
                print("\t" + utils.stats_str(stuff[0]))
                print("\t" + utils.stats_str(stuff[1]))
                print(msg)
                assert_true(pv > 0.05)

        return
Example #4
0
def _check_reflect(reflect, data, weights=None, helper=False):
    """Make sure the given `reflect` argument is valid given the data shape
    """
    if reflect is None:
        return reflect

    if reflect is False:
        return None

    # NOTE: FIX: Should this happen in the method that calls `_check_reflect`?
    # data = np.atleast_2d(data)
    # ndim, nval = np.shape(data)
    data = np.asarray(data)
    ndim, nval = data.shape
    if reflect is True:
        reflect = [True for ii in range(ndim)]

    if (len(reflect) == 2) and (ndim == 1):
        reflect = np.atleast_2d(reflect)

    if (len(reflect) != ndim):  # and not ((len(reflect) == 2) and (ndim == 1)):
        err = "`reflect` ({},) must match the data with shape ({}) parameters!".format(
            len(reflect), data.shape)
        raise ValueError(err)

    try:
        goods = [(ref is None) or (ref is True) or (len(ref) == 2) for ref in reflect]
    except TypeError as err:
        err = "Invalid `reflect` argument: Error: '{}'".format(err)
        raise ValueError(err)

    if not np.all(goods):
        err = "each row of `reflect` must be `None` or have shape (2,)!  '{}'".format(reflect)
        raise ValueError(err)

    # Perform additional diagnostics
    for ii in range(ndim):
        if (reflect[ii] is True):
            reflect[ii] = [np.min(data[ii])*(1 - _NUM_PAD), np.max(data[ii])*(1 + _NUM_PAD)]
        elif (reflect[ii] is not None) and (True in reflect[ii]):
            if reflect[ii][0] is True:
                reflect[ii][0] = np.min(data[ii])*(1 - _NUM_PAD)
            if reflect[ii][1] is True:
                reflect[ii][1] = np.max(data[ii])*(1 + _NUM_PAD)

        if np.all(np.array(reflect[ii]) != None) and (reflect[ii][0] >= reflect[ii][1]):  # noqa
            err = "Reflect is out of order:  `reflect`[{}] = {}  !".format(ii, reflect[ii])
            raise ValueError(err)

        if helper:
            # Warn if any datapoints are outside of reflection bounds
            bads = utils.bound_indices(data[ii, :], reflect[ii], outside=True)
            if np.any(bads):
                if weights is None:
                    frac = np.count_nonzero(bads) / bads.size
                else:
                    frac = np.sum(weights[bads]) / np.sum(weights)
                msg = (
                    "A fraction {:.2e} of data[{}] ".format(frac, ii) +
                    " are outside of `reflect` bounds!"
                )
                logging.warning(msg)
                msg = (
                    "`reflect[{}]` = {}; ".format(ii, reflect[ii]) +
                    "`data[{}]` = {}".format(ii, utils.stats_str(data[ii], weights=weights))
                )
                logging.warning(msg)
                logging.warning("I hope you know what you're doing.")

    return reflect
Example #5
0
    def _resample_reflect(self, data, size, reflect, weights=None, keep=None):
        """Resample the given data using reflection.
        """
        matrix = self.matrix
        # Modify covariance-matrix for any `keep` dimensions
        matrix = utils.cov_keep_vars(matrix, keep, reflect=reflect)

        ndim, nvals = np.shape(data)

        # Actually 'reflect' (append new, mirrored points) around the given reflection points
        #   Also construct bounding box for valid data
        data, bounds, weights = self._reflect_data(data, reflect, weights=weights)

        # Remove data points outside of kernels (or truncated region)
        data, weights = self._truncate_reflections(data, bounds, weights=weights)

        if (self._chunk is not None) and (self._chunk < size):
            num_chunks = int(np.ceil(size/self._chunk))
            chunk_size = int(np.ceil(size/num_chunks))
        else:
            chunk_size = size
            num_chunks = 1

        # Draw randomly from the given data points, proportionally to their weights
        samps = np.zeros((size, ndim))
        num_good = 0
        cnt = 0
        MAX = 10
        draw = chunk_size
        fracs = []
        while (num_good < size) and (cnt < MAX * num_chunks):
            # Draw candidate resample points
            #    set `keep` to None, `matrix` is already modified to account for it
            trial = self._resample_clear(data, draw, weights=weights, matrix=matrix, keep=None)
            # Find the (boolean) indices of values within target boundaries
            idx = utils.bound_indices(trial, bounds)

            # Store good values to output array
            ngd = np.count_nonzero(idx)
            fracs.append(ngd/idx.size)

            if num_good + ngd <= size:
                samps[num_good:num_good+ngd, :] = trial.T[idx, :]
            else:
                ngd = (size - num_good)
                samps[num_good:num_good+ngd, :] = trial.T[idx, :][:ngd]

            # Increment counters
            num_good += ngd
            cnt += 1
            # Next time, draw twice as many as we need
            draw = np.minimum(size - num_good, chunk_size)
            draw = (2**ndim) * draw
            draw = np.minimum(draw, int(self._chunk))

        if num_good < size:
            err = "Failed to draw '{}' samples in {} iterations!".format(size, cnt)
            logging.error("")
            logging.error(err)
            logging.error("fracs = {}\n\t{}".format(utils.stats_str(fracs), fracs))
            logging.error("Obtained {} samples".format(num_good))
            logging.error("Reflect: {}".format(reflect))
            logging.error("Bandwidths: {}".format(np.sqrt(self.matrix.diagonal().squeeze())))
            logging.error("data = ")
            for dd in data:
                logging.error("\t{}".format(utils.stats_str(dd)))

            raise RuntimeError(err)

        samps = samps.T
        return samps