def test_different_bws(self): print("\n|Test_KDE_Resample:test_different_bws()|") np.random.seed(9235) NUM = 1000 a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) bb = np.random.normal(3.0, 0.02, NUM) + aa / 100 data = [aa, bb] edges = [utils.spacing(dd, 'lin', 100, stretch=1.0) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') bws = [0.5, 2.0] kde2d = kale.KDE(data, bandwidth=bws) kde1d = [kale.KDE(dd, bandwidth=ss) for dd, ss in zip(data, bws)] for ii in range(2): samp_1d = kde1d[ii].resample(NUM).squeeze() samp_2d = kde2d.resample(NUM)[ii] # Make sure the two distributions resemble eachother ks, pv = sp.stats.ks_2samp(samp_1d, samp_2d) # Calibrated to the above seed-value of `9235` print("{}, pv = {}".format(ii, pv)) assert_true(pv > 0.05) return
def pdf_params_fixed_bandwidth(self, kernel): print("\n|Test_KDE_PDF:pdf_params_fixed_bandwidth()|") np.random.seed(124) NUM = 1000 bandwidth = 0.02 sigma = [2.5, 1.5] corr = 0.9 s2 = np.square(sigma) cc = corr * sigma[0] * sigma[1] cov = [[s2[0], cc], [cc, s2[1]]] cov = np.array(cov) data = np.random.multivariate_normal([1.0, 2.0], cov, NUM).T sigma = [2.5, 0.5] corr = 0.0 s2 = np.square(sigma) cc = corr * sigma[0] * sigma[1] cov = [[s2[0], cc], [cc, s2[1]]] cov = np.array(cov) more = np.random.multivariate_normal([1.0, 6.0], cov, NUM).T data = np.concatenate([data, more], axis=-1) kde = kale.KDE(data, bandwidth=bandwidth, kernel=kernel) edges = [utils.spacing(dd, 'lin', 200, stretch=0.1) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] widths = [np.diff(ee) for ee in edges] # area = widths[0][:, np.newaxis] * widths[1][np.newaxis, :] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') # grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=edges, density=True) for par in range(2): xx = cents[par] pdf_2d = kde.density(xx, params=par, probability=True)[1] kde_1d = kale.KDE(data[par, :], bandwidth=bandwidth, kernel=kernel) pdf_1d = kde_1d.density(xx, probability=True)[1] # print("matrix : ", kde.bandwidth.matrix, kde_1d.bandwidth.matrix) print(f"pdf_1d = {utils.stats_str(pdf_1d)}") print(f"pdf_2d = {utils.stats_str(pdf_2d)}") assert_true(np.allclose(pdf_2d, pdf_1d, rtol=1e-3)) for pdf, ls, lw in zip([pdf_2d, pdf_1d], ['-', '--'], [1.5, 3.0]): tot = np.sum(pdf * widths[par]) print("tot = {:.4e}".format(tot)) assert_true(np.isclose(tot, 1.0, rtol=2e-2)) vals = [xx, pdf] if par == 1: vals = vals[::-1] return
def test_reflect_2d(self): print("\n|Test_KDE_Resample:test_reflect_2d()|") seed = np.random.randint(int(1e4)) seed = 8067 print(seed) np.random.seed(seed) NUM = 2000 xx = np.random.uniform(0.0, 2.0, NUM) yy = np.random.normal(1.0, 1.5, NUM) yy = yy[yy < 2.0] yy = np.concatenate([yy, np.random.choice(yy, NUM - yy.size)]) data = [xx, yy] edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]] egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges] cgrid = [utils.midpoints(ee, 'lin') for ee in egrid] # width = [np.diff(ee) for ee in egrid] xc, yc = np.meshgrid(*cgrid, indexing='ij') # grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=egrid, density=True) kde = kale.KDE(data) reflections = [[[0.0, 2.0], [None, 2.0]], [[0.0, 2.0], None], [None, [None, 2.0]], None] for jj, reflect in enumerate(reflections): samps_ref = kde.resample(reflect=reflect) samps_nrm = kde.resample() if reflect is None: continue for ii, ref in enumerate(reflect): if ref is None: continue if ref[0] is None: ref[0] = -np.inf if ref[1] is None: ref[1] = np.inf print(jj, ii, ref) for kk, zz in enumerate([samps_nrm[ii], samps_ref[ii]]): inside = (ref[0] < zz) & (zz < ref[1]) outside = ((zz < ref[0]) | (ref[1] < zz)) print("\tin : ", kk, np.all(inside), np.any(inside)) print("\tout: ", kk, np.all(outside), np.any(outside)) if kk == 0: assert_false(np.all(inside)) assert_true(np.any(outside)) else: assert_true(np.all(inside)) assert_false(np.any(outside)) return
def compare_scipy_1d(self, kernel): print("\n|Test_KDE_PDF:test_compare_scipy_1d()|") NUM = 100 a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) bins = utils.spacing([-1, 14.0], 'lin', 40) grid = utils.spacing(bins, 'lin', 3000) methods = ['scott', 0.04, 0.2, 0.8] classes = [ lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw), lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel) ] for mm in methods: kde_list = [] for cc in classes: try: test = cc(aa, mm).density(grid, probability=True)[1] except AttributeError: test = cc(aa, mm).pdf(grid) kde_list.append(test) print("method: {}".format(mm)) print("\t" + utils.stats_str(kde_list[0])) print("\t" + utils.stats_str(kde_list[1])) assert_true(np.allclose(kde_list[0], kde_list[1])) return
def compare_scipy_2d(self, kernel): print("\n|Test_KDE_PDF:test_compare_scipy_2d()|") NUM = 1000 a1 = np.random.normal(6.0, 1.0, NUM//2) a2 = np.random.lognormal(0, 0.5, size=NUM//2) aa = np.concatenate([a1, a2]) bb = np.random.normal(3.0, 0.02, NUM) + aa/100 data = [aa, bb] edges = [utils.spacing(dd, 'lin', 30, stretch=0.5) for dd in data] cents = [utils.midpoints(ee, 'lin') for ee in edges] xe, ye = np.meshgrid(*edges, indexing='ij') xc, yc = np.meshgrid(*cents, indexing='ij') grid = np.vstack([xc.ravel(), yc.ravel()]) methods = ['scott', 0.04, 0.2, 0.8] # classes = [sp.stats.gaussian_kde, kale.KDE] classes = [lambda xx, bw: sp.stats.gaussian_kde(xx, bw_method=bw), lambda xx, bw: kale.KDE(xx, bandwidth=bw, kernel=kernel)] for mm in methods: kdes_list = [] for cc in classes: try: test = cc(data, mm).density(grid, probability=True)[1].reshape(xc.shape).T except AttributeError: test = cc(data, mm).pdf(grid).reshape(xc.shape).T kdes_list.append(test) assert_true(np.allclose(kdes_list[0], kdes_list[1])) return
def reflect_2d(self, kernel): print("\n|Test_KDE_PDF:test_reflect_2d()|") np.random.seed(124) NUM = 1000 xx = np.random.uniform(0.0, 2.0, NUM) yy = np.random.normal(1.0, 1.0, NUM) yy = yy[yy < 2.0] yy = np.concatenate([yy, np.random.choice(yy, NUM-yy.size)]) data = [xx, yy] edges = [utils.spacing(aa, 'lin', 30) for aa in [xx, yy]] egrid = [utils.spacing(ee, 'lin', 100, stretch=0.5) for ee in edges] cgrid = [utils.midpoints(ee, 'lin') for ee in egrid] width = [np.diff(ee) for ee in egrid] xc, yc = np.meshgrid(*cgrid, indexing='ij') grid = np.vstack([xc.ravel(), yc.ravel()]) hist, *_ = np.histogram2d(*data, bins=egrid, density=True) kde = kale.KDE(data, kernel=kernel) inside_test_func = np.all if kernel._FINITE == 'infinite' else np.any reflections = [ [[0.0, 2.0], [None, 2.0]], [[0.0, 2.0], None], [None, [None, 2.0]], None ] for jj, reflect in enumerate(reflections): pdf_1d = kde.density(grid, reflect=reflect, probability=True)[1] pdf = pdf_1d.reshape(hist.shape) inside = np.ones_like(pdf_1d, dtype=bool) if reflect is None: outside = np.zeros_like(pdf_1d, dtype=bool) else: outside = np.ones_like(pdf_1d, dtype=bool) for ii, ref in enumerate(reflect): if ref is None: ref = [-np.inf, np.inf] if ref[0] is None: ref[0] = -np.inf if ref[1] is None: ref[1] = np.inf inside = inside & (ref[0] < grid[ii]) & (grid[ii] < ref[1]) outside = outside & ((grid[ii] < ref[0]) | (ref[1] < grid[ii])) assert_true(inside_test_func(pdf_1d[inside] > 0.0)) assert_true(np.allclose(pdf_1d[outside], 0.0)) area = width[0][:, np.newaxis] * width[1][np.newaxis, :] prob_tot = np.sum(pdf * area) print(jj, reflect, "prob_tot = {:.4e}".format(prob_tot)) assert_true(np.isclose(prob_tot, 1.0, rtol=3e-2)) return
def test_resample_keep_params_1(self): print("\n|Test_KDE_Resample:test_resample_keep_params_1()|") np.random.seed(9235) NUM = int(1e3) # Construct some random data # ------------------------------------ a1 = np.random.normal(6.0, 1.0, NUM // 2) a2 = np.random.lognormal(1.0, 0.5, size=NUM // 2) aa = np.concatenate([a1, a2]) # aa = a1 bb = np.random.normal(3.0, 0.02, aa.size) + aa / 100 data = [aa, bb] norm = 2.3 # Add an array of uniform values at location `ii`, make sure they are preserved in resample for ii in range(3): test = np.array(data) tt = norm * np.ones_like(test[0]) idx = np.random.choice(tt.size, tt.size // 2) tt[idx] *= -1 test = np.insert(test, ii, tt, axis=0) # Construct KDE kde3d = kale.KDE(test) # Resample from KDE preserving the uniform data samples = kde3d.resample(NUM, keep=ii) # Make sure the uniform values are still the same param_samp = samples[ii] assert_true( np.all( np.isclose(param_samp, norm) | np.isclose(param_samp, -norm))) # Make sure the other two parameters are consistent (KS-test) with input data samples = np.delete(samples, ii, axis=0) for jj in range(2): stuff = [samples[jj], data[jj]] ks, pv = sp.stats.ks_2samp(*stuff) msg = "{} {} :: {:.2e} {:.2e}".format(ii, jj, ks, pv) print("\t" + utils.stats_str(stuff[0])) print("\t" + utils.stats_str(stuff[1])) print(msg) assert_true(pv > 0.05) return
def reflect_1d(self, kernel): print("\n|Test_KDE_PDF:reflect_1d()|") np.random.seed(124) NUM = 1000 EXTR = [0.0, 2.0] aa = np.random.uniform(*EXTR, NUM) egrid = utils.spacing(aa, 'lin', 2000, stretch=0.5) cgrid = utils.midpoints(egrid, 'lin') delta = np.diff(egrid) boundaries = [None, EXTR] for bnd in boundaries: kde = kale.KDE(aa, kernel=kernel) pdf = kde.density(cgrid, reflect=bnd, probability=True)[1] # If the kernel's support is infinite, then all points outside of boundaries should be # nonzero; if it's finite-supported, then only some of them (near edges) will be outside_test_func = np.all if kernel._FINITE == 'infinite' else np.any # Make sure unitarity is preserved tot = np.sum(pdf * delta) print("Boundary '{}', total = {:.4e}".format(bnd, tot)) assert_true(np.isclose(tot, 1.0, rtol=1e-3)) ratio_extr = np.max(pdf) / np.min(pdf[pdf > 0]) # No reflection, then non-zero PDF everywhere, and large ratio of extrema if bnd is None: assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0)) assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0)) assert_true(ratio_extr > 10.0) # No lower-reflection, nonzero values below 0.0 elif bnd[0] is None: assert_true(outside_test_func(pdf[cgrid < EXTR[0]] > 0.0)) assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0)) # No upper-reflection, nonzero values above 2.0 elif bnd[1] is None: assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0)) assert_true(outside_test_func(pdf[cgrid > EXTR[1]] > 0.0)) else: assert_true(np.all(pdf[cgrid < EXTR[0]] == 0.0)) assert_true(np.all(pdf[cgrid > EXTR[1]] == 0.0)) assert_true(ratio_extr < 2.0) return
def test_reflect_1d(self): print("\n|Test_KDE_Resample:test_reflect_1d()|") np.random.seed(1245) NUM = 1000 extr = [0.0, 2.0] data = np.random.uniform(*extr, NUM) # fig, axes = plt.subplots(figsize=[10, 5], ncols=2) # edges = np.linspace(-0.2, 2.2, 25) # for ax in axes: # ax.scatter(data, -0.05*np.ones_like(data), alpha=0.1, marker='|') # ax.hist(data, bins=edges, density=True, edgecolor='k', alpha=0.5) kde = kale.KDE(data) reflections = [None, extr] for ii, reflect in enumerate(reflections): samp = kde.resample(NUM, reflect=reflect) # axes[ii].scatter(samp, -0.07*np.ones_like(samp), # alpha=0.1, color='r', marker='|') # axes[ii].hist(samp, bins=edges, # density=True, edgecolor='k', alpha=0.4, color='r', rwidth=0.5) some_outside = np.any((samp < extr[0]) + (extr[1] < samp)) print("reflect = '{}', outside = '{}'".format( reflect, some_outside)) if reflect is None: # There should be some samples outside assert_true(some_outside) else: # There should not be any samples outside assert_false(some_outside) ks, pv = sp.stats.ks_2samp(data, samp) print("ks = '{}', pv = '{}'".format(ks, pv)) # Check new sample is consistent assert_true(pv > 0.1) return
a1 = np.random.normal(4.0, 1.0, num_1) loc = 0.3 a2 = np.random.lognormal(loc, 0.5, size=num_2) data = np.concatenate([a1, a2]) np.random.seed(1234) np.random.shuffle(data) grid = np.linspace(-0.2, 8.0, 1000) # edges = np.linspace(-0.2, 8.0, 10) true_pdf = (num_1 / NUM) * np.power(2 * np.pi, -1 / 2) * np.exp( -(grid - 4.0)**2 / 2) true_pdf += (num_2 / NUM) * np.power( 2 * np.pi, -1 / 2) / (0.5 * grid) * np.exp(-(np.log(grid) - loc)**2 / 0.5) kde_full = kale.KDE(data, kernel='Parabola', bandwidth=0.3) pdf_full = kde_full.pdf(grid) np.random.seed(4321) samp_full = kde_full.resample(NUM) edges_full = np.linspace(-0.2, 8.0, 40) output_path = os.path.join(os.path.curdir, "anim", "") if os.path.exists(output_path) and os.path.isdir(output_path): shutil.rmtree(output_path) os.makedirs(output_path) fname_base = os.path.join(output_path, "anim_{:05d}.png") fig, axes = plt.subplots(figsize=[10, 4], ncols=2, sharey=True) plt.subplots_adjust(left=0.04, bottom=0.08, right=0.98, top=0.92, wspace=0.05)