def ks_test(a1: np.ndarray, a2: np.ndarray, b: int = 100, random_state=None) -> Dict: """ Perform permutation two sample Kolmogorov-Smirnov test. Parameters ---------- a1 : array-like First sample. a2 : array-like Second sample. b : int, optional Number of permutations. Default 100. random_state : numpy.random.Generator or int, optional Random number generator instance. If an integer is passed, seed the numpy default generator with it. Default is to use `numpy.random.default_rng()`. Returns ------- {'d': float, 'prop': float} D statistic as well as proportion of permutation distribution less than or equal to that statistic. """ if random_state is None: rng = np.random.default_rng() elif isinstance(random_state, int): rng = np.random.default_rng(random_state) else: rng = random_state a1 = np.asarray(a1) a2 = np.asarray(a2) a1 = a1[~np.isnan(a1)] a2 = a2[~np.isnan(a2)] n1 = len(a1) n2 = len(a2) n = n1 + n2 f1 = cdf_gen(a1) f2 = cdf_gen(a2) a = np.sort(np.append(a1, a2)) d = np.max([abs(f1(v) - f2(v)) for v in a]) def h(arr, i, m): return np.searchsorted(arr, i, side="right", sorter=None) / m def g(s): mask = np.ones(n, dtype=np.bool) mask[rng.choice(range(n), size=n2, replace=False)] = False return np.max([abs(h(s[mask], i, n1) - h(s[~mask], i, n2)) for i in s]) x = np.reshape(np.tile(a, b), newshape=(b, n)) permute_d = np.apply_along_axis(func1d=g, arr=x, axis=1) return {"d": d, "prop": np.mean(permute_d > d)}
def test_quantile_is_inverse_of_cdf(rng): x = rng.normal(size=30) y = cdf_gen(x)(x) assert_equal(quantile_function_gen(x)(y), x)
def test_cdf_on_array(): x = np.arange(4) cdf = cdf_gen(x) assert_equal(cdf(x), (x + 1) / len(x)) assert_equal(cdf(x + 1e-10), (x + 1) / len(x)) assert_equal(cdf(x - 1e-10), x / len(x))
def test_cdf_simple_cases(): cdf = cdf_gen([0, 1, 2, 3]) assert cdf(0) == 0.25 assert cdf(1) == 0.5 assert cdf(2) == 0.75 assert cdf(3) == 1.0
def test_cdf_at_infinity(): cdf = cdf_gen(np.arange(10)) assert cdf(-np.inf) == 0.0 assert cdf(np.inf) == 1.0
def test_cdf_increasing(rng): x = rng.normal(size=100) cdf = cdf_gen(x) result = [cdf(s) for s in np.linspace(x.min(), x.max(), 100)] assert np.all(np.diff(result) >= 0)