Example #1
0
def ks_test(a1: np.ndarray, a2: np.ndarray, b: int = 100, random_state=None) -> Dict:
    """
    Perform permutation two sample Kolmogorov-Smirnov test.

    Parameters
    ----------
    a1 : array-like
        First sample.
    a2 : array-like
        Second sample.
    b : int, optional
        Number of permutations. Default 100.
    random_state : numpy.random.Generator or int, optional
        Random number generator instance. If an integer is passed, seed the numpy
        default generator with it. Default is to use `numpy.random.default_rng()`.

    Returns
    -------
    {'d': float, 'prop': float}
        D statistic as well as proportion of permutation distribution less than or
        equal to that statistic.
    """
    if random_state is None:
        rng = np.random.default_rng()
    elif isinstance(random_state, int):
        rng = np.random.default_rng(random_state)
    else:
        rng = random_state

    a1 = np.asarray(a1)
    a2 = np.asarray(a2)

    a1 = a1[~np.isnan(a1)]
    a2 = a2[~np.isnan(a2)]

    n1 = len(a1)
    n2 = len(a2)
    n = n1 + n2

    f1 = cdf_gen(a1)
    f2 = cdf_gen(a2)
    a = np.sort(np.append(a1, a2))
    d = np.max([abs(f1(v) - f2(v)) for v in a])

    def h(arr, i, m):
        return np.searchsorted(arr, i, side="right", sorter=None) / m

    def g(s):
        mask = np.ones(n, dtype=np.bool)
        mask[rng.choice(range(n), size=n2, replace=False)] = False

        return np.max([abs(h(s[mask], i, n1) - h(s[~mask], i, n2)) for i in s])

    x = np.reshape(np.tile(a, b), newshape=(b, n))

    permute_d = np.apply_along_axis(func1d=g, arr=x, axis=1)

    return {"d": d, "prop": np.mean(permute_d > d)}
def test_quantile_is_inverse_of_cdf(rng):
    x = rng.normal(size=30)
    y = cdf_gen(x)(x)
    assert_equal(quantile_function_gen(x)(y), x)
def test_cdf_on_array():
    x = np.arange(4)
    cdf = cdf_gen(x)
    assert_equal(cdf(x), (x + 1) / len(x))
    assert_equal(cdf(x + 1e-10), (x + 1) / len(x))
    assert_equal(cdf(x - 1e-10), x / len(x))
def test_cdf_simple_cases():
    cdf = cdf_gen([0, 1, 2, 3])
    assert cdf(0) == 0.25
    assert cdf(1) == 0.5
    assert cdf(2) == 0.75
    assert cdf(3) == 1.0
def test_cdf_at_infinity():
    cdf = cdf_gen(np.arange(10))
    assert cdf(-np.inf) == 0.0
    assert cdf(np.inf) == 1.0
def test_cdf_increasing(rng):
    x = rng.normal(size=100)
    cdf = cdf_gen(x)
    result = [cdf(s) for s in np.linspace(x.min(), x.max(), 100)]
    assert np.all(np.diff(result) >= 0)