Exemple #1
0
    def _cdf(self, statistic, samples):
        # Some simple, exact cases (more in Ruben & Gambino).
        if statistic <= 1 / (2 * samples):
            return 0.
        if statistic >= 1:
            return 1.
        if statistic <= 1 / samples:
            t = 2 * statistic - 1 / samples
            return exp(gammaln(samples + 1) + samples * log(t))
        if statistic >= 1 - 1 / samples:
            return 1 - 2 * (1 - statistic)**samples

        # For small sample counts we may use an exact method when needed.
        if samples < 150:
            # With samples = 150 the matrix calculation takes about 100 ms
            # on a ~3 GFLOPS/core processor.
            if samples * statistic**2 < 7:
                # For a small threshold the Durbin matrix will be small.
                return ks_unif_durbin_matrix(samples, statistic)
            else:
                # Double the one-sided probability; accurate when close to one.
                return 1 - 2 * smirnov(samples, statistic)

        # Further we need to make a compromise between speed and accuracy.
        if samples < 100000 and samples * statistic**1.5 < 1.4:
            # The cost of the matrix calculation should still be acceptable.
            return ks_unif_durbin_matrix(samples, statistic)
        else:
            # No options left, but to use an asymptotic approximation.
            return ks_unif_pelz_good(samples, statistic)
Exemple #2
0
 def test_n_large(self):
     # test for large values of n
     # Probabilities should go down as n goes up
     x = 0.4
     pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)])
     dfs = np.diff(pvals)
     assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs)
Exemple #3
0
    def _cdf(self, statistic, samples):
        # Some simple, exact cases (more in Ruben & Gambino).
        if statistic <= 1 / (2 * samples):
            return 0.
        if statistic >= 1:
            return 1.
        if statistic <= 1 / samples:
            t = 2 * statistic - 1 / samples
            return exp(gammaln(samples + 1) + samples * log(t))
        if statistic >= 1 - 1 / samples:
            return 1 - 2 * (1 - statistic) ** samples

        # For small sample counts we may use an exact method when needed.
        if samples < 150:
            # With samples = 150 the matrix calculation takes about 100 ms
            # on a ~3 GFLOPS/core processor.
            if samples * statistic ** 2 < 7:
                # For a small threshold the Durbin matrix will be small.
                return ks_unif_durbin_matrix(samples, statistic)
            else:
                # Double the one-sided probability; accurate when close to one.
                return 1 - 2 * smirnov(samples, statistic)

        # Further we need to make a compromise between speed and accuracy.
        if samples < 100000 and samples * statistic ** 1.5 < 1.4:
            # The cost of the matrix calculation should still be acceptable.
            return ks_unif_durbin_matrix(samples, statistic)
        else:
            # No options left, but to use an asymptotic approximation.
            return ks_unif_pelz_good(samples, statistic)
 def test_n_large(self):
     # test for large values of n
     # Probabilities should go down as n goes up
     x = 0.4
     pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)])
     dfs = np.diff(pvals)
     assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs)
Exemple #5
0
def concentration_pfa(membership, ms_size, trim=False):
    membership = membership[membership > 0]
    if trim:
        ones = np.where(membership > 1 - 1e-5)[0]
        membership = np.delete(membership, ones[-min(len(ones), ms_size):])
    if len(membership) > 1:
        d_min, _ = kstest(membership, 'uniform', alternative='less')
        pvalue = smirnov(len(membership), d_min)
    else:
        pvalue = 1.
    if pvalue == 0:
        return -300
    else:
        return np.log10(pvalue)
Exemple #6
0
 def _sf(self, statistic, samples):
     if statistic >= 1:
         # Statistic greater than 1 results in a NaN from Cephes smirnov().
         return 0.
     if statistic >= 1 - 1 / samples:
         # The _cdf code can suffer from some cancellation in this case.
         return min(1., 2 * (1 - statistic)**samples)
     probability = 1 - self._cdf(statistic, samples)
     if probability > 1e-5:
         # Not too much precision got lost to cancellation.
         return probability
     else:
         # When the cdf float is very close to one it does not have bits
         # of small enough magnitude to express its 1-complement properly.
         # Hence, an approximate direct sf calculation may be more precise.
         return min(1., 2 * smirnov(samples, statistic))
Exemple #7
0
    def test_n_large(self):
        # test for large values of n
        # Probabilities should go down as n goes up
        x = 0.4
        pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)])
        dfs = np.diff(pvals)
        assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs)

        dataset = [(1000, 1 - 1.0/2000, np.power(2000.0, -1000))]
        dataset = np.asarray(dataset)
        FuncData(smirnov, dataset, (0, 1), 2, rtol=_rtol).check()

        # Check asymptotic behaviour
        dataset = [(n, 1.0 / np.sqrt(n), np.exp(-2)) for n in range(1000, 5000, 1000)]
        dataset = np.asarray(dataset)
        FuncData(smirnov, dataset, (0, 1), 2, rtol=.05).check()
Exemple #8
0
 def _sf(self, statistic, samples):
     if statistic >= 1:
         # Statistic greater than 1 results in a NaN from Cephes smirnov().
         return 0.
     if statistic >= 1 - 1 / samples:
         # The _cdf code can suffer from some cancellation in this case.
         return min(1., 2 * (1 - statistic) ** samples)
     probability = 1 - self._cdf(statistic, samples)
     if probability > 1e-5:
         # Not too much precision got lost to cancellation.
         return probability
     else:
         # When the cdf float is very close to one it does not have bits
         # of small enough magnitude to express its 1-complement properly.
         # Hence, an approximate direct sf calculation may be more precise.
         return min(1., 2 * smirnov(samples, statistic))
Exemple #9
0
    def test_n_large(self):
        # test for large values of n
        # Probabilities should go down as n goes up
        x = 0.4
        pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)])
        dfs = np.diff(pvals)
        assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs)

        dataset = [(1000, 1 - 1.0 / 2000, np.power(2000.0, -1000))]
        dataset = np.asarray(dataset)
        FuncData(smirnov, dataset, (0, 1), 2, rtol=_rtol).check()

        # Check asymptotic behaviour
        dataset = [(n, 1.0 / np.sqrt(n), np.exp(-2))
                   for n in range(1000, 5000, 1000)]
        dataset = np.asarray(dataset)
        FuncData(smirnov, dataset, (0, 1), 2, rtol=.05).check()
Exemple #10
0
def ks1(data, model, x, ord=np.inf):
    vals = np.sort(data)

    # build the continuous cdf
    total = np.sum(model)
    ccdf  = np.zeros(vals.shape)
    for i in xrange(len(vals)):
        idx = x <= vals[i]
        ccdf[i] = np.sum(model[idx]) / total

    # build the discrete cdf
    N = len(vals)
    dcdf = np.cumsum(np.ones(vals.shape)) / N

    d = norm(dcdf-ccdf, ord)
    p = smirnov(N, d)

    return ccdf, dcdf, d, p
Exemple #11
0
 def _sm_smi(n, p):
     return smirnov(n, smirnovi(n, p))
Exemple #12
0
 def test_nan(self):
     assert_(np.isnan(smirnov(1, np.nan)))
 def _sm_smi(n, p):
     return smirnov(n, smirnovi(n, p))
 def test_nan(self):
     assert_(np.isnan(smirnov(1, np.nan)))
Exemple #15
0
from scipy.special import smirnov

# Show the probability of a gap at least as big as 0, 0.5 and 1.0 for a sample of size 5

smirnov(5, [0, 0.5, 1.0])
# array([ 1.   ,  0.056,  0.   ])

# Compare a sample of size 5 drawn from a source N(0.5, 1) distribution against
# a target N(0, 1) CDF.

from scipy.stats import norm
n = 5
gendist = norm(0.5, 1)  # Normal distribution, mean 0.5, stddev 1
np.random.seed(seed=233423)  # Set the seed for reproducibility
x = np.sort(gendist.rvs(size=n))
x
# array([-0.20946287,  0.71688765,  0.95164151,  1.44590852,  3.08880533])
target = norm(0, 1)
cdfs = target.cdf(x)
cdfs
# array([ 0.41704346,  0.76327829,  0.82936059,  0.92589857,  0.99899518])
# # Construct the Empirical CDF and the K-S statistics (Dn+, Dn-, Dn)
ecdfs = np.arange(n + 1, dtype=float) / n
cols = np.column_stack(
    [x, ecdfs[1:], cdfs, cdfs - ecdfs[:n], ecdfs[1:] - cdfs])
np.set_printoptions(precision=3)
cols
# array([[ -2.095e-01,   2.000e-01,   4.170e-01,   4.170e-01,  -2.170e-01],
# [  7.169e-01,   4.000e-01,   7.633e-01,   5.633e-01,  -3.633e-01],
# [  9.516e-01,   6.000e-01,   8.294e-01,   4.294e-01,  -2.294e-01],
# [  1.446e+00,   8.000e-01,   9.259e-01,   3.259e-01,  -1.259e-01],