def _cdf(self, statistic, samples): # Some simple, exact cases (more in Ruben & Gambino). if statistic <= 1 / (2 * samples): return 0. if statistic >= 1: return 1. if statistic <= 1 / samples: t = 2 * statistic - 1 / samples return exp(gammaln(samples + 1) + samples * log(t)) if statistic >= 1 - 1 / samples: return 1 - 2 * (1 - statistic)**samples # For small sample counts we may use an exact method when needed. if samples < 150: # With samples = 150 the matrix calculation takes about 100 ms # on a ~3 GFLOPS/core processor. if samples * statistic**2 < 7: # For a small threshold the Durbin matrix will be small. return ks_unif_durbin_matrix(samples, statistic) else: # Double the one-sided probability; accurate when close to one. return 1 - 2 * smirnov(samples, statistic) # Further we need to make a compromise between speed and accuracy. if samples < 100000 and samples * statistic**1.5 < 1.4: # The cost of the matrix calculation should still be acceptable. return ks_unif_durbin_matrix(samples, statistic) else: # No options left, but to use an asymptotic approximation. return ks_unif_pelz_good(samples, statistic)
def test_n_large(self): # test for large values of n # Probabilities should go down as n goes up x = 0.4 pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)]) dfs = np.diff(pvals) assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs)
def _cdf(self, statistic, samples): # Some simple, exact cases (more in Ruben & Gambino). if statistic <= 1 / (2 * samples): return 0. if statistic >= 1: return 1. if statistic <= 1 / samples: t = 2 * statistic - 1 / samples return exp(gammaln(samples + 1) + samples * log(t)) if statistic >= 1 - 1 / samples: return 1 - 2 * (1 - statistic) ** samples # For small sample counts we may use an exact method when needed. if samples < 150: # With samples = 150 the matrix calculation takes about 100 ms # on a ~3 GFLOPS/core processor. if samples * statistic ** 2 < 7: # For a small threshold the Durbin matrix will be small. return ks_unif_durbin_matrix(samples, statistic) else: # Double the one-sided probability; accurate when close to one. return 1 - 2 * smirnov(samples, statistic) # Further we need to make a compromise between speed and accuracy. if samples < 100000 and samples * statistic ** 1.5 < 1.4: # The cost of the matrix calculation should still be acceptable. return ks_unif_durbin_matrix(samples, statistic) else: # No options left, but to use an asymptotic approximation. return ks_unif_pelz_good(samples, statistic)
def concentration_pfa(membership, ms_size, trim=False): membership = membership[membership > 0] if trim: ones = np.where(membership > 1 - 1e-5)[0] membership = np.delete(membership, ones[-min(len(ones), ms_size):]) if len(membership) > 1: d_min, _ = kstest(membership, 'uniform', alternative='less') pvalue = smirnov(len(membership), d_min) else: pvalue = 1. if pvalue == 0: return -300 else: return np.log10(pvalue)
def _sf(self, statistic, samples): if statistic >= 1: # Statistic greater than 1 results in a NaN from Cephes smirnov(). return 0. if statistic >= 1 - 1 / samples: # The _cdf code can suffer from some cancellation in this case. return min(1., 2 * (1 - statistic)**samples) probability = 1 - self._cdf(statistic, samples) if probability > 1e-5: # Not too much precision got lost to cancellation. return probability else: # When the cdf float is very close to one it does not have bits # of small enough magnitude to express its 1-complement properly. # Hence, an approximate direct sf calculation may be more precise. return min(1., 2 * smirnov(samples, statistic))
def test_n_large(self): # test for large values of n # Probabilities should go down as n goes up x = 0.4 pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)]) dfs = np.diff(pvals) assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs) dataset = [(1000, 1 - 1.0/2000, np.power(2000.0, -1000))] dataset = np.asarray(dataset) FuncData(smirnov, dataset, (0, 1), 2, rtol=_rtol).check() # Check asymptotic behaviour dataset = [(n, 1.0 / np.sqrt(n), np.exp(-2)) for n in range(1000, 5000, 1000)] dataset = np.asarray(dataset) FuncData(smirnov, dataset, (0, 1), 2, rtol=.05).check()
def _sf(self, statistic, samples): if statistic >= 1: # Statistic greater than 1 results in a NaN from Cephes smirnov(). return 0. if statistic >= 1 - 1 / samples: # The _cdf code can suffer from some cancellation in this case. return min(1., 2 * (1 - statistic) ** samples) probability = 1 - self._cdf(statistic, samples) if probability > 1e-5: # Not too much precision got lost to cancellation. return probability else: # When the cdf float is very close to one it does not have bits # of small enough magnitude to express its 1-complement properly. # Hence, an approximate direct sf calculation may be more precise. return min(1., 2 * smirnov(samples, statistic))
def test_n_large(self): # test for large values of n # Probabilities should go down as n goes up x = 0.4 pvals = np.array([smirnov(n, x) for n in range(400, 1100, 20)]) dfs = np.diff(pvals) assert_(np.all(dfs <= 0), msg='Not all diffs negative %s' % dfs) dataset = [(1000, 1 - 1.0 / 2000, np.power(2000.0, -1000))] dataset = np.asarray(dataset) FuncData(smirnov, dataset, (0, 1), 2, rtol=_rtol).check() # Check asymptotic behaviour dataset = [(n, 1.0 / np.sqrt(n), np.exp(-2)) for n in range(1000, 5000, 1000)] dataset = np.asarray(dataset) FuncData(smirnov, dataset, (0, 1), 2, rtol=.05).check()
def ks1(data, model, x, ord=np.inf): vals = np.sort(data) # build the continuous cdf total = np.sum(model) ccdf = np.zeros(vals.shape) for i in xrange(len(vals)): idx = x <= vals[i] ccdf[i] = np.sum(model[idx]) / total # build the discrete cdf N = len(vals) dcdf = np.cumsum(np.ones(vals.shape)) / N d = norm(dcdf-ccdf, ord) p = smirnov(N, d) return ccdf, dcdf, d, p
def _sm_smi(n, p): return smirnov(n, smirnovi(n, p))
def test_nan(self): assert_(np.isnan(smirnov(1, np.nan)))
from scipy.special import smirnov # Show the probability of a gap at least as big as 0, 0.5 and 1.0 for a sample of size 5 smirnov(5, [0, 0.5, 1.0]) # array([ 1. , 0.056, 0. ]) # Compare a sample of size 5 drawn from a source N(0.5, 1) distribution against # a target N(0, 1) CDF. from scipy.stats import norm n = 5 gendist = norm(0.5, 1) # Normal distribution, mean 0.5, stddev 1 np.random.seed(seed=233423) # Set the seed for reproducibility x = np.sort(gendist.rvs(size=n)) x # array([-0.20946287, 0.71688765, 0.95164151, 1.44590852, 3.08880533]) target = norm(0, 1) cdfs = target.cdf(x) cdfs # array([ 0.41704346, 0.76327829, 0.82936059, 0.92589857, 0.99899518]) # # Construct the Empirical CDF and the K-S statistics (Dn+, Dn-, Dn) ecdfs = np.arange(n + 1, dtype=float) / n cols = np.column_stack( [x, ecdfs[1:], cdfs, cdfs - ecdfs[:n], ecdfs[1:] - cdfs]) np.set_printoptions(precision=3) cols # array([[ -2.095e-01, 2.000e-01, 4.170e-01, 4.170e-01, -2.170e-01], # [ 7.169e-01, 4.000e-01, 7.633e-01, 5.633e-01, -3.633e-01], # [ 9.516e-01, 6.000e-01, 8.294e-01, 4.294e-01, -2.294e-01], # [ 1.446e+00, 8.000e-01, 9.259e-01, 3.259e-01, -1.259e-01],