def test_ks_stat(x): theor_data = np.random.normal(0, 1, size=100) correct, _ = st.ks_2samp(x, theor_data) assert np.isclose(dcst.ks_stat(x, theor_data), correct) theor_data = np.random.exponential(1, size=100) correct, _ = st.ks_2samp(x, theor_data) assert np.isclose(dcst.ks_stat(x, theor_data), correct) theor_data = np.random.logistic(0, 1, size=100) correct, _ = st.ks_2samp(x, theor_data) assert np.isclose(dcst.ks_stat(x, theor_data), correct)
def test_pandas_conversion(seed): df = pd.DataFrame({ 'a': [3, 2, 1, 4], 'b': [8, 6, 7, 5], 'c': [9.1, 10.1, 11.1, np.nan] }) x, y = dcst.ecdf(df.loc[:, 'a']) assert (x == np.array([1, 2, 3, 4])).all() assert (y == np.array([0.25, 0.5, 0.75, 1.0])).all() x, y = dcst.ecdf(df.loc[:, 'c']) assert np.allclose(x, np.array([9.1, 10.1, 11.1])) assert np.allclose(y, np.array([1 / 3, 2 / 3, 1.0])) df = pd.DataFrame({ 'a': np.concatenate((np.random.normal(0, 1, size=10), [np.nan] * 990)), 'b': np.random.normal(0, 1, size=1000) }) correct, _ = st.ks_2samp(df['a'].dropna(), df['b']) assert np.isclose(dcst.ks_stat(df['a'], df['b']), correct) df = pd.DataFrame({ 'a': np.concatenate((np.random.normal(0, 1, size=80), [np.nan] * 20)), 'b': np.random.normal(0, 1, size=100) }) dcst_private._seed_numba(seed) correct = dcst.draw_bs_reps(df['a'].values, np.mean, size=100) dcst_private._seed_numba(seed) assert np.allclose(dcst.draw_bs_reps(df['a'], np.mean, size=100), correct, atol=atol) dcst_private._seed_numba(seed) correct = dcst.draw_bs_reps(df['b'].values, np.mean, size=100) dcst_private._seed_numba(seed) assert np.allclose(dcst.draw_bs_reps(df['b'], np.mean, size=100), correct, atol=atol) dcst_private._seed_numba(seed) correct = dcst.draw_perm_reps(df['a'].values, df['b'].values, dcst.diff_of_means, size=100) dcst_private._seed_numba(seed) assert np.allclose(dcst.draw_perm_reps(df['a'], df['b'], dcst.diff_of_means, size=100), correct, atol=atol)
def draw_ks_reps(n, f, args=(), size=10000, n_reps=10000): # Generate samples from target distribution x_f = f(*args, size=size) # Initialize K-S replicates reps = np.empty(n_reps) # Draw replicates for i in range(n_reps): # Draw samples for comparison x_samp = f(*args, size=n) # Compute K-S statistic reps[i] = dcst.ks_stat(x_samp, x_f) return reps
def the_ks_test_for_exponentiality(time_gap): mean_time_gap = np.mean(time_gap) # Draw target distribution: x_f x_f = np.random.exponential(mean_time_gap, 10000) # Compute K-S stat: d d = dcst.ks_stat(x_f, time_gap) # Draw K-S replicates: reps reps = dcst.draw_ks_reps(len(time_gap), np.random.exponential, args=(mean_time_gap, ), size=10000, n_reps=10000) # Compute and print p-value p_val = np.sum(reps >= d) / 10000 print('p =', p_val)
# Standard deviation of the time gap: std_time_gap std_time_gap = np.std(time_gap) ''' INSTRUCTIONS * Draw 10,000 replicates from the Exponential distribution using np.random.exponential(). The mean time gap between earthquakes is stored as mean_time_gap, which you computed in a previous exercise. Store the result in x_f. * Use these samples, x_f, along with the actual time gaps, stored in time_gap, to compute the Kolmogorov-Smirnov statistic using dcst.ks_stat(). * Use the function you wrote in the last exercise, now conveniently stored as dcst.draw_ks_reps() to draw 10,000 K-S replicates from the Exponential distribution. Use the size=10000 keyword argument for drawing out of the target Exponential distribution. Store the replicates as reps. * Compute and print the p-value. Remember that "at least as extreme as" is defined in this case as the test statistic under the null hypothesis being greater than or equal to what was observed. ''' # Draw target distribution: x_f x_f = np.random.exponential(mean_time_gap, size=10000) # Compute K-S stat: d d = dcst.ks_stat(x_f, time_gap) # Draw K-S replicates: reps reps = dcst.draw_ks_reps(len(time_gap), np.random.exponential, args=(mean_time_gap, ), size=10000, n_reps=10000) # Compute and print p-value p_val = np.sum(reps >= d) / 10000 print('p =', p_val) ''' p = 0.2199 That's a p-value above 0.2. This means that the Parkfield sequence is not outside the realm of possibility if earthquakes there are a Poisson process. This does not mean that they are generated by a Poisson process, but that the observed sequence is not incongruous with that model. The upshot is that it is really hard to say when the next Parkfield quake will be.