def test_ks_stat(x):
    theor_data = np.random.normal(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.exponential(1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)

    theor_data = np.random.logistic(0, 1, size=100)
    correct, _ = st.ks_2samp(x, theor_data)
    assert np.isclose(dcst.ks_stat(x, theor_data), correct)
Exemple #2
0
def test_pandas_conversion(seed):
    df = pd.DataFrame({
        'a': [3, 2, 1, 4],
        'b': [8, 6, 7, 5],
        'c': [9.1, 10.1, 11.1, np.nan]
    })

    x, y = dcst.ecdf(df.loc[:, 'a'])
    assert (x == np.array([1, 2, 3, 4])).all()
    assert (y == np.array([0.25, 0.5, 0.75, 1.0])).all()

    x, y = dcst.ecdf(df.loc[:, 'c'])
    assert np.allclose(x, np.array([9.1, 10.1, 11.1]))
    assert np.allclose(y, np.array([1 / 3, 2 / 3, 1.0]))

    df = pd.DataFrame({
        'a':
        np.concatenate((np.random.normal(0, 1, size=10), [np.nan] * 990)),
        'b':
        np.random.normal(0, 1, size=1000)
    })
    correct, _ = st.ks_2samp(df['a'].dropna(), df['b'])
    assert np.isclose(dcst.ks_stat(df['a'], df['b']), correct)

    df = pd.DataFrame({
        'a':
        np.concatenate((np.random.normal(0, 1, size=80), [np.nan] * 20)),
        'b':
        np.random.normal(0, 1, size=100)
    })
    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['a'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['a'], np.mean, size=100),
                       correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_bs_reps(df['b'].values, np.mean, size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_bs_reps(df['b'], np.mean, size=100),
                       correct,
                       atol=atol)

    dcst_private._seed_numba(seed)
    correct = dcst.draw_perm_reps(df['a'].values,
                                  df['b'].values,
                                  dcst.diff_of_means,
                                  size=100)
    dcst_private._seed_numba(seed)
    assert np.allclose(dcst.draw_perm_reps(df['a'],
                                           df['b'],
                                           dcst.diff_of_means,
                                           size=100),
                       correct,
                       atol=atol)
def draw_ks_reps(n, f, args=(), size=10000, n_reps=10000):
    # Generate samples from target distribution
    x_f = f(*args, size=size)

    # Initialize K-S replicates
    reps = np.empty(n_reps)

    # Draw replicates
    for i in range(n_reps):
        # Draw samples for comparison
        x_samp = f(*args, size=n)

        # Compute K-S statistic
        reps[i] = dcst.ks_stat(x_samp, x_f)

    return reps
def the_ks_test_for_exponentiality(time_gap):
    mean_time_gap = np.mean(time_gap)

    # Draw target distribution: x_f
    x_f = np.random.exponential(mean_time_gap, 10000)

    # Compute K-S stat: d
    d = dcst.ks_stat(x_f, time_gap)

    # Draw K-S replicates: reps
    reps = dcst.draw_ks_reps(len(time_gap),
                             np.random.exponential,
                             args=(mean_time_gap, ),
                             size=10000,
                             n_reps=10000)

    # Compute and print p-value
    p_val = np.sum(reps >= d) / 10000
    print('p =', p_val)
# Standard deviation of the time gap: std_time_gap
std_time_gap = np.std(time_gap)
'''
INSTRUCTIONS

*   Draw 10,000 replicates from the Exponential distribution using np.random.exponential(). The mean time gap between earthquakes is stored as mean_time_gap, which you computed in a previous exercise. Store the result in x_f.
*   Use these samples, x_f, along with the actual time gaps, stored in time_gap, to compute the Kolmogorov-Smirnov statistic using dcst.ks_stat().
*   Use the function you wrote in the last exercise, now conveniently stored as dcst.draw_ks_reps() to draw 10,000 K-S replicates from the Exponential distribution. Use the size=10000 keyword argument for drawing out of the target Exponential distribution. Store the replicates as reps.
*   Compute and print the p-value. Remember that "at least as extreme as" is defined in this case as the test statistic under the null hypothesis being greater than or equal to what was observed.
'''

# Draw target distribution: x_f
x_f = np.random.exponential(mean_time_gap, size=10000)

# Compute K-S stat: d
d = dcst.ks_stat(x_f, time_gap)

# Draw K-S replicates: reps
reps = dcst.draw_ks_reps(len(time_gap),
                         np.random.exponential,
                         args=(mean_time_gap, ),
                         size=10000,
                         n_reps=10000)

# Compute and print p-value
p_val = np.sum(reps >= d) / 10000
print('p =', p_val)
'''
p = 0.2199

That's a p-value above 0.2. This means that the Parkfield sequence is not outside the realm of possibility if earthquakes there are a Poisson process. This does not mean that they are generated by a Poisson process, but that the observed sequence is not incongruous with that model. The upshot is that it is really hard to say when the next Parkfield quake will be.