Esempio n. 1
0
def instance2features(instance):
    features = []
    if settings.use_N_DIM:
        features.append(instance.dim)
        features.append(len(instance.points))
    for dist in settings.distances:
        distances, mean, dev = dist_util.distance(instance, dist)
        features.append(mean)
        features.append(dev)
        for test in settings.modality_tests:
            if test == 'dip':
                out = diptest(distances)
                if 'dip_stat' in settings.modality_tests[test]:
                    features.append(out[0])
                if 'p_value' in settings.modality_tests[test]:
                    features.append(out[1])

            if test == 'silverman':
                if 'p_value' in settings.modality_tests[test]:
                    out = modality.silverman_bwtest(np.random.choice(
                        distances, 250),
                                                    alpha=0.05)
                    assert isinstance(out, float)
                    features.append(out)

    for feat in settings.additional_statistics:
        if feat == 'hopkins':
            features.append(hopkins(instance.points))

    return features
Esempio n. 2
0
def dip(peak, smooth=True):
    """ Run a dip test. 

    The diptest can be used to test if a distribution is unimodal. In order to
    get it to work, I have to turn the peak signal into a distribution by
    simulating, and then run the test on the simulated data. This is a little
    hackish, there is probably a better/faster way.
    
    """

    # Smooth distribution using hamming
    if smooth:
        smooth = signal.convolve(peak, signal.hamming(10))
    else:
        smooth = peak

    # Set up x's
    x_grid = np.arange(0, smooth.shape[0])

    # Normalize the peak section to sum to 1
    norm = smooth / smooth.sum()

    # Simulate data from the peak distribution
    sim = choice(x_grid, size=3000, replace=True, p=norm)

    # Run diptest
    test, pval = diptest(sim)

    return test, pval
Esempio n. 3
0
def dip(peak, smooth=True):
    """ Run a dip test. 

    The diptest can be used to test if a distribution is unimodal. In order to
    get it to work, I have to turn the peak signal into a distribution by
    simulating, and then run the test on the simulated data. This is a little
    hackish, there is probably a better/faster way.
    
    """

    # Smooth distribution using hamming
    if smooth:
        smooth = signal.convolve(peak, signal.hamming(10))
    else:
        smooth = peak

    # Set up x's
    x_grid = np.arange(0, smooth.shape[0])

    # Normalize the peak section to sum to 1
    norm = smooth / smooth.sum()

    # Simulate data from the peak distribution
    sim = choice(x_grid, size=3000, replace=True, p=norm)

    # Run diptest
    test, pval = diptest(sim)

    return test, pval
Esempio n. 4
0
def generate_scatter_dist_plot(articles,
                               num_ideas,
                               plot_dir,
                               prefix,
                               cooccur_func=None,
                               make_plots=True,
                               write_tests=True,
                               group_by="year",
                               samples=1000):
    result = get_count_cooccur(articles, func=cooccur_func)
    pmi = get_pmi(result["cooccur"],
                  result["count"],
                  float(result["articles"]),
                  num_ideas=num_ideas)
    articles_group = get_time_grouped_articles(articles, group_by=group_by)
    info_dict = {
        k: get_count_cooccur(articles_group[k], func=cooccur_func)
        for k in articles_group
    }
    ts_correlation = get_ts_correlation(info_dict, num_ideas, normalize=True)
    xs, ys = [], []
    for i in range(num_ideas):
        for j in range(i + 1, num_ideas):
            if np.isnan(pmi[i, j]) or np.isnan(ts_correlation[i, j]):
                continue
            if np.isinf(pmi[i, j]) or np.isinf(ts_correlation[i, j]):
                continue
            xs.append(ts_correlation[i, j])
            ys.append(pmi[i, j])
    if write_tests:
        with open("%s/%s_test.jsonlist" % (plot_dir, prefix), "w") as fout:
            k, p = ss.mstats.normaltest(xs)
            fout.write("%s\n" %
                       json.dumps({
                           "name": "correlation normality test",
                           "k2": None if np.ma.is_masked(k) else k,
                           "p-value": p
                       }))
            k, p = ss.mstats.normaltest(ys)
            fout.write("%s\n" %
                       json.dumps({
                           "name": "PMI normality test",
                           "k2": None if np.ma.is_masked(k) else k,
                           "p-value": p
                       }))
            if unimodality_test:
                d, p = diptest.diptest(np.array(xs))
                fout.write("%s\n" %
                           json.dumps({
                               "name": "correlation unimodality test",
                               "d": None if np.ma.is_masked(k) else d,
                               "p-value": p
                           }))
                d, p = diptest.diptest(np.array(ys))
                fout.write("%s\n" %
                           json.dumps({
                               "name": "PMI unimodality test",
                               "d": None if np.ma.is_masked(k) else d,
                               "p-value": p
                           }))
            c, p = ss.pearsonr(xs, ys)
            fout.write("%s\n" % json.dumps({
                "name": "correlation between correlation and PMI",
                "coef": c,
                "p-value": p
            }))
    filename = "%s/%s_joint_plot.pdf" % (plot_dir, prefix)
    if make_plots:
        fig = pf.joint_plot(np.array(xs),
                            np.array(ys),
                            xlabel="prevalence correlation",
                            ylabel="cooccurrence",
                            xlim=(-1, 1))
        pf.savefig(fig, filename)
    return pmi, ts_correlation, filename
Esempio n. 5
0
from src.gen import break_gen
from src.dist_util import distance
from diptest.diptest import diptest
import sys
import matplotlib.pyplot as plt

best = 0
other = 'minkowski'
for i in range(100000):
    inst = break_gen()
    distances_euclid, mean_euclid, dev_euclid = distance(inst, 'eucld')
    out = diptest(distances_euclid)
    distances_other, mean_other, dev_other = distance(inst, other)
    out2 = diptest(distances_other)

    #print(len(distances_euclid), len(distances_other))
    cur = (out2[1] - out[1])
    if cur < best:
        print(out, out2)
        plt.title("p values --- Euclid = " + str(round(out[1], 2)) + ", " +
                  other + " = " + str(round(out2[1], 2)))
        plt.scatter(inst.points[:, 0], inst.points[:, 1])
        plt.show()
        plt.hist([distances_euclid, distances_other],
                 label=['euclid', other],
                 bins=30)
        plt.legend()
        plt.title("p values --- Euclid = " + str(round(out[1], 2)) + ", " +
                  other + " = " + str(round(out2[1], 2)))
        plt.show()
    best = min(best, cur)
Esempio n. 6
0
def ackerman_dist(instance):
    distances = distance(instance)
    if len(distances) > 70000:
        distances = np.random.choice(distances, 70000)
    out = diptest(distances)
    return out[1] < ackerman_cutoff, out[1]