Exemple #1
0
def mean_vs_pooled(rank_dist, freq_dist, save_dir):
    all_joints = merge_to_joint(rank_dist, freq_dist)
    all_xs, all_ys = list(
        zip(*[(r, f) for r_ls, f_ls in all_joints.values()
              for r, f in zip(r_ls, f_ls) if f > 0]))

    hexbin_plot(all_xs,
                all_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                min_y=1)

    mean_ranks = reduce_pooled(rank_dist)
    mean_freqs = reduce_pooled(freq_dist)

    mean_joints = merge_to_joint(mean_ranks, mean_freqs)
    mean_xs, mean_ys = list(zip(*sorted(mean_joints.values())))

    hexbin_plot(mean_xs,
                mean_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                color="red",
                edgecolors="red",
                cmap="Reds_r",
                cbar=False,
                min_y=1,
                label="mean")

    plt.legend()
    plt.savefig(save_dir + "rank_freq_mean_vs_var.png", dpi=300)
    plt.close()
Exemple #2
0
def convergence_main(wiki, rng, m, save_dir="./"):
    handle = open(
        save_dir + "mle_mandelbrot_convergence_" + "_".join(map(str, rng)) +
        ".txt", "w")
    for i, n in enumerate(rng):
        mean_ranks, mean_freqs = get_mean_relationship(wiki, n, m,
                                                       compute_freqs)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*joints.values()))

        hexbin_plot(xs,
                    ys,
                    xlbl=r"$\log$ $r(w)$",
                    ylbl=r"$\log$ $f(w)$",
                    edgecolors=colour_palette[i],
                    color=colour_palette[i],
                    label=format_scientific(n),
                    alpha=1 / (i + 1)**.3,
                    linewidths=1.0,
                    cbar=(True if i == 0 else False),
                    min_y=1)

        do_mle(xs, ys, n, handle)

    handle.close()

    plt.legend()
    plt.savefig(save_dir + "convergence_" + "_".join(map(str, rng)) + ".png",
                dpi=300)
    plt.close()

    for i, n in enumerate(rng):
        mean_ranks, mean_freqs = get_mean_relationship(
            wiki, n, m, compute_normalised_freqs)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*joints.values()))

        hexbin_plot(xs,
                    ys,
                    xlbl=r"$\log$ $r(w)$",
                    ylbl=r"$\log$ $P(w)$",
                    edgecolors=colour_palette[i],
                    color=colour_palette[i],
                    label=format_scientific(n),
                    alpha=1 / (i + 1)**.3,
                    linewidths=1.0,
                    cbar=(True if i == 0 else False),
                    min_y=1 / n)

    plt.legend()
    plt.savefig(save_dir + "convergence_probs_" + "_".join(map(str, rng)) +
                ".png",
                dpi=300)
    plt.close()
Exemple #3
0
def within_filter_plots(sample_dict, show=True, mle_dict=None):
    plot_lims = None
    for i, (param, sample_ls) in enumerate(sample_dict.items()):
        mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*sorted(joints.values())))

        cur_plot_lims =\
        hexbin_plot(xs, ys,
                    xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label=str(param),
                    color=colour_palette[i], edgecolors=colour_palette[i],
                    linewidths=1.0, lims=None, min_y=1,
                    cbar=False)
        if mle_dict and param in mle_dict:
            mandelbrot = mle_dict[param]
            plot_preds(mandelbrot, np.asarray(xs), color=colour_palette[i])

        plot_lims = get_greater_lims(plot_lims, cur_plot_lims)
        print(plot_lims)

    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.legend()
    if show:
        plt.show()

    return plot_lims
Exemple #4
0
def zipf_wrong(wiki, n, d):
    subcorp = Articles.subsample(wiki, n)

    ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300)
    plt.close()
Exemple #5
0
 def filter_worker(i):
     print("started ", i)
     cur_seed = int.from_bytes(os.urandom(4), byteorder='little')
     rand.seed(cur_seed)
     filtered = list(filter_typicality_incremental(mp_array, zipf_model, 
                     rank_dict, auto_typ, n, factor*epsilon_f_minus, lt))
     filtered_freqs = compute_freqs(Sentences(filtered))
     print("filtered ", i, " typicality: ", 
           typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))
     
     name = "_".join((str(n), str(factor), str(i)))
     corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
Exemple #6
0
def do_mles(tf_samples, srf_samples, uni_samples):
    tf_mles = {}
    srf_mles = {}
    for param, sample_ls in tf_samples.items():
        print("\n TF", str(param))
        mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*sorted(joints.values())))
        mandelbrot = Mandelbrot(ys, xs)
        mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                        method="powell",
                                        full_output=True)
        mandelbrot.register_fit(mandelbrot_fit)
        #        result_str = mandelbrot.print_result(string=True)
        tf_mles[param] = mandelbrot

    for param, sample_ls in srf_samples.items():
        print("\n SRF", str(param))
        mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*sorted(joints.values())))
        mandelbrot = Mandelbrot(ys, xs)
        mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                        method="powell",
                                        full_output=True)
        mandelbrot.register_fit(mandelbrot_fit)
        #        mandelbrot.print_result()
        srf_mles[param] = mandelbrot

    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples)
    uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs)
    uni_xs, uni_ys = list(zip(*sorted(uni_joints.values())))
    uni_mandelbrot = Mandelbrot(uni_ys, uni_xs)
    uni_mandelbrot_fit = uni_mandelbrot.fit(start_params=np.asarray([1.0,
                                                                     1.0]),
                                            method="powell",
                                            full_output=True)
    uni_mandelbrot.register_fit(uni_mandelbrot_fit)

    return tf_mles, srf_mles, uni_mandelbrot
Exemple #7
0
def zipf_piantadosi(wiki, n, d):
    subcorp1 = Words.subsample(wiki, n)
    subcorp2 = Words.subsample(wiki, n)

    ranks = compute_ranks(subcorp1)
    freqs = compute_freqs(subcorp2)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()
Exemple #8
0
def establish_typical_set(corpus, rank_dict, zipf_model, n, m):
    typicalities = []

    for i in range(m):
        sub = Sentences.subsample(corpus, n)

        sub_freqs = compute_freqs(sub)
        sub_joints = merge_to_joint(rank_dict, sub_freqs)

        sub_typicality = typicality(zipf_model, sub_joints)
        typicalities.append(sub_typicality)

    mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5
    return mean_typ, std_typ
Exemple #9
0
def do_mles(ranks, freqs, save_dir):
    with open(save_dir + "mle_mandelbrot_point_estimates.txt", "w") as handle:
        for r_dict, f_dict in zip(ranks, freqs):
            joints = merge_to_joint(r_dict, f_dict)
            xs, ys = list(zip(*sorted(joints.values())))

            mandelbrot = Mandelbrot(ys, xs)
            mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0,
                                                                     1.0]),
                                            method="powell",
                                            full_output=True)
            mandelbrot.register_fit(mandelbrot_fit)
            #            mandelbrot.print_result()

            handle.write(mandelbrot.print_result(string=True))
            handle.write("\n")
Exemple #10
0
def get_reference_dist(wiki):
    n = int(10e6)
    m = 10

    wiki_ls = list(wiki)

    subsamples = [Sentences.subsample(wiki_ls, n) for _ in range(m)]
    mean_ranks, mean_freqs = mean_rank_freq_from_samples(subsamples)
    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))
    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    return mandelbrot, mean_ranks
Exemple #11
0
def get_model(corpus, n):
    big_ranks = compute_ranks(Sentences.subsample(corpus, n))
    freqs = compute_freqs(Sentences.subsample(corpus, n))

    joint = merge_to_joint(big_ranks, freqs)

    xs, ys = list(zip(*sorted(joint.values())))

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    auto_typ = typicality(mandelbrot, joint)

    return big_ranks, mandelbrot, auto_typ
Exemple #12
0
def covariance_across_words(rank_dist, freq_dist, save_dir):
    joints = merge_to_joint(rank_dist, freq_dist)
    mean_ranks = reduce_pooled(rank_dist)

    equalize_len = lambda ls1, ls2: (ls1[:min(len(ls1), len(ls2))], ls2[:min(
        len(ls1), len(ls2))])

    cov_dict = {
        w: np.cov(*equalize_len(r_ls, f_ls))
        for w, (r_ls, f_ls) in joints.items()
    }

    fano_factor_dict = {
        w: cov_mat[0][1] / mean_ranks[w]
        for w, cov_mat in cov_dict.items()
    }

    words_sorted = [
        (w, r) for w, r in sorted(mean_ranks.items(), key=lambda tup: tup[1])
    ]

    xs, ys = list(
        zip(*[(r, fano_factor_dict[w]) for w, r in words_sorted
              if w in cov_dict]))

    hexbin_plot(xs,
                ys,
                log=False,
                xscale="log",
                bins="log",
                xlbl="$\overline{r}(w)$",
                ylbl="$D(w)$",
                ignore_zeros=False,
                gridsize=100)

    #    plt.legend()
    plt.savefig(save_dir + "dispersion.png", dpi=300)
    plt.close()
Exemple #13
0
def samples_to_typicality(samples, ref_dist, rank_dict):
    freqs = [compute_freqs(s) for s in samples]
    joints = [merge_to_joint(rank_dict, f_dict) for f_dict in freqs]
    typs = [typicality(ref_dist, j) for j in joints]
    return typs
Exemple #14
0
    srfs = get_filters(d + "SRF/", k, ["k", "h", "i"], "h", hist_lens)
    tfs = get_filters(d + "TF/", k, ["k", "f", "i"], "f", factors)

    highest_three_factors = factors[-3:]
    three_tfs = {k: tfs[k] for k in highest_three_factors}
    highest_three_hist_lens = hist_lens[-3:]
    three_srfs = {k: srfs[k] for k in highest_three_hist_lens}

    unis = [
        Sentences(c)
        for _, c in corpora_from_pickles(d + "UNI", names=["k", "i"])
    ]

    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(unis)
    uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs)
    uni_xs, uni_ys = list(zip(*sorted(uni_joints.values())))

    print("filters loaded", flush=True)

    # MLEs
    tf_mles, srf_mles, uni_mandel = do_mles(tfs, srfs, unis)

    with open(results_d + "mle_mandelbrot.txt", "w") as handle:
        for param, mandel in tf_mles.items():
            handle.write("TF " + str(param))
            handle.write("\n")
            handle.write(mandel.print_result(string=True))
            handle.write("\n\n")
        for param, mandel in srf_mles.items():
            handle.write("SRF " + str(param))
Exemple #15
0
def sampling_levels_main(wiki, n, m, save_dir="./"):

    art_mean_ranks, art_mean_freqs = get_mean_relationship(
        Articles, wiki, n, m)
    art_joint = merge_to_joint(art_mean_ranks, art_mean_freqs)
    art_xs, art_ys = list(zip(*sorted(art_joint.values())))

    hexbin_plot(art_xs,
                art_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                label="texts",
                min_y=1)

    do_mle(art_xs, art_ys, Articles, save_dir)

    sent_mean_ranks, sent_mean_freqs = get_mean_relationship(
        Sentences, wiki, n, m)
    sent_joint = merge_to_joint(sent_mean_ranks, sent_mean_freqs)
    sent_xs, sent_ys = list(zip(*sorted(sent_joint.values())))

    do_mle(sent_xs, sent_ys, Sentences, save_dir)

    word_mean_ranks, word_mean_freqs = get_mean_relationship(Words, wiki, n, m)
    word_joint = merge_to_joint(word_mean_ranks, word_mean_freqs)
    word_xs, word_ys = list(zip(*sorted(word_joint.values())))

    hexbin_plot(word_xs,
                word_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                color="red",
                edgecolors="red",
                cmap="Reds_r",
                label="words",
                cbar=False,
                min_y=1)

    do_mle(word_xs, word_ys, Words, save_dir)

    plt.legend()
    plt.savefig(save_dir + "rank_freq_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    freq_joint = merge_to_joint(art_mean_freqs, word_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    hexbin_plot(xs,
                ys,
                xlbl=r"$\log$ $f(w)$ from texts",
                ylbl=r"$\log$ $f(w)$ from words")
    plt.savefig(save_dir + "freq_correl_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    art_word_corr = scistats.spearmanr(xs, ys)

    freq_joint = merge_to_joint(art_mean_freqs, sent_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    art_sent_corr = scistats.spearmanr(xs, ys)

    freq_joint = merge_to_joint(sent_mean_freqs, word_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    sent_word_corr = scistats.spearmanr(xs, ys)

    with open(save_dir + "freq_sampling_level_correlations.txt",
              "w") as handle:
        handle.write("\t".join([
            "Articles-Words:",
            str(art_word_corr.correlation),
            str(art_word_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Articles-Sentences:",
            str(art_sent_corr.correlation),
            str(art_sent_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Sentences-Words:",
            str(sent_word_corr.correlation),
            str(sent_word_corr.pvalue)
        ]))
        handle.write("\n")

    rank_joint = merge_to_joint(art_mean_ranks, word_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    hexbin_plot(xs,
                ys,
                xlbl=r"$\log$ $r(w)$ from texts",
                ylbl=r"$\log$ $r(w)$ from words")
    plt.savefig(save_dir + "rank_correl_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    art_word_corr = scistats.spearmanr(xs, ys)

    rank_joint = merge_to_joint(art_mean_ranks, sent_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    art_sent_corr = scistats.spearmanr(xs, ys)

    rank_joint = merge_to_joint(sent_mean_ranks, word_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    sent_word_corr = scistats.spearmanr(xs, ys)

    with open(save_dir + "rank_sampling_level_correlations.txt",
              "w") as handle:
        handle.write("\t".join([
            "Articles-Words:",
            str(art_word_corr.correlation),
            str(art_word_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Articles-Sentences:",
            str(art_sent_corr.correlation),
            str(art_sent_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Sentences-Words:",
            str(sent_word_corr.correlation),
            str(sent_word_corr.pvalue)
        ]))
        handle.write("\n")
Exemple #16
0
def across_filter_plots(tf_samples,
                        srf_samples,
                        f,
                        h,
                        uni_samples,
                        show=False):
    tf_mean_ranks, tf_mean_freqs = mean_rank_freq_from_samples(tf_samples)
    srf_mean_ranks, srf_mean_freqs = mean_rank_freq_from_samples(srf_samples)
    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples)

    tf_mean_rf = mean_rank_freq_from_samples(tf_samples)
    srf_mean_rf = mean_rank_freq_from_samples(srf_samples)
    uni_mean_rf = mean_rank_freq_from_samples(uni_samples)

    plot_lims = None

    joints = merge_to_joint(*uni_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                xlbl="$\log$ $r(w)$",
                                ylbl="$\log$ $f(w)$",
                                label="UNIF",
                                color="black",
                                edgecolors="black",
                                cmap="gray",
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=True)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    joints = merge_to_joint(*tf_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                label="TF " + str(f),
                                color=colour_palette[0],
                                edgecolors=colour_palette[0],
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=False)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    joints = merge_to_joint(*srf_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                label="SRF " + str(h),
                                color=colour_palette[1],
                                edgecolors=colour_palette[1],
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=False)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.legend()
    if show:
        plt.show()
Exemple #17
0
    setup_m = 100
    m = 10
    
    wiki = list(wiki_from_pickles("data/"+lang+"_pkl"))
    sents = [s for a in wiki for s in a]

    zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, 
                                                                         big_n(wiki), 
                                                                         n, 
                                                                         setup_m)
    
    mean_corrected = abs(mean_typ - auto_typ)
    epsilon_f_plus = mean_corrected + std_typ*factor
    epsilon_f_minus = - epsilon_f_plus
    
    print("\nModel and Epsilon established")
    print(auto_typ, mean_typ, std_typ)
    print(epsilon_f_minus, epsilon_f_plus)
    
    
    for m_i in range(m):
        print("started ", m_i)        
        filtered = list(filter_typicality_incremental(sents, zipf_model, 
                        rank_dict, auto_typ, n, epsilon_f_minus, lt))
        filtered_freqs = compute_freqs(Sentences(filtered))
        print("filtered ", m_i, " typicality: ", 
              typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))

        
        name = "_".join((str(n), str(factor), str(m_i)))
        corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
Exemple #18
0
    zipf_piantadosi(wiki, n, d)

    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1)

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(
        start_params=np.asarray([10.0, 1000.0]),  # [1.0, 1.0]
        method="powell",
        full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    with open(d + "mle_mandelbrot_" + str(n) + "_" + str(m) + ".txt",
              "w") as handle:
        handle.write(mandelbrot.print_result(string=True))
    plot_preds(mandelbrot, np.asarray(xs))