def mean_vs_pooled(rank_dist, freq_dist, save_dir): all_joints = merge_to_joint(rank_dist, freq_dist) all_xs, all_ys = list( zip(*[(r, f) for r_ls, f_ls in all_joints.values() for r, f in zip(r_ls, f_ls) if f > 0])) hexbin_plot(all_xs, all_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", min_y=1) mean_ranks = reduce_pooled(rank_dist) mean_freqs = reduce_pooled(freq_dist) mean_joints = merge_to_joint(mean_ranks, mean_freqs) mean_xs, mean_ys = list(zip(*sorted(mean_joints.values()))) hexbin_plot(mean_xs, mean_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", color="red", edgecolors="red", cmap="Reds_r", cbar=False, min_y=1, label="mean") plt.legend() plt.savefig(save_dir + "rank_freq_mean_vs_var.png", dpi=300) plt.close()
def convergence_main(wiki, rng, m, save_dir="./"): handle = open( save_dir + "mle_mandelbrot_convergence_" + "_".join(map(str, rng)) + ".txt", "w") for i, n in enumerate(rng): mean_ranks, mean_freqs = get_mean_relationship(wiki, n, m, compute_freqs) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*joints.values())) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", edgecolors=colour_palette[i], color=colour_palette[i], label=format_scientific(n), alpha=1 / (i + 1)**.3, linewidths=1.0, cbar=(True if i == 0 else False), min_y=1) do_mle(xs, ys, n, handle) handle.close() plt.legend() plt.savefig(save_dir + "convergence_" + "_".join(map(str, rng)) + ".png", dpi=300) plt.close() for i, n in enumerate(rng): mean_ranks, mean_freqs = get_mean_relationship( wiki, n, m, compute_normalised_freqs) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*joints.values())) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $P(w)$", edgecolors=colour_palette[i], color=colour_palette[i], label=format_scientific(n), alpha=1 / (i + 1)**.3, linewidths=1.0, cbar=(True if i == 0 else False), min_y=1 / n) plt.legend() plt.savefig(save_dir + "convergence_probs_" + "_".join(map(str, rng)) + ".png", dpi=300) plt.close()
def within_filter_plots(sample_dict, show=True, mle_dict=None): plot_lims = None for i, (param, sample_ls) in enumerate(sample_dict.items()): mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims =\ hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label=str(param), color=colour_palette[i], edgecolors=colour_palette[i], linewidths=1.0, lims=None, min_y=1, cbar=False) if mle_dict and param in mle_dict: mandelbrot = mle_dict[param] plot_preds(mandelbrot, np.asarray(xs), color=colour_palette[i]) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) print(plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.legend() if show: plt.show() return plot_lims
def zipf_wrong(wiki, n, d): subcorp = Articles.subsample(wiki, n) ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300) plt.close()
def filter_worker(i): print("started ", i) cur_seed = int.from_bytes(os.urandom(4), byteorder='little') rand.seed(cur_seed) filtered = list(filter_typicality_incremental(mp_array, zipf_model, rank_dict, auto_typ, n, factor*epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
def do_mles(tf_samples, srf_samples, uni_samples): tf_mles = {} srf_mles = {} for param, sample_ls in tf_samples.items(): print("\n TF", str(param)) mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) # result_str = mandelbrot.print_result(string=True) tf_mles[param] = mandelbrot for param, sample_ls in srf_samples.items(): print("\n SRF", str(param)) mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) # mandelbrot.print_result() srf_mles[param] = mandelbrot uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples) uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs) uni_xs, uni_ys = list(zip(*sorted(uni_joints.values()))) uni_mandelbrot = Mandelbrot(uni_ys, uni_xs) uni_mandelbrot_fit = uni_mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) uni_mandelbrot.register_fit(uni_mandelbrot_fit) return tf_mles, srf_mles, uni_mandelbrot
def zipf_piantadosi(wiki, n, d): subcorp1 = Words.subsample(wiki, n) subcorp2 = Words.subsample(wiki, n) ranks = compute_ranks(subcorp1) freqs = compute_freqs(subcorp2) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300) plt.close()
def establish_typical_set(corpus, rank_dict, zipf_model, n, m): typicalities = [] for i in range(m): sub = Sentences.subsample(corpus, n) sub_freqs = compute_freqs(sub) sub_joints = merge_to_joint(rank_dict, sub_freqs) sub_typicality = typicality(zipf_model, sub_joints) typicalities.append(sub_typicality) mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5 return mean_typ, std_typ
def do_mles(ranks, freqs, save_dir): with open(save_dir + "mle_mandelbrot_point_estimates.txt", "w") as handle: for r_dict, f_dict in zip(ranks, freqs): joints = merge_to_joint(r_dict, f_dict) xs, ys = list(zip(*sorted(joints.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) # mandelbrot.print_result() handle.write(mandelbrot.print_result(string=True)) handle.write("\n")
def get_reference_dist(wiki): n = int(10e6) m = 10 wiki_ls = list(wiki) subsamples = [Sentences.subsample(wiki_ls, n) for _ in range(m)] mean_ranks, mean_freqs = mean_rank_freq_from_samples(subsamples) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() return mandelbrot, mean_ranks
def get_model(corpus, n): big_ranks = compute_ranks(Sentences.subsample(corpus, n)) freqs = compute_freqs(Sentences.subsample(corpus, n)) joint = merge_to_joint(big_ranks, freqs) xs, ys = list(zip(*sorted(joint.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() auto_typ = typicality(mandelbrot, joint) return big_ranks, mandelbrot, auto_typ
def covariance_across_words(rank_dist, freq_dist, save_dir): joints = merge_to_joint(rank_dist, freq_dist) mean_ranks = reduce_pooled(rank_dist) equalize_len = lambda ls1, ls2: (ls1[:min(len(ls1), len(ls2))], ls2[:min( len(ls1), len(ls2))]) cov_dict = { w: np.cov(*equalize_len(r_ls, f_ls)) for w, (r_ls, f_ls) in joints.items() } fano_factor_dict = { w: cov_mat[0][1] / mean_ranks[w] for w, cov_mat in cov_dict.items() } words_sorted = [ (w, r) for w, r in sorted(mean_ranks.items(), key=lambda tup: tup[1]) ] xs, ys = list( zip(*[(r, fano_factor_dict[w]) for w, r in words_sorted if w in cov_dict])) hexbin_plot(xs, ys, log=False, xscale="log", bins="log", xlbl="$\overline{r}(w)$", ylbl="$D(w)$", ignore_zeros=False, gridsize=100) # plt.legend() plt.savefig(save_dir + "dispersion.png", dpi=300) plt.close()
def samples_to_typicality(samples, ref_dist, rank_dict): freqs = [compute_freqs(s) for s in samples] joints = [merge_to_joint(rank_dict, f_dict) for f_dict in freqs] typs = [typicality(ref_dist, j) for j in joints] return typs
srfs = get_filters(d + "SRF/", k, ["k", "h", "i"], "h", hist_lens) tfs = get_filters(d + "TF/", k, ["k", "f", "i"], "f", factors) highest_three_factors = factors[-3:] three_tfs = {k: tfs[k] for k in highest_three_factors} highest_three_hist_lens = hist_lens[-3:] three_srfs = {k: srfs[k] for k in highest_three_hist_lens} unis = [ Sentences(c) for _, c in corpora_from_pickles(d + "UNI", names=["k", "i"]) ] uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(unis) uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs) uni_xs, uni_ys = list(zip(*sorted(uni_joints.values()))) print("filters loaded", flush=True) # MLEs tf_mles, srf_mles, uni_mandel = do_mles(tfs, srfs, unis) with open(results_d + "mle_mandelbrot.txt", "w") as handle: for param, mandel in tf_mles.items(): handle.write("TF " + str(param)) handle.write("\n") handle.write(mandel.print_result(string=True)) handle.write("\n\n") for param, mandel in srf_mles.items(): handle.write("SRF " + str(param))
def sampling_levels_main(wiki, n, m, save_dir="./"): art_mean_ranks, art_mean_freqs = get_mean_relationship( Articles, wiki, n, m) art_joint = merge_to_joint(art_mean_ranks, art_mean_freqs) art_xs, art_ys = list(zip(*sorted(art_joint.values()))) hexbin_plot(art_xs, art_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", label="texts", min_y=1) do_mle(art_xs, art_ys, Articles, save_dir) sent_mean_ranks, sent_mean_freqs = get_mean_relationship( Sentences, wiki, n, m) sent_joint = merge_to_joint(sent_mean_ranks, sent_mean_freqs) sent_xs, sent_ys = list(zip(*sorted(sent_joint.values()))) do_mle(sent_xs, sent_ys, Sentences, save_dir) word_mean_ranks, word_mean_freqs = get_mean_relationship(Words, wiki, n, m) word_joint = merge_to_joint(word_mean_ranks, word_mean_freqs) word_xs, word_ys = list(zip(*sorted(word_joint.values()))) hexbin_plot(word_xs, word_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", color="red", edgecolors="red", cmap="Reds_r", label="words", cbar=False, min_y=1) do_mle(word_xs, word_ys, Words, save_dir) plt.legend() plt.savefig(save_dir + "rank_freq_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() freq_joint = merge_to_joint(art_mean_freqs, word_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) hexbin_plot(xs, ys, xlbl=r"$\log$ $f(w)$ from texts", ylbl=r"$\log$ $f(w)$ from words") plt.savefig(save_dir + "freq_correl_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() art_word_corr = scistats.spearmanr(xs, ys) freq_joint = merge_to_joint(art_mean_freqs, sent_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) art_sent_corr = scistats.spearmanr(xs, ys) freq_joint = merge_to_joint(sent_mean_freqs, word_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) sent_word_corr = scistats.spearmanr(xs, ys) with open(save_dir + "freq_sampling_level_correlations.txt", "w") as handle: handle.write("\t".join([ "Articles-Words:", str(art_word_corr.correlation), str(art_word_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Articles-Sentences:", str(art_sent_corr.correlation), str(art_sent_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Sentences-Words:", str(sent_word_corr.correlation), str(sent_word_corr.pvalue) ])) handle.write("\n") rank_joint = merge_to_joint(art_mean_ranks, word_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$ from texts", ylbl=r"$\log$ $r(w)$ from words") plt.savefig(save_dir + "rank_correl_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() art_word_corr = scistats.spearmanr(xs, ys) rank_joint = merge_to_joint(art_mean_ranks, sent_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) art_sent_corr = scistats.spearmanr(xs, ys) rank_joint = merge_to_joint(sent_mean_ranks, word_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) sent_word_corr = scistats.spearmanr(xs, ys) with open(save_dir + "rank_sampling_level_correlations.txt", "w") as handle: handle.write("\t".join([ "Articles-Words:", str(art_word_corr.correlation), str(art_word_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Articles-Sentences:", str(art_sent_corr.correlation), str(art_sent_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Sentences-Words:", str(sent_word_corr.correlation), str(sent_word_corr.pvalue) ])) handle.write("\n")
def across_filter_plots(tf_samples, srf_samples, f, h, uni_samples, show=False): tf_mean_ranks, tf_mean_freqs = mean_rank_freq_from_samples(tf_samples) srf_mean_ranks, srf_mean_freqs = mean_rank_freq_from_samples(srf_samples) uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples) tf_mean_rf = mean_rank_freq_from_samples(tf_samples) srf_mean_rf = mean_rank_freq_from_samples(srf_samples) uni_mean_rf = mean_rank_freq_from_samples(uni_samples) plot_lims = None joints = merge_to_joint(*uni_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label="UNIF", color="black", edgecolors="black", cmap="gray", linewidths=1.0, lims=None, min_y=1, cbar=True) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) joints = merge_to_joint(*tf_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, label="TF " + str(f), color=colour_palette[0], edgecolors=colour_palette[0], linewidths=1.0, lims=None, min_y=1, cbar=False) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) joints = merge_to_joint(*srf_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, label="SRF " + str(h), color=colour_palette[1], edgecolors=colour_palette[1], linewidths=1.0, lims=None, min_y=1, cbar=False) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.legend() if show: plt.show()
setup_m = 100 m = 10 wiki = list(wiki_from_pickles("data/"+lang+"_pkl")) sents = [s for a in wiki for s in a] zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, big_n(wiki), n, setup_m) mean_corrected = abs(mean_typ - auto_typ) epsilon_f_plus = mean_corrected + std_typ*factor epsilon_f_minus = - epsilon_f_plus print("\nModel and Epsilon established") print(auto_typ, mean_typ, std_typ) print(epsilon_f_minus, epsilon_f_plus) for m_i in range(m): print("started ", m_i) filtered = list(filter_typicality_incremental(sents, zipf_model, rank_dict, auto_typ, n, epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", m_i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(m_i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
zipf_piantadosi(wiki, n, d) subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) print("subsampling done") joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit( start_params=np.asarray([10.0, 1000.0]), # [1.0, 1.0] method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() with open(d + "mle_mandelbrot_" + str(n) + "_" + str(m) + ".txt", "w") as handle: handle.write(mandelbrot.print_result(string=True)) plot_preds(mandelbrot, np.asarray(xs))