def load_data_lda(relative_sizes, sample_size, iterations, namespace,
                  num_topics):
    # Load data
    base_path = build_basepath("samples", namespace, sample_size)
    data_a = dict()  # x -> [ys]
    data_b = dict()  # x -> [ys]
    data_all = dict()  # x -> [ys]
    for rel_size, i, adj_path in rel_size_iteration_loop(
            relative_sizes, iterations, base_path):
        if rel_size not in data_a.keys():
            data_a[rel_size] = []
            data_b[rel_size] = []
            data_all[rel_size] = []

        # True LDA Eval (no adj)
        with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_a.p",
                  "rb") as f_a:
            perpl_exponent = pickle.load(f_a)
            data_a[rel_size].append(np.exp2(-perpl_exponent))

        with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_b.p",
                  "rb") as f_b:
            perpl_exponent = pickle.load(f_b)
            data_b[rel_size].append(np.exp2(-perpl_exponent))

        with open(adj_path + "eval/adj_lda_" + str(num_topics) + "_all.p",
                  "rb") as f_all:
            perpl_exponent = pickle.load(f_all)
            data_all[rel_size].append(np.exp2(-perpl_exponent))

    return data_a, data_b, data_all
def load_inference_data(namespace, sample_size, num_topics):
    base_path = build_basepath("samples", namespace, sample_size)
    inference_data = load_topic_group_data(base_path, relative_sizes, iterations, num_topics)

    # Init X-values (relative_size)
    x_values = sorted(list(inference_data.keys()))

    # Init plot 1 data
    red_y = []
    purple_y = []
    blue_y = []

    # Init plot 2 data
    a_prop = dict()
    b_prop = dict()
    shared_prop = dict()

    for x in x_values:
        deviation_stats = deviation_stats_inference(inference_data[x])
        print(deviation_stats)
        # Plot 1 - Data
        blue_y.append(deviation_stats[0][0] + deviation_stats[1][0] + deviation_stats[2][0])
        purple_y.append(deviation_stats[0][0] + deviation_stats[2][0])
        red_y.append(deviation_stats[0][0])

        # Plot 2 - Data
        a_prop[x], b_prop[x], shared_prop[x] = deviation_stats

    data_1 = (red_y, purple_y, blue_y)
    data_2 = (a_prop, shared_prop, b_prop)
    return x_values, data_1, data_2
def plot_corpus_dictionary(ds_name, namespace):
    plt.rc('text', usetex=True)
    plt.style.use('seaborn-paper')

    with open("../../dumps/"+ds_name+".dump", "r") as f_dump:
        print("Loading Dump: ../../dumps/"+ds_name+".dump")
        data = json.load(f_dump)
        global_dict = data["global_dict"]

    out_path = build_basepath("plots", namespace, subfolder=ds_name)

    # Dump unique words for each group
    a_uniq, b_uniq, shared = extract_unique_words(global_dict, out_path)
    total = a_uniq + b_uniq + shared

    #Normalize
    a_uniq /= total
    b_uniq /= total
    shared /= total

    # Plot distribution
    fig, ax = plt.subplots(figsize=(5.7,0.7))
    ax.get_yaxis().set_visible(False)
    ax.set_xticks([x/10 for x in range(11)])

    ax.barh([0], [1], align='edge', color='blue', height=1, label=namespace["tag_coll_b"] + " exclusive")
    ax.barh([0], [a_uniq+shared], align='edge', color='purple', height=1, label="shared")
    ax.barh([0], [a_uniq], align='edge', color='red', height=1, label=namespace["tag_coll_a"] + " exclusive")

    ax.set_xlabel('Proportion')

    handles, labels = ax.get_legend_handles_labels()
    handles = [handles[2], handles[1], handles[0]]
    labels = [labels[2], labels[1], labels[0]]

    lgd = ax.legend(handles=handles, labels=labels, bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
                    ncol=3, mode="expand", borderaxespad=0.)

    fig.savefig(out_path+"ditionary_dist.png",bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(out_path + "ditionary_dist.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight', format="pdf")
def plot_triple_stack(relative_sizes, sample_size, iterations, namespace,
                      num_topics):
    # Init plot folders
    plot_path = build_basepath("plots",
                               namespace,
                               sample_size,
                               appendix="-" + str(iterations) + "i")
    # Load data
    data_a_triple = []
    data_b_triple = []
    data_all_triple = []
    for topics in num_topics:
        data_a, data_b, data_all = load_data_lda(relative_sizes, sample_size,
                                                 iterations, namespace, topics)
        data_a_triple.append(data_a)
        data_b_triple.append(data_b)
        data_all_triple.append(data_all)

    # Plot perplexity
    plot_triple_stack_perpl_adj(data_a_triple, data_b_triple, data_all_triple,
                                namespace, plot_path, "lda_all")
    plot_triple_stack_perpl_all(data_all_triple, namespace, plot_path,
                                "lda_all")
def plot_vocab_mismatch(relative_sizes, sample_size, iterations, namespace, format="png"):
    plt.style.use('seaborn-paper')

    # Init plot folders
    plot_path = build_basepath("plots", namespace, sample_size, appendix="-" + str(iterations) + "i")

    # Load data
    base_path = build_basepath("samples", namespace, sample_size)
    data_a = dict()  # x -> [ys]
    data_b = dict()  # x -> [ys]
    for rel_size, i, adj_path in rel_size_iteration_loop(relative_sizes, iterations, base_path):
        if rel_size not in data_a.keys():
            data_a[rel_size] = []
            data_b[rel_size] = []

        with open(adj_path + "eval/mismatch_a.p", "rb") as f_a:
            prob_mismatch_per_doc = pickle.load(f_a)
            data_a[rel_size] += prob_mismatch_per_doc

        with open(adj_path + "eval/mismatch_b.p", "rb") as f_b:
            prob_mismatch_per_doc = pickle.load(f_b)
            data_b[rel_size] += prob_mismatch_per_doc

    # Set x values
    x_values = sorted(list(data_a.keys()))

    # ----------------
    # Plot Means + Std
    # ----------------

    # Calc stats for x values
    means_a, std_a = list(zip(*[calculate_deviation_stats(data_a[x]) for x in x_values]))
    means_b, std_b = list(zip(*[calculate_deviation_stats(data_b[x]) for x in x_values]))

    # Command Line output
    print(namespace["tag_coll_a"] + " Vocab Mismatch")
    for i in range(len(x_values)):
        print(str(x_values[i]) + " - " + str(means_a[i]) + " - " + str(std_a[i]))

    print("\n" + namespace["tag_coll_b"] + " Vocab Mismatch")
    for i in range(len(x_values)):
        print(str(x_values[i]) + " - " + str(means_b[i]) + " - " + str(std_b[i]))

    plt.rc('text', usetex=True)

    # Plot overlapping
    fig, ax = plt.subplots(figsize=(5.7,3))
    # Plot group A
    # Fill space in background
    ax.fill_between(x_values, np.array(means_a) + std_a, np.array(means_a) - std_a, alpha=0.3, facecolor="red")
    # plot mean
    ax.plot(x_values, means_a, color="red", linewidth=2.0, label=namespace["tag_coll_a"] + " mean")
    # plot std
    ax.plot(x_values, np.array(means_a) + std_a, '--', color="red", linewidth=0.5, label = namespace["tag_coll_a"] + " std")
    ax.plot(x_values, np.array(means_a) - std_a, '--', color="red", linewidth=0.5)

    # Plot group B
    # Fill space in background
    ax.fill_between(x_values, np.array(means_b) + std_b, np.array(means_b) - std_b, alpha=0.3, facecolor="blue")
    # plot mean
    ax.plot(x_values, means_b, color="blue", linewidth=2.0, label=namespace["tag_coll_b"] + " mean")
    # plot std
    ax.plot(x_values, np.array(means_b) + std_b, '--', color="blue", linewidth=0.5, label = namespace["tag_coll_b"] + " std")
    ax.plot(x_values, np.array(means_b) - std_b, '--', color="blue", linewidth=0.5)

    # Set x limits

    lgd = ax.legend( bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=4, mode="expand",
                       borderaxespad=0.)
    ax.set_xlim(0.1,0.9)
    ax.set_ylim(0, 1)

    # Set axis labels
    ax.set_xlabel("Percentage of '"+namespace["tag_coll_a"]+"' documents in the training corpus")
    ax.set_ylabel("Proportion of Vocabulary Missmatch in a Test Document")

    fig.savefig(plot_path + "vocab_miss_mean.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
    fig.savefig(plot_path + "vocab_miss_mean.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight', format="pdf")
def plot_topic_group_distribution_triple(relative_sizes, sample_size, iterations, namespace, num_topics):
    plt.style.use('seaborn-paper')
    plt.rc('text', usetex=True)

    plot_path = build_basepath("plots", namespace, sample_size, appendix="-" + str(iterations) + "i")

    # Load data
    data_1_all = []
    data_2_all = []
    x_values = []
    # Load data
    for topics in num_topics:
        x_values, data_1, data_2 = load_inference_data(namespace, sample_size, topics)
        data_1_all.append(data_1)
        data_2_all.append(data_2)
        output_latex_table(x_values, data_2, topics, namespace)

    # Plot 1
    # f, axarr = plt.subplots(3, figsize=(2.2, 7), sharex=True, sharey=True)
    f, axarr = plt.subplots(1,3, figsize=(9,2), sharey=True)

    for index, topics in enumerate(num_topics):
        current_ax = axarr[index]
        plot_topic_dist(x_values, data_1_all[index], current_ax)
        current_ax.set_title(str(topics) + ' Topics')
        #current_ax.set_xlim(0.1, 0.9)
        current_ax.set_ylim(0, 1)

    legend_ax = axarr[1]
    handles, labels = legend_ax.get_legend_handles_labels()
    handles = [handles[2], handles[1], handles[0]]
    labels = [labels[2], labels[1], labels[0]]
    # Set x limits
    lgd = legend_ax.legend(handles=handles, labels=labels, bbox_to_anchor=(0., 1.13, 1., .102), loc=3, ncol=3, mode="expand",
                          borderaxespad=0.)

    # Set axis labels
    # axarr[-1].set_xlabel("Perc. of '" + namespace["tag_coll_a"] + "' docs in training corpus")

    axarr[1].set_xlabel("Percentage of group's documents in the trainings corpus")
    axarr[0].set_ylabel("Proportion")

    f.savefig(plot_path + "lda_all_topic_dist_triple.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
    f.savefig(plot_path + "lda_all_topic_dist_triple_triple.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight',
              format="pdf")

    # Plot 2
    # f2, axarr2 = plt.subplots(1,3, figsize=(5, 7), sharex=True, sharey=True)
    f2, axarr2 = plt.subplots(1,3, figsize=(9, 2) ,sharey=True)

    for index, topics in enumerate(num_topics):
        current_ax = axarr2[index]
        plot_topic_ratio(x_values, data_2_all[index], current_ax)
        current_ax.set_title(str(topics) + ' Topics')
        current_ax.set_xlim(0.1, 0.9)
        current_ax.set_ylim(0, 1)

    # Set x limits
    lgd2 = axarr2[1].legend(bbox_to_anchor=(0., 1.13, 1., .102), loc=3, ncol=2,
                           mode="expand",
                           borderaxespad=0.)
    # Set axis labels
    #axarr2[-1].set_xlabel("Percentage of group's documents in the trainings corpus")
    axarr2[0].set_ylabel("Topic/Corpus Ratio")
    axarr2[1].set_xlabel("Percentage of group's documents in the trainings corpus")

    f2.savefig(plot_path + "lda_all_topic_ratio_triple.png", bbox_extra_artists=(lgd2,), bbox_inches='tight')
    f2.savefig(plot_path + "lda_all_topic_ratio_triple_triple.pdf", bbox_extra_artists=(lgd2,), bbox_inches='tight',
              format="pdf")
def plot_corpus_stats(namespace, format="png", *args, **kwargs):
    plt.style.use('seaborn-paper')
    plt.rc('text', usetex=True)

    data_sets = namespace["data_sets"]
    for data_set in data_sets:
        # if data_set["path"].endswith("cleaned.tsv"):
        #    print("Skipped: "+data_set["path"])
        #    continue

        file_path = data_set["path"]

        base_plotpath = build_basepath("plots", namespace, subfolder = ".".join(file_path.strip().split("/")[-1].split(".")[:-1]))

        a_lens, b_lens, len_diffs, len_diffs_perc,\
        global_dict, uniq_words_a, uniq_words_b = load_dump_data(file_path=file_path,
                                                   formatted=data_set["formatted"],
                                                   index_a=data_set["index_a"],
                                                   index_b=data_set["index_b"],
                                                   *args, **kwargs)

        write_stat_overview(base_plotpath + "stats.txt", namespace, a_lens, b_lens, len_diffs, len_diffs_perc, global_dict, uniq_words_a, uniq_words_b)

        # Word Frequency Plots
        fig, ax = plt.subplots(figsize=(5.7,3))
        ax = plt.gca()
        ax.set_xlabel("Word Rank")
        file_name = "word_frequency_global"
        ax.set_xscale("log")
        ax.set_yscale("log")
        plot_word_frequency(ax, global_dict, key="freq_global", linewidth=2)
        fig.tight_layout()
        fig.savefig(base_plotpath + file_name + "." + format, format=format)

        fig, ax = plt.subplots(figsize=(5.7,3))
        ax = plt.gca()
        ax.set_xlabel("Word Rank")
        ax.set_xscale("log")
        ax.set_yscale("log")
        file_name = "word_frequency_" + namespace["tag_coll_a"] + "_" + namespace["tag_coll_b"]
        plot_word_frequency(ax, global_dict, key="freq_a", color="r", linewidth=2, label= namespace["tag_coll_a"] + " Words")
        plot_word_frequency(ax, global_dict, key="freq_b", color="b", linewidth=2, label= namespace["tag_coll_b"] + " Words")
        ax.legend()
        fig.tight_layout()
        fig.savefig(base_plotpath + file_name + "." + format, format=format)
        fig.savefig(base_plotpath + file_name + ".pdf", format="pdf")

        plt.clf()

        ''' Disabled
        # Document Length Distribution
        plt.figure()
        file_name = namespace["tag_coll_a"] + "_distribution_max1000"
        title = namespace["tag_coll_a"].upper() + " length distribution"
        plot_histogram(a_lens, mode="a", bin_to=1000)
        plt.xlabel("Document Length")
        # plt.title(title)
        plt.savefig(base_plotpath+file_name+"."+format,format=format)

        plt.figure()
        file_name = namespace["tag_coll_b"] + "_distribution_max1000"
        title = namespace["tag_coll_b"].upper() + " length distribution"
        plot_histogram(b_lens, mode="b", bin_to=1000)
        plt.xlabel("Document Length")
        # plt.title(title)
        plt.savefig(base_plotpath + file_name + "." + format, format=format)


        # Length Difference Distribution
        plt.figure()
        file_name = "diff_distribution_sbs1000"
        title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" length difference distribution"
        plot_diff_histogram(len_diffs, display_mode="sbs", namespace=namespace, bin_from=-1000, bin_to=1000, bin_gran=100)
        plt.xlabel("Pairwise length difference")
        # plt.title(title)
        plt.savefig(base_plotpath + file_name + "." + format, format=format)

        # Document Length CDF
        plt.figure()
        file_name = namespace["tag_coll_a"] + "_cdf_log"
        title = namespace["tag_coll_a"].upper() + " length CDF"
        plot_cdf(a_lens, mode="a")
        # plt.title(title)
        plt.xlabel("Document Length")
        plt.xscale("log")
        plt.yscale("log")
        plt.savefig(base_plotpath+file_name+"."+format,format=format)

        plt.figure()
        file_name = namespace["tag_coll_b"] + "_cdf_log"
        title = namespace["tag_coll_b"].upper() + " length CDF"
        plot_cdf(b_lens, mode="b")
        plt.xlabel("Document Length")
        # plt.title(title)
        plt.xscale("log")
        plt.yscale("log")
        plt.savefig(base_plotpath+file_name+"."+format,format=format)

        # Length Difference CDF
        plt.figure()
        file_name = "diffs_cdf"
        title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" length difference CDF"
        plot_diff_cdf(len_diffs, namespace=namespace)
        # plt.title(title)
        plt.xlabel("Pairwise length difference")
        plt.xscale("log")
        plt.yscale("log")
        plt.savefig(base_plotpath+file_name+"."+format,format=format)

        plt.figure()
        file_name = namespace["tag_coll_a"] + "_cdf_log_cut"
        title = namespace["tag_coll_a"].upper() + " length CDF"
        plot_cdf(a_lens, mode="a", cutoff=[50])
        # plt.title(title)
        plt.xlabel("Document Length")
        plt.xscale("log")
        plt.yscale("log")
        plt.savefig(base_plotpath+file_name+"."+format,format=format)

        plt.figure()
        file_name = namespace["tag_coll_b"] + "_cdf_log_cut"
        title = namespace["tag_coll_b"].upper() + " length CDF"
        plot_cdf(b_lens, mode="b", cutoff=[50])
        # plt.title(title)
        plt.xlabel("Document Length")
        plt.xscale("log")
        plt.yscale("log")
        plt.savefig(base_plotpath+file_name+"."+format,format=format)
        '''

        plt.figure(figsize=(5.7,3))
        file_name = namespace["tag_coll_a"] + "_" + namespace["tag_coll_b"] + "_cdf_log_cut"
        plot_cdf(b_lens, mode="b", cutoff=[50],label=True)
        plot_cdf(a_lens, mode="a", label=True)
        plt.xscale("log")
        plt.yscale("log")
        plt.legend(loc="lower right")
        ax = plt.gca()
        ax.set_xlabel("Document Length")
        plt.tight_layout()
        plt.savefig(base_plotpath + file_name + "." + format, format=format)
        plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")
        plt.clf()


        # Percentual Length Difference CDF
        plt.figure(figsize=(5.7,3))
        file_name = "diffs_cdf_perc_total"
        title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" Percentual length difference CDF"
        plot_diff_cdf(len_diffs_perc, namespace=namespace, at_x=[1])
        # plt.title(title)
        plt.xlim([0, 3])
        plt.xlabel("Pairwise percentual length difference")
        plt.tight_layout()
        plt.savefig(base_plotpath+file_name+"."+format,format=format)
        plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")
        plt.clf()

        '''
        plt.figure()
        file_name = "diffs_cdf_perc_total_threshold"
        title = namespace["tag_coll_a"].upper() + "/" + namespace[
            "tag_coll_b"].upper() + " Percentual length difference CDF"
        plot_diff_cdf(len_diffs_perc, namespace=namespace)
        # plt.title(title)
        plt.xlim([0, 1])
        plt.xlabel("Pairwise percentual length difference")
        plt.savefig(base_plotpath + file_name + "." + format, format=format)
        '''

        # Difference Frequency Distribution
        plt.figure(figsize=(5.7,3))
        file_name = "diff_counts_cut-2"
        title = namespace["tag_coll_a"].upper()+"/"+namespace["tag_coll_b"].upper()+" overhead cases"
        plot_counts(len_diffs_perc, namespace, cutoff=[-2, 2])
        plt.xlim([-0.5, 2])
        # plt.title(title)
        plt.tight_layout()
        plt.savefig(base_plotpath+file_name+"."+format,format=format)
        plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")
        plt.close("all")

        if namespace["mode"] != "wiki":
            sources_a, sources_b, sources_pair = load_sources(file_path,namespace)

            # Source Frequency Distribution
            plt.figure(figsize=(5.7,3))
            file_name = "source_dist_"+namespace["tag_coll_a"]
            plot_sources_dist(sources_a,namespace)
            plt.savefig(base_plotpath + file_name + "." + format, format=format)
            plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")
            plt.clf()

            plt.figure(figsize=(5.7,3))
            file_name = "source_dist_" + namespace["tag_coll_b"]
            plot_sources_dist(sources_b, namespace)
            plt.savefig(base_plotpath + file_name + "." + format, format=format)
            plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")
            plt.clf()

            plt.figure(figsize=(5.7,5))
            plt.axes().set_aspect('equal')
            file_name = "source_dist_pairs"
            plot_pair_matrix(sources_a, sources_b, sources_pair, namespace)
            plt.savefig(base_plotpath + file_name + "." + format, format=format, dpi=600)
            plt.savefig(base_plotpath + file_name + ".pdf", format="pdf")