Ejemplo n.º 1
0
 def get_0nM_enrichment_dict(self):
     assert len(self.enrichments_to_0nM) == 4**self.k
     return {
         kmer: enrich
         for kmer, enrich in zip(RBNS_utils.yield_kmers(self.k),
                                 self.enrichments_to_0nM)
     }
Ejemplo n.º 2
0
 def save_0nM_enrichments(self, enrich_pkl):
     enriches_by_kmer_D = {}
     num_kmers = len(self.enrichments_to_0nM)
     k = int(math.log(num_kmers, 4.))
     for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)):
         enriches_by_kmer_D[kmer] = self.enrichments_to_0nM[kmer_num]
     cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
Ejemplo n.º 3
0
 def load_0nM_enrichments(self, enrich_pkl):
     enriches_by_kmer_D = cPickle.load(open(enrich_pkl, 'rb'))
     k = int(math.log(len(enriches_by_kmer_D), 4.))
     enriches_L = []
     for kmer in RBNS_utils.yield_kmers(k):
         enriches_L.append(enriches_by_kmer_D[kmer])
     self.enrichments_to_0nM = np.array(enriches_L)
Ejemplo n.º 4
0
 def save_enrichments(self, enrich_pkl):
     RBNS_utils.make_dir(os.path.dirname(enrich_pkl))
     enriches_by_kmer_D = {}
     num_kmers = len(self.enrichments)
     k = int(math.log(num_kmers, 4.))
     for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)):
         enriches_by_kmer_D[kmer] = self.enrichments[kmer_num]
     cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
Ejemplo n.º 5
0
def get_kmer_freqs_from_reads_F(reads_F, k, vals_sum_to="sumto1"):
    """
    - Returns the kmer counts & freqs in reads_F

    - INPUTs:
        - vals_sum_to
            "sumto1": all 4^k entries sum to 1
            "sumto4^k": all 4^k entries sum to 4^k
    """
    counts_by_kmer_D = {}

    for kmer in RBNS_utils.yield_kmers(k):
        counts_by_kmer_D[kmer] = 0
    with open(reads_F) as f:
        for line in f:
            read = line.strip()
            for start_pos in range(len(read) - k + 1):
                kmer = read[start_pos:(start_pos + k)]
                # only include it if it doesn't have an N
                try:
                    counts_by_kmer_D[kmer] += 1
                except KeyError:
                    pass
    return_D = {"counts_by_kmer_D": counts_by_kmer_D}

    if (vals_sum_to == "none"):
        return counts_by_kmer_D

    #### Normalize using the helper function in dict_helpers
    elif (vals_sum_to == "sumto1"):
        freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D)
    elif (vals_sum_to == "sumto4^k"):
        freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D,
                                                 vals_sum_to="sumto4^k")
    else:
        print "{0} IS NOT A VALID vals_sum_to ARGUMENT. REPLACE AND TRY AGAIN\n".format(
            vals_sum_to)

    return_D["freqs_by_kmer_D"] = freqs_by_kmer_D
    return return_D
Ejemplo n.º 6
0
 def weight_dict(self):
     kmer2weight = {}
     for kmer, weight in zip(RBNS_utils.yield_kmers(self.k), self.profile):
         kmer2weight[kmer] = weight
     return kmer2weight
Ejemplo n.º 7
0
def return_frequency_and_number_of_reads_kmer_in_reads_F(reads_F, kmer):
    """
    - For a reads_F, makes a new out_reads_F in the same directory
        in which each occurrence of the kmer is replaced with "X"s
    - Called by functions in RBNS_logos.py

    - RETURNS:
            return_D = {"out_reads_F": out_reads_F,
                    "tot_num_reads": tot_num_reads,
                    "num_reads_w_kmer": num_reads_w_kmer,
                    "freq_reads_w_kmer": freq_reads_w_kmer,
                    "tot_num_kmer_occurs" : tot_num_kmer_occurs,
                    "counts_by_kmer_D": counts_by_kmer_D}
                    "freqs_by_kmer_D": freqs_by_kmer_D}
    """
    k = len(kmer)
    read_len = get_readlength(reads_F)

    orig_reads_DIR = os.path.dirname(reads_F)
    orig_reads_basename = os.path.basename(reads_F)
    out_basename = orig_reads_basename.rsplit(".", 1)[0] +\
            "_{}.reads".format( kmer )
    #### If the file name is over 100 characters, shorten it
    if (len(out_basename) >= 100):
        out_basename = "{}.reads".format(kmer)
    out_reads_F = os.path.join(orig_reads_DIR, out_basename)

    #### The number of reads and number of times a kmer was found
    tot_num_reads = 0
    num_reads_w_kmer = 0
    tot_num_kmer_occurs = 0

    #### A dictionary of kmer frequencies for the reads written out
    counts_by_kmer_D = {}
    for this_kmer in RBNS_utils.yield_kmers(k):
        counts_by_kmer_D[this_kmer] = 0

    reads_f = open(reads_F)
    out_reads_f = open(out_reads_F, "w")

    reads_to_write_out_L = []

    for line in reads_f:

        tot_num_reads += 1
        if (len(reads_to_write_out_L) == 10000):
            for read in reads_to_write_out_L:
                out_reads_f.write(read + "\n")
            reads_to_write_out_L = []
        read = line.strip()

        cont = True
        found_any = False

        while (cont == True):
            kmer_pos = read.find(kmer)
            if (kmer_pos == -1):

                if (found_any == True):
                    num_reads_w_kmer += 1
                for start_pos in range(read_len - k + 1):
                    this_kmer = read[start_pos:(start_pos + k)]
                    try:
                        counts_by_kmer_D[this_kmer] += 1
                    except KeyError:
                        pass
                reads_to_write_out_L.append(read)
                #out_reads_f.write( read + "\n" )
                cont = False

            #### If an occurrence of this kmer was found, replace it with X's
            ####    and write out the read
            else:
                found_any = True
                tot_num_kmer_occurs += 1
                read = read[:kmer_pos] + "X" * k + read[(kmer_pos + k):]

    for read in reads_to_write_out_L:
        out_reads_f.write(read + "\n")

    reads_f.close()
    out_reads_f.close()

    #### Normalize the counts_by_kmer_D into freqs
    freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D)
    freq_reads_w_kmer = float(num_reads_w_kmer) / tot_num_reads

    return_D = {
        "out_reads_F": out_reads_F,
        "tot_num_reads": tot_num_reads,
        "num_reads_w_kmer": num_reads_w_kmer,
        "freq_reads_w_kmer": freq_reads_w_kmer,
        "tot_num_kmer_occurs": tot_num_kmer_occurs,
        "counts_by_kmer_D": counts_by_kmer_D,
        "freqs_by_kmer_D": freqs_by_kmer_D
    }

    return return_D
def calc_Ppaired_over_top_enriched_kmers_and_flanking(reads_w_struct_F,
                                                      k,
                                                      fiveP_adapter,
                                                      threeP_adapter,
                                                      random_read_len,
                                                      num_bins=5):
    """
    - For an input reads_struct_F like:
            RBFOX3_input.w_struc.reads.gz
            RBFOX3_20.w_struc.reads.gz,

        gets all occurrences of each of the kmers and
        calculates the Ppaired over each position of the motif & 10 bases
        flanking it upstream & downstream, as well as the average Ppaired
        over each motif occurrence (i.e., which of the num_bins Ppaired bins
        it should go into for later calculating the R by Ppaired bin)

    - Pickles an output dictionary in out_Ds_DIR for later loading & analysis
    """
    assert (num_bins in [5, 10])
    starting_basename = os.path.basename(reads_w_struct_F).split('.w_st')[0]

    fiveP_len = len(fiveP_adapter)
    threeP_len = len(threeP_adapter)

    out_Ds_DIR = os.path.join(os.path.dirname(reads_w_struct_F), 'Ppaired_Ds',
                              str(k))
    RBNS_utils.make_dir(out_Ds_DIR)

    out_D_F = os.path.join(out_Ds_DIR, "{0}.D.pkl".format(starting_basename))
    if os.path.exists(out_D_F):
        return

    ##### Make sure that the adapter lengths & random read length match up
    for lines_L in RBNS_utils.iterNlines(reads_w_struct_F,
                                         4,
                                         strip_newlines=True):

        read_w_adapter = lines_L[0]
        calculated_random_read_len = len(
            read_w_adapter) - fiveP_len - threeP_len
        assert (calculated_random_read_len == random_read_len)
        break
    random_idx_L = range(random_read_len)

    upper_index_of_random = random_read_len + fiveP_len
    num_kmers_each_read = random_read_len - k + 1

    D = {
        "num_reads": 0,
        "Ppair_and_count_by_kmer_idx_D": {},
        "counts_by_kmer_binidx_D": {}
    }

    for kmer in RBNS_utils.yield_kmers(k):
        D["counts_by_kmer_binidx_D"][kmer] = {}
        for i in range(num_bins):
            D["counts_by_kmer_binidx_D"][kmer][i] = 0
        D["Ppair_and_count_by_kmer_idx_D"][kmer] = {}
        for idx in range(-10, k + 10):
            D["Ppair_and_count_by_kmer_idx_D"][kmer][idx] = {
                'counts': 0,
                'Ppaired_sum': 0.
            }

    for lines_L in RBNS_utils.iterNlines(reads_w_struct_F,
                                         4,
                                         strip_newlines=True):

        read_w_adapter = lines_L[0]
        random_seq = read_w_adapter[fiveP_len:upper_index_of_random]
        seq_L = [x for x in random_seq]

        Ppaired_L = lines_L[1].split(" ")
        pruned_Ppaired_L = [
            float(x) for x in Ppaired_L[fiveP_len:upper_index_of_random]
        ]

        seq_Ppaired_T_L = zip(seq_L, pruned_Ppaired_L)

        D["num_reads"] += 1

        for start_idx in range(num_kmers_each_read):

            #### Get the kmers in the read
            kmer = random_seq[start_idx:(start_idx + k)]
            Ppaired_kmer_L = pruned_Ppaired_L[start_idx:(start_idx + k)]

            mean_Ppaired = np.mean(Ppaired_kmer_L)
            if (num_bins == 5):
                bin_idx = get_bin_of_5_from_mean_Ppaired(mean_Ppaired)
            elif (num_bins == 10):
                bin_idx = get_bin_of_10_from_mean_Ppaired(mean_Ppaired)

            D["counts_by_kmer_binidx_D"][kmer][bin_idx] += 1

            ##### Go through and get all of the Ppaired flanking
            for rel_idx in range(-10, 10 + k):

                this_idx = start_idx + rel_idx
                if this_idx in random_idx_L:
                    D["Ppair_and_count_by_kmer_idx_D"][kmer][rel_idx][
                        'counts'] += 1
                    D["Ppair_and_count_by_kmer_idx_D"][kmer][rel_idx]['Ppaired_sum'] +=\
                            pruned_Ppaired_L[this_idx]

    ##### Pickle to out_D_F
    RBNS_utils.pkl_with_formatfile(D, out_D_F)
def analyze_freqs_by_position_one_library(protein,
                                          main_DIR,
                                          conc_for_fastq,
                                          ks_L,
                                          make_output_Fs=True,
                                          num_controls=20,
                                          max_log2_val_colormap=None):
    """
    -   Calculates the KL divergence of
        (Uniform across read || Observed freqs. across read) for each kmer,
        and outputs a .txt table with kmers in descending order of
        KL Divergence

    - INPUTs:
            - make_output_Fs:
                - If True, makes a .txt out file and a plot;
                - If Flase, doesn't make .txt/.pdf (this is used the first time
                    around to get the maximum absolute log2 value so that on
                    the second time around when plots are made, all the
                    colorbars can be coordinated togeter)
            - max_log2_val_colormap:
                - If passed in, the heatmap colorbar will go from
                    -max_log2_val_colormap to max_log2_val_colormap
    """
    return_D = {}

    if (conc_for_fastq == "input"):
        conc_label = "Input lib."
    else:
        conc_label = "{} nM lib.".format(conc_for_fastq)

    frequency_Ds_DIR = os.path.join(main_DIR, "frequency_Ds")
    RBNS_utils.make_dir(frequency_Ds_DIR)

    #### go through each of the k's
    for k in ks_L:

        #### Load the previously pickled dictionary of kmer frequencies at each
        ####    position
        D_F = os.path.join(
            frequency_Ds_DIR,
            "{0}_{1}.{2}mer.frequencies.by_position.pkl".format(
                protein, conc_for_fastq, k))
        with open(D_F) as f:
            freqs_by_pos_kmer_D = pickle.load(f)
        num_kmers_per_rd = len(freqs_by_pos_kmer_D.keys())

        #### A uniform distribution over all positions in the read
        uniform_L = [1. / num_kmers_per_rd] * num_kmers_per_rd

        #### Now go through each of the kmers and get the KL divergence of
        ####    KLDiv( uniform || observed freqs. across read )
        kmer_KLDiv_tuples_L = []
        kmer_to_KLDiv_D = {}
        for kmer in RBNS_utils.yield_kmers(k):
            obs_freqs_L = [
                freqs_by_pos_kmer_D[x][kmer] for x in range(num_kmers_per_rd)
            ]
            sum_obs_freqs = sum(obs_freqs_L)
            #### Normalize the obs_freqs_L
            obs_freqs_L = [x / sum_obs_freqs for x in obs_freqs_L]

            #### Get the KL Divergence
            KL = RBNS_utils.KL_divergence(uniform_L, obs_freqs_L)
            kmer_to_KLDiv_D[kmer] = KL
            #### Also get the log2(OBSERVED/UNIFORM) at each position
            try:
                observed_over_unif_L = [math.log(obs_freqs_L[i] / uniform_L[i], 2)\
                        for i in range( num_kmers_per_rd )]
            except ValueError:
                observed_over_unif_L = []
                for i in range(num_kmers_per_rd):
                    try:
                        observed_over_unif_L.append(
                            math.log(obs_freqs_L[i] / uniform_L[i], 2))
                    except ValueError:
                        observed_over_unif_L.append(1.)

            kmer_KLDiv_tuples_L.append((kmer, KL, observed_over_unif_L))

        #### Sort the kmers by decreasing
        kmer_KLDiv_tuples_L.sort(key=lambda x: -1 * x[1])

        #### Get the mean KL divergence
        KL_divs = [x[1] for x in kmer_KLDiv_tuples_L]
        mean_KL, std_KL = RBNS_utils.mean_std(KL_divs)
        #### a 3 STD threshold for the "most unequal"
        three_STD_thresh = mean_KL + (3 * std_KL)
        neg1_STD_thresh = mean_KL - std_KL

        #### Go through and get the kmers & KL divergences for those that are
        ####    >= 3 STD
        three_STD_tuples_L = kmer_KLDiv_tuples_L[:30]
        #three_STD_tuples_L = [x for x in kmer_KLDiv_tuples_L if x[1] >=\
        #        three_STD_thresh ]
        #print "{0}".format( len(three_STD_tuples_L) )
        #### Go through each of the significant kmers
        sig_kmers_to_KL_Div_D = {}
        sig_kmers_to_log2_Obs_over_Exp_L_D = {}
        #### the maximum absolute log2 value plotted, so that all of the
        ####    libraries can have the same colormap scale
        max_abs_log2_plotted = 0.
        for kmer, KL, observed_over_unif_L in three_STD_tuples_L:
            sig_kmers_to_KL_Div_D[kmer] = KL
            sig_kmers_to_log2_Obs_over_Exp_L_D[kmer] = observed_over_unif_L
            #### update max_abs_log2_plotted
            max_abs_log2_plotted = max(max_abs_log2_plotted,
                                       max(observed_over_unif_L),
                                       -1 * min(observed_over_unif_L))

        #### Get "control" kmer distributions that have KL divergence below
        ####    the mean
        ctrl_STD_tuples_L = [x for x in kmer_KLDiv_tuples_L if x[1] < mean_KL]
        control_delta = int(len(ctrl_STD_tuples_L) / float(num_controls))
        #### go through and get the 20 evently spaced controls to plot
        control_tuples_L = [
            ctrl_STD_tuples_L[control_delta * x] for x in range(num_controls)
        ]
        #### Go through each of the control kmers
        low_kmers_to_KL_Div_D = {}
        low_kmers_to_log2_Obs_over_Exp_L_D = {}
        for kmer, KL, observed_over_unif_L in control_tuples_L:
            low_kmers_to_KL_Div_D[kmer] = KL
            low_kmers_to_log2_Obs_over_Exp_L_D[kmer] = observed_over_unif_L
            #### update max_abs_log2_plotted
            max_abs_log2_plotted = max(max_abs_log2_plotted,
                                       max(observed_over_unif_L),
                                       -1 * min(observed_over_unif_L))

        #### add the max_abs_log2_plotted to the return_D
        return_D[k] = {"max_abs_log2_plotted": max_abs_log2_plotted}

        if (make_output_Fs == True):
            #### Make the out_F
            out_DIR = os.path.join(main_DIR, "tables/by_position")
            RBNS_utils.make_dir(out_DIR)

            out_basename = "{0}mers.{1}greatest_KL_div_of_freqs_across_read.txt".format(
                k, conc_label.replace(" ", "_"))
            out_F = os.path.join(out_DIR, out_basename)
            with open(out_F, "w") as f_out:
                #### write a header line
                f_out.write("{0}: {1}\n".format(protein, conc_label))
                f_out.write(
                    "\tKL Div(Uniform||Observed)\t\tlog2(Obs/Unif) at Pos 1\tPos. 2\n"
                )

                #### Go through and write out all of the kmers
                for kmer, KL, observed_over_unif_L in kmer_KLDiv_tuples_L:
                    f_out.write("\n{0}\t{1:.4g}\t\t".format(kmer, KL))
                    for ratio in observed_over_unif_L:
                        f_out.write("{0:.3f}\t".format(ratio))

            return_D[k] = {"out_F": out_F, "kmer_to_KLDiv_D": kmer_to_KLDiv_D}

            #### Make a plot using the helper function in
            ####    /helpers/python_helpers/plots.py
            returned_fig_D = RBNS_plots.make_rectangular_heatmap_plot_RBNS_freqs(
                sig_kmers_to_KL_Div_D,
                sig_kmers_to_log2_Obs_over_Exp_L_D,
                low_kmers_to_KL_Div_D,
                low_kmers_to_log2_Obs_over_Exp_L_D,
                title="{0}: {1} {2}mer frequencies across reads".format(
                    protein.replace("_", " "), conc_label, k),
                colorbar_label=r"$log_2$(Observed / Uniform freq.)",
                max_log2_val_colormap=max_log2_val_colormap)
            return_D[k]["fig"] = returned_fig_D["fig"]

    return return_D
def analyze_freqs_by_position_one_barcodes_ordered_kmers_to_consider(
        ordered_kmers_to_consider_Ls_by_k_D,
        protein,
        main_DIR,
        conc_for_fastq,
        ks_L,
        ordered_kmers_description_fnames="",
        make_output_Fs=True,
        num_controls=20,
        max_log2_val_colormap=None):
    """
    - A helper function called by the
    analyze_freqs_by_position_all_barcodes_one_protein_top_enriched_kmers()
    function below

    - Using the previously pickled dictionaries from the
        get_counts_freqs_by_pos_one_F() function above, loads them and
        calculates the KL divergence of
        (Uniform across read || Observed freqs. across read) for each kmer,
        and outputs a .txt table with kmers in descending order of
        KL Divergence

    - INPUTs:
        - counts_by_pos_DIR:
            - directory that has pickled dictionaries, like:
                /net/uorf/data/backup/RBNS_results/srsf8/counts/by_position
            - basename_start:
                - e.g., "80", or "input_library"
            - pprint_lib_name:
                - a "nice" name to use for the title (e.g., "Input Library")
            - ordered_kmers_to_consider_L:
                - the kmers to consider, e.g., the sig. enriched kmers
            - add_to_end_title_str:
                - e.g., ", Perfect Adapter Reads Only"
            - make_output_Fs:
                - If True, makes a .txt out file and a plot;
                - If False, doesn't make .txt/.pdf (this is used the first time
                    around to get the maximum absolute log2 value so that on
                    the second time around when plots are made, all the
                    colorbars can be coordinated togeter)
            - ordered_kmers_description_fnames:
                - a name that will be added to the output PDF & .txt files to
                    distinguish it (e.g., "3std" to denote these kmers as those
                    with Z-score >= 3)
                    - "3.0std"
                    - "2.0std"
                    - "least"
                    - "Adapter 1" or "Adapter 2"
            - max_log2_val_colormap:
                - If passed in, the heatmap colorbar will go from
                    -max_log2_val_colormap to max_log2_val_colormap
    """
    return_D = {}

    frequency_Ds_DIR = os.path.join(main_DIR, "frequency_Ds")
    RBNS_utils.make_dir(frequency_Ds_DIR)

    if (conc_for_fastq == "input"):
        conc_label = "Input lib."
    else:
        conc_label = "{} nM lib.".format(conc_for_fastq)

    #### go through each of the k's
    for k in ks_L:

        ordered_kmers_to_consider_L = ordered_kmers_to_consider_Ls_by_k_D[k]
        if (len(ordered_kmers_to_consider_L) == 0):
            return_D[k] = {}
            continue

        #### Load the previously pickled dictionary of kmer frequencies at each
        ####    position
        D_F = os.path.join(
            main_DIR,
            "frequency_Ds/{0}_{1}.{2}mer.frequencies.by_position.pkl".format(
                protein, conc_for_fastq, k))
        with open(D_F) as f:
            freqs_by_pos_kmer_D = pickle.load(f)
        num_kmers_per_rd = len(freqs_by_pos_kmer_D.keys())

        #### A uniform distribution over all positions in the read
        uniform_L = [1. / num_kmers_per_rd] * num_kmers_per_rd

        #### Now go through each of the kmers and get the KL divergence of
        ####    KLDiv( uniform || observed freqs. across read )
        kmer_KLDiv_tuples_L = []
        kmer_to_KLDiv_D = {}
        for kmer in RBNS_utils.yield_kmers(k):
            obs_freqs_L = [freqs_by_pos_kmer_D[x][kmer] for x in\
                    range( num_kmers_per_rd )]
            sum_obs_freqs = sum(obs_freqs_L)
            #### Normalize the obs_freqs_L
            obs_freqs_L = [x / sum_obs_freqs for x in obs_freqs_L]

            #### Get the KL Divergence
            KL = RBNS_utils.KL_divergence(uniform_L, obs_freqs_L)
            kmer_to_KLDiv_D[kmer] = KL
            #### Also get the log2(OBSERVED/UNIFORM) at each position
            observed_over_unif_L = []
            for i in range(num_kmers_per_rd):
                try:
                    observed_over_unif_L.append(
                        math.log(obs_freqs_L[i] / uniform_L[i], 2))
                except ValueError:
                    observed_over_unif_L.append(0.)

            kmer_KLDiv_tuples_L.append((kmer, KL, observed_over_unif_L))

        #### Sort the kmers by decreasing
        kmer_KLDiv_tuples_L.sort(key=lambda x: -1 * x[1])

        #### Get the mean KL divergence
        KL_divs = [x[1] for x in kmer_KLDiv_tuples_L]
        mean_KL, std_KL = RBNS_utils.mean_std(KL_divs)

        #### Extract the ordered_kmers_to_consider
        kmer_KLDiv_tuples_desiredkmers_L = []
        for kmer in ordered_kmers_to_consider_L:
            #### Go through kmer_KLDiv_tuples_L and get the tuple for this kmer
            for tupl in kmer_KLDiv_tuples_L:
                if (tupl[0] == kmer):
                    kmer_KLDiv_tuples_desiredkmers_L.append(tupl)
                    break

        #### Go through each of the significant kmers
        kmers_to_KL_Div_D = {}
        kmers_to_log2_Obs_over_Exp_L_D = {}
        #### the maximum absolute log2 value plotted, so that all of the
        ####    libraries can have the same colormap scale
        max_abs_log2_plotted = 0.
        for kmer, KL, observed_over_unif_L in kmer_KLDiv_tuples_desiredkmers_L:
            kmers_to_KL_Div_D[kmer] = KL
            kmers_to_log2_Obs_over_Exp_L_D[kmer] = observed_over_unif_L
            #### update max_abs_log2_plotted
            max_abs_log2_plotted = max(max_abs_log2_plotted,
                                       max(observed_over_unif_L),
                                       -1 * min(observed_over_unif_L))

        #### Get "control" kmer distributions that have KL divergence below
        ####    the mean
        ctrl_STD_tuples_L = [x for x in kmer_KLDiv_tuples_L if x[1] < mean_KL]
        control_delta = int(len(ctrl_STD_tuples_L) / float(num_controls))
        #### go through and get the 20 evently spaced controls to plot
        control_tuples_L = [
            ctrl_STD_tuples_L[control_delta * x] for x in range(num_controls)
        ]
        #### Go through each of the control kmers
        control_kmers_to_KL_Div_D = {}
        control_kmers_to_log2_Obs_over_Exp_L_D = {}
        for kmer, KL, observed_over_unif_L in control_tuples_L:
            control_kmers_to_KL_Div_D[kmer] = KL
            control_kmers_to_log2_Obs_over_Exp_L_D[kmer] = observed_over_unif_L
            #### update max_abs_log2_plotted
            max_abs_log2_plotted = max(max_abs_log2_plotted,
                                       max(observed_over_unif_L),
                                       -1 * min(observed_over_unif_L))

        #### add the max_abs_log2_plotted to the return_D
        return_D[k] = {
            "max_abs_log2_plotted":
            max_abs_log2_plotted,
            "kmers_to_KL_Div_D":
            kmers_to_KL_Div_D,
            "kmers_to_log2_Obs_over_Exp_L_D":
            kmers_to_log2_Obs_over_Exp_L_D,
            "control_kmers_to_KL_Div_D":
            control_kmers_to_KL_Div_D,
            "control_kmers_to_log2_Obs_over_Exp_L_D":
            control_kmers_to_log2_Obs_over_Exp_L_D
        }

        if make_output_Fs:

            #### Make the out_F
            out_DIR = os.path.join(main_DIR, "tables/by_position")
            RBNS_utils.make_dir(out_DIR)

            #### < Make a table of KL div by decreasing R of sig. R kmers > ###
            out_basename = "{0}mers.{1}sig_R_{2}.KL_div_of_freqs_across_read.txt".format(
                k, conc_label.replace(" ", "_"),
                ordered_kmers_description_fnames.replace(" ", "_"))
            out_F = os.path.join(out_DIR, out_basename)
            with open(out_F, "w") as f_out:
                #### write a header line
                f_out.write("{0}: {1}\n".format(protein.replace("_", " "),
                                                conc_label))
                f_out.write(
                    "\tKL Div(Uniform||Observed)\t\tlog2(Obs/Unif) at Pos 1\tPos. 2\n"
                )

                #### Go through and write out all of the kmers
                for kmer in ordered_kmers_to_consider_L:
                    for other_kmer, KL, observed_over_unif_L in kmer_KLDiv_tuples_L:
                        if (other_kmer == kmer):
                            f_out.write("\n{0}\t{1:.4g}\t\t".format(kmer, KL))
                            for ratio in observed_over_unif_L:
                                f_out.write("{0:.3f}\t".format(ratio))

            ### </ Make a table of KL div by decreasing R of sig. R kmers > ###

            #### try to get the Z-score threshold for sig. enrichment
            kmers_type_annot = "Enriched $k$mers"
            try:
                num_std = int(
                    float(ordered_kmers_description_fnames.split("std")[0]))
                kmers_type_annot += (" (Z-score $\geq${})".format(num_std))
            except ValueError:
                if (ordered_kmers_description_fnames == "least"):
                    kmers_type_annot = "Least Enriched $k$mers"
                else:
                    kmers_type_annot = ordered_kmers_description_fnames

            #### Make the plot
            returned_fig_D = RBNS_plots.make_rectangular_heatmap_plot_RBNS_freqs(
                kmers_to_KL_Div_D,
                kmers_to_log2_Obs_over_Exp_L_D,
                control_kmers_to_KL_Div_D,
                control_kmers_to_log2_Obs_over_Exp_L_D,
                order_of_kmers_L=ordered_kmers_to_consider_L,
                kmers_type_annot=kmers_type_annot,
                title="{0}: {1} {2}mer frequencies across reads".format(
                    protein.replace("_", " "), conc_label, k),
                colorbar_label=r"$log_2$(Observed / Uniform freq.)",
                max_log2_val_colormap=max_log2_val_colormap)
            return_D[k]["fig"] = returned_fig_D["fig"]

    return return_D