def submit_get_suboptimal_block_sampled_DotBracket_reads_F(
        in_struct_gz_F, temp, starting_scratch_DIR, block_idx,
        num_reads_per_block):
    """
    - For an in_struct_gz_F split_reads file,
        submits a job to run the
        get_suboptimal_block_sampled_DotBracket_reads_F() function
           below
    """
    #### Get this file path
    filename = inspect.getframeinfo(inspect.currentframe()).filename
    this_script_path = os.path.abspath(filename)

    output_DIR = os.path.dirname(in_struct_gz_F)
    errors_outputs_DIR = os.path.join(output_DIR, "errors_outputs")
    RBNS_utils.make_dir(errors_outputs_DIR)

    command = ('python %(this_script_path)s '
               'get_suboptimal_block_sampled_DotBracket_reads_F '
               '%(in_struct_gz_F)s '
               '%(temp)s '
               '%(starting_scratch_DIR)s '
               '%(block_idx)s '
               '%(num_reads_per_block)s ' % locals())

    job_name = "{0}_block{1}_get_suboptimal_block_sampled_DotBracket_reads_F".format(
        os.path.basename(in_struct_gz_F).split(".")[0], block_idx)
    RBNS_cluster_utils.launch(
        command,
        out_file=os.path.join(errors_outputs_DIR, "{}.log".format(job_name)),
        jobname=job_name,
        error_DIR=errors_outputs_DIR,
        time_mins=690)  # 11.5 hours (12 hours is the limit)
Ejemplo n.º 2
0
def launch_counter(lib_settings, count_type, k, error_DIR):
    """
    - Launches a job to perform kmer counts of count_type, calling the
        'counter' function below
    """
    split_reads = lib_settings.get_split_reads()
    out_pkl = lib_settings.counts_file(count_type, k)
    RBNS_utils.make_dir(os.path.dirname(out_pkl))
    cluster_python_script = os.path.abspath(__file__)
    barcode = lib_settings.get_barcode()
    out_file = os.path.join(error_DIR,
                            'count.%s.%s.%i.out' % (barcode, count_type, k))
    err_file = os.path.join(error_DIR,
                            'count.%s.%s.%i.err' % (barcode, count_type, k))
    command = ('python %(cluster_python_script)s '
               'counter '
               '%(count_type)s '
               '%(split_reads)s '
               '%(k)i '
               '%(out_pkl)s ' % locals())
    #'1> %(out_file)s '
    #'2> %(err_file)s ' % locals())
    conc = lib_settings.get_conc()
    jobname = '%s.%s.%i.%g' % (os.path.basename(split_reads), count_type, k,
                               conc)
    return launch(command, jobname=jobname, error_DIR=error_DIR)
Ejemplo n.º 3
0
 def save_enrichments(self, enrich_pkl):
     RBNS_utils.make_dir(os.path.dirname(enrich_pkl))
     enriches_by_kmer_D = {}
     num_kmers = len(self.enrichments)
     k = int(math.log(num_kmers, 4.))
     for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)):
         enriches_by_kmer_D[kmer] = self.enrichments[kmer_num]
     cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
Ejemplo n.º 4
0
 def save_0nM_enrichments(self, enrich_pkl):
     enriches_by_kmer_D = {}
     num_kmers = len(self.enrichments_to_0nM)
     k = int(math.log(num_kmers, 4.))
     for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)):
         enriches_by_kmer_D[kmer] = self.enrichments_to_0nM[kmer_num]
     cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
Ejemplo n.º 5
0
 def load_0nM_enrichments(self, enrich_pkl):
     enriches_by_kmer_D = cPickle.load(open(enrich_pkl, 'rb'))
     k = int(math.log(len(enriches_by_kmer_D), 4.))
     enriches_L = []
     for kmer in RBNS_utils.yield_kmers(k):
         enriches_L.append(enriches_by_kmer_D[kmer])
     self.enrichments_to_0nM = np.array(enriches_L)
Ejemplo n.º 6
0
 def get_0nM_enrichment_dict(self):
     assert len(self.enrichments_to_0nM) == 4**self.k
     return {
         kmer: enrich
         for kmer, enrich in zip(RBNS_utils.yield_kmers(self.k),
                                 self.enrichments_to_0nM)
     }
Ejemplo n.º 7
0
 def split_reads_exist(self):
     """
     returns true if the split reads file for this library exists
     and is non empty
     does not check if it is complete
     """
     return RBNS_utils.file_exists(self.get_split_reads())
Ejemplo n.º 8
0
 def check_barcodes_are_separated( self,
         min_hamming_distance = 2):
     """
     - Makes sure the barcodes are all totally distinguishable (i.e., all
         have Hamming distance min_hamming_distance away from all others)
     """
     for b1, b2 in itertools.combinations(self.settings['barcodes'], 2):
         hamming_dist = RBNS_utils.hamming_distance(b1, b2)
         if hamming_dist < min_hamming_distance:
             raise ValueError('The barcodes supplied are not well '
               'separated: %s-%s' % (b1, b2))
Ejemplo n.º 9
0
 def calculate_enrichment(self, k, input_lib):
     enrich_pkl = os.path.join(
         self.experiment_settings.get_rdir(), 'enrichment_Ds',
         '%s_%s_to_input.%imer.enrichments.pkl' %
         (self.experiment_settings.get_property('protein_name'),
          self.lib_settings.get_conc_string(), k))
     if RBNS_utils.file_exists(enrich_pkl):
         self.type2k2counts['naive'][k].load_enrichments(enrich_pkl)
     else:
         input_profile = input_lib.type2k2counts['naive'][k]
         self.type2k2counts['naive'][k].calculate_enrichments(input_profile)
         self.type2k2counts['naive'][k].save_enrichments(enrich_pkl)
Ejemplo n.º 10
0
def get_kmer_freqs_from_reads_F(reads_F, k, vals_sum_to="sumto1"):
    """
    - Returns the kmer counts & freqs in reads_F

    - INPUTs:
        - vals_sum_to
            "sumto1": all 4^k entries sum to 1
            "sumto4^k": all 4^k entries sum to 4^k
    """
    counts_by_kmer_D = {}

    for kmer in RBNS_utils.yield_kmers(k):
        counts_by_kmer_D[kmer] = 0
    with open(reads_F) as f:
        for line in f:
            read = line.strip()
            for start_pos in range(len(read) - k + 1):
                kmer = read[start_pos:(start_pos + k)]
                # only include it if it doesn't have an N
                try:
                    counts_by_kmer_D[kmer] += 1
                except KeyError:
                    pass
    return_D = {"counts_by_kmer_D": counts_by_kmer_D}

    if (vals_sum_to == "none"):
        return counts_by_kmer_D

    #### Normalize using the helper function in dict_helpers
    elif (vals_sum_to == "sumto1"):
        freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D)
    elif (vals_sum_to == "sumto4^k"):
        freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D,
                                                 vals_sum_to="sumto4^k")
    else:
        print "{0} IS NOT A VALID vals_sum_to ARGUMENT. REPLACE AND TRY AGAIN\n".format(
            vals_sum_to)

    return_D["freqs_by_kmer_D"] = freqs_by_kmer_D
    return return_D
Ejemplo n.º 11
0
    def calculate_enrichment_to_0nM(self, k, zero_nM_lib):
        zero_enrich_pkl = os.path.join(
            self.experiment_settings.get_rdir(), 'enrichment_Ds',
            '%s_%s_to_0nM.%imer.enrichments.pkl' %
            (self.experiment_settings.get_property('protein_name'),
             self.lib_settings.get_conc_string(), k))

        zero_nM_profile = zero_nM_lib.type2k2counts['naive'][k]
        self.type2k2counts['naive'][k].calculate_enrichments_to_0nM(
            zero_nM_profile)

        if RBNS_utils.file_exists(zero_enrich_pkl) == False:
            self.type2k2counts['naive'][k].save_0nM_enrichments(
                zero_enrich_pkl)
Ejemplo n.º 12
0
def stream_without_continual_update(k, in_weights, inFile):
    """
    - Performs streaming kmer assignment (SKA) algorithm in which the weights
        are NOT continually updated after each read, just after going through
        all of the reads (typically used from the second pass onward)
    """
    new_weights = np.ones(4**k)
    for linei, line in enumerate(RBNS_utils.aopen(inFile)):
        read_seq = line.strip()
        pk = get_kmers(read_seq, k)
        additional_weights = assign_kmer_weights(pk, in_weights)
        assert sum(additional_weights) - 1.0 < 0.001
        for kmer, weight in zip(pk, additional_weights):
            kmeri = get_index_from_kmer(kmer)
            new_weights[kmeri] += weight
    return new_weights
Ejemplo n.º 13
0
def get_sig_enriched_kmers_from_txt_R_F(txt_R_F,
                                        most_enriched_lib_conc=None,
                                        num_std_for_sig=2):
    """
    INPUT:
        - txt_R_F: a file of enrichments from the pipeline (e.g.
        /net/uorf/data/nobackup/pfreese/RBNS_results/Fox_1_7_14/tables/RBFox2_enrichment_R.6mers.txt
        - most_enriched_lib_conc: the column header in the first row of the txt_F (e.g. "80");
            can pass this in manually. If nothing is passed in, the concentration
            with the highest enrichment will be chosen

    RETURNS:
        - return_D =
            {"sig_enriched_kmers_L": sig_enriched_kmers_L,
            "sig_enrichments_by_kmer_D": sig_enrichments_by_kmer_D}
    """
    #### get the dictionary of enrichments
    enriches_by_kmer_D = return_D_of_enrichments_from_txt_F(
        txt_R_F, most_enriched_lib_conc=most_enriched_lib_conc)

    #### a list of enrichments
    enrichments_L = enriches_by_kmer_D.values()
    mean, std = RBNS_utils.mean_std(enrichments_L)
    sig_threshold = mean + (num_std_for_sig * std)

    #### a list of the sig. enriched kmers and R values
    sig_enriched_kmer_R_tuples_L = []
    #### a dictionary of the enrichments, containing ONLY the sig. enriched
    ####    kmers
    sig_enrichments_by_kmer_D = {}
    for kmer in enriches_by_kmer_D:
        R = enriches_by_kmer_D[kmer]
        if (R >= sig_threshold):
            sig_enriched_kmer_R_tuples_L.append((kmer, R))
            sig_enrichments_by_kmer_D[kmer] = enriches_by_kmer_D[kmer]
    sig_enriched_kmer_R_tuples_L.sort(key=lambda x: -1 * x[1])
    sig_enriched_kmers_L = [tupl[0] for tupl in sig_enriched_kmer_R_tuples_L]

    return_D = {
        "sig_enriched_kmers_L": sig_enriched_kmers_L,
        "sig_enrichments_by_kmer_D": sig_enrichments_by_kmer_D
    }

    return return_D
def get_subopt_folding_of_reads(reads_L,
                                scratch_DIR,
                                temp,
                                out_F_to_append_results_to,
                                num_to_return_for_each_read=20):
    """
    - Given a list of reads (with adapters) reads_L, will get
        num_to_return_for_each_read suboptimal DotBracket structures sampled
        with probabilities equal to their Boltzmann weights
    """
    tmp_read_fasta_F = os.path.join(scratch_DIR, "reads.fa")
    out_F = os.path.join(scratch_DIR, "reads.out.txt")

    read_by_readwindex_D = {}
    with open(tmp_read_fasta_F, "w") as f:
        for idx, read in enumerate(reads_L):
            read_w_index = "read{0}".format(idx)
            read_by_readwindex_D[read_w_index] = read
            f.write(">read{0}\n{1}\n".format(idx, read))

    os.chdir(scratch_DIR)
    fold_CMD = "RNAsubopt --temp={0} --stochBT={1} < {2} > {3}".format(
        temp, num_to_return_for_each_read, tmp_read_fasta_F, out_F)
    # make the .ps files
    fold = subprocess.Popen(fold_CMD, shell=True)
    stdoutdata, stderrdata = fold.communicate()

    #### Now go through all of the reads
    this_read = ""
    out_f_to_append_results_to = gzip.open(out_F_to_append_results_to, 'ab')
    for lines_L in RBNS_utils.iterNlines(out_F,
                                         num_to_return_for_each_read + 2,
                                         strip_newlines=True):

        this_read = lines_L[1]
        out_f_to_append_results_to.write(">" + this_read + "\n")

        #### Now go through all of the num_to_return_for_each_read DotBracket
        ####    structures
        for DB_str in lines_L[2:]:
            element_string = get_elementstring_from_DotBracket(DB_str)
            out_f_to_append_results_to.write(element_string + "\n")

    out_f_to_append_results_to.close()
Ejemplo n.º 15
0
def stream_continual_update(k, weights, inFile):
    """
    - Performs streaming kmer assignment (SKA) algorithm in which the weights
        are continually updated after each read (typically, this is used for
        just the first pass)
    """
    total_lines = count_lines(inFile) * 2
    start_time = time.time()
    for linei, line in enumerate(RBNS_utils.aopen(inFile)):
        if linei % 10000 == 0 and linei:
            elapsed_time = time.time() - start_time
            print 'Predicted time remaining for stream_continual_update:',\
              (total_lines - linei) / linei * elapsed_time / 3600,\
              'hours'
        read_seq = line.strip()
        pk = get_kmers(read_seq, k)
        assigned_weights = assign_kmer_weights(pk, weights)
        for kmer, weight in zip(pk, assigned_weights):
            kmeri = get_index_from_kmer(kmer)
            weights[kmeri] += weight
    return weights
Ejemplo n.º 16
0
def stream_continual_update_with_convergence_table(k,
                                                   weights,
                                                   inFile,
                                                   out_file,
                                                   how_often_to_write=10000):
    """
    - Performs streaming kmer assignment (SKA) algorithm in which the weights
        are continually updated after each read (typically, this is used for
        just the first pass); also makes an ouptput summary table at the end
    """
    internal_history = []
    for linei, line in enumerate(RBNS_utils.aopen(inFile)):
        if linei % how_often_to_write == 0:
            norm_weights = copy.copy(weights)
            norm_weights = normalize_mean_1(norm_weights)
            internal_history.append(norm_weights)

        read_seq = line.strip()
        pk = get_kmers(read_seq, k)
        assigned_weights = assign_kmer_weights(pk, weights)
        for kmer, weight in zip(pk, assigned_weights):
            kmeri = get_index_from_kmer(kmer)
            weights[kmeri] += weight

    of = open(out_file, 'w')
    of.write('kmer\t' + '\t'.join(
      ['reads_read_%i' % (i * how_often_to_write) for i in\
              range(len(internal_history))]) + '\n')
    for kmer_i, kmer in enumerate(yield_kmers(k)):

        of.write('%s\t' % kmer)
        for col_i in range(len(internal_history)):
            assert len(internal_history[col_i]) == 4**k
            of.write('%g\t' % internal_history[col_i][kmer_i])
        of.write('\n')

    of.close()

    return weights
Ejemplo n.º 17
0
 def get_B_values(self, read_len):
     return [
         RBNS_utils.B_factor(enrich, self.k, read_len)
         for enrich in self.enrichments
     ]
Ejemplo n.º 18
0
def make_temp_reads_F(orig_reads_F,
                      target_reads_DIR,
                      read_length_to_use="full_length",
                      num_reads_to_use="all",
                      target_reads_basename=None,
                      start_frac=0.0):
    """
    - Given an orig_reads_F, makes a new reads file containing only reads that
        DON'T contain any N's

    - INPUT:
        - orig_reads_F: a file (e.g. .reads) of the reads
        - target_reads_DIR: where the output .reads file will be written
        - read_length_to_use: the length of reads that will be included in the
            output reads file
            - "full_length": it will be the full read
        - num_reads_to_use:
            - "all": all reads
            - an int: will use up to that many reads
            - a float (from 0.0 to 1.0): that proportion of the total reads in
                orig_reads_F
        - target_reads_basename: the output reads basename; if not passed in,
            it will be the orig_reads_F basename with a time_stamp appended
        - start_frac: how far through the orig_reads_F to start getting reads

    - RETURNS:
        return_D = {"out_reads_F": out_reads_F,
                "num_reads_in_out_reads_F": rd_num}
    """
    #### Make the out_reads_F
    if (target_reads_basename == None):
        reads_basename = os.path.basename( orig_reads_F ).rsplit( ".", 1 )[0] +\
            ".{}.reads".format(datetime.datetime.now().strftime("%Hh_%Mm_%Ss" ))
    else:
        if (target_reads_basename[-6:] == ".reads"):
            reads_basename = target_reads_basename
        else:
            reads_basename = target_reads_basename + ".reads"

    out_reads_F = os.path.join(target_reads_DIR, reads_basename)
    os.system("mkdir -p {}".format(target_reads_DIR))
    out_reads_f = open(out_reads_F, "w")

    #### Get the num_reads_to_use
    total_lines_in_file = RBNS_utils.return_num_lines_in_F(orig_reads_F)
    if (num_reads_to_use == "all"):
        reads_to_use = 10000000000
    elif (type(num_reads_to_use) is int):
        reads_to_use = num_reads_to_use
    elif (type(num_reads_to_use) is float):
        assert (num_reads_to_use <= 1.0 and num_reads_to_use > 0.0)
        reads_to_use = int(num_reads_to_use * total_lines_in_file)

    #### Get the line_to_start_at from the start_frac passed in
    if (start_frac == 0.0):
        line_to_start_at = 0
    else:
        line_to_start_at = int(total_lines_in_file * start_frac)

    line_lower = line_to_start_at
    line_upper = line_to_start_at + reads_to_use

    #### Get the read_length_to_use if it's "all"
    if (read_length_to_use == "full_length"):
        rd_length_to_use = get_readlength(orig_reads_F)
    else:
        assert (type(read_length_to_use) is int)
        rd_length_to_use = read_length_to_use

    #### Now populate the out_reads_F
    with open(orig_reads_F) as f:
        reads_written = 0
        this_read = -1
        for line in f:
            this_read += 1
            if (this_read >= line_to_start_at):
                ln = line.strip()
                #### Only use reads that have no N's
                if ((len(ln) >= rd_length_to_use) and (ln.find("N") == -1)):
                    out_reads_f.write(ln[:rd_length_to_use] + "\n")
                    reads_written += 1
                if (reads_written >= reads_to_use):
                    break
    out_reads_f.close()

    return_D = {
        "out_reads_F": out_reads_F,
        "num_reads_in_out_reads_F": reads_written
    }
    return return_D
def plot_R_by_Ppaired_bin_w_sig(readswstruct_startingbasename_myannot_L,
                                read_len,
                                k,
                                effective_R_D,
                                kmers_to_do="top_10"):
    """
    - Makes a plot of the Ppaired ratios for the desired set of kmers

    - fld_CG_match_DIR is the directory that contains the Ppaired_Ds directory:
        /net/eofe-data010/data001/burgelab/nevermind/data/nm/pfreese/RBFOX3_test/split_reads/fld_CG_match

    """
    import random
    import RBNS_plots

    assert (kmers_to_do in ["top_10"])

    fld_CG_match_DIR = os.path.dirname(
        readswstruct_startingbasename_myannot_L[0][0])
    RBP = readswstruct_startingbasename_myannot_L[0][1].split('_')[0]

    #### Get the list of all kmers, which to motif_num matches
    all_kmers_L = RBNS_utils.return_all_kmers_L(k)
    kmer_to_motifidx_D = {}
    motifidx_to_kmer_D = {}
    for idx, kmer in enumerate(all_kmers_L):
        kmer_to_motifidx_D[kmer] = idx
        motifidx_to_kmer_D[idx] = kmer

    out_DIR = fld_CG_match_DIR.split("/split_reads")[0]
    os.system("mkdir -p {}".format(out_DIR))

    out_DIR_this_RBP_k = os.path.join(out_DIR, "{0}mer_plots".format(k))

    out_Ds_DIR = os.path.join(out_DIR_this_RBP_k, 'Ds')
    os.system("mkdir -p {}".format(out_Ds_DIR))

    tables_DIR = os.path.join(out_DIR_this_RBP_k, 'tables')
    os.system("mkdir -p {}".format(tables_DIR))

    out_F_start = os.path.join(out_DIR_this_RBP_k, RBP)

    #### Get the most enriched concentration
    most_enriched_conc_str = ""
    for T in readswstruct_startingbasename_myannot_L:
        if (T[2] == 'Most enriched'):
            most_enriched_conc_str = "{0} nM".format(T[1].split("_")[-1])

    Ds_DIR = os.path.join(fld_CG_match_DIR, "Ppaired_Ds/{0}".format(k))

    most_enriched_lib_annotation = ""
    #### annots_L will be like:
    ####    ['input', '5_nM', '20_nM', '80_nM', '320_nM', '1300_nM']
    annots_L = []
    ####    D_by_annot_D will have keys like 'input', '5_nM', etc. and values:
    ## {'AAAAAA': {-10: {'Ppaired_sum': 2.061,
    ##                    'counts': 4},
    ##              -9: {'Ppaired_sum': 1.027,
    ##                    'counts': 5},
    D_by_annot_D = {}
    for reads_w_struct_F, starting_basename, my_annot in\
        readswstruct_startingbasename_myannot_L:

        if (my_annot == "Input"):
            lib_annot = 'input'
        else:
            lib_annot = starting_basename.split("_")[-1] + "_nM"
        if (my_annot == "Most enriched"):
            most_enriched_lib_annotation = lib_annot

        annots_L.append(lib_annot)

        D_F = os.path.join(Ds_DIR, "{0}.D.pkl".format(starting_basename))
        D_by_annot_D[lib_annot] = pickle.load(
            open(D_F))['counts_by_kmer_binidx_D']

    #### Get the desired kmers (e.g., the top 5 )
    if (kmers_to_do == "top_10"):
        kmer_R_T_L = [(kmer, effective_R_D[kmer]) for kmer in effective_R_D]
        kmer_R_T_L.sort(key=lambda x: -1 * x[1])
        top_kmers_L = [tupl[0] for tupl in kmer_R_T_L[:10]]

    #top_kmers_L = RBNS_exp.return_top_X_kmers( k, int( 4 ** k ) )
    for kmer_idx, kmer_to_plot in enumerate(top_kmers_L):

        R = effective_R_D[kmer_to_plot]
        title = r"{0}, {1} (\#{2}: $R={3:.2f}$)".format(
            RBP, kmer_to_plot.replace("T", "U"), kmer_idx + 1, R)
        print title

        ##### Make an output .txt table of the Ppaireds of this motif and
        ####    the Ppaired ratio for the upstream & downstream flanking
        ####    positions
        out_txt_F = os.path.join(
            tables_DIR, "{0}.{1}.R_by_Ppaired_bin.txt".format(
                RBP, kmer_to_plot.replace("T", "U")))
        out_txt_f = open(out_txt_F, 'w')
        out_txt_f.write("{0} {1} R by Ppaired bin".format(
            RBP, kmer_to_plot.replace("T", "U")))
        out_txt_f.write("\t0-0.2\t0.2-0.4\t0.4-0.6\t0.6-0.8\t0.8-1.0")

        ##### First get the INPUT frequency in each of the 5 Ppaired bins
        input_D = D_by_annot_D['input']
        input_freq_by_bin_D = {}
        input_kmer_counts_all_bins = 0.
        input_all_counts_all_bins = 0.
        for bin_idx in range(5):
            total_counts_this_bin = sum(
                [input_D[all_kmer][bin_idx] for all_kmer in input_D])
            kmer_counts_this_bin = input_D[kmer_to_plot][bin_idx]
            kmer_freq = float(kmer_counts_this_bin) / total_counts_this_bin
            input_freq_by_bin_D[bin_idx] = kmer_freq
            input_kmer_counts_all_bins += kmer_counts_this_bin
            input_all_counts_all_bins += total_counts_this_bin

        enrichments_by_kmer_conc_bin_D = {kmer_to_plot: {}}
        for lib_annot, D in D_by_annot_D.iteritems():

            if (lib_annot == 'input'):
                continue

            kmer_counts_all_bins = 0.
            all_counts_all_bins = 0.

            enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot] = {}
            #### The original motif_num of this kmer
            motif_num = kmer_to_motifidx_D[kmer_to_plot]

            this_D = D_by_annot_D[lib_annot]

            out_txt_f.write("\n{}".format(lib_annot))
            for bin_idx in range(5):
                total_counts_this_bin = sum(
                    [this_D[all_kmer][bin_idx] for all_kmer in this_D])
                kmer_counts_this_bin = this_D[kmer_to_plot][bin_idx]
                kmer_freq = float(kmer_counts_this_bin) / total_counts_this_bin
                kmer_counts_all_bins += kmer_counts_this_bin
                all_counts_all_bins += total_counts_this_bin

                try:
                    kmer_R = kmer_freq / input_freq_by_bin_D[bin_idx]
                except ZeroDivisionError:
                    kmer_R = 1.
                out_txt_f.write("\t{0:.3f}".format(kmer_R))

                enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot][
                    bin_idx] = kmer_R

            ##### Get the OVERALL (over all bins) R
            overall_R = (kmer_counts_all_bins / all_counts_all_bins) / (
                input_kmer_counts_all_bins / input_all_counts_all_bins)
            enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot][
                'overall'] = kmer_R

        print "\nSaving to: {}".format(out_F_start)
        Ppaired_upper_bins_L = [0.2, 0.4, 0.6, 0.8, 1.]
        returned_D = RBNS_plots.plot_enrichment_by_5_Ppaired_bins(
            enrichments_by_kmer_conc_bin_D,
            annots_L,
            [kmer_to_plot],
            Ppaired_upper_bins_L,
            out_F_start,
            read_len,
            title=title,
            #plot_signif = True,
            plot_signif=False)
        sigB_by_kmer_conc_bin_D = returned_D['sigB_by_kmer_conc_bin_D']

        out_ratio_D_F = os.path.join(
            out_Ds_DIR, "{}.sigB_by_kmer_conc_bin_D.pkl".format(kmer_to_plot))
        try:
            D = sigB_by_kmer_conc_bin_D[kmer_to_plot]
            RBNS_utils.pkl_with_formatfile(D,
                                           out_ratio_D_F,
                                           num_to_include_in_format="all")
        except KeyError:
            pass

        out_D_F = os.path.join(
            out_Ds_DIR,
            "{}.enrichments_by_conc_bin_D.pkl".format(kmer_to_plot))
        RBNS_utils.pkl_with_formatfile(
            enrichments_by_kmer_conc_bin_D[kmer_to_plot],
            out_D_F,
            num_to_include_in_format="all")

        out_txt_f.close()
Ejemplo n.º 20
0
def return_frequency_and_number_of_reads_kmer_in_reads_F(reads_F, kmer):
    """
    - For a reads_F, makes a new out_reads_F in the same directory
        in which each occurrence of the kmer is replaced with "X"s
    - Called by functions in RBNS_logos.py

    - RETURNS:
            return_D = {"out_reads_F": out_reads_F,
                    "tot_num_reads": tot_num_reads,
                    "num_reads_w_kmer": num_reads_w_kmer,
                    "freq_reads_w_kmer": freq_reads_w_kmer,
                    "tot_num_kmer_occurs" : tot_num_kmer_occurs,
                    "counts_by_kmer_D": counts_by_kmer_D}
                    "freqs_by_kmer_D": freqs_by_kmer_D}
    """
    k = len(kmer)
    read_len = get_readlength(reads_F)

    orig_reads_DIR = os.path.dirname(reads_F)
    orig_reads_basename = os.path.basename(reads_F)
    out_basename = orig_reads_basename.rsplit(".", 1)[0] +\
            "_{}.reads".format( kmer )
    #### If the file name is over 100 characters, shorten it
    if (len(out_basename) >= 100):
        out_basename = "{}.reads".format(kmer)
    out_reads_F = os.path.join(orig_reads_DIR, out_basename)

    #### The number of reads and number of times a kmer was found
    tot_num_reads = 0
    num_reads_w_kmer = 0
    tot_num_kmer_occurs = 0

    #### A dictionary of kmer frequencies for the reads written out
    counts_by_kmer_D = {}
    for this_kmer in RBNS_utils.yield_kmers(k):
        counts_by_kmer_D[this_kmer] = 0

    reads_f = open(reads_F)
    out_reads_f = open(out_reads_F, "w")

    reads_to_write_out_L = []

    for line in reads_f:

        tot_num_reads += 1
        if (len(reads_to_write_out_L) == 10000):
            for read in reads_to_write_out_L:
                out_reads_f.write(read + "\n")
            reads_to_write_out_L = []
        read = line.strip()

        cont = True
        found_any = False

        while (cont == True):
            kmer_pos = read.find(kmer)
            if (kmer_pos == -1):

                if (found_any == True):
                    num_reads_w_kmer += 1
                for start_pos in range(read_len - k + 1):
                    this_kmer = read[start_pos:(start_pos + k)]
                    try:
                        counts_by_kmer_D[this_kmer] += 1
                    except KeyError:
                        pass
                reads_to_write_out_L.append(read)
                #out_reads_f.write( read + "\n" )
                cont = False

            #### If an occurrence of this kmer was found, replace it with X's
            ####    and write out the read
            else:
                found_any = True
                tot_num_kmer_occurs += 1
                read = read[:kmer_pos] + "X" * k + read[(kmer_pos + k):]

    for read in reads_to_write_out_L:
        out_reads_f.write(read + "\n")

    reads_f.close()
    out_reads_f.close()

    #### Normalize the counts_by_kmer_D into freqs
    freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D)
    freq_reads_w_kmer = float(num_reads_w_kmer) / tot_num_reads

    return_D = {
        "out_reads_F": out_reads_F,
        "tot_num_reads": tot_num_reads,
        "num_reads_w_kmer": num_reads_w_kmer,
        "freq_reads_w_kmer": freq_reads_w_kmer,
        "tot_num_kmer_occurs": tot_num_kmer_occurs,
        "counts_by_kmer_D": counts_by_kmer_D,
        "freqs_by_kmer_D": freqs_by_kmer_D
    }

    return return_D
Ejemplo n.º 21
0
def get_best_match_of_kmer_to_foundingkmer(
        kmer_to_align, founding_kmer, possible_comps_by_foundingk_alignk_D):
    """
    - Will try to align the kmer_to_align to the founding_kmer (trying all
        possible sliding combinations), with the number of mismatches
        allowed specified by possible_comps_by_foundingk_alignk_D

    - Returns:
        return_D = {"best_match_offset": best_match_offset,
            "best_match_side": best_match_side,
            "best_match": best_match}
    """
    best_match_offset = None
    best_match_side = None
    best_match = None
    best_match_position_in_allowedmatchesL = 100
    ####    allowed_matches_L is like:
    ####        ["side1_mismatch0", "side0_mismatch1", "side2_mismatch0"],
    ####        where side is the number of unaligned (overhang) positions,
    ####        and mismatch is the # of mismatched among the aligned positions
    ####    - This list is ordered from best -> worst, so if multiple offsets
    ####        are in the list, we'll use the one that has the lowest
    ####        best_match_position_in_allowedmatchesL
    allowed_matches_L = possible_comps_by_foundingk_alignk_D\
            [len(founding_kmer)][len(kmer_to_align)]
    sides_allowed_L = [int(x.split("side")[-1][0]) for x in allowed_matches_L]

    for offset in range(-3, len(founding_kmer)):

        #### Get the # of nt hanging of the "side" of the founding kmer
        if (offset < 0):
            side = abs(offset)
            pos_to_align = len(kmer_to_align) - side
            kmer_to_align_for_mismatches = kmer_to_align[(-1 * pos_to_align):]
            founding_kmer_for_mismatches = founding_kmer[:pos_to_align]

        else:
            side = max(offset + len(kmer_to_align) - len(founding_kmer), 0)

            if (side == 0):
                kmer_to_align_for_mismatches = kmer_to_align
            else:
                kmer_to_align_for_mismatches = kmer_to_align[:-1 * side]

            founding_kmer_for_mismatches = founding_kmer[
                offset:offset + len(kmer_to_align_for_mismatches)]

        if side not in sides_allowed_L:
            continue

        #### Get the # of mismatches between the kmer_to_align_for_mismatches
        ####    and founding_kmer_for_mismatches
        mismatches = RBNS_utils.hamming_distance(kmer_to_align_for_mismatches,
                                                 founding_kmer_for_mismatches)
        #### See if this side/mismatch combination is allowed
        this_match = "side{0}_mismatch{1}".format(side, mismatches)

        if (this_match in allowed_matches_L):
            pos_in_allowedmatchesL = allowed_matches_L.index(this_match)
            #### If this offset is better than (i.e., has a lover index in
            ####    allowed_matches_L) the previous best one, record it
            if (pos_in_allowedmatchesL <
                    best_match_position_in_allowedmatchesL):
                best_match_position_in_allowedmatchesL = pos_in_allowedmatchesL
                best_match_offset = offset
                best_match_side = side
                best_match = this_match

    return_D = {
        "best_match_offset": best_match_offset,
        "best_match_side": best_match_side,
        "best_match": best_match
    }

    return return_D
def get_num_reads_by_CplusG_content(reads_w_str_F,
                                    fiveP_adapt_len,
                                    threeP_adapt_len,
                                    min_perc_reads_for_CGbin=4.):
    """
    - Give a file of reads, calculates the number of reads in each C+G bin
        (counting the number of C+G's in the random region) and determines
        which bins will be used to match PD reads to
    - Only C+G bins that have at least min_perc_reads_for_CGbin % of reads will
        be used (i.e., don't want to use very lowly populated C+G bins)
    """
    ##['GAGTTCTACAGTCCGACGATCTGAACCGAACATATTCTACGTGGAATTCTCGGGTGCCAAGG',
    ## '0.819 0.818 0.867 0.867 0.866 0.900 0.857 0.821 0.983 0.182 0.930 0.953 0.176 0.076 0.834 0.848 0.017 0.957 0.196 0.868 0.948 0.202 0.200 0.009 0.012 0.089 0.102 0.968 0.947 0.093 0.787 0.029 0.042 0.120 0.910 0.950 0.232 0.119 0.777 0.781 0.822 0.890 0.988 0.998 0.965 0.961 0.989 0.898 0.851 0.226 0.185 0.874 0.877 0.096 0.074 0.077 0.756 0.746 0.005 0.091 0.103 0.056',
    ##  '(((((((((.((..((.(.((......)).)...))..)))))))))))..((...))....',
    ##  'sssssssssissiissisisshhhhhhssisiiissiisssssssssssmmsshhhssffff']
    numreads_by_numCG_D = {}
    start_time = time.time()

    #### Get the read length
    for four_lines_L in RBNS_utils.iterNlines(reads_w_str_F,
                                              4,
                                              strip_newlines=True):
        read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len
        break
    for i in range(read_len + 1):
        numreads_by_numCG_D[i] = 0

    #### Go through all of the reads
    for four_lines_L in RBNS_utils.iterNlines(reads_w_str_F,
                                              4,
                                              strip_newlines=True):
        rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len +
                                                     read_len)]
        num_CG = len([x for x in rand_read if x in ["C", "G"]])
        numreads_by_numCG_D[num_CG] += 1

    pprint.pprint(numreads_by_numCG_D)
    end_time = time.time()

    #### Get the number of reads in each CG bin
    total_num_reads = float(sum(numreads_by_numCG_D.values()))
    bin_percreads_T_L = [(num_CG, numreads_by_numCG_D[num_CG]*100./total_num_reads)\
            for num_CG in numreads_by_numCG_D ]

    #### Prune for those CG-bins that have at least x%
    bin_percreads_T_L = [
        tupl for tupl in bin_percreads_T_L
        if (tupl[1] >= min_perc_reads_for_CGbin)
    ]
    #### Renormalize so the sum of the pruned bins adds up to 1.
    total_after_pruned = sum([tupl[1] for tupl in bin_percreads_T_L])
    CGbin_normedfreqofreads_T_L = [(tupl[0], tupl[1] / total_after_pruned)\
            for tupl in bin_percreads_T_L]
    CGbin_normedfreqofreads_T_L.sort(key=lambda x: x[0])

    return_D = {
        "numreads_by_numCG_D": numreads_by_numCG_D,
        'CGbin_normedfreqofreads_T_L': CGbin_normedfreqofreads_T_L
    }

    return return_D
def write_output_F_from_inputreads_props_max_given_PD_readstokeepbyCG_D(
        in_reads_w_str_F, out_reads_w_str_F, numreads_to_keep_by_bin_D,
        fiveP_adapt_len, threeP_adapt_len):
    """
    - Given a set of input reads (in_reads_w_str_F) and the number of reads
        to keep for each C+G bin, gets the required number of reads in each bin
        and writes them out to out_reads_w_str_F
    """
    #### If the out_reads_w_str_F already exist, RETURN
    if ( os.path.exists( out_reads_w_str_F ) and\
            os.stat( out_reads_w_str_F ).st_size > 100000 ):
        return

    #### Make a copy of the numreads_to_keep_by_bin_D to run down
    cp_numreads_to_keep_by_bin_D = copy.copy(numreads_to_keep_by_bin_D)

    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):
        read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len
        break
    assert (read_len in [20, 40])

    #### FOr the in_reads_w_str_F, GET the actual number of CG reads in each bin
    ####    to see if there are enough for each; if not, we will need to
    ####    downsample each of the CG bins
    num_reads_in_F_by_CG_bin_D = {}
    for CG_bin in cp_numreads_to_keep_by_bin_D:
        num_reads_in_F_by_CG_bin_D[CG_bin] = 0
    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):

        rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len +
                                                     read_len)]
        num_CG = len([x for x in rand_read if x in ["C", "G"]])
        try:
            num_reads_in_F_by_CG_bin_D[num_CG] += 1
        except KeyError:
            pass

    ##### Go through and get the lowest CG_bin factor - that is, which needed C+G
    #####   bin is least populated and will be the factor multiplied by the
    #####   values of numreads_to_keep_by_bin_D to determine how many in each
    ####    C+G bin will __actually__ be able to be written to out_reads_w_str_F
    min_CG_factor = 1.
    any_limiting_CG_bin = 'No limiting CG bin'
    for CG_bin, num_target_reads in cp_numreads_to_keep_by_bin_D.iteritems():

        num_in_reads = num_reads_in_F_by_CG_bin_D[CG_bin]
        prop_of_reads_needed_in_inF = float(num_in_reads) / num_target_reads
        if (prop_of_reads_needed_in_inF < min_CG_factor):
            any_limiting_CG_bin = 'CG bin {0}: desired {1:,}, have only {2:,} -> min_CG_factor = {3:.2f}'.format(
                CG_bin, num_target_reads, num_in_reads,
                prop_of_reads_needed_in_inF)
        min_CG_factor = min(min_CG_factor, prop_of_reads_needed_in_inF)

    #### Now go through and downsample the copied_numreads_to_keep_by_bin_D if
    ####    necessary (i.e., if the min_CG_factor is less than 1
    copied_numreads_to_keep_by_bin_D = {}
    for CG_bin, orig_num_target_reads in cp_numreads_to_keep_by_bin_D.iteritems(
    ):
        copied_numreads_to_keep_by_bin_D[CG_bin] = int(min_CG_factor *
                                                       orig_num_target_reads)

    out_DIR = os.path.dirname(out_reads_w_str_F)
    os.system("mkdir -p {}".format(out_DIR))

    log_DIR = os.path.join(out_DIR, "logs")

    out_log_F = os.path.join(
        log_DIR,
        os.path.basename(out_reads_w_str_F).split(".gz")[0] + ".log.txt")

    out_f = gzip.open(out_reads_w_str_F, 'wb')

    #### Get the read length
    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):
        read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len
        break

    #### The total number of reads to write out
    num_reads_to_write = sum(copied_numreads_to_keep_by_bin_D.values())

    #### Get the read length
    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):

        rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len +
                                                     read_len)]
        num_CG = len([x for x in rand_read if x in ["C", "G"]])
        if (num_CG in copied_numreads_to_keep_by_bin_D):
            if (copied_numreads_to_keep_by_bin_D[num_CG] > 0):
                out_f.write("\n".join(four_lines_L) + "\n")
                copied_numreads_to_keep_by_bin_D[num_CG] -= 1
                num_reads_to_write -= 1

        if (num_reads_to_write == 0):
            break

    out_f.close()
    with open(out_log_F, 'w') as f:
        CG_bins_L = copied_numreads_to_keep_by_bin_D.keys()
        CG_bins_L.sort()
        f.write(any_limiting_CG_bin)

        f.write("\n\nREMAINING READS:\n")
        for CG_bin in CG_bins_L:
            f.write("CG {0}:\t{1}\n".format(
                CG_bin, copied_numreads_to_keep_by_bin_D[CG_bin]))
Ejemplo n.º 24
0
 def get_split_whandle(self):
     """
     returns a write file handle to the split reads
     """
     return RBNS_utils.aopen(self.get_split_reads(), 'w')
def write_output_F_from_inputreads_and_numreadstokeepbyCG_D(
        in_reads_w_str_F, out_reads_w_str_F, numreads_to_keep_by_bin_D,
        fiveP_adapt_len, threeP_adapt_len):
    """
    - Given a pulldown folded reads file (in_reads_w_str_F), will write out
        a new file (out_reads_w_str_F) containing a subset of reads according
        to numreads_to_keep_by_bin_D, which dictates how many reads containing
        each number of C+G bases should be included
    """
    #### If the out_reads_w_str_F already exist, PASS
    if ( os.path.exists( out_reads_w_str_F ) and\
            os.stat( out_reads_w_str_F ).st_size > 100000 ):
        return

    #### Make a copy of the numreads_to_keep_by_bin_D to run down
    copied_numreads_to_keep_by_bin_D = copy.copy(numreads_to_keep_by_bin_D)

    out_DIR = os.path.dirname(out_reads_w_str_F)
    os.system("mkdir -p {}".format(out_DIR))

    log_DIR = os.path.join(out_DIR, "logs")

    out_log_F = os.path.join(
        log_DIR,
        os.path.basename(out_reads_w_str_F).split(".gz")[0] + ".log.txt")

    out_f = gzip.open(out_reads_w_str_F, 'wb')

    #### Get the read length
    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):
        read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len
        break
    assert (read_len in [20, 40])

    #### The total number of reads to write out
    num_reads_to_write = sum(copied_numreads_to_keep_by_bin_D.values())

    #### Get the read length
    for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F,
                                              4,
                                              strip_newlines=True):

        rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len +
                                                     read_len)]
        num_CG = len([x for x in rand_read if x in ["C", "G"]])
        if (num_CG in copied_numreads_to_keep_by_bin_D):
            if (copied_numreads_to_keep_by_bin_D[num_CG] > 0):
                out_f.write("\n".join(four_lines_L) + "\n")
                copied_numreads_to_keep_by_bin_D[num_CG] -= 1
                num_reads_to_write -= 1

        if (num_reads_to_write == 0):
            break

    out_f.close()
    with open(out_log_F, 'w') as f:
        CG_bins_L = copied_numreads_to_keep_by_bin_D.keys()
        CG_bins_L.sort()
        f.write("REMAINING READS:\n")
        for CG_bin in CG_bins_L:
            f.write("CG {0}:\t{1}\n".format(
                CG_bin, copied_numreads_to_keep_by_bin_D[CG_bin]))

        f.write("\n\n reads used:\n")
        for CG_bin in CG_bins_L:
            f.write("CG_num_reads {0}:\t{1}\n".format(
                CG_bin, numreads_to_keep_by_bin_D[CG_bin]))
Ejemplo n.º 26
0
 def get_0nM_enrichment_kmer(self, kmer):
     kmer_i = RBNS_utils.get_index_from_kmer(kmer)
     return self.get_0nM_enrichment(kmer_i)
Ejemplo n.º 27
0
 def kmer_value(self, kmer):
     kmeri = RBNS_utils.get_index_from_kmer(kmer)
     return self.kmeri_value(kmeri)
Ejemplo n.º 28
0
 def get_B(self, kmer_i, read_len):
     assert kmer_i
     enrichment = self.get_enrichment(kmer_i)
     B = RBNS_utils.B_factor(enrichment, self.k, read_len)
     return B
Ejemplo n.º 29
0
 def get_B_kmer(self, kmer, read_len):
     assert len(kmer) == self.k
     enrichment = self.get_enrichment_kmer(kmer)
     B = RBNS_utils.B_factor(enrichment, self.k, read_len)
     return B
Ejemplo n.º 30
0
 def weight_dict(self):
     kmer2weight = {}
     for kmer, weight in zip(RBNS_utils.yield_kmers(self.k), self.profile):
         kmer2weight[kmer] = weight
     return kmer2weight