def submit_get_suboptimal_block_sampled_DotBracket_reads_F( in_struct_gz_F, temp, starting_scratch_DIR, block_idx, num_reads_per_block): """ - For an in_struct_gz_F split_reads file, submits a job to run the get_suboptimal_block_sampled_DotBracket_reads_F() function below """ #### Get this file path filename = inspect.getframeinfo(inspect.currentframe()).filename this_script_path = os.path.abspath(filename) output_DIR = os.path.dirname(in_struct_gz_F) errors_outputs_DIR = os.path.join(output_DIR, "errors_outputs") RBNS_utils.make_dir(errors_outputs_DIR) command = ('python %(this_script_path)s ' 'get_suboptimal_block_sampled_DotBracket_reads_F ' '%(in_struct_gz_F)s ' '%(temp)s ' '%(starting_scratch_DIR)s ' '%(block_idx)s ' '%(num_reads_per_block)s ' % locals()) job_name = "{0}_block{1}_get_suboptimal_block_sampled_DotBracket_reads_F".format( os.path.basename(in_struct_gz_F).split(".")[0], block_idx) RBNS_cluster_utils.launch( command, out_file=os.path.join(errors_outputs_DIR, "{}.log".format(job_name)), jobname=job_name, error_DIR=errors_outputs_DIR, time_mins=690) # 11.5 hours (12 hours is the limit)
def launch_counter(lib_settings, count_type, k, error_DIR): """ - Launches a job to perform kmer counts of count_type, calling the 'counter' function below """ split_reads = lib_settings.get_split_reads() out_pkl = lib_settings.counts_file(count_type, k) RBNS_utils.make_dir(os.path.dirname(out_pkl)) cluster_python_script = os.path.abspath(__file__) barcode = lib_settings.get_barcode() out_file = os.path.join(error_DIR, 'count.%s.%s.%i.out' % (barcode, count_type, k)) err_file = os.path.join(error_DIR, 'count.%s.%s.%i.err' % (barcode, count_type, k)) command = ('python %(cluster_python_script)s ' 'counter ' '%(count_type)s ' '%(split_reads)s ' '%(k)i ' '%(out_pkl)s ' % locals()) #'1> %(out_file)s ' #'2> %(err_file)s ' % locals()) conc = lib_settings.get_conc() jobname = '%s.%s.%i.%g' % (os.path.basename(split_reads), count_type, k, conc) return launch(command, jobname=jobname, error_DIR=error_DIR)
def save_enrichments(self, enrich_pkl): RBNS_utils.make_dir(os.path.dirname(enrich_pkl)) enriches_by_kmer_D = {} num_kmers = len(self.enrichments) k = int(math.log(num_kmers, 4.)) for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)): enriches_by_kmer_D[kmer] = self.enrichments[kmer_num] cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
def save_0nM_enrichments(self, enrich_pkl): enriches_by_kmer_D = {} num_kmers = len(self.enrichments_to_0nM) k = int(math.log(num_kmers, 4.)) for kmer_num, kmer in enumerate(RBNS_utils.yield_kmers(k)): enriches_by_kmer_D[kmer] = self.enrichments_to_0nM[kmer_num] cPickle.dump(enriches_by_kmer_D, open(enrich_pkl, 'wb'))
def load_0nM_enrichments(self, enrich_pkl): enriches_by_kmer_D = cPickle.load(open(enrich_pkl, 'rb')) k = int(math.log(len(enriches_by_kmer_D), 4.)) enriches_L = [] for kmer in RBNS_utils.yield_kmers(k): enriches_L.append(enriches_by_kmer_D[kmer]) self.enrichments_to_0nM = np.array(enriches_L)
def get_0nM_enrichment_dict(self): assert len(self.enrichments_to_0nM) == 4**self.k return { kmer: enrich for kmer, enrich in zip(RBNS_utils.yield_kmers(self.k), self.enrichments_to_0nM) }
def split_reads_exist(self): """ returns true if the split reads file for this library exists and is non empty does not check if it is complete """ return RBNS_utils.file_exists(self.get_split_reads())
def check_barcodes_are_separated( self, min_hamming_distance = 2): """ - Makes sure the barcodes are all totally distinguishable (i.e., all have Hamming distance min_hamming_distance away from all others) """ for b1, b2 in itertools.combinations(self.settings['barcodes'], 2): hamming_dist = RBNS_utils.hamming_distance(b1, b2) if hamming_dist < min_hamming_distance: raise ValueError('The barcodes supplied are not well ' 'separated: %s-%s' % (b1, b2))
def calculate_enrichment(self, k, input_lib): enrich_pkl = os.path.join( self.experiment_settings.get_rdir(), 'enrichment_Ds', '%s_%s_to_input.%imer.enrichments.pkl' % (self.experiment_settings.get_property('protein_name'), self.lib_settings.get_conc_string(), k)) if RBNS_utils.file_exists(enrich_pkl): self.type2k2counts['naive'][k].load_enrichments(enrich_pkl) else: input_profile = input_lib.type2k2counts['naive'][k] self.type2k2counts['naive'][k].calculate_enrichments(input_profile) self.type2k2counts['naive'][k].save_enrichments(enrich_pkl)
def get_kmer_freqs_from_reads_F(reads_F, k, vals_sum_to="sumto1"): """ - Returns the kmer counts & freqs in reads_F - INPUTs: - vals_sum_to "sumto1": all 4^k entries sum to 1 "sumto4^k": all 4^k entries sum to 4^k """ counts_by_kmer_D = {} for kmer in RBNS_utils.yield_kmers(k): counts_by_kmer_D[kmer] = 0 with open(reads_F) as f: for line in f: read = line.strip() for start_pos in range(len(read) - k + 1): kmer = read[start_pos:(start_pos + k)] # only include it if it doesn't have an N try: counts_by_kmer_D[kmer] += 1 except KeyError: pass return_D = {"counts_by_kmer_D": counts_by_kmer_D} if (vals_sum_to == "none"): return counts_by_kmer_D #### Normalize using the helper function in dict_helpers elif (vals_sum_to == "sumto1"): freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D) elif (vals_sum_to == "sumto4^k"): freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D, vals_sum_to="sumto4^k") else: print "{0} IS NOT A VALID vals_sum_to ARGUMENT. REPLACE AND TRY AGAIN\n".format( vals_sum_to) return_D["freqs_by_kmer_D"] = freqs_by_kmer_D return return_D
def calculate_enrichment_to_0nM(self, k, zero_nM_lib): zero_enrich_pkl = os.path.join( self.experiment_settings.get_rdir(), 'enrichment_Ds', '%s_%s_to_0nM.%imer.enrichments.pkl' % (self.experiment_settings.get_property('protein_name'), self.lib_settings.get_conc_string(), k)) zero_nM_profile = zero_nM_lib.type2k2counts['naive'][k] self.type2k2counts['naive'][k].calculate_enrichments_to_0nM( zero_nM_profile) if RBNS_utils.file_exists(zero_enrich_pkl) == False: self.type2k2counts['naive'][k].save_0nM_enrichments( zero_enrich_pkl)
def stream_without_continual_update(k, in_weights, inFile): """ - Performs streaming kmer assignment (SKA) algorithm in which the weights are NOT continually updated after each read, just after going through all of the reads (typically used from the second pass onward) """ new_weights = np.ones(4**k) for linei, line in enumerate(RBNS_utils.aopen(inFile)): read_seq = line.strip() pk = get_kmers(read_seq, k) additional_weights = assign_kmer_weights(pk, in_weights) assert sum(additional_weights) - 1.0 < 0.001 for kmer, weight in zip(pk, additional_weights): kmeri = get_index_from_kmer(kmer) new_weights[kmeri] += weight return new_weights
def get_sig_enriched_kmers_from_txt_R_F(txt_R_F, most_enriched_lib_conc=None, num_std_for_sig=2): """ INPUT: - txt_R_F: a file of enrichments from the pipeline (e.g. /net/uorf/data/nobackup/pfreese/RBNS_results/Fox_1_7_14/tables/RBFox2_enrichment_R.6mers.txt - most_enriched_lib_conc: the column header in the first row of the txt_F (e.g. "80"); can pass this in manually. If nothing is passed in, the concentration with the highest enrichment will be chosen RETURNS: - return_D = {"sig_enriched_kmers_L": sig_enriched_kmers_L, "sig_enrichments_by_kmer_D": sig_enrichments_by_kmer_D} """ #### get the dictionary of enrichments enriches_by_kmer_D = return_D_of_enrichments_from_txt_F( txt_R_F, most_enriched_lib_conc=most_enriched_lib_conc) #### a list of enrichments enrichments_L = enriches_by_kmer_D.values() mean, std = RBNS_utils.mean_std(enrichments_L) sig_threshold = mean + (num_std_for_sig * std) #### a list of the sig. enriched kmers and R values sig_enriched_kmer_R_tuples_L = [] #### a dictionary of the enrichments, containing ONLY the sig. enriched #### kmers sig_enrichments_by_kmer_D = {} for kmer in enriches_by_kmer_D: R = enriches_by_kmer_D[kmer] if (R >= sig_threshold): sig_enriched_kmer_R_tuples_L.append((kmer, R)) sig_enrichments_by_kmer_D[kmer] = enriches_by_kmer_D[kmer] sig_enriched_kmer_R_tuples_L.sort(key=lambda x: -1 * x[1]) sig_enriched_kmers_L = [tupl[0] for tupl in sig_enriched_kmer_R_tuples_L] return_D = { "sig_enriched_kmers_L": sig_enriched_kmers_L, "sig_enrichments_by_kmer_D": sig_enrichments_by_kmer_D } return return_D
def get_subopt_folding_of_reads(reads_L, scratch_DIR, temp, out_F_to_append_results_to, num_to_return_for_each_read=20): """ - Given a list of reads (with adapters) reads_L, will get num_to_return_for_each_read suboptimal DotBracket structures sampled with probabilities equal to their Boltzmann weights """ tmp_read_fasta_F = os.path.join(scratch_DIR, "reads.fa") out_F = os.path.join(scratch_DIR, "reads.out.txt") read_by_readwindex_D = {} with open(tmp_read_fasta_F, "w") as f: for idx, read in enumerate(reads_L): read_w_index = "read{0}".format(idx) read_by_readwindex_D[read_w_index] = read f.write(">read{0}\n{1}\n".format(idx, read)) os.chdir(scratch_DIR) fold_CMD = "RNAsubopt --temp={0} --stochBT={1} < {2} > {3}".format( temp, num_to_return_for_each_read, tmp_read_fasta_F, out_F) # make the .ps files fold = subprocess.Popen(fold_CMD, shell=True) stdoutdata, stderrdata = fold.communicate() #### Now go through all of the reads this_read = "" out_f_to_append_results_to = gzip.open(out_F_to_append_results_to, 'ab') for lines_L in RBNS_utils.iterNlines(out_F, num_to_return_for_each_read + 2, strip_newlines=True): this_read = lines_L[1] out_f_to_append_results_to.write(">" + this_read + "\n") #### Now go through all of the num_to_return_for_each_read DotBracket #### structures for DB_str in lines_L[2:]: element_string = get_elementstring_from_DotBracket(DB_str) out_f_to_append_results_to.write(element_string + "\n") out_f_to_append_results_to.close()
def stream_continual_update(k, weights, inFile): """ - Performs streaming kmer assignment (SKA) algorithm in which the weights are continually updated after each read (typically, this is used for just the first pass) """ total_lines = count_lines(inFile) * 2 start_time = time.time() for linei, line in enumerate(RBNS_utils.aopen(inFile)): if linei % 10000 == 0 and linei: elapsed_time = time.time() - start_time print 'Predicted time remaining for stream_continual_update:',\ (total_lines - linei) / linei * elapsed_time / 3600,\ 'hours' read_seq = line.strip() pk = get_kmers(read_seq, k) assigned_weights = assign_kmer_weights(pk, weights) for kmer, weight in zip(pk, assigned_weights): kmeri = get_index_from_kmer(kmer) weights[kmeri] += weight return weights
def stream_continual_update_with_convergence_table(k, weights, inFile, out_file, how_often_to_write=10000): """ - Performs streaming kmer assignment (SKA) algorithm in which the weights are continually updated after each read (typically, this is used for just the first pass); also makes an ouptput summary table at the end """ internal_history = [] for linei, line in enumerate(RBNS_utils.aopen(inFile)): if linei % how_often_to_write == 0: norm_weights = copy.copy(weights) norm_weights = normalize_mean_1(norm_weights) internal_history.append(norm_weights) read_seq = line.strip() pk = get_kmers(read_seq, k) assigned_weights = assign_kmer_weights(pk, weights) for kmer, weight in zip(pk, assigned_weights): kmeri = get_index_from_kmer(kmer) weights[kmeri] += weight of = open(out_file, 'w') of.write('kmer\t' + '\t'.join( ['reads_read_%i' % (i * how_often_to_write) for i in\ range(len(internal_history))]) + '\n') for kmer_i, kmer in enumerate(yield_kmers(k)): of.write('%s\t' % kmer) for col_i in range(len(internal_history)): assert len(internal_history[col_i]) == 4**k of.write('%g\t' % internal_history[col_i][kmer_i]) of.write('\n') of.close() return weights
def get_B_values(self, read_len): return [ RBNS_utils.B_factor(enrich, self.k, read_len) for enrich in self.enrichments ]
def make_temp_reads_F(orig_reads_F, target_reads_DIR, read_length_to_use="full_length", num_reads_to_use="all", target_reads_basename=None, start_frac=0.0): """ - Given an orig_reads_F, makes a new reads file containing only reads that DON'T contain any N's - INPUT: - orig_reads_F: a file (e.g. .reads) of the reads - target_reads_DIR: where the output .reads file will be written - read_length_to_use: the length of reads that will be included in the output reads file - "full_length": it will be the full read - num_reads_to_use: - "all": all reads - an int: will use up to that many reads - a float (from 0.0 to 1.0): that proportion of the total reads in orig_reads_F - target_reads_basename: the output reads basename; if not passed in, it will be the orig_reads_F basename with a time_stamp appended - start_frac: how far through the orig_reads_F to start getting reads - RETURNS: return_D = {"out_reads_F": out_reads_F, "num_reads_in_out_reads_F": rd_num} """ #### Make the out_reads_F if (target_reads_basename == None): reads_basename = os.path.basename( orig_reads_F ).rsplit( ".", 1 )[0] +\ ".{}.reads".format(datetime.datetime.now().strftime("%Hh_%Mm_%Ss" )) else: if (target_reads_basename[-6:] == ".reads"): reads_basename = target_reads_basename else: reads_basename = target_reads_basename + ".reads" out_reads_F = os.path.join(target_reads_DIR, reads_basename) os.system("mkdir -p {}".format(target_reads_DIR)) out_reads_f = open(out_reads_F, "w") #### Get the num_reads_to_use total_lines_in_file = RBNS_utils.return_num_lines_in_F(orig_reads_F) if (num_reads_to_use == "all"): reads_to_use = 10000000000 elif (type(num_reads_to_use) is int): reads_to_use = num_reads_to_use elif (type(num_reads_to_use) is float): assert (num_reads_to_use <= 1.0 and num_reads_to_use > 0.0) reads_to_use = int(num_reads_to_use * total_lines_in_file) #### Get the line_to_start_at from the start_frac passed in if (start_frac == 0.0): line_to_start_at = 0 else: line_to_start_at = int(total_lines_in_file * start_frac) line_lower = line_to_start_at line_upper = line_to_start_at + reads_to_use #### Get the read_length_to_use if it's "all" if (read_length_to_use == "full_length"): rd_length_to_use = get_readlength(orig_reads_F) else: assert (type(read_length_to_use) is int) rd_length_to_use = read_length_to_use #### Now populate the out_reads_F with open(orig_reads_F) as f: reads_written = 0 this_read = -1 for line in f: this_read += 1 if (this_read >= line_to_start_at): ln = line.strip() #### Only use reads that have no N's if ((len(ln) >= rd_length_to_use) and (ln.find("N") == -1)): out_reads_f.write(ln[:rd_length_to_use] + "\n") reads_written += 1 if (reads_written >= reads_to_use): break out_reads_f.close() return_D = { "out_reads_F": out_reads_F, "num_reads_in_out_reads_F": reads_written } return return_D
def plot_R_by_Ppaired_bin_w_sig(readswstruct_startingbasename_myannot_L, read_len, k, effective_R_D, kmers_to_do="top_10"): """ - Makes a plot of the Ppaired ratios for the desired set of kmers - fld_CG_match_DIR is the directory that contains the Ppaired_Ds directory: /net/eofe-data010/data001/burgelab/nevermind/data/nm/pfreese/RBFOX3_test/split_reads/fld_CG_match """ import random import RBNS_plots assert (kmers_to_do in ["top_10"]) fld_CG_match_DIR = os.path.dirname( readswstruct_startingbasename_myannot_L[0][0]) RBP = readswstruct_startingbasename_myannot_L[0][1].split('_')[0] #### Get the list of all kmers, which to motif_num matches all_kmers_L = RBNS_utils.return_all_kmers_L(k) kmer_to_motifidx_D = {} motifidx_to_kmer_D = {} for idx, kmer in enumerate(all_kmers_L): kmer_to_motifidx_D[kmer] = idx motifidx_to_kmer_D[idx] = kmer out_DIR = fld_CG_match_DIR.split("/split_reads")[0] os.system("mkdir -p {}".format(out_DIR)) out_DIR_this_RBP_k = os.path.join(out_DIR, "{0}mer_plots".format(k)) out_Ds_DIR = os.path.join(out_DIR_this_RBP_k, 'Ds') os.system("mkdir -p {}".format(out_Ds_DIR)) tables_DIR = os.path.join(out_DIR_this_RBP_k, 'tables') os.system("mkdir -p {}".format(tables_DIR)) out_F_start = os.path.join(out_DIR_this_RBP_k, RBP) #### Get the most enriched concentration most_enriched_conc_str = "" for T in readswstruct_startingbasename_myannot_L: if (T[2] == 'Most enriched'): most_enriched_conc_str = "{0} nM".format(T[1].split("_")[-1]) Ds_DIR = os.path.join(fld_CG_match_DIR, "Ppaired_Ds/{0}".format(k)) most_enriched_lib_annotation = "" #### annots_L will be like: #### ['input', '5_nM', '20_nM', '80_nM', '320_nM', '1300_nM'] annots_L = [] #### D_by_annot_D will have keys like 'input', '5_nM', etc. and values: ## {'AAAAAA': {-10: {'Ppaired_sum': 2.061, ## 'counts': 4}, ## -9: {'Ppaired_sum': 1.027, ## 'counts': 5}, D_by_annot_D = {} for reads_w_struct_F, starting_basename, my_annot in\ readswstruct_startingbasename_myannot_L: if (my_annot == "Input"): lib_annot = 'input' else: lib_annot = starting_basename.split("_")[-1] + "_nM" if (my_annot == "Most enriched"): most_enriched_lib_annotation = lib_annot annots_L.append(lib_annot) D_F = os.path.join(Ds_DIR, "{0}.D.pkl".format(starting_basename)) D_by_annot_D[lib_annot] = pickle.load( open(D_F))['counts_by_kmer_binidx_D'] #### Get the desired kmers (e.g., the top 5 ) if (kmers_to_do == "top_10"): kmer_R_T_L = [(kmer, effective_R_D[kmer]) for kmer in effective_R_D] kmer_R_T_L.sort(key=lambda x: -1 * x[1]) top_kmers_L = [tupl[0] for tupl in kmer_R_T_L[:10]] #top_kmers_L = RBNS_exp.return_top_X_kmers( k, int( 4 ** k ) ) for kmer_idx, kmer_to_plot in enumerate(top_kmers_L): R = effective_R_D[kmer_to_plot] title = r"{0}, {1} (\#{2}: $R={3:.2f}$)".format( RBP, kmer_to_plot.replace("T", "U"), kmer_idx + 1, R) print title ##### Make an output .txt table of the Ppaireds of this motif and #### the Ppaired ratio for the upstream & downstream flanking #### positions out_txt_F = os.path.join( tables_DIR, "{0}.{1}.R_by_Ppaired_bin.txt".format( RBP, kmer_to_plot.replace("T", "U"))) out_txt_f = open(out_txt_F, 'w') out_txt_f.write("{0} {1} R by Ppaired bin".format( RBP, kmer_to_plot.replace("T", "U"))) out_txt_f.write("\t0-0.2\t0.2-0.4\t0.4-0.6\t0.6-0.8\t0.8-1.0") ##### First get the INPUT frequency in each of the 5 Ppaired bins input_D = D_by_annot_D['input'] input_freq_by_bin_D = {} input_kmer_counts_all_bins = 0. input_all_counts_all_bins = 0. for bin_idx in range(5): total_counts_this_bin = sum( [input_D[all_kmer][bin_idx] for all_kmer in input_D]) kmer_counts_this_bin = input_D[kmer_to_plot][bin_idx] kmer_freq = float(kmer_counts_this_bin) / total_counts_this_bin input_freq_by_bin_D[bin_idx] = kmer_freq input_kmer_counts_all_bins += kmer_counts_this_bin input_all_counts_all_bins += total_counts_this_bin enrichments_by_kmer_conc_bin_D = {kmer_to_plot: {}} for lib_annot, D in D_by_annot_D.iteritems(): if (lib_annot == 'input'): continue kmer_counts_all_bins = 0. all_counts_all_bins = 0. enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot] = {} #### The original motif_num of this kmer motif_num = kmer_to_motifidx_D[kmer_to_plot] this_D = D_by_annot_D[lib_annot] out_txt_f.write("\n{}".format(lib_annot)) for bin_idx in range(5): total_counts_this_bin = sum( [this_D[all_kmer][bin_idx] for all_kmer in this_D]) kmer_counts_this_bin = this_D[kmer_to_plot][bin_idx] kmer_freq = float(kmer_counts_this_bin) / total_counts_this_bin kmer_counts_all_bins += kmer_counts_this_bin all_counts_all_bins += total_counts_this_bin try: kmer_R = kmer_freq / input_freq_by_bin_D[bin_idx] except ZeroDivisionError: kmer_R = 1. out_txt_f.write("\t{0:.3f}".format(kmer_R)) enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot][ bin_idx] = kmer_R ##### Get the OVERALL (over all bins) R overall_R = (kmer_counts_all_bins / all_counts_all_bins) / ( input_kmer_counts_all_bins / input_all_counts_all_bins) enrichments_by_kmer_conc_bin_D[kmer_to_plot][lib_annot][ 'overall'] = kmer_R print "\nSaving to: {}".format(out_F_start) Ppaired_upper_bins_L = [0.2, 0.4, 0.6, 0.8, 1.] returned_D = RBNS_plots.plot_enrichment_by_5_Ppaired_bins( enrichments_by_kmer_conc_bin_D, annots_L, [kmer_to_plot], Ppaired_upper_bins_L, out_F_start, read_len, title=title, #plot_signif = True, plot_signif=False) sigB_by_kmer_conc_bin_D = returned_D['sigB_by_kmer_conc_bin_D'] out_ratio_D_F = os.path.join( out_Ds_DIR, "{}.sigB_by_kmer_conc_bin_D.pkl".format(kmer_to_plot)) try: D = sigB_by_kmer_conc_bin_D[kmer_to_plot] RBNS_utils.pkl_with_formatfile(D, out_ratio_D_F, num_to_include_in_format="all") except KeyError: pass out_D_F = os.path.join( out_Ds_DIR, "{}.enrichments_by_conc_bin_D.pkl".format(kmer_to_plot)) RBNS_utils.pkl_with_formatfile( enrichments_by_kmer_conc_bin_D[kmer_to_plot], out_D_F, num_to_include_in_format="all") out_txt_f.close()
def return_frequency_and_number_of_reads_kmer_in_reads_F(reads_F, kmer): """ - For a reads_F, makes a new out_reads_F in the same directory in which each occurrence of the kmer is replaced with "X"s - Called by functions in RBNS_logos.py - RETURNS: return_D = {"out_reads_F": out_reads_F, "tot_num_reads": tot_num_reads, "num_reads_w_kmer": num_reads_w_kmer, "freq_reads_w_kmer": freq_reads_w_kmer, "tot_num_kmer_occurs" : tot_num_kmer_occurs, "counts_by_kmer_D": counts_by_kmer_D} "freqs_by_kmer_D": freqs_by_kmer_D} """ k = len(kmer) read_len = get_readlength(reads_F) orig_reads_DIR = os.path.dirname(reads_F) orig_reads_basename = os.path.basename(reads_F) out_basename = orig_reads_basename.rsplit(".", 1)[0] +\ "_{}.reads".format( kmer ) #### If the file name is over 100 characters, shorten it if (len(out_basename) >= 100): out_basename = "{}.reads".format(kmer) out_reads_F = os.path.join(orig_reads_DIR, out_basename) #### The number of reads and number of times a kmer was found tot_num_reads = 0 num_reads_w_kmer = 0 tot_num_kmer_occurs = 0 #### A dictionary of kmer frequencies for the reads written out counts_by_kmer_D = {} for this_kmer in RBNS_utils.yield_kmers(k): counts_by_kmer_D[this_kmer] = 0 reads_f = open(reads_F) out_reads_f = open(out_reads_F, "w") reads_to_write_out_L = [] for line in reads_f: tot_num_reads += 1 if (len(reads_to_write_out_L) == 10000): for read in reads_to_write_out_L: out_reads_f.write(read + "\n") reads_to_write_out_L = [] read = line.strip() cont = True found_any = False while (cont == True): kmer_pos = read.find(kmer) if (kmer_pos == -1): if (found_any == True): num_reads_w_kmer += 1 for start_pos in range(read_len - k + 1): this_kmer = read[start_pos:(start_pos + k)] try: counts_by_kmer_D[this_kmer] += 1 except KeyError: pass reads_to_write_out_L.append(read) #out_reads_f.write( read + "\n" ) cont = False #### If an occurrence of this kmer was found, replace it with X's #### and write out the read else: found_any = True tot_num_kmer_occurs += 1 read = read[:kmer_pos] + "X" * k + read[(kmer_pos + k):] for read in reads_to_write_out_L: out_reads_f.write(read + "\n") reads_f.close() out_reads_f.close() #### Normalize the counts_by_kmer_D into freqs freqs_by_kmer_D = RBNS_utils.normalize_D(counts_by_kmer_D) freq_reads_w_kmer = float(num_reads_w_kmer) / tot_num_reads return_D = { "out_reads_F": out_reads_F, "tot_num_reads": tot_num_reads, "num_reads_w_kmer": num_reads_w_kmer, "freq_reads_w_kmer": freq_reads_w_kmer, "tot_num_kmer_occurs": tot_num_kmer_occurs, "counts_by_kmer_D": counts_by_kmer_D, "freqs_by_kmer_D": freqs_by_kmer_D } return return_D
def get_best_match_of_kmer_to_foundingkmer( kmer_to_align, founding_kmer, possible_comps_by_foundingk_alignk_D): """ - Will try to align the kmer_to_align to the founding_kmer (trying all possible sliding combinations), with the number of mismatches allowed specified by possible_comps_by_foundingk_alignk_D - Returns: return_D = {"best_match_offset": best_match_offset, "best_match_side": best_match_side, "best_match": best_match} """ best_match_offset = None best_match_side = None best_match = None best_match_position_in_allowedmatchesL = 100 #### allowed_matches_L is like: #### ["side1_mismatch0", "side0_mismatch1", "side2_mismatch0"], #### where side is the number of unaligned (overhang) positions, #### and mismatch is the # of mismatched among the aligned positions #### - This list is ordered from best -> worst, so if multiple offsets #### are in the list, we'll use the one that has the lowest #### best_match_position_in_allowedmatchesL allowed_matches_L = possible_comps_by_foundingk_alignk_D\ [len(founding_kmer)][len(kmer_to_align)] sides_allowed_L = [int(x.split("side")[-1][0]) for x in allowed_matches_L] for offset in range(-3, len(founding_kmer)): #### Get the # of nt hanging of the "side" of the founding kmer if (offset < 0): side = abs(offset) pos_to_align = len(kmer_to_align) - side kmer_to_align_for_mismatches = kmer_to_align[(-1 * pos_to_align):] founding_kmer_for_mismatches = founding_kmer[:pos_to_align] else: side = max(offset + len(kmer_to_align) - len(founding_kmer), 0) if (side == 0): kmer_to_align_for_mismatches = kmer_to_align else: kmer_to_align_for_mismatches = kmer_to_align[:-1 * side] founding_kmer_for_mismatches = founding_kmer[ offset:offset + len(kmer_to_align_for_mismatches)] if side not in sides_allowed_L: continue #### Get the # of mismatches between the kmer_to_align_for_mismatches #### and founding_kmer_for_mismatches mismatches = RBNS_utils.hamming_distance(kmer_to_align_for_mismatches, founding_kmer_for_mismatches) #### See if this side/mismatch combination is allowed this_match = "side{0}_mismatch{1}".format(side, mismatches) if (this_match in allowed_matches_L): pos_in_allowedmatchesL = allowed_matches_L.index(this_match) #### If this offset is better than (i.e., has a lover index in #### allowed_matches_L) the previous best one, record it if (pos_in_allowedmatchesL < best_match_position_in_allowedmatchesL): best_match_position_in_allowedmatchesL = pos_in_allowedmatchesL best_match_offset = offset best_match_side = side best_match = this_match return_D = { "best_match_offset": best_match_offset, "best_match_side": best_match_side, "best_match": best_match } return return_D
def get_num_reads_by_CplusG_content(reads_w_str_F, fiveP_adapt_len, threeP_adapt_len, min_perc_reads_for_CGbin=4.): """ - Give a file of reads, calculates the number of reads in each C+G bin (counting the number of C+G's in the random region) and determines which bins will be used to match PD reads to - Only C+G bins that have at least min_perc_reads_for_CGbin % of reads will be used (i.e., don't want to use very lowly populated C+G bins) """ ##['GAGTTCTACAGTCCGACGATCTGAACCGAACATATTCTACGTGGAATTCTCGGGTGCCAAGG', ## '0.819 0.818 0.867 0.867 0.866 0.900 0.857 0.821 0.983 0.182 0.930 0.953 0.176 0.076 0.834 0.848 0.017 0.957 0.196 0.868 0.948 0.202 0.200 0.009 0.012 0.089 0.102 0.968 0.947 0.093 0.787 0.029 0.042 0.120 0.910 0.950 0.232 0.119 0.777 0.781 0.822 0.890 0.988 0.998 0.965 0.961 0.989 0.898 0.851 0.226 0.185 0.874 0.877 0.096 0.074 0.077 0.756 0.746 0.005 0.091 0.103 0.056', ## '(((((((((.((..((.(.((......)).)...))..)))))))))))..((...))....', ## 'sssssssssissiissisisshhhhhhssisiiissiisssssssssssmmsshhhssffff'] numreads_by_numCG_D = {} start_time = time.time() #### Get the read length for four_lines_L in RBNS_utils.iterNlines(reads_w_str_F, 4, strip_newlines=True): read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len break for i in range(read_len + 1): numreads_by_numCG_D[i] = 0 #### Go through all of the reads for four_lines_L in RBNS_utils.iterNlines(reads_w_str_F, 4, strip_newlines=True): rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len + read_len)] num_CG = len([x for x in rand_read if x in ["C", "G"]]) numreads_by_numCG_D[num_CG] += 1 pprint.pprint(numreads_by_numCG_D) end_time = time.time() #### Get the number of reads in each CG bin total_num_reads = float(sum(numreads_by_numCG_D.values())) bin_percreads_T_L = [(num_CG, numreads_by_numCG_D[num_CG]*100./total_num_reads)\ for num_CG in numreads_by_numCG_D ] #### Prune for those CG-bins that have at least x% bin_percreads_T_L = [ tupl for tupl in bin_percreads_T_L if (tupl[1] >= min_perc_reads_for_CGbin) ] #### Renormalize so the sum of the pruned bins adds up to 1. total_after_pruned = sum([tupl[1] for tupl in bin_percreads_T_L]) CGbin_normedfreqofreads_T_L = [(tupl[0], tupl[1] / total_after_pruned)\ for tupl in bin_percreads_T_L] CGbin_normedfreqofreads_T_L.sort(key=lambda x: x[0]) return_D = { "numreads_by_numCG_D": numreads_by_numCG_D, 'CGbin_normedfreqofreads_T_L': CGbin_normedfreqofreads_T_L } return return_D
def write_output_F_from_inputreads_props_max_given_PD_readstokeepbyCG_D( in_reads_w_str_F, out_reads_w_str_F, numreads_to_keep_by_bin_D, fiveP_adapt_len, threeP_adapt_len): """ - Given a set of input reads (in_reads_w_str_F) and the number of reads to keep for each C+G bin, gets the required number of reads in each bin and writes them out to out_reads_w_str_F """ #### If the out_reads_w_str_F already exist, RETURN if ( os.path.exists( out_reads_w_str_F ) and\ os.stat( out_reads_w_str_F ).st_size > 100000 ): return #### Make a copy of the numreads_to_keep_by_bin_D to run down cp_numreads_to_keep_by_bin_D = copy.copy(numreads_to_keep_by_bin_D) for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len break assert (read_len in [20, 40]) #### FOr the in_reads_w_str_F, GET the actual number of CG reads in each bin #### to see if there are enough for each; if not, we will need to #### downsample each of the CG bins num_reads_in_F_by_CG_bin_D = {} for CG_bin in cp_numreads_to_keep_by_bin_D: num_reads_in_F_by_CG_bin_D[CG_bin] = 0 for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len + read_len)] num_CG = len([x for x in rand_read if x in ["C", "G"]]) try: num_reads_in_F_by_CG_bin_D[num_CG] += 1 except KeyError: pass ##### Go through and get the lowest CG_bin factor - that is, which needed C+G ##### bin is least populated and will be the factor multiplied by the ##### values of numreads_to_keep_by_bin_D to determine how many in each #### C+G bin will __actually__ be able to be written to out_reads_w_str_F min_CG_factor = 1. any_limiting_CG_bin = 'No limiting CG bin' for CG_bin, num_target_reads in cp_numreads_to_keep_by_bin_D.iteritems(): num_in_reads = num_reads_in_F_by_CG_bin_D[CG_bin] prop_of_reads_needed_in_inF = float(num_in_reads) / num_target_reads if (prop_of_reads_needed_in_inF < min_CG_factor): any_limiting_CG_bin = 'CG bin {0}: desired {1:,}, have only {2:,} -> min_CG_factor = {3:.2f}'.format( CG_bin, num_target_reads, num_in_reads, prop_of_reads_needed_in_inF) min_CG_factor = min(min_CG_factor, prop_of_reads_needed_in_inF) #### Now go through and downsample the copied_numreads_to_keep_by_bin_D if #### necessary (i.e., if the min_CG_factor is less than 1 copied_numreads_to_keep_by_bin_D = {} for CG_bin, orig_num_target_reads in cp_numreads_to_keep_by_bin_D.iteritems( ): copied_numreads_to_keep_by_bin_D[CG_bin] = int(min_CG_factor * orig_num_target_reads) out_DIR = os.path.dirname(out_reads_w_str_F) os.system("mkdir -p {}".format(out_DIR)) log_DIR = os.path.join(out_DIR, "logs") out_log_F = os.path.join( log_DIR, os.path.basename(out_reads_w_str_F).split(".gz")[0] + ".log.txt") out_f = gzip.open(out_reads_w_str_F, 'wb') #### Get the read length for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len break #### The total number of reads to write out num_reads_to_write = sum(copied_numreads_to_keep_by_bin_D.values()) #### Get the read length for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len + read_len)] num_CG = len([x for x in rand_read if x in ["C", "G"]]) if (num_CG in copied_numreads_to_keep_by_bin_D): if (copied_numreads_to_keep_by_bin_D[num_CG] > 0): out_f.write("\n".join(four_lines_L) + "\n") copied_numreads_to_keep_by_bin_D[num_CG] -= 1 num_reads_to_write -= 1 if (num_reads_to_write == 0): break out_f.close() with open(out_log_F, 'w') as f: CG_bins_L = copied_numreads_to_keep_by_bin_D.keys() CG_bins_L.sort() f.write(any_limiting_CG_bin) f.write("\n\nREMAINING READS:\n") for CG_bin in CG_bins_L: f.write("CG {0}:\t{1}\n".format( CG_bin, copied_numreads_to_keep_by_bin_D[CG_bin]))
def get_split_whandle(self): """ returns a write file handle to the split reads """ return RBNS_utils.aopen(self.get_split_reads(), 'w')
def write_output_F_from_inputreads_and_numreadstokeepbyCG_D( in_reads_w_str_F, out_reads_w_str_F, numreads_to_keep_by_bin_D, fiveP_adapt_len, threeP_adapt_len): """ - Given a pulldown folded reads file (in_reads_w_str_F), will write out a new file (out_reads_w_str_F) containing a subset of reads according to numreads_to_keep_by_bin_D, which dictates how many reads containing each number of C+G bases should be included """ #### If the out_reads_w_str_F already exist, PASS if ( os.path.exists( out_reads_w_str_F ) and\ os.stat( out_reads_w_str_F ).st_size > 100000 ): return #### Make a copy of the numreads_to_keep_by_bin_D to run down copied_numreads_to_keep_by_bin_D = copy.copy(numreads_to_keep_by_bin_D) out_DIR = os.path.dirname(out_reads_w_str_F) os.system("mkdir -p {}".format(out_DIR)) log_DIR = os.path.join(out_DIR, "logs") out_log_F = os.path.join( log_DIR, os.path.basename(out_reads_w_str_F).split(".gz")[0] + ".log.txt") out_f = gzip.open(out_reads_w_str_F, 'wb') #### Get the read length for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): read_len = len(four_lines_L[0]) - fiveP_adapt_len - threeP_adapt_len break assert (read_len in [20, 40]) #### The total number of reads to write out num_reads_to_write = sum(copied_numreads_to_keep_by_bin_D.values()) #### Get the read length for four_lines_L in RBNS_utils.iterNlines(in_reads_w_str_F, 4, strip_newlines=True): rand_read = four_lines_L[0][fiveP_adapt_len:(fiveP_adapt_len + read_len)] num_CG = len([x for x in rand_read if x in ["C", "G"]]) if (num_CG in copied_numreads_to_keep_by_bin_D): if (copied_numreads_to_keep_by_bin_D[num_CG] > 0): out_f.write("\n".join(four_lines_L) + "\n") copied_numreads_to_keep_by_bin_D[num_CG] -= 1 num_reads_to_write -= 1 if (num_reads_to_write == 0): break out_f.close() with open(out_log_F, 'w') as f: CG_bins_L = copied_numreads_to_keep_by_bin_D.keys() CG_bins_L.sort() f.write("REMAINING READS:\n") for CG_bin in CG_bins_L: f.write("CG {0}:\t{1}\n".format( CG_bin, copied_numreads_to_keep_by_bin_D[CG_bin])) f.write("\n\n reads used:\n") for CG_bin in CG_bins_L: f.write("CG_num_reads {0}:\t{1}\n".format( CG_bin, numreads_to_keep_by_bin_D[CG_bin]))
def get_0nM_enrichment_kmer(self, kmer): kmer_i = RBNS_utils.get_index_from_kmer(kmer) return self.get_0nM_enrichment(kmer_i)
def kmer_value(self, kmer): kmeri = RBNS_utils.get_index_from_kmer(kmer) return self.kmeri_value(kmeri)
def get_B(self, kmer_i, read_len): assert kmer_i enrichment = self.get_enrichment(kmer_i) B = RBNS_utils.B_factor(enrichment, self.k, read_len) return B
def get_B_kmer(self, kmer, read_len): assert len(kmer) == self.k enrichment = self.get_enrichment_kmer(kmer) B = RBNS_utils.B_factor(enrichment, self.k, read_len) return B
def weight_dict(self): kmer2weight = {} for kmer, weight in zip(RBNS_utils.yield_kmers(self.k), self.profile): kmer2weight[kmer] = weight return kmer2weight