def chunk_case_control_files(opts, contig, j, contigs_N): logging.info(" ...chunking contig %s (%s/%s)..." % (contig, (j + 1), contigs_N)) contig_SCp = {} contig_SCp_N = {} keeper_motifs = set() control_means = pickle.load(open(opts.control_pkl_name, "rb")) contig_ipds_fn = os.path.join(opts.tmp, "%s_ipds.tmp" % contig) contig_ipds_N_fn = os.path.join(opts.tmp, "%s_ipdsN.tmp" % contig) contig_ipds_kmers_fn = os.path.join(opts.tmp, "%s_ipdskmers.tmp" % contig) kmers = np.loadtxt(contig_ipds_kmers_fn, dtype="str") fns = [contig_ipds_fn, contig_ipds_N_fn] n_chunks = 99 chunksize = int(math.ceil(float(len(kmers) / n_chunks))) cols_chunks = list(chunks(range(len(kmers)), chunksize)) args = [] for i, cols_chunk in enumerate(cols_chunks): cut_CMDs = [] for fn in fns: cut_cols = "%s-%s" % ((cols_chunk[0] + 1), (cols_chunk[-1] + 1)) in_fn = fn out_fn = fn + ".sub.%s" % i cut_CMD = "cut -d$\'\\t\' -f%s %s > %s" % (cut_cols, in_fn, out_fn) cut_CMDs.append(cut_CMD) args.append((i, opts.control_pkl_name, cut_CMDs, kmers, cols_chunk, j, n_chunks, contigs_N, opts)) results = mbin.launch_pool(opts.procs, process_contig_chunk, args) for i, result in enumerate(results): for motif in result[0].keys(): # contig_SCp,contig_SCp_N contig_SCp[motif] = result[0][motif] contig_SCp_N[motif] = result[1][motif] return control_means, contig_SCp, contig_SCp_N, contig
def chunk_control_matrices( self, control_ipds_fn, control_ipds_N_fn, control_kmers_fn ): """ """ kmers = np.atleast_1d(np.loadtxt(control_kmers_fn, dtype="str")) fns = [control_ipds_fn, control_ipds_N_fn] n_chunks = 99 chunksize = int(math.ceil(float( len(kmers)/n_chunks ))) cols_chunks = list(chunks( range(len(kmers)), chunksize )) args = [] for i,cols_chunk in enumerate(cols_chunks): cut_CMDs = [] for fn in fns: cut_cols = "%s-%s" % ((cols_chunk[0]+1), (cols_chunk[-1]+1)) in_fn = fn out_fn = fn+".sub.%s" % i cut_CMD = "cut -d$\'\\t\' -f%s %s > %s" % (cut_cols, in_fn, out_fn) cut_CMDs.append(cut_CMD) args.append( (i, cut_CMDs, kmers, cols_chunk, n_chunks, self.opts.min_motif_count) ) results = mbin.launch_pool(self.opts.procs, process_contig_chunk, args) logging.info("Combining motifs from all chunks of control data...") not_found = 0 control_means = {} for i,result in enumerate(results): not_found += result[1] for motif in result[0].keys(): control_means[motif] = result[0][motif] logging.info("Done.") return control_means,not_found
def bas_stream_files(self): reads_ipds_fn = os.path.join(self.opts.tmp, "read_ipds.tmp") reads_ipds_kmers_fn = os.path.join(self.opts.tmp, "read_ipdskmers.tmp") reads_ipds_N_fn = os.path.join(self.opts.tmp, "read_ipdsN.tmp") all_motifs = defaultdict(list) logging.info("Unpickling the control IPDs...") control_means = pickle.load(open(self.opts.control_pkl_name, "r")) logging.info("Done.") args = [] for j, line in enumerate( open(reads_ipds_fn + ".trans", "r").xreadlines()): args.append((j, copy.copy(control_means), reads_ipds_fn, reads_ipds_N_fn, reads_ipds_kmers_fn, self.opts)) results = mbin.launch_pool(self.opts.procs, get_motif_scores_from_read, args) highscore_motifs = [x for x in results if x is not None] # Keep only the shortest version of the high scoring motifs (reduces redundancy) keeper_motifs = set() if len(highscore_motifs) > 0: shortest_contiguous = min( [len(m.split("-")[0]) for m in highscore_motifs]) shortest_motifs = [ m for m in highscore_motifs if len(m.split("-")[0]) == shortest_contiguous ] to_del = set() for shorty in shortest_motifs: shorty_str = shorty.split("-")[0] shorty_idx = int(shorty.split("-")[1]) for motif in highscore_motifs: if motif != shorty: motif_str = motif.split("-")[0] motif_idx = int(motif.split("-")[1]) match = re.search(shorty_str, motif_str) if match != None: if (shorty_idx + match.start()) == motif_idx: to_del.add(motif) for motif in highscore_motifs: if motif not in to_del: keeper_motifs.add(motif) return keeper_motifs
def build_profiles(opts, h5_files, motifs, motifs_fn): """ """ if os.path.exists(opts.tmp): shutil.rmtree(opts.tmp) os.mkdir(opts.tmp) opts.motifs_file = motifs_fn opts.motifs = motifs opts.bi_motifs = None logging.info("Building methylation profiles using %s motifs..." % len(opts.motifs)) to_del = glob.glob(os.path.join(opts.tmp, "*")) for fn in to_del: os.remove(fn) mbinRunner = mbin.mbinRunner(opts) ################################################## # Launch analysis of <N_reads> for motif filtering ################################################## for i, h5_file in enumerate(h5_files): logging.info("Creating %s barcodes (%s motifs) from %s..." % (opts.N_reads, len(opts.motifs), h5_file)) mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts) if opts.h5_type == "cmp": logging.info( "Combining subread-level barcodes to get read-level barcodes from each contig..." ) contig_labels_fns = glob.glob( os.path.join(opts.tmp, "*_labels.tmp")) contigs = map( lambda x: os.path.basename(x).split("_labels.tmp")[0], contig_labels_fns) args = [(h5_file, contig, opts.tmp, opts.h5_labels, i, len(contigs)) for i, contig in enumerate(contigs)] results = mbin.launch_pool(opts.procs, combine_subreads_for_read_level, args) logging.info("Combining read-level barcodes from all contigs...") mbinRunner.combine_read_level_barcodes_across_contigs() logging.info("Done.") logging.info( "Creating contig-level barcodes (%s motifs) from %s..." % (len(opts.motifs), h5_file)) mbinRunner.combine_subreads_for_contig_level(h5_file) logging.info("Done.") n_contigs = len( np.loadtxt(os.path.join(opts.tmp, mbinRunner.fns["contig_names"]), dtype="str", ndmin=1)) if opts.cross_cov_bins != None: logging.info( "Creating bin-level barcodes (%s motifs) using %s..." % (len(opts.motifs), opts.cross_cov_bins)) mbinRunner.combine_contigs_for_bin_level() logging.info("Done.") if opts.h5_type == "bas": # Combine subread data across multiple movies logging.info("Combining subread data across all movies...") results = mbinRunner.combine_subread_data_across_bas_movies() logging.info("Done.") # Combine movie-merged subreads to get read-level barcodes logging.info("Combining subreads to get read-level barcodes...") results = mbinRunner.bas_combine_subreads_for_read_level() logging.info("Done.") if opts.sam != None: logging.info("Writing read-contig assignments based on %s..." % opts.sam) mbinRunner.get_read_refs_from_SAM() logging.info("Done.") for i, h5_file in enumerate(h5_files): logging.info( "Creating contig-level barcodes (%s motifs) from %s..." % (len(opts.motifs), h5_file)) mbinRunner.combine_subreads_for_contig_level(h5_file) logging.info("Done.") n_contigs = len( np.loadtxt(os.path.join(opts.tmp, mbinRunner.fns["contig_names"]), dtype="str", ndmin=1)) logging.info("Writing output files:") if opts.h5_type == "cmp": write_contig_features(mbinRunner, opts) if opts.aligned_read_barcodes: write_aligned_read_features(mbinRunner, opts) elif opts.h5_type == "bas": write_unaligned_read_features(mbinRunner, opts) logging.info("Cleaning up temp files from methylation profiling...") shutil.rmtree(opts.tmp) logging.info("Pipeline finished.")
def run(self, mbinRunner): #################################################### # Filter out motifs without significant signatures #################################################### logging.info("Getting top motifs from each contig...") if self.opts.h5_type == "cmp": self.contig_fasta_lens = {} for entry in SeqIO.parse(self.opts.contigs, "fasta"): name = entry.id if name.find("|quiver") > -1: # SMRT assemblies add |quiver to contig names, but this # gets dropped from the contig names in the cmp.h5 file. name = name.replace("|quiver", "") self.contig_fasta_lens[mbin.slugify(name)] = len(entry.seq) contig_ipds_fns = glob.glob( os.path.join(self.opts.tmp, "*_ipds.tmp")) contigs = map(lambda x: os.path.basename(x).split("_ipds.tmp")[0], contig_ipds_fns) ipds_fn_dict = dict([ (os.path.basename(ipds_fn).split("_ipds.tmp")[0], ipds_fn) for ipds_fn in contig_ipds_fns ]) contigs_for_transpose = [] contigs_for_chunking = [] maxsize_for_transpose = 25000000 #25Mb for name in contigs: fsize = os.path.getsize(ipds_fn_dict[name]) if fsize < maxsize_for_transpose: contigs_for_transpose.append(name) else: contigs_for_chunking.append(name) logging.info("Transposing %s case contigs..." % len(contigs_for_transpose)) args = [(contig, self.opts) for contig in contigs_for_transpose] results = mbin.launch_pool(self.opts.procs, transpose_contig_matrix, args) streamed_contig_dicts = {} if len(contigs_for_transpose) > 0: logging.info("Streaming through %s contigs..." % len(contigs_for_transpose)) args = [(self.opts, contig, i, len(contigs_for_transpose)) for i, contig in enumerate(contigs_for_transpose)] results = mbin.launch_pool(self.opts.procs, stream_case_control_files, args) streamed_contig_SCp = map(lambda x: x[0], results) streamed_contig_SCp_N = map(lambda x: x[1], results) streamed_contigs = map(lambda x: x[2], results) for i, contig in enumerate(streamed_contigs): streamed_contig_dicts[contig] = { "SCp": streamed_contig_SCp[i], "SCp_N": streamed_contig_SCp_N[i] } chunked_contigs_dicts = {} if len(contigs_for_chunking) > 0: logging.info("Chunking %s contigs..." % len(contigs_for_chunking)) for i, contig in enumerate(contigs_for_chunking): control_means, contig_SCp, contig_SCp_N, contig = chunk_case_control_files( self.opts, contig, i, len(contigs_for_chunking)) chunked_contigs_dicts[contig] = { "SCp": contig_SCp, "SCp_N": contig_SCp_N } # Combine the contig dictionaries from both streaming and chunked paths def merge_two_dicts(x, y): """Given two dicts, merge them into a new dict as a shallow copy.""" z = x.copy() z.update(y) return z contig_dicts = merge_two_dicts(streamed_contig_dicts, chunked_contigs_dicts) keeper_control_ipds = {} keeper_motifs = set() if self.opts.cross_cov_bins != None: """ Using contig<-->bin mappings, collect methylation data from each contig and compile them into methylation scores for each bin. Then discover motifs based on bin-level scores. """ bin_map = {} for line in open(self.opts.cross_cov_bins, "rb").xreadlines(): line = line.strip() contig = line.split(",")[0] bin_id = int(line.split(",")[1]) bin_map[contig] = bin_id bin_contig_dicts = {} for bin_id in bin_map.values(): # Initialize the bin-level methylation dictionary bin_contig_dicts[bin_id] = {"SCp": {}, "SCp_N": {}} for contig, contig_d in contig_dicts.iteritems(): # Make sure contig is binned if bin_map.get(contig): bin_id = bin_map[contig] bin_contig_dicts[bin_id] = build_bin_dict( contig, bin_id, contig_d, bin_contig_dicts[bin_id]) else: logging.info( "Contig %s not found in cross-coverage binning results." % contig) bin_ids = list(set(bin_map.values())) bin_ids.sort() args = [] for bin_id in bin_ids: # For each bin, do motif filtering and refinement if len(bin_contig_dicts[bin_id]["SCp"].keys()) > 0: bin_copy_contig_dicts = copy.deepcopy( bin_contig_dicts[bin_id]) args.append( (bin_id, \ bin_copy_contig_dicts["SCp"], \ bin_copy_contig_dicts["SCp_N"], \ self.opts, \ len(bin_ids), \ bin_id, \ "bin") ) results = mbin.launch_pool(self.opts.procs, simplify_motifs, args) bin_keeper_motifs_list = map(lambda x: x[0], results) control_means_list = map(lambda x: x[1], results) """ Add the control means for these bin motifs to the complete set of control means for all detected motifs. """ for bin_keeper_motifs in bin_keeper_motifs_list: keeper_motifs = keeper_motifs | bin_keeper_motifs for sub_control_means in control_means_list: for motif, score in sub_control_means.iteritems(): control_means[motif] = score else: args = [] for j, (contig, contig_d) in enumerate(contig_dicts.iteritems()): # For each contig, do motif filtering and refinement copy_contig_dicts = copy.deepcopy(contig_d) args.append( (j, \ copy_contig_dicts["SCp"], \ copy_contig_dicts["SCp_N"], \ self.opts, \ len(contig_dicts.keys()), \ contig, \ "contig") ) results = mbin.launch_pool(self.opts.procs, simplify_motifs, args) contig_keeper_motifs_list = map(lambda x: x[0], results) control_means_list = map(lambda x: x[1], results) """ Add the control means for these bin motifs to the complete set of control means for all detected motifs. """ for contig_keeper_motifs in contig_keeper_motifs_list: keeper_motifs = keeper_motifs | contig_keeper_motifs for control_means in control_means_list: for motif, score in control_means.iteritems(): keeper_control_ipds[motif] = score # Rewrite the control so that it includes the new degenerate motifs. control_means, n_degen = self.add_degen_motifs( keeper_motifs, control_means) if n_degen > 0: pickle.dump(control_means, open(self.opts.control_pkl_name, "wb")) elif self.opts.h5_type == "bas": logging.info("Transposing reads...") files = [ os.path.join(self.opts.tmp, "read_ipds.tmp"), os.path.join(self.opts.tmp, "read_ipdsN.tmp") ] results = mbin.launch_pool(len(files), transpose_file, files) logging.info("Done.") logging.info("Streaming through reads for motif filtering...") keeper_motifs = self.bas_stream_files() logging.info("Keeping %s motifs for further analysis" % len(keeper_motifs)) self.motifs = list(keeper_motifs) n_motifs = len(keeper_motifs) f_motifs = open(self.opts.motifs_fn, "w") for motif in keeper_motifs: f_motifs.write("%s\n" % motif) f_motifs.close()