def __check_input(opts, args, parser): """ Make sure the input is in the form of either a cmp.h5 file of aligned reads or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta file is specified if """ arg = args[0] h5_files = [] opts.h5_labels = {} if arg[-6:] == "cmp.h5": print "Found cmp.h5 of aligned reads:" opts.h5_type = "cmp" opts.cmph5_contig_lens = {} opts.cmph5_contig_lens[arg] = {} h5_files.append(arg) print " -- %s" % arg print "Getting contig information from %s..." % arg reader = CmpH5Reader(arg) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) opts.cmph5_contig_lens[arg][slug_name] = length opts.h5_labels[arg] = "remove" reader.close() elif arg[-6:] == "bas.h5": print "Found bas.h5 of unaligned reads:" opts.h5_type = "bas" h5_files.append(arg) opts.h5_labels[arg] = "remove" print " -- %s" % arg elif arg[-5:] == ".fofn": print "Found FOFN of bas.h5 files:" opts.h5_type = "bas" fns = map(lambda x: x.strip("\n"), np.atleast_1d(open(arg, "r").read())) h5_files = fns for fn in fns: print " -- %s" % fn opts.h5_labels[fn] = "remove" if opts.h5_type == "bas": print "*************************************************************" print "* Motif filtering using unaligned reads is not recommended. *" print "* Aligned reads work much better for this! *" print "*************************************************************" print "" if opts.h5_type == "bas" and opts.cross_cov_bins != None: parser.error( "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!" ) return h5_files
def scan_WGA_h5( self ): """ Get some necessary information about the WGA cmp.h5 being used to generate the control IPD data. """ self.opts.h5_labels = {} self.opts.cmph5_contig_lens = {} self.opts.h5_labels[self.control_h5] = "control" self.opts.cmph5_contig_lens[self.control_h5] = {} reader = CmpH5Reader(self.control_h5) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) self.opts.cmph5_contig_lens[self.control_h5][slug_name] = length reader.close() return self.opts
def scan_WGA_aligns(self): """ Get some necessary information about the WGA cmp.h5 being used to generate the control IPD data. """ self.opts.aln_fn_labels = {} self.opts.aln_fn_contig_lens = {} self.opts.aln_fn_labels[self.control_aln_fn] = "control" self.opts.aln_fn_contig_lens[self.control_aln_fn] = {} # reader = CmpH5Reader(self.control_aln_fn) reader = openIndexedAlignmentFile(self.control_aln_fn) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) self.opts.aln_fn_contig_lens[ self.control_aln_fn][slug_name] = length reader.close() return self.opts
def __check_input(opts, args, parser): """ Make sure the input is in the form of either a cmp.h5 file of aligned reads or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta file is specified if """ if len(args) != 2: print "ERROR -- expecting two arguments: \ (1) input hdf5 file (cmp.h5, bas.h5, or FOFN of bas.h5 files) \ (2) file containing the motifs to analyze, separated by newlines, e.g.\ \ GATC-1\ CATG-1\ CAACGA-2" seq_input = args[0] motifs_fn = args[1] h5_files = [] opts.h5_labels = {} if seq_input[-6:] == "cmp.h5": print "Found cmp.h5 of aligned reads:" h5 = os.path.abspath(seq_input) opts.h5_type = "cmp" opts.cmph5_contig_lens = {} opts.cmph5_contig_lens[h5] = {} h5_files.append(h5) print " -- %s" % h5 print "Getting contig information from %s..." % h5 reader = CmpH5Reader(h5) for entry in reader.referenceInfoTable: name = entry[3] length = entry[4] slug_name = mbin.slugify(name) opts.cmph5_contig_lens[h5][slug_name] = length opts.h5_labels[h5] = "remove" reader.close() elif seq_input[-6:] == "bas.h5": print "Found bas.h5 of unaligned reads:" opts.h5_type = "bas" h5 = os.path.abspath(seq_input) h5_files.append(h5) opts.h5_labels[h5] = "remove" print " -- %s" % h5 elif seq_input[-5:] == ".fofn": print "Found FOFN of bas.h5 files of unaligned reads:" opts.h5_type = "bas" fofn_content = open(seq_input, "r").read().strip() h5_files = fofn_content.split("\n") for h5 in h5_files: h5 = os.path.abspath(h5) print " -- %s" % h5 opts.h5_labels[h5] = "remove" if opts.h5_type == "bas" and opts.cross_cov_bins != None: parser.error( "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!" ) if opts.h5_type == "cmp": try: for entry in SeqIO.parse(opts.contigs, "fasta"): x = entry.seq y = entry.id except: parser.error( "Please make sure the --contigs input is a valid fasta file.") if not os.path.exists(motifs_fn): parser.error( "Can't find file of motifs to include in methylation profile: %s" % motifs_fn) return h5_files, motifs_fn
def __call__(self): class ipd_entry: def __init__(self, tup): """ """ self.ref_base = tup[0] self.ipd = tup[1] # self.call = tup[2] # self.read_base = tup[3] self.ref_pos = tup[2] class subread: def __init__(self, cmph5, alignment, label, opts): leftAnchor = 1 rightAnchor = 1 self.entries = {} self.opts = opts self.subname = alignment.readName movieID = alignment.movieInfo[0] alignedLength = alignment.referenceSpan fps = alignment.movieInfo[2] self.refName = alignment.referenceInfo[3] zmw = alignment.HoleNumber self.mol = alignment.MoleculeID if alignment.isForwardStrand: self.strand = 0 else: self.strand = 1 self.ref_bases = alignment.reference() # self.read_bases = alignment.read() read_calls = alignment.transcript() ref_pos = list(alignment.referencePositions()) IPD = list(alignment.IPD()) self.label = self.opts.h5_labels[cmph5] error_mk = [] for read_call in read_calls: # Go through all entries and flag which positions are MM/indels if read_call != "M": # Mismatch or indel at this position! error_mk.append(1) else: error_mk.append(0) # Get the indices of all the non-matches error_idx = [i for (i, val) in enumerate(error_mk) if val == 1] for error_id in error_idx: try: for j in range(leftAnchor): error_mk[error_id - (j + 1)] = 1 for j in range(rightAnchor): error_mk[error_id + (j + 1)] = 1 except IndexError: pass error_mk = np.array(error_mk) ipds = np.array(IPD) / fps strands = np.array([self.strand] * len(read_calls)) self.ref_bases = np.array(list(self.ref_bases)) # self.read_bases = np.array(list(self.read_bases)) self.ref_pos = np.array(ref_pos) read_calls = np.array(list(read_calls)) # Mark the error positions, but leave them in the sequence so # we can pull out intact motifs from contiguous correct bases self.ref_bases[error_mk == 1] = "*" # self.read_bases[error_mk==1] = "*" read_calls[error_mk == 1] = "*" ipds[error_mk == 1] = -9 strands[error_mk == 1] = -9 # Attach these IPD entries to the subread object # for i,tup in enumerate(zip(self.ref_bases, ipds, read_calls, self.read_bases, self.ref_pos)): for i, tup in enumerate(zip(self.ref_bases, ipds, self.ref_pos)): entry = ipd_entry(tup) self.entries[self.ref_pos[i]] = ipd_entry(tup) # self.cap_outliers() self.subread_normalize() def cap_outliers(self, max_ipd=10): """ Cap the outlier IPDs at max_ipd seconds. """ for read_pos, entry in self.entries.iteritems(): entry.ipd = min(entry.ipd, max_ipd) def subread_normalize(self): """ Every IPD entry needs to be normalized by the mean IPD of its subread. """ if len(self.entries) == 0: # Nothing to do here. return self.entries # First populate list of all IPDs per subread. Will use to get normalization factor. subread_vals = [] for entry in self.entries.values(): # Only do if this IPD is NOT from an error position if entry.ipd != -9: subread_vals.append(entry.ipd) rawIPDs = np.array( map(lambda x: math.log(x + 0.001), subread_vals)) nfs = rawIPDs.mean() for pos, entry in self.entries.iteritems(): if entry.ipd == -9: newIPD = -9 else: newIPD = math.log(entry.ipd + 0.001) - nfs entry.ipd = newIPD def zip_bases_and_IPDs(self): """ Reassemble the read and IPD values using the subread normalized IPDs """ od = OrderedDict(sorted(self.entries.items())) ref = [] ref_pos = [] self.ipds = [] for read_pos, entry in od.items(): ref.append(entry.ref_base) ref_pos.append(entry.ref_pos) self.ipds.append(entry.ipd) self.ref_str = "".join(ref) self.ref_pos = ref_pos reader = CmpH5Reader(self.cmph5) read_refs = {} read_SMp = {} read_SMp_N = {} read_comps = {} read_labs = {} contig_SCp = {} i = 0 n_mols = 0 cwd = os.getcwd() # Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file chunksize = 10 self.chunkdir = "chunk_%s" % self.chunk_id if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)): shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir)) os.mkdir(os.path.join(self.opts.tmp, self.chunkdir)) to_dump = defaultdict(list) def dump_data_to_contig_files(refName, to_dump, read_labs): refName = mbin.slugify(refName) ref_subname_fn = "%s_readnames.tmp" % refName ref_label_fn = "%s_labels.tmp" % refName ref_length_fn = "%s_lengths.tmp" % refName ref_ipds_fn = "%s_ipds.tmp" % refName ref_ipds_N_fn = "%s_ipdsN.tmp" % refName ref_comp_N_fn = "%s_compN.tmp" % refName ref_strand_fn = "%s_strand.tmp" % refName self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn)) f_subnames = open( os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn), "a") f_labels = open( os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a") f_lengths = open( os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a") f_ipds = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a") f_ipds_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a") f_comp_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a") f_strand = open( os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a") self.tmp_fs.add(f_subnames) self.tmp_fs.add(f_labels) self.tmp_fs.add(f_ipds) self.tmp_fs.add(f_ipds_N) self.tmp_fs.add(f_comp_N) self.tmp_fs.add(f_strand) if self.opts.motifs_file != None and self.opts.subtract_control: control_ipds_d = pickle.load( open(self.opts.control_pkl_name, "rb")) for i, (subread_ipds, subread_comps, readname, subread_length, strand) in enumerate(to_dump[refName]): ipd_kmers = [motif for motif in subread_ipds.iterkeys()] ipd_means = [ subread_ipds[motif][1] for motif in subread_ipds.iterkeys() ] ipd_counts = [ subread_ipds[motif][0] for motif in subread_ipds.iterkeys() ] ipd_means = [] if self.opts.motifs_file != None and self.opts.subtract_control: for motif in subread_ipds.iterkeys(): if subread_ipds[motif][1] != 0.0: w_control_sub = subread_ipds[motif][ 1] - control_ipds_d[motif] ipd_means.append(w_control_sub) else: # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0) ipd_means.append(subread_ipds[motif][1]) else: for motif in subread_ipds.iterkeys(): ipd_means.append(subread_ipds[motif][1]) comp_kmers = np.array( [motif for motif, ipds in subread_comps.items()]) comp_counts = np.array( [ipds for motif, ipds in subread_comps.items()]) if i == 0 and refName not in self.refName_has_header: ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName ref_comp_kmers_fn = "%s_compkmers.tmp" % refName f_ipds_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_kmers_fn), "a") f_comp_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_kmers_fn), "a") ipds_kmers_str = "\t".join(ipd_kmers) comp_kmers_str = "\t".join(comp_kmers) f_ipds_kmers.write("%s\n" % ipds_kmers_str) f_comp_kmers.write("%s\n" % comp_kmers_str) f_ipds_kmers.close() f_comp_kmers.close() self.refName_has_header.add(refName) ipds_str = "\t".join(map(lambda x: str(round(x, 3)), ipd_means)) ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts)) comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts)) f_subnames.write("%s\n" % readname) f_labels.write("%s\n" % read_labs[readname]) f_lengths.write("%s\n" % subread_length) f_ipds.write("%s\n" % ipds_str) f_ipds_N.write("%s\n" % ipds_N_str) f_comp_N.write("%s\n" % comp_counts_str) f_strand.write("%s\n" % strand) for f in self.tmp_fs: f.close() self.tmp_fs = set() self.tmp_fns = set() self.refName_has_header = set() to_check = reader[self.idx] for alignment in to_check: ref_contig = mbin.slugify(alignment.referenceInfo[3]) label = self.opts.h5_labels[self.cmph5] ref_len = self.opts.cmph5_contig_lens[self.cmph5][ref_contig] if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV: to_get = min(self.N_target_reads, len(self.idx)) incr = to_get / 10 readname = "/".join(alignment.readName.split("/")[:-1]) if len(read_labs.keys()) % incr == 0 and not read_labs.get( readname): logging.info( "...chunk %s\t- mol %s/%s (%.1f%%)" % (self.chunk_id, n_mols, to_get, 100 * n_mols / to_get)) read_labs[readname] = label read_refs[readname] = ref_contig sub = subread(self.cmph5, alignment, label, self.opts) sub.zip_bases_and_IPDs() subread_ipds,subread_comps = read_scanner.scan_motifs( "cmp", \ # sub.read_str, \ sub.ipds, \ sub.ref_str, \ sub.strand, \ self.motifs, \ self.bi_motifs, \ self.opts ) to_dump[ref_contig].append( (subread_ipds, subread_comps, readname, len(sub.ref_str), sub.strand)) # Dump subread IPD and comp data to contig-specific file if len(to_dump[ref_contig]) % chunksize == 0 and len( to_dump[ref_contig]) != 0: dump_data_to_contig_files(ref_contig, to_dump, read_labs) to_dump[ref_contig] = [] n_mols = len(read_labs.keys()) i += 1 if n_mols == self.N_target_reads: break for ref_contig in to_dump.keys(): dump_data_to_contig_files(ref_contig, to_dump, read_labs) for f in self.tmp_fs: f.close() to_dump = defaultdict(list) if i == 0: logging.info("Chunk %s: no qualifying reads found!" % self.chunk_id) logging.info( "Chunk %s: found %s alignments (%s molecules) > %sbp in %s" % (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min, os.path.basename(self.cmph5))) reader.close() return self.tmp_fns
def dump_data_to_contig_files(refName, to_dump, read_labs): refName = mbin.slugify(refName) ref_subname_fn = "%s_readnames.tmp" % refName ref_label_fn = "%s_labels.tmp" % refName ref_length_fn = "%s_lengths.tmp" % refName ref_ipds_fn = "%s_ipds.tmp" % refName ref_ipds_N_fn = "%s_ipdsN.tmp" % refName ref_comp_N_fn = "%s_compN.tmp" % refName ref_strand_fn = "%s_strand.tmp" % refName self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn)) self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn)) f_subnames = open( os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn), "a") f_labels = open( os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a") f_lengths = open( os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a") f_ipds = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a") f_ipds_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a") f_comp_N = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a") f_strand = open( os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a") self.tmp_fs.add(f_subnames) self.tmp_fs.add(f_labels) self.tmp_fs.add(f_ipds) self.tmp_fs.add(f_ipds_N) self.tmp_fs.add(f_comp_N) self.tmp_fs.add(f_strand) if self.opts.motifs_file != None and self.opts.subtract_control: control_ipds_d = pickle.load( open(self.opts.control_pkl_name, "rb")) for i, (subread_ipds, subread_comps, readname, subread_length, strand) in enumerate(to_dump[refName]): ipd_kmers = [motif for motif in subread_ipds.iterkeys()] ipd_means = [ subread_ipds[motif][1] for motif in subread_ipds.iterkeys() ] ipd_counts = [ subread_ipds[motif][0] for motif in subread_ipds.iterkeys() ] ipd_means = [] if self.opts.motifs_file != None and self.opts.subtract_control: for motif in subread_ipds.iterkeys(): if subread_ipds[motif][1] != 0.0: w_control_sub = subread_ipds[motif][ 1] - control_ipds_d[motif] ipd_means.append(w_control_sub) else: # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0) ipd_means.append(subread_ipds[motif][1]) else: for motif in subread_ipds.iterkeys(): ipd_means.append(subread_ipds[motif][1]) comp_kmers = np.array( [motif for motif, ipds in subread_comps.items()]) comp_counts = np.array( [ipds for motif, ipds in subread_comps.items()]) if i == 0 and refName not in self.refName_has_header: ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName ref_comp_kmers_fn = "%s_compkmers.tmp" % refName f_ipds_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_kmers_fn), "a") f_comp_kmers = open( os.path.join(self.opts.tmp, self.chunkdir, ref_comp_kmers_fn), "a") ipds_kmers_str = "\t".join(ipd_kmers) comp_kmers_str = "\t".join(comp_kmers) f_ipds_kmers.write("%s\n" % ipds_kmers_str) f_comp_kmers.write("%s\n" % comp_kmers_str) f_ipds_kmers.close() f_comp_kmers.close() self.refName_has_header.add(refName) ipds_str = "\t".join(map(lambda x: str(round(x, 3)), ipd_means)) ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts)) comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts)) f_subnames.write("%s\n" % readname) f_labels.write("%s\n" % read_labs[readname]) f_lengths.write("%s\n" % subread_length) f_ipds.write("%s\n" % ipds_str) f_ipds_N.write("%s\n" % ipds_N_str) f_comp_N.write("%s\n" % comp_counts_str) f_strand.write("%s\n" % strand) for f in self.tmp_fs: f.close()
def run(self, mbinRunner): #################################################### # Filter out motifs without significant signatures #################################################### logging.info("Getting top motifs from each contig...") if self.opts.h5_type == "cmp": self.contig_fasta_lens = {} for entry in SeqIO.parse(self.opts.contigs, "fasta"): name = entry.id if name.find("|quiver") > -1: # SMRT assemblies add |quiver to contig names, but this # gets dropped from the contig names in the cmp.h5 file. name = name.replace("|quiver", "") self.contig_fasta_lens[mbin.slugify(name)] = len(entry.seq) contig_ipds_fns = glob.glob( os.path.join(self.opts.tmp, "*_ipds.tmp")) contigs = map(lambda x: os.path.basename(x).split("_ipds.tmp")[0], contig_ipds_fns) ipds_fn_dict = dict([ (os.path.basename(ipds_fn).split("_ipds.tmp")[0], ipds_fn) for ipds_fn in contig_ipds_fns ]) contigs_for_transpose = [] contigs_for_chunking = [] maxsize_for_transpose = 25000000 #25Mb for name in contigs: fsize = os.path.getsize(ipds_fn_dict[name]) if fsize < maxsize_for_transpose: contigs_for_transpose.append(name) else: contigs_for_chunking.append(name) logging.info("Transposing %s case contigs..." % len(contigs_for_transpose)) args = [(contig, self.opts) for contig in contigs_for_transpose] results = mbin.launch_pool(self.opts.procs, transpose_contig_matrix, args) streamed_contig_dicts = {} if len(contigs_for_transpose) > 0: logging.info("Streaming through %s contigs..." % len(contigs_for_transpose)) args = [(self.opts, contig, i, len(contigs_for_transpose)) for i, contig in enumerate(contigs_for_transpose)] results = mbin.launch_pool(self.opts.procs, stream_case_control_files, args) streamed_contig_SCp = map(lambda x: x[0], results) streamed_contig_SCp_N = map(lambda x: x[1], results) streamed_contigs = map(lambda x: x[2], results) for i, contig in enumerate(streamed_contigs): streamed_contig_dicts[contig] = { "SCp": streamed_contig_SCp[i], "SCp_N": streamed_contig_SCp_N[i] } chunked_contigs_dicts = {} if len(contigs_for_chunking) > 0: logging.info("Chunking %s contigs..." % len(contigs_for_chunking)) for i, contig in enumerate(contigs_for_chunking): control_means, contig_SCp, contig_SCp_N, contig = chunk_case_control_files( self.opts, contig, i, len(contigs_for_chunking)) chunked_contigs_dicts[contig] = { "SCp": contig_SCp, "SCp_N": contig_SCp_N } # Combine the contig dictionaries from both streaming and chunked paths def merge_two_dicts(x, y): """Given two dicts, merge them into a new dict as a shallow copy.""" z = x.copy() z.update(y) return z contig_dicts = merge_two_dicts(streamed_contig_dicts, chunked_contigs_dicts) keeper_control_ipds = {} keeper_motifs = set() if self.opts.cross_cov_bins != None: """ Using contig<-->bin mappings, collect methylation data from each contig and compile them into methylation scores for each bin. Then discover motifs based on bin-level scores. """ bin_map = {} for line in open(self.opts.cross_cov_bins, "rb").xreadlines(): line = line.strip() contig = line.split(",")[0] bin_id = int(line.split(",")[1]) bin_map[contig] = bin_id bin_contig_dicts = {} for bin_id in bin_map.values(): # Initialize the bin-level methylation dictionary bin_contig_dicts[bin_id] = {"SCp": {}, "SCp_N": {}} for contig, contig_d in contig_dicts.iteritems(): # Make sure contig is binned if bin_map.get(contig): bin_id = bin_map[contig] bin_contig_dicts[bin_id] = build_bin_dict( contig, bin_id, contig_d, bin_contig_dicts[bin_id]) else: logging.info( "Contig %s not found in cross-coverage binning results." % contig) bin_ids = list(set(bin_map.values())) bin_ids.sort() args = [] for bin_id in bin_ids: # For each bin, do motif filtering and refinement if len(bin_contig_dicts[bin_id]["SCp"].keys()) > 0: bin_copy_contig_dicts = copy.deepcopy( bin_contig_dicts[bin_id]) args.append( (bin_id, \ bin_copy_contig_dicts["SCp"], \ bin_copy_contig_dicts["SCp_N"], \ self.opts, \ len(bin_ids), \ bin_id, \ "bin") ) results = mbin.launch_pool(self.opts.procs, simplify_motifs, args) bin_keeper_motifs_list = map(lambda x: x[0], results) control_means_list = map(lambda x: x[1], results) """ Add the control means for these bin motifs to the complete set of control means for all detected motifs. """ for bin_keeper_motifs in bin_keeper_motifs_list: keeper_motifs = keeper_motifs | bin_keeper_motifs for sub_control_means in control_means_list: for motif, score in sub_control_means.iteritems(): control_means[motif] = score else: args = [] for j, (contig, contig_d) in enumerate(contig_dicts.iteritems()): # For each contig, do motif filtering and refinement copy_contig_dicts = copy.deepcopy(contig_d) args.append( (j, \ copy_contig_dicts["SCp"], \ copy_contig_dicts["SCp_N"], \ self.opts, \ len(contig_dicts.keys()), \ contig, \ "contig") ) results = mbin.launch_pool(self.opts.procs, simplify_motifs, args) contig_keeper_motifs_list = map(lambda x: x[0], results) control_means_list = map(lambda x: x[1], results) """ Add the control means for these bin motifs to the complete set of control means for all detected motifs. """ for contig_keeper_motifs in contig_keeper_motifs_list: keeper_motifs = keeper_motifs | contig_keeper_motifs for control_means in control_means_list: for motif, score in control_means.iteritems(): keeper_control_ipds[motif] = score # Rewrite the control so that it includes the new degenerate motifs. control_means, n_degen = self.add_degen_motifs( keeper_motifs, control_means) if n_degen > 0: pickle.dump(control_means, open(self.opts.control_pkl_name, "wb")) elif self.opts.h5_type == "bas": logging.info("Transposing reads...") files = [ os.path.join(self.opts.tmp, "read_ipds.tmp"), os.path.join(self.opts.tmp, "read_ipdsN.tmp") ] results = mbin.launch_pool(len(files), transpose_file, files) logging.info("Done.") logging.info("Streaming through reads for motif filtering...") keeper_motifs = self.bas_stream_files() logging.info("Keeping %s motifs for further analysis" % len(keeper_motifs)) self.motifs = list(keeper_motifs) n_motifs = len(keeper_motifs) f_motifs = open(self.opts.motifs_fn, "w") for motif in keeper_motifs: f_motifs.write("%s\n" % motif) f_motifs.close()