Example #1
0
def __check_input(opts, args, parser):
    """
	Make sure the input is in the form of either a cmp.h5 file of aligned reads
	or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta 
	file is specified if 
	"""
    arg = args[0]
    h5_files = []
    opts.h5_labels = {}

    if arg[-6:] == "cmp.h5":
        print "Found cmp.h5 of aligned reads:"

        opts.h5_type = "cmp"
        opts.cmph5_contig_lens = {}
        opts.cmph5_contig_lens[arg] = {}

        h5_files.append(arg)
        print "  -- %s" % arg
        print "Getting contig information from %s..." % arg
        reader = CmpH5Reader(arg)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            opts.cmph5_contig_lens[arg][slug_name] = length
            opts.h5_labels[arg] = "remove"
        reader.close()

    elif arg[-6:] == "bas.h5":
        print "Found bas.h5 of unaligned reads:"
        opts.h5_type = "bas"
        h5_files.append(arg)
        opts.h5_labels[arg] = "remove"
        print "  -- %s" % arg

    elif arg[-5:] == ".fofn":
        print "Found FOFN of bas.h5 files:"
        opts.h5_type = "bas"
        fns = map(lambda x: x.strip("\n"),
                  np.atleast_1d(open(arg, "r").read()))
        h5_files = fns
        for fn in fns:
            print "  -- %s" % fn
            opts.h5_labels[fn] = "remove"

    if opts.h5_type == "bas":
        print "*************************************************************"
        print "* Motif filtering using unaligned reads is not recommended. *"
        print "*         Aligned reads work much better for this!          *"
        print "*************************************************************"
        print ""

    if opts.h5_type == "bas" and opts.cross_cov_bins != None:
        parser.error(
            "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!"
        )

    return h5_files
Example #2
0
	def scan_WGA_h5( self ):
		"""
		Get some necessary information about the WGA cmp.h5 
		being used to generate the control IPD data.
		"""
		self.opts.h5_labels                          = {}
		self.opts.cmph5_contig_lens                  = {}
		self.opts.h5_labels[self.control_h5]         = "control"
		self.opts.cmph5_contig_lens[self.control_h5] = {}
		
		reader = CmpH5Reader(self.control_h5)
		for entry in reader.referenceInfoTable:
			name      = entry[3]
			length    = entry[4]
			slug_name = mbin.slugify(name)
			self.opts.cmph5_contig_lens[self.control_h5][slug_name] = length
		reader.close()

		return self.opts
Example #3
0
    def scan_WGA_aligns(self):
        """
		Get some necessary information about the WGA cmp.h5 
		being used to generate the control IPD data.
		"""
        self.opts.aln_fn_labels = {}
        self.opts.aln_fn_contig_lens = {}
        self.opts.aln_fn_labels[self.control_aln_fn] = "control"
        self.opts.aln_fn_contig_lens[self.control_aln_fn] = {}

        # reader = CmpH5Reader(self.control_aln_fn)
        reader = openIndexedAlignmentFile(self.control_aln_fn)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            self.opts.aln_fn_contig_lens[
                self.control_aln_fn][slug_name] = length
        reader.close()

        return self.opts
Example #4
0
def __check_input(opts, args, parser):
    """
	Make sure the input is in the form of either a cmp.h5 file of aligned reads
	or a FOFN of unaligned bas.h5 files. Also make sure that a reference fasta 
	file is specified if 
	"""
    if len(args) != 2:
        print "ERROR -- expecting two arguments: \
				 (1) input hdf5 file (cmp.h5, bas.h5, or FOFN of bas.h5 files) \
				 (2) file containing the motifs to analyze, separated by newlines, e.g.\
				     \
				     GATC-1\
				     CATG-1\
				     CAACGA-2"

    seq_input = args[0]
    motifs_fn = args[1]
    h5_files = []
    opts.h5_labels = {}

    if seq_input[-6:] == "cmp.h5":
        print "Found cmp.h5 of aligned reads:"

        h5 = os.path.abspath(seq_input)
        opts.h5_type = "cmp"
        opts.cmph5_contig_lens = {}
        opts.cmph5_contig_lens[h5] = {}

        h5_files.append(h5)
        print "  -- %s" % h5
        print "Getting contig information from %s..." % h5
        reader = CmpH5Reader(h5)
        for entry in reader.referenceInfoTable:
            name = entry[3]
            length = entry[4]
            slug_name = mbin.slugify(name)
            opts.cmph5_contig_lens[h5][slug_name] = length
            opts.h5_labels[h5] = "remove"
        reader.close()

    elif seq_input[-6:] == "bas.h5":
        print "Found bas.h5 of unaligned reads:"
        opts.h5_type = "bas"
        h5 = os.path.abspath(seq_input)
        h5_files.append(h5)
        opts.h5_labels[h5] = "remove"
        print "  -- %s" % h5

    elif seq_input[-5:] == ".fofn":
        print "Found FOFN of bas.h5 files of unaligned reads:"
        opts.h5_type = "bas"
        fofn_content = open(seq_input, "r").read().strip()
        h5_files = fofn_content.split("\n")
        for h5 in h5_files:
            h5 = os.path.abspath(h5)
            print "  -- %s" % h5
            opts.h5_labels[h5] = "remove"

    if opts.h5_type == "bas" and opts.cross_cov_bins != None:
        parser.error(
            "Use of the --cross_cov_bins option is not compatible with bas.h5 inputs!"
        )

    if opts.h5_type == "cmp":
        try:
            for entry in SeqIO.parse(opts.contigs, "fasta"):
                x = entry.seq
                y = entry.id
        except:
            parser.error(
                "Please make sure the --contigs input is a valid fasta file.")

    if not os.path.exists(motifs_fn):
        parser.error(
            "Can't find file of motifs to include in methylation profile: %s" %
            motifs_fn)

    return h5_files, motifs_fn
Example #5
0
    def __call__(self):
        class ipd_entry:
            def __init__(self, tup):
                """
				"""
                self.ref_base = tup[0]
                self.ipd = tup[1]
                # self.call      = tup[2]
                # self.read_base = tup[3]
                self.ref_pos = tup[2]

        class subread:
            def __init__(self, cmph5, alignment, label, opts):
                leftAnchor = 1
                rightAnchor = 1
                self.entries = {}
                self.opts = opts

                self.subname = alignment.readName
                movieID = alignment.movieInfo[0]
                alignedLength = alignment.referenceSpan
                fps = alignment.movieInfo[2]
                self.refName = alignment.referenceInfo[3]
                zmw = alignment.HoleNumber
                self.mol = alignment.MoleculeID
                if alignment.isForwardStrand:
                    self.strand = 0
                else:
                    self.strand = 1
                self.ref_bases = alignment.reference()
                # self.read_bases = alignment.read()

                read_calls = alignment.transcript()
                ref_pos = list(alignment.referencePositions())
                IPD = list(alignment.IPD())
                self.label = self.opts.h5_labels[cmph5]

                error_mk = []
                for read_call in read_calls:
                    # Go through all entries and flag which positions are MM/indels
                    if read_call != "M":
                        # Mismatch or indel at this position!
                        error_mk.append(1)
                    else:
                        error_mk.append(0)

                # Get the indices of all the non-matches
                error_idx = [i for (i, val) in enumerate(error_mk) if val == 1]
                for error_id in error_idx:
                    try:
                        for j in range(leftAnchor):
                            error_mk[error_id - (j + 1)] = 1
                        for j in range(rightAnchor):
                            error_mk[error_id + (j + 1)] = 1
                    except IndexError:
                        pass
                error_mk = np.array(error_mk)

                ipds = np.array(IPD) / fps

                strands = np.array([self.strand] * len(read_calls))

                self.ref_bases = np.array(list(self.ref_bases))
                # self.read_bases = np.array(list(self.read_bases))
                self.ref_pos = np.array(ref_pos)
                read_calls = np.array(list(read_calls))

                # Mark the error positions, but leave them in the sequence so
                # we can pull out intact motifs from contiguous correct bases
                self.ref_bases[error_mk == 1] = "*"
                # self.read_bases[error_mk==1] = "*"
                read_calls[error_mk == 1] = "*"
                ipds[error_mk == 1] = -9
                strands[error_mk == 1] = -9

                # Attach these IPD entries to the subread object
                # for i,tup in enumerate(zip(self.ref_bases, ipds, read_calls, self.read_bases, self.ref_pos)):
                for i, tup in enumerate(zip(self.ref_bases, ipds,
                                            self.ref_pos)):
                    entry = ipd_entry(tup)
                    self.entries[self.ref_pos[i]] = ipd_entry(tup)

                # self.cap_outliers()

                self.subread_normalize()

            def cap_outliers(self, max_ipd=10):
                """
				Cap the outlier IPDs at max_ipd seconds.
				"""
                for read_pos, entry in self.entries.iteritems():
                    entry.ipd = min(entry.ipd, max_ipd)

            def subread_normalize(self):
                """
				Every IPD entry needs to be normalized by the mean IPD of its subread.
				"""
                if len(self.entries) == 0:
                    # Nothing to do here.
                    return self.entries

                # First populate list of all IPDs per subread. Will use to get normalization factor.
                subread_vals = []
                for entry in self.entries.values():
                    # Only do if this IPD is NOT from an error position
                    if entry.ipd != -9:
                        subread_vals.append(entry.ipd)

                rawIPDs = np.array(
                    map(lambda x: math.log(x + 0.001), subread_vals))
                nfs = rawIPDs.mean()

                for pos, entry in self.entries.iteritems():
                    if entry.ipd == -9:
                        newIPD = -9
                    else:
                        newIPD = math.log(entry.ipd + 0.001) - nfs

                    entry.ipd = newIPD

            def zip_bases_and_IPDs(self):
                """
				Reassemble the read and IPD values using the subread normalized IPDs
				"""
                od = OrderedDict(sorted(self.entries.items()))
                ref = []
                ref_pos = []
                self.ipds = []
                for read_pos, entry in od.items():
                    ref.append(entry.ref_base)
                    ref_pos.append(entry.ref_pos)
                    self.ipds.append(entry.ipd)
                self.ref_str = "".join(ref)
                self.ref_pos = ref_pos

        reader = CmpH5Reader(self.cmph5)

        read_refs = {}
        read_SMp = {}
        read_SMp_N = {}
        read_comps = {}
        read_labs = {}
        contig_SCp = {}
        i = 0
        n_mols = 0

        cwd = os.getcwd()

        # Periodically (after <chunksize> alignments) write out data to a contig-specific tmp file
        chunksize = 10
        self.chunkdir = "chunk_%s" % self.chunk_id
        if os.path.exists(os.path.join(self.opts.tmp, self.chunkdir)):
            shutil.rmtree(os.path.join(self.opts.tmp, self.chunkdir))
        os.mkdir(os.path.join(self.opts.tmp, self.chunkdir))
        to_dump = defaultdict(list)

        def dump_data_to_contig_files(refName, to_dump, read_labs):
            refName = mbin.slugify(refName)
            ref_subname_fn = "%s_readnames.tmp" % refName
            ref_label_fn = "%s_labels.tmp" % refName
            ref_length_fn = "%s_lengths.tmp" % refName
            ref_ipds_fn = "%s_ipds.tmp" % refName
            ref_ipds_N_fn = "%s_ipdsN.tmp" % refName
            ref_comp_N_fn = "%s_compN.tmp" % refName
            ref_strand_fn = "%s_strand.tmp" % refName

            self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn))
            f_subnames = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn),
                "a")
            f_labels = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a")
            f_lengths = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a")
            f_ipds = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a")
            f_ipds_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a")
            f_comp_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a")
            f_strand = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a")
            self.tmp_fs.add(f_subnames)
            self.tmp_fs.add(f_labels)
            self.tmp_fs.add(f_ipds)
            self.tmp_fs.add(f_ipds_N)
            self.tmp_fs.add(f_comp_N)
            self.tmp_fs.add(f_strand)

            if self.opts.motifs_file != None and self.opts.subtract_control:
                control_ipds_d = pickle.load(
                    open(self.opts.control_pkl_name, "rb"))

            for i, (subread_ipds, subread_comps, readname, subread_length,
                    strand) in enumerate(to_dump[refName]):
                ipd_kmers = [motif for motif in subread_ipds.iterkeys()]
                ipd_means = [
                    subread_ipds[motif][1]
                    for motif in subread_ipds.iterkeys()
                ]
                ipd_counts = [
                    subread_ipds[motif][0]
                    for motif in subread_ipds.iterkeys()
                ]

                ipd_means = []
                if self.opts.motifs_file != None and self.opts.subtract_control:
                    for motif in subread_ipds.iterkeys():
                        if subread_ipds[motif][1] != 0.0:
                            w_control_sub = subread_ipds[motif][
                                1] - control_ipds_d[motif]
                            ipd_means.append(w_control_sub)
                        else:  # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0)
                            ipd_means.append(subread_ipds[motif][1])
                else:
                    for motif in subread_ipds.iterkeys():
                        ipd_means.append(subread_ipds[motif][1])

                comp_kmers = np.array(
                    [motif for motif, ipds in subread_comps.items()])
                comp_counts = np.array(
                    [ipds for motif, ipds in subread_comps.items()])
                if i == 0 and refName not in self.refName_has_header:
                    ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName
                    ref_comp_kmers_fn = "%s_compkmers.tmp" % refName
                    f_ipds_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_ipds_kmers_fn), "a")
                    f_comp_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_comp_kmers_fn), "a")
                    ipds_kmers_str = "\t".join(ipd_kmers)
                    comp_kmers_str = "\t".join(comp_kmers)
                    f_ipds_kmers.write("%s\n" % ipds_kmers_str)
                    f_comp_kmers.write("%s\n" % comp_kmers_str)
                    f_ipds_kmers.close()
                    f_comp_kmers.close()
                    self.refName_has_header.add(refName)
                ipds_str = "\t".join(map(lambda x: str(round(x, 3)),
                                         ipd_means))
                ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts))
                comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts))
                f_subnames.write("%s\n" % readname)
                f_labels.write("%s\n" % read_labs[readname])
                f_lengths.write("%s\n" % subread_length)
                f_ipds.write("%s\n" % ipds_str)
                f_ipds_N.write("%s\n" % ipds_N_str)
                f_comp_N.write("%s\n" % comp_counts_str)
                f_strand.write("%s\n" % strand)

            for f in self.tmp_fs:
                f.close()

        self.tmp_fs = set()
        self.tmp_fns = set()
        self.refName_has_header = set()
        to_check = reader[self.idx]
        for alignment in to_check:
            ref_contig = mbin.slugify(alignment.referenceInfo[3])
            label = self.opts.h5_labels[self.cmph5]
            ref_len = self.opts.cmph5_contig_lens[self.cmph5][ref_contig]
            if ref_len >= self.opts.minContigLength and alignment.referenceSpan >= self.opts.readlength_min and alignment.MapQV >= self.opts.minMapQV:
                to_get = min(self.N_target_reads, len(self.idx))
                incr = to_get / 10
                readname = "/".join(alignment.readName.split("/")[:-1])
                if len(read_labs.keys()) % incr == 0 and not read_labs.get(
                        readname):
                    logging.info(
                        "...chunk %s\t- mol %s/%s (%.1f%%)" %
                        (self.chunk_id, n_mols, to_get, 100 * n_mols / to_get))

                read_labs[readname] = label
                read_refs[readname] = ref_contig

                sub = subread(self.cmph5, alignment, label, self.opts)
                sub.zip_bases_and_IPDs()
                subread_ipds,subread_comps = read_scanner.scan_motifs( "cmp",          \
																	   # sub.read_str,   \
                                sub.ipds,       \
                                sub.ref_str,    \
                                sub.strand,     \
                                self.motifs,    \
                                self.bi_motifs, \
                                self.opts )

                to_dump[ref_contig].append(
                    (subread_ipds, subread_comps, readname, len(sub.ref_str),
                     sub.strand))
                # Dump subread IPD and comp data to contig-specific file
                if len(to_dump[ref_contig]) % chunksize == 0 and len(
                        to_dump[ref_contig]) != 0:
                    dump_data_to_contig_files(ref_contig, to_dump, read_labs)
                    to_dump[ref_contig] = []

                n_mols = len(read_labs.keys())
                i += 1

                if n_mols == self.N_target_reads:
                    break

        for ref_contig in to_dump.keys():
            dump_data_to_contig_files(ref_contig, to_dump, read_labs)
        for f in self.tmp_fs:
            f.close()
        to_dump = defaultdict(list)

        if i == 0:
            logging.info("Chunk %s: no qualifying reads found!" %
                         self.chunk_id)

        logging.info(
            "Chunk %s: found %s alignments (%s molecules) > %sbp in %s" %
            (self.chunk_id, i, len(read_labs.keys()), self.opts.readlength_min,
             os.path.basename(self.cmph5)))
        reader.close()

        return self.tmp_fns
Example #6
0
        def dump_data_to_contig_files(refName, to_dump, read_labs):
            refName = mbin.slugify(refName)
            ref_subname_fn = "%s_readnames.tmp" % refName
            ref_label_fn = "%s_labels.tmp" % refName
            ref_length_fn = "%s_lengths.tmp" % refName
            ref_ipds_fn = "%s_ipds.tmp" % refName
            ref_ipds_N_fn = "%s_ipdsN.tmp" % refName
            ref_comp_N_fn = "%s_compN.tmp" % refName
            ref_strand_fn = "%s_strand.tmp" % refName

            self.tmp_fns.add(os.path.join(self.chunkdir, ref_subname_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_label_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_length_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_ipds_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_comp_N_fn))
            self.tmp_fns.add(os.path.join(self.chunkdir, ref_strand_fn))
            f_subnames = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_subname_fn),
                "a")
            f_labels = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_label_fn), "a")
            f_lengths = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_length_fn), "a")
            f_ipds = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_fn), "a")
            f_ipds_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_ipds_N_fn), "a")
            f_comp_N = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_comp_N_fn), "a")
            f_strand = open(
                os.path.join(self.opts.tmp, self.chunkdir, ref_strand_fn), "a")
            self.tmp_fs.add(f_subnames)
            self.tmp_fs.add(f_labels)
            self.tmp_fs.add(f_ipds)
            self.tmp_fs.add(f_ipds_N)
            self.tmp_fs.add(f_comp_N)
            self.tmp_fs.add(f_strand)

            if self.opts.motifs_file != None and self.opts.subtract_control:
                control_ipds_d = pickle.load(
                    open(self.opts.control_pkl_name, "rb"))

            for i, (subread_ipds, subread_comps, readname, subread_length,
                    strand) in enumerate(to_dump[refName]):
                ipd_kmers = [motif for motif in subread_ipds.iterkeys()]
                ipd_means = [
                    subread_ipds[motif][1]
                    for motif in subread_ipds.iterkeys()
                ]
                ipd_counts = [
                    subread_ipds[motif][0]
                    for motif in subread_ipds.iterkeys()
                ]

                ipd_means = []
                if self.opts.motifs_file != None and self.opts.subtract_control:
                    for motif in subread_ipds.iterkeys():
                        if subread_ipds[motif][1] != 0.0:
                            w_control_sub = subread_ipds[motif][
                                1] - control_ipds_d[motif]
                            ipd_means.append(w_control_sub)
                        else:  # Don't subtract control if no ipd values are available (i.e. IPD score == 0.0)
                            ipd_means.append(subread_ipds[motif][1])
                else:
                    for motif in subread_ipds.iterkeys():
                        ipd_means.append(subread_ipds[motif][1])

                comp_kmers = np.array(
                    [motif for motif, ipds in subread_comps.items()])
                comp_counts = np.array(
                    [ipds for motif, ipds in subread_comps.items()])
                if i == 0 and refName not in self.refName_has_header:
                    ref_ipds_kmers_fn = "%s_ipdskmers.tmp" % refName
                    ref_comp_kmers_fn = "%s_compkmers.tmp" % refName
                    f_ipds_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_ipds_kmers_fn), "a")
                    f_comp_kmers = open(
                        os.path.join(self.opts.tmp, self.chunkdir,
                                     ref_comp_kmers_fn), "a")
                    ipds_kmers_str = "\t".join(ipd_kmers)
                    comp_kmers_str = "\t".join(comp_kmers)
                    f_ipds_kmers.write("%s\n" % ipds_kmers_str)
                    f_comp_kmers.write("%s\n" % comp_kmers_str)
                    f_ipds_kmers.close()
                    f_comp_kmers.close()
                    self.refName_has_header.add(refName)
                ipds_str = "\t".join(map(lambda x: str(round(x, 3)),
                                         ipd_means))
                ipds_N_str = "\t".join(map(lambda x: str(x), ipd_counts))
                comp_counts_str = "\t".join(map(lambda x: str(x), comp_counts))
                f_subnames.write("%s\n" % readname)
                f_labels.write("%s\n" % read_labs[readname])
                f_lengths.write("%s\n" % subread_length)
                f_ipds.write("%s\n" % ipds_str)
                f_ipds_N.write("%s\n" % ipds_N_str)
                f_comp_N.write("%s\n" % comp_counts_str)
                f_strand.write("%s\n" % strand)

            for f in self.tmp_fs:
                f.close()
Example #7
0
    def run(self, mbinRunner):
        ####################################################
        # Filter out motifs without significant signatures
        ####################################################
        logging.info("Getting top motifs from each contig...")

        if self.opts.h5_type == "cmp":

            self.contig_fasta_lens = {}
            for entry in SeqIO.parse(self.opts.contigs, "fasta"):
                name = entry.id
                if name.find("|quiver") > -1:
                    # SMRT assemblies add |quiver to contig names, but this
                    # gets dropped from the contig names in the cmp.h5 file.
                    name = name.replace("|quiver", "")
                self.contig_fasta_lens[mbin.slugify(name)] = len(entry.seq)

            contig_ipds_fns = glob.glob(
                os.path.join(self.opts.tmp, "*_ipds.tmp"))
            contigs = map(lambda x: os.path.basename(x).split("_ipds.tmp")[0],
                          contig_ipds_fns)
            ipds_fn_dict = dict([
                (os.path.basename(ipds_fn).split("_ipds.tmp")[0], ipds_fn)
                for ipds_fn in contig_ipds_fns
            ])

            contigs_for_transpose = []
            contigs_for_chunking = []
            maxsize_for_transpose = 25000000  #25Mb
            for name in contigs:
                fsize = os.path.getsize(ipds_fn_dict[name])
                if fsize < maxsize_for_transpose:
                    contigs_for_transpose.append(name)
                else:
                    contigs_for_chunking.append(name)

            logging.info("Transposing %s case contigs..." %
                         len(contigs_for_transpose))
            args = [(contig, self.opts) for contig in contigs_for_transpose]
            results = mbin.launch_pool(self.opts.procs,
                                       transpose_contig_matrix, args)

            streamed_contig_dicts = {}
            if len(contigs_for_transpose) > 0:
                logging.info("Streaming through %s contigs..." %
                             len(contigs_for_transpose))
                args = [(self.opts, contig, i, len(contigs_for_transpose))
                        for i, contig in enumerate(contigs_for_transpose)]
                results = mbin.launch_pool(self.opts.procs,
                                           stream_case_control_files, args)

                streamed_contig_SCp = map(lambda x: x[0], results)
                streamed_contig_SCp_N = map(lambda x: x[1], results)
                streamed_contigs = map(lambda x: x[2], results)

                for i, contig in enumerate(streamed_contigs):
                    streamed_contig_dicts[contig] = {
                        "SCp": streamed_contig_SCp[i],
                        "SCp_N": streamed_contig_SCp_N[i]
                    }

            chunked_contigs_dicts = {}
            if len(contigs_for_chunking) > 0:
                logging.info("Chunking %s contigs..." %
                             len(contigs_for_chunking))

                for i, contig in enumerate(contigs_for_chunking):
                    control_means, contig_SCp, contig_SCp_N, contig = chunk_case_control_files(
                        self.opts, contig, i, len(contigs_for_chunking))
                    chunked_contigs_dicts[contig] = {
                        "SCp": contig_SCp,
                        "SCp_N": contig_SCp_N
                    }

            # Combine the contig dictionaries from both streaming and chunked paths
            def merge_two_dicts(x, y):
                """Given two dicts, merge them into a new dict as a shallow copy."""
                z = x.copy()
                z.update(y)
                return z

            contig_dicts = merge_two_dicts(streamed_contig_dicts,
                                           chunked_contigs_dicts)

            keeper_control_ipds = {}
            keeper_motifs = set()

            if self.opts.cross_cov_bins != None:
                """
				Using contig<-->bin mappings, collect methylation data from each
				contig and compile them into methylation scores for each bin.
				Then discover motifs based on bin-level scores.
				"""
                bin_map = {}
                for line in open(self.opts.cross_cov_bins, "rb").xreadlines():
                    line = line.strip()
                    contig = line.split(",")[0]
                    bin_id = int(line.split(",")[1])
                    bin_map[contig] = bin_id

                bin_contig_dicts = {}
                for bin_id in bin_map.values():
                    # Initialize the bin-level methylation dictionary
                    bin_contig_dicts[bin_id] = {"SCp": {}, "SCp_N": {}}

                for contig, contig_d in contig_dicts.iteritems():
                    # Make sure contig is binned
                    if bin_map.get(contig):
                        bin_id = bin_map[contig]
                        bin_contig_dicts[bin_id] = build_bin_dict(
                            contig, bin_id, contig_d, bin_contig_dicts[bin_id])
                    else:
                        logging.info(
                            "Contig %s not found in cross-coverage binning results."
                            % contig)

                bin_ids = list(set(bin_map.values()))
                bin_ids.sort()

                args = []
                for bin_id in bin_ids:
                    # For each bin, do motif filtering and refinement
                    if len(bin_contig_dicts[bin_id]["SCp"].keys()) > 0:

                        bin_copy_contig_dicts = copy.deepcopy(
                            bin_contig_dicts[bin_id])

                        args.append( (bin_id,                         \
                             bin_copy_contig_dicts["SCp"],   \
                             bin_copy_contig_dicts["SCp_N"], \
                             self.opts,                      \
                             len(bin_ids),                   \
                             bin_id,                         \
                             "bin") )

                results = mbin.launch_pool(self.opts.procs, simplify_motifs,
                                           args)

                bin_keeper_motifs_list = map(lambda x: x[0], results)
                control_means_list = map(lambda x: x[1], results)
                """
				Add the control means for these bin motifs to the 
				complete set of control means for all detected motifs.
				"""
                for bin_keeper_motifs in bin_keeper_motifs_list:
                    keeper_motifs = keeper_motifs | bin_keeper_motifs

                for sub_control_means in control_means_list:
                    for motif, score in sub_control_means.iteritems():
                        control_means[motif] = score

            else:
                args = []
                for j, (contig,
                        contig_d) in enumerate(contig_dicts.iteritems()):
                    # For each contig, do motif filtering and refinement
                    copy_contig_dicts = copy.deepcopy(contig_d)

                    args.append( (j,                          \
                         copy_contig_dicts["SCp"],   \
                         copy_contig_dicts["SCp_N"], \
                         self.opts,                  \
                         len(contig_dicts.keys()),   \
                         contig,                     \
                         "contig") )

                results = mbin.launch_pool(self.opts.procs, simplify_motifs,
                                           args)

                contig_keeper_motifs_list = map(lambda x: x[0], results)
                control_means_list = map(lambda x: x[1], results)
                """
				Add the control means for these bin motifs to the 
				complete set of control means for all detected motifs.
				"""
                for contig_keeper_motifs in contig_keeper_motifs_list:
                    keeper_motifs = keeper_motifs | contig_keeper_motifs

                for control_means in control_means_list:
                    for motif, score in control_means.iteritems():
                        keeper_control_ipds[motif] = score

            # Rewrite the control so that it includes the new degenerate motifs.
            control_means, n_degen = self.add_degen_motifs(
                keeper_motifs, control_means)
            if n_degen > 0:
                pickle.dump(control_means,
                            open(self.opts.control_pkl_name, "wb"))

        elif self.opts.h5_type == "bas":
            logging.info("Transposing reads...")
            files = [
                os.path.join(self.opts.tmp, "read_ipds.tmp"),
                os.path.join(self.opts.tmp, "read_ipdsN.tmp")
            ]
            results = mbin.launch_pool(len(files), transpose_file, files)
            logging.info("Done.")
            logging.info("Streaming through reads for motif filtering...")
            keeper_motifs = self.bas_stream_files()

        logging.info("Keeping %s motifs for further analysis" %
                     len(keeper_motifs))
        self.motifs = list(keeper_motifs)
        n_motifs = len(keeper_motifs)
        f_motifs = open(self.opts.motifs_fn, "w")
        for motif in keeper_motifs:
            f_motifs.write("%s\n" % motif)
        f_motifs.close()