Beispiel #1
0
def chunk_case_control_files(opts, contig, j, contigs_N):
    logging.info("   ...chunking contig %s (%s/%s)..." % (contig,
                                                          (j + 1), contigs_N))

    contig_SCp = {}
    contig_SCp_N = {}
    keeper_motifs = set()
    control_means = pickle.load(open(opts.control_pkl_name, "rb"))
    contig_ipds_fn = os.path.join(opts.tmp, "%s_ipds.tmp" % contig)
    contig_ipds_N_fn = os.path.join(opts.tmp, "%s_ipdsN.tmp" % contig)
    contig_ipds_kmers_fn = os.path.join(opts.tmp, "%s_ipdskmers.tmp" % contig)
    kmers = np.loadtxt(contig_ipds_kmers_fn, dtype="str")
    fns = [contig_ipds_fn, contig_ipds_N_fn]

    n_chunks = 99
    chunksize = int(math.ceil(float(len(kmers) / n_chunks)))
    cols_chunks = list(chunks(range(len(kmers)), chunksize))
    args = []
    for i, cols_chunk in enumerate(cols_chunks):
        cut_CMDs = []
        for fn in fns:
            cut_cols = "%s-%s" % ((cols_chunk[0] + 1), (cols_chunk[-1] + 1))
            in_fn = fn
            out_fn = fn + ".sub.%s" % i
            cut_CMD = "cut -d$\'\\t\' -f%s %s > %s" % (cut_cols, in_fn, out_fn)
            cut_CMDs.append(cut_CMD)
        args.append((i, opts.control_pkl_name, cut_CMDs, kmers, cols_chunk, j,
                     n_chunks, contigs_N, opts))
    results = mbin.launch_pool(opts.procs, process_contig_chunk, args)
    for i, result in enumerate(results):
        for motif in result[0].keys():  # contig_SCp,contig_SCp_N
            contig_SCp[motif] = result[0][motif]
            contig_SCp_N[motif] = result[1][motif]

    return control_means, contig_SCp, contig_SCp_N, contig
Beispiel #2
0
	def chunk_control_matrices( self, control_ipds_fn, control_ipds_N_fn, control_kmers_fn ):
		"""

		"""
		kmers       = np.atleast_1d(np.loadtxt(control_kmers_fn, dtype="str"))
		fns         = [control_ipds_fn, control_ipds_N_fn]
		n_chunks    = 99
		chunksize   = int(math.ceil(float( len(kmers)/n_chunks )))
		cols_chunks = list(chunks( range(len(kmers)), chunksize ))
		args        = []
		for i,cols_chunk in enumerate(cols_chunks):
			cut_CMDs = []
			for fn in fns:
				cut_cols = "%s-%s" % ((cols_chunk[0]+1), (cols_chunk[-1]+1))
				in_fn    = fn
				out_fn   = fn+".sub.%s" % i
				cut_CMD  = "cut -d$\'\\t\' -f%s %s > %s" % (cut_cols, in_fn, out_fn)
				cut_CMDs.append(cut_CMD)
			args.append( (i, cut_CMDs, kmers, cols_chunk, n_chunks, self.opts.min_motif_count) )
		
		results = mbin.launch_pool(self.opts.procs, process_contig_chunk, args)
		
		logging.info("Combining motifs from all chunks of control data...")
		not_found     = 0
		control_means = {}
		for i,result in enumerate(results):
			not_found += result[1]
			for motif in result[0].keys():
				control_means[motif] = result[0][motif]
		logging.info("Done.")

		return control_means,not_found
Beispiel #3
0
    def bas_stream_files(self):
        reads_ipds_fn = os.path.join(self.opts.tmp, "read_ipds.tmp")
        reads_ipds_kmers_fn = os.path.join(self.opts.tmp, "read_ipdskmers.tmp")
        reads_ipds_N_fn = os.path.join(self.opts.tmp, "read_ipdsN.tmp")
        all_motifs = defaultdict(list)
        logging.info("Unpickling the control IPDs...")
        control_means = pickle.load(open(self.opts.control_pkl_name, "r"))
        logging.info("Done.")
        args = []
        for j, line in enumerate(
                open(reads_ipds_fn + ".trans", "r").xreadlines()):
            args.append((j, copy.copy(control_means), reads_ipds_fn,
                         reads_ipds_N_fn, reads_ipds_kmers_fn, self.opts))

        results = mbin.launch_pool(self.opts.procs, get_motif_scores_from_read,
                                   args)
        highscore_motifs = [x for x in results if x is not None]

        # Keep only the shortest version of the high scoring motifs (reduces redundancy)
        keeper_motifs = set()
        if len(highscore_motifs) > 0:
            shortest_contiguous = min(
                [len(m.split("-")[0]) for m in highscore_motifs])
            shortest_motifs = [
                m for m in highscore_motifs
                if len(m.split("-")[0]) == shortest_contiguous
            ]
            to_del = set()
            for shorty in shortest_motifs:
                shorty_str = shorty.split("-")[0]
                shorty_idx = int(shorty.split("-")[1])
                for motif in highscore_motifs:
                    if motif != shorty:
                        motif_str = motif.split("-")[0]
                        motif_idx = int(motif.split("-")[1])
                        match = re.search(shorty_str, motif_str)
                        if match != None:
                            if (shorty_idx + match.start()) == motif_idx:
                                to_del.add(motif)
            for motif in highscore_motifs:
                if motif not in to_del:
                    keeper_motifs.add(motif)
        return keeper_motifs
Beispiel #4
0
def build_profiles(opts, h5_files, motifs, motifs_fn):
    """

	"""
    if os.path.exists(opts.tmp):
        shutil.rmtree(opts.tmp)
    os.mkdir(opts.tmp)

    opts.motifs_file = motifs_fn
    opts.motifs = motifs
    opts.bi_motifs = None

    logging.info("Building methylation profiles using %s motifs..." %
                 len(opts.motifs))
    to_del = glob.glob(os.path.join(opts.tmp, "*"))
    for fn in to_del:
        os.remove(fn)

    mbinRunner = mbin.mbinRunner(opts)
    ##################################################
    # Launch analysis of <N_reads> for motif filtering
    ##################################################
    for i, h5_file in enumerate(h5_files):
        logging.info("Creating %s barcodes (%s motifs) from %s..." %
                     (opts.N_reads, len(opts.motifs), h5_file))
        mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts)

        if opts.h5_type == "cmp":
            logging.info(
                "Combining subread-level barcodes to get read-level barcodes from each contig..."
            )
            contig_labels_fns = glob.glob(
                os.path.join(opts.tmp, "*_labels.tmp"))
            contigs = map(
                lambda x: os.path.basename(x).split("_labels.tmp")[0],
                contig_labels_fns)
            args = [(h5_file, contig, opts.tmp, opts.h5_labels, i,
                     len(contigs)) for i, contig in enumerate(contigs)]
            results = mbin.launch_pool(opts.procs,
                                       combine_subreads_for_read_level, args)

            logging.info("Combining read-level barcodes from all contigs...")
            mbinRunner.combine_read_level_barcodes_across_contigs()
            logging.info("Done.")

            logging.info(
                "Creating contig-level barcodes (%s motifs) from %s..." %
                (len(opts.motifs), h5_file))
            mbinRunner.combine_subreads_for_contig_level(h5_file)
            logging.info("Done.")
            n_contigs = len(
                np.loadtxt(os.path.join(opts.tmp,
                                        mbinRunner.fns["contig_names"]),
                           dtype="str",
                           ndmin=1))

            if opts.cross_cov_bins != None:
                logging.info(
                    "Creating bin-level barcodes (%s motifs) using %s..." %
                    (len(opts.motifs), opts.cross_cov_bins))
                mbinRunner.combine_contigs_for_bin_level()
                logging.info("Done.")

    if opts.h5_type == "bas":
        # Combine subread data across multiple movies
        logging.info("Combining subread data across all movies...")
        results = mbinRunner.combine_subread_data_across_bas_movies()
        logging.info("Done.")
        # Combine movie-merged subreads to get read-level barcodes
        logging.info("Combining subreads to get read-level barcodes...")
        results = mbinRunner.bas_combine_subreads_for_read_level()
        logging.info("Done.")

        if opts.sam != None:
            logging.info("Writing read-contig assignments based on %s..." %
                         opts.sam)
            mbinRunner.get_read_refs_from_SAM()
            logging.info("Done.")
            for i, h5_file in enumerate(h5_files):
                logging.info(
                    "Creating contig-level barcodes (%s motifs) from %s..." %
                    (len(opts.motifs), h5_file))
                mbinRunner.combine_subreads_for_contig_level(h5_file)
                logging.info("Done.")
            n_contigs = len(
                np.loadtxt(os.path.join(opts.tmp,
                                        mbinRunner.fns["contig_names"]),
                           dtype="str",
                           ndmin=1))

    logging.info("Writing output files:")

    if opts.h5_type == "cmp":
        write_contig_features(mbinRunner, opts)
        if opts.aligned_read_barcodes:
            write_aligned_read_features(mbinRunner, opts)
    elif opts.h5_type == "bas":
        write_unaligned_read_features(mbinRunner, opts)

    logging.info("Cleaning up temp files from methylation profiling...")
    shutil.rmtree(opts.tmp)
    logging.info("Pipeline finished.")
Beispiel #5
0
    def run(self, mbinRunner):
        ####################################################
        # Filter out motifs without significant signatures
        ####################################################
        logging.info("Getting top motifs from each contig...")

        if self.opts.h5_type == "cmp":

            self.contig_fasta_lens = {}
            for entry in SeqIO.parse(self.opts.contigs, "fasta"):
                name = entry.id
                if name.find("|quiver") > -1:
                    # SMRT assemblies add |quiver to contig names, but this
                    # gets dropped from the contig names in the cmp.h5 file.
                    name = name.replace("|quiver", "")
                self.contig_fasta_lens[mbin.slugify(name)] = len(entry.seq)

            contig_ipds_fns = glob.glob(
                os.path.join(self.opts.tmp, "*_ipds.tmp"))
            contigs = map(lambda x: os.path.basename(x).split("_ipds.tmp")[0],
                          contig_ipds_fns)
            ipds_fn_dict = dict([
                (os.path.basename(ipds_fn).split("_ipds.tmp")[0], ipds_fn)
                for ipds_fn in contig_ipds_fns
            ])

            contigs_for_transpose = []
            contigs_for_chunking = []
            maxsize_for_transpose = 25000000  #25Mb
            for name in contigs:
                fsize = os.path.getsize(ipds_fn_dict[name])
                if fsize < maxsize_for_transpose:
                    contigs_for_transpose.append(name)
                else:
                    contigs_for_chunking.append(name)

            logging.info("Transposing %s case contigs..." %
                         len(contigs_for_transpose))
            args = [(contig, self.opts) for contig in contigs_for_transpose]
            results = mbin.launch_pool(self.opts.procs,
                                       transpose_contig_matrix, args)

            streamed_contig_dicts = {}
            if len(contigs_for_transpose) > 0:
                logging.info("Streaming through %s contigs..." %
                             len(contigs_for_transpose))
                args = [(self.opts, contig, i, len(contigs_for_transpose))
                        for i, contig in enumerate(contigs_for_transpose)]
                results = mbin.launch_pool(self.opts.procs,
                                           stream_case_control_files, args)

                streamed_contig_SCp = map(lambda x: x[0], results)
                streamed_contig_SCp_N = map(lambda x: x[1], results)
                streamed_contigs = map(lambda x: x[2], results)

                for i, contig in enumerate(streamed_contigs):
                    streamed_contig_dicts[contig] = {
                        "SCp": streamed_contig_SCp[i],
                        "SCp_N": streamed_contig_SCp_N[i]
                    }

            chunked_contigs_dicts = {}
            if len(contigs_for_chunking) > 0:
                logging.info("Chunking %s contigs..." %
                             len(contigs_for_chunking))

                for i, contig in enumerate(contigs_for_chunking):
                    control_means, contig_SCp, contig_SCp_N, contig = chunk_case_control_files(
                        self.opts, contig, i, len(contigs_for_chunking))
                    chunked_contigs_dicts[contig] = {
                        "SCp": contig_SCp,
                        "SCp_N": contig_SCp_N
                    }

            # Combine the contig dictionaries from both streaming and chunked paths
            def merge_two_dicts(x, y):
                """Given two dicts, merge them into a new dict as a shallow copy."""
                z = x.copy()
                z.update(y)
                return z

            contig_dicts = merge_two_dicts(streamed_contig_dicts,
                                           chunked_contigs_dicts)

            keeper_control_ipds = {}
            keeper_motifs = set()

            if self.opts.cross_cov_bins != None:
                """
				Using contig<-->bin mappings, collect methylation data from each
				contig and compile them into methylation scores for each bin.
				Then discover motifs based on bin-level scores.
				"""
                bin_map = {}
                for line in open(self.opts.cross_cov_bins, "rb").xreadlines():
                    line = line.strip()
                    contig = line.split(",")[0]
                    bin_id = int(line.split(",")[1])
                    bin_map[contig] = bin_id

                bin_contig_dicts = {}
                for bin_id in bin_map.values():
                    # Initialize the bin-level methylation dictionary
                    bin_contig_dicts[bin_id] = {"SCp": {}, "SCp_N": {}}

                for contig, contig_d in contig_dicts.iteritems():
                    # Make sure contig is binned
                    if bin_map.get(contig):
                        bin_id = bin_map[contig]
                        bin_contig_dicts[bin_id] = build_bin_dict(
                            contig, bin_id, contig_d, bin_contig_dicts[bin_id])
                    else:
                        logging.info(
                            "Contig %s not found in cross-coverage binning results."
                            % contig)

                bin_ids = list(set(bin_map.values()))
                bin_ids.sort()

                args = []
                for bin_id in bin_ids:
                    # For each bin, do motif filtering and refinement
                    if len(bin_contig_dicts[bin_id]["SCp"].keys()) > 0:

                        bin_copy_contig_dicts = copy.deepcopy(
                            bin_contig_dicts[bin_id])

                        args.append( (bin_id,                         \
                             bin_copy_contig_dicts["SCp"],   \
                             bin_copy_contig_dicts["SCp_N"], \
                             self.opts,                      \
                             len(bin_ids),                   \
                             bin_id,                         \
                             "bin") )

                results = mbin.launch_pool(self.opts.procs, simplify_motifs,
                                           args)

                bin_keeper_motifs_list = map(lambda x: x[0], results)
                control_means_list = map(lambda x: x[1], results)
                """
				Add the control means for these bin motifs to the 
				complete set of control means for all detected motifs.
				"""
                for bin_keeper_motifs in bin_keeper_motifs_list:
                    keeper_motifs = keeper_motifs | bin_keeper_motifs

                for sub_control_means in control_means_list:
                    for motif, score in sub_control_means.iteritems():
                        control_means[motif] = score

            else:
                args = []
                for j, (contig,
                        contig_d) in enumerate(contig_dicts.iteritems()):
                    # For each contig, do motif filtering and refinement
                    copy_contig_dicts = copy.deepcopy(contig_d)

                    args.append( (j,                          \
                         copy_contig_dicts["SCp"],   \
                         copy_contig_dicts["SCp_N"], \
                         self.opts,                  \
                         len(contig_dicts.keys()),   \
                         contig,                     \
                         "contig") )

                results = mbin.launch_pool(self.opts.procs, simplify_motifs,
                                           args)

                contig_keeper_motifs_list = map(lambda x: x[0], results)
                control_means_list = map(lambda x: x[1], results)
                """
				Add the control means for these bin motifs to the 
				complete set of control means for all detected motifs.
				"""
                for contig_keeper_motifs in contig_keeper_motifs_list:
                    keeper_motifs = keeper_motifs | contig_keeper_motifs

                for control_means in control_means_list:
                    for motif, score in control_means.iteritems():
                        keeper_control_ipds[motif] = score

            # Rewrite the control so that it includes the new degenerate motifs.
            control_means, n_degen = self.add_degen_motifs(
                keeper_motifs, control_means)
            if n_degen > 0:
                pickle.dump(control_means,
                            open(self.opts.control_pkl_name, "wb"))

        elif self.opts.h5_type == "bas":
            logging.info("Transposing reads...")
            files = [
                os.path.join(self.opts.tmp, "read_ipds.tmp"),
                os.path.join(self.opts.tmp, "read_ipdsN.tmp")
            ]
            results = mbin.launch_pool(len(files), transpose_file, files)
            logging.info("Done.")
            logging.info("Streaming through reads for motif filtering...")
            keeper_motifs = self.bas_stream_files()

        logging.info("Keeping %s motifs for further analysis" %
                     len(keeper_motifs))
        self.motifs = list(keeper_motifs)
        n_motifs = len(keeper_motifs)
        f_motifs = open(self.opts.motifs_fn, "w")
        for motif in keeper_motifs:
            f_motifs.write("%s\n" % motif)
        f_motifs.close()