Esempio n. 1
0
    def _detect_chimera(self, in_fasta, out_nc_fasta, out_c_fasta,
                        primer_report_fn, out_dom, num_reads, job_name):
        """Detect chimeric reads from in_fasta, call phmmer to generate a
        dom file (out_dom), save non-chimeric reads to out_nc_fasta and
        chimeric reads to out_c_fasta.
            in_fasta --- either a fasta of trimmed fl reads, or a fasta of
                         trimmed nfl reads.
            out_nc_fasta --- an output fasta of non-chimeric reads
            out_c_fasta --- an output fasta of chimeric reads
            primer_report_fn --- an output primer report
            out_dom --- phmmer output
            num_reads --- number of reads in in_fasta
            job_name --- either 'fl' or 'nfl'
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        if op.exists(out_dom) and self.reuse_dom:
            logging.warn(
                "Chimera detection output already exists. Parse {o}.".format(
                    o=out_dom))
        else:
            num_chunks = max(min(num_reads, self.cpus), 1)
            reads_per_chunk = int(math.ceil(num_reads / float(num_chunks)))
            num_chunks = int(math.ceil(num_reads / float(reads_per_chunk)))

            chunked_reads_fns = generateChunkedFN(
                self.out_dir, "in.{n}.trimmed.fa_split".format(n=job_name),
                num_chunks)

            chunked_dom_fns = generateChunkedFN(
                self.out_dir, "out.{n}.trimmed.hmmer_split".format(n=job_name),
                num_chunks)

            self._chunkReads(reads_fn=in_fasta,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=chunked_reads_fns,
                             extract_front_back_only=False)

            self._startPhmmers(chunked_reads_fns=chunked_reads_fns,
                               chunked_dom_fns=chunked_dom_fns,
                               out_dom_fn=out_dom,
                               primer_fn=self.primer_chimera_fn,
                               pbmatrix_fn=self.pbmatrix_fn)

        suspicous_hits = self._getChimeraRecord(out_dom,
                                                self.chimera_detection_opts)

        # Update chimera information
        (num_nc, num_c, num_nc_bases, num_c_bases) = \
            self._updateChimeraInfo(suspicous_hits=suspicous_hits,
                                    in_read_fn=in_fasta,
                                    out_nc_fn=out_nc_fasta,
                                    out_c_fn=out_c_fasta,
                                    primer_report_fn=primer_report_fn,
                                    write_report_header=True if job_name == "fl" else False)

        return (num_nc, num_c, num_nc_bases, num_c_bases)
Esempio n. 2
0
    def _detect_chimera(self, in_fasta, out_nc_fasta, out_c_fasta,
                        primer_report_fn, out_dom, num_reads, job_name):
        """Detect chimeric reads from in_fasta, call phmmer to generate a
        dom file (out_dom), save non-chimeric reads to out_nc_fasta and
        chimeric reads to out_c_fasta.
            in_fasta --- either a fasta of trimmed fl reads, or a fasta of
                         trimmed nfl reads.
            out_nc_fasta --- an output fasta of non-chimeric reads
            out_c_fasta --- an output fasta of chimeric reads
            primer_report_fn --- an output primer report
            out_dom --- phmmer output
            num_reads --- number of reads in in_fasta
            job_name --- either 'fl' or 'nfl'
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        if op.exists(out_dom) and self.reuse_dom:
            logging.warn("Chimera detection output already exists. Parse {o}.".
                         format(o=out_dom))
        else:
            num_chunks = max(min(num_reads, self.cpus), 1)
            reads_per_chunk = int(math.ceil(num_reads / float(num_chunks)))
            num_chunks = int(math.ceil(num_reads / float(reads_per_chunk)))

            chunked_reads_fns = generateChunkedFN(self.out_dir,
                                                  "in.{n}.trimmed.fa_split".format(n=job_name), num_chunks)

            chunked_dom_fns = generateChunkedFN(self.out_dir,
                                                "out.{n}.trimmed.hmmer_split".format(n=job_name), num_chunks)

            self._chunkReads(reads_fn=in_fasta,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=chunked_reads_fns,
                             extract_front_back_only=False)

            self._startPhmmers(chunked_reads_fns=chunked_reads_fns,
                               chunked_dom_fns=chunked_dom_fns,
                               out_dom_fn=out_dom,
                               primer_fn=self.primer_chimera_fn,
                               pbmatrix_fn=self.pbmatrix_fn)

        suspicous_hits = self._getChimeraRecord(out_dom,
                                                self.chimera_detection_opts)

        # Update chimera information
        (num_nc, num_c, num_nc_bases, num_c_bases) = \
            self._updateChimeraInfo(suspicous_hits=suspicous_hits,
                                    in_read_fn=in_fasta,
                                    out_nc_fn=out_nc_fasta,
                                    out_c_fn=out_c_fasta,
                                    primer_report_fn=primer_report_fn,
                                    write_report_header=True if job_name == "fl" else False)

        return (num_nc, num_c, num_nc_bases, num_c_bases)
Esempio n. 3
0
    def runPrimerTrimmer(self):
        """Run PHMMER to identify barcodes and trim them away.
        (1) create forward/reverse primers
        (2) copy input with just the first/last k bases
        (3) run phmmer
        (4) parse phmmer DOM output, trim barcodes and output summary
        """
        logging.info("Start to find and trim 3'/5' primers and polyAs.")
        # Sanity check input primers and create forward/reverse primers
        # for primer detection.
        primer_indices = self._processPrimers(
            primer_fn=self.primer_fn,
            window_size=self.chimera_detection_opts.primer_search_window,
            primer_out_fn=self.primer_front_back_fn,
            revcmp_primers=False)

        logging.info("reuse_dom = {0}".format(self.reuse_dom))
        if op.exists(self.out_front_back_dom_fn) and self.reuse_dom:
            logging.warn("Primer detection output already exists. Parsing {0}".
                         format(self.out_front_back_dom_fn))
        else:
            # Split reads in reads_fn into smaller chunks.
            num_chunks = max(min(self.cpus, self.numReads), 1)
            reads_per_chunk = int(math.ceil(self.numReads / (float(num_chunks))))
            num_chunks = int(math.ceil(self.numReads / float(reads_per_chunk)))

            logging.debug("Split reads into {n} chunks".format(n=num_chunks))
            # Divide input reads into smaller chunks and extract only
            # the front and the end segment from each read.
            self.chunked_front_back_reads_fns = generateChunkedFN(self.out_dir,
                                                                  "in.front_end.fa_split", num_chunks)

            # Dom output of phmmer for the above front/end sequences.
            self.chunked_front_back_dom_fns = generateChunkedFN(self.out_dir,
                                                                "out.front_end.hmmer_split", num_chunks)

            # Split reads within 'reads_fn' into 'num_chunks' chunks, and only
            # extract the front and end segment from each read.
            window_size = self.chimera_detection_opts.primer_search_window
            self._chunkReads(reads_fn=self.reads_fn,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=self.chunked_front_back_reads_fns,
                             extract_front_back_only=True,
                             window_size=window_size)

            # Start n='num_chunks' phmmer.
            self._startPhmmers(
                chunked_reads_fns=self.chunked_front_back_reads_fns,
                chunked_dom_fns=self.chunked_front_back_dom_fns,
                out_dom_fn=self.out_front_back_dom_fn,
                primer_fn=self.primer_front_back_fn,
                pbmatrix_fn=self.pbmatrix_fn)

        # Parse dome file, and return dictionary of front & back.
        best_of_front, best_of_back = self._getBestFrontBackRecord(
            self.out_front_back_dom_fn)

        # Trim bar code away
        self._trimBarCode(reads_fn=self.reads_fn,
                          out_fl_reads_fn=self._trimmed_fl_reads_fn,
                          out_nfl_reads_fn=self._trimmed_nfl_reads_fn,
                          primer_report_nfl_fn=self._primer_report_nfl_fn,
                          best_of_front=best_of_front,
                          best_of_back=best_of_back,
                          primer_indices=primer_indices,
                          min_seq_len=self.chimera_detection_opts.min_seq_len,
                          min_score=self.chimera_detection_opts.min_score,
                          change_read_id=self.change_read_id,
                          ignore_polyA=self.ignore_polyA)

        # Clean intemediate files: chunked reads files and chunked dom files.
        self._cleanup(self.chunked_front_back_reads_fns)
        self._cleanup(self.chunked_front_back_dom_fns)
        logging.info("Done with finding and trimming primers and polyAs.")
Esempio n. 4
0
    def runPrimerTrimmer(self):
        """Run PHMMER to identify barcodes and trim them away.
        (1) create forward/reverse primers
        (2) copy input with just the first/last k bases
        (3) run phmmer
        (4) parse phmmer DOM output, trim barcodes and output summary
        """
        logging.info("Start to find and trim 3'/5' primers and polyAs.")
        # Sanity check input primers and create forward/reverse primers
        # for primer detection.
        primer_names = self._processPrimers(
            primer_fn_forward=self.primer_fn_forward,
            primer_fn_reverse=self.primer_fn_reverse,
            window_size=self.chimera_detection_opts.primer_search_window,
            primer_out_fn=self.primer_front_back_fn,
            revcmp_primers=False)

        logging.info("reuse_dom = {0}".format(self.reuse_dom))
        if op.exists(self.out_front_back_dom_fn) and self.reuse_dom:
            logging.warn("Primer detection output already exists. Parsing {0}".
                         format(self.out_front_back_dom_fn))
        else:
            # Split reads in reads_fn into smaller chunks.
            num_chunks = max(min(self.cpus, self.numReads), 1)
            reads_per_chunk = int(math.ceil(self.numReads / (float(num_chunks))))
            num_chunks = int(math.ceil(self.numReads / float(reads_per_chunk)))

            logging.debug("Split reads into {n} chunks".format(n=num_chunks))
            # Divide input reads into smaller chunks and extract only
            # the front and the end segment from each read.
            self.chunked_front_back_reads_fns = generateChunkedFN(self.out_dir,
                                                                  "in.front_end.fa_split", num_chunks)

            # Dom output of phmmer for the above front/end sequences.
            self.chunked_front_back_dom_fns = generateChunkedFN(self.out_dir,
                                                                "out.front_end.hmmer_split", num_chunks)

            # Split reads within 'reads_fn' into 'num_chunks' chunks, and only
            # extract the front and end segment from each read.
            window_size = self.chimera_detection_opts.primer_search_window
            self._chunkReads(reads_fn=self.reads_fn,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=self.chunked_front_back_reads_fns,
                             extract_front_back_only=True,
                             window_size=window_size)

            # Start n='num_chunks' phmmer.
            self._startPhmmers(
                chunked_reads_fns=self.chunked_front_back_reads_fns,
                chunked_dom_fns=self.chunked_front_back_dom_fns,
                out_dom_fn=self.out_front_back_dom_fn,
                primer_fn=self.primer_front_back_fn,
                pbmatrix_fn=self.pbmatrix_fn)

        # Parse dome file, and return dictionary of front & back.
        best_of_front, best_of_back = self._getBestFrontBackRecord(
            self.out_front_back_dom_fn, self.chimera_detection_opts.min_score)


        # Trim bar code away
        self._trimBarCode(reads_fn=self.reads_fn,
                          out_fl_reads_fn=self._trimmed_fl_reads_fn,
                          out_nfl_reads_fn=self._trimmed_nfl_reads_fn,
                          primer_report_nfl_fn=self._primer_report_nfl_fn,
                          best_of_front=best_of_front,
                          best_of_back=best_of_back,
                          primer_names=primer_names,
                          min_seq_len=self.chimera_detection_opts.min_seq_len,
                          min_score=self.chimera_detection_opts.min_score,
                          change_read_id=self.change_read_id,
                          ignore_polyA=self.ignore_polyA,
                          keep_primer=self.keep_primer)

        # Clean intemediate files: chunked reads files and chunked dom files.
        self._cleanup(self.chunked_front_back_reads_fns)
        self._cleanup(self.chunked_front_back_dom_fns)
        logging.info("Done with finding and trimming primers and polyAs.")
Esempio n. 5
0
    def runChimeraDetector(self):
        """Detect chimeras from trimmed reads."""
        logging.info("Start to detect chimeras from trimmed reads.")
        need_cleanup = True
        if os.path.exists(self.out_trimmed_reads_dom_fn):
            logging.info("Output already exists. Parsing {0}.".format(self.out_trimmed_reads_dom_fn))
            need_cleanup = False
        else:
            # Create forward/reverse primers for chimera detection.
            _primer_indices = self._processPrimers(
                primer_fn=self.primer_fn,
                window_size=self.chimera_detection_opts.primer_search_window,
                primer_out_fn=self.primer_chimera_fn,
                revcmp_primers=True)
    
            num_chunks = max(min(self.summary.num_fl, self.cpus), 1)
            #logging.debug("Split non-full-length reads into {n} chunks.".
            #              format(n=num_chunks))
            # Only detect chimeras on full-length reads in order to save time
            reads_per_chunk = int(math.ceil(self.summary.num_fl /
                                            (float(num_chunks))))
            num_chunks = int(math.ceil(self.summary.num_fl/float(reads_per_chunk)))
    
            self.chunked_trimmed_reads_fns = generateChunkedFN(self.out_dir,
                "in.trimmed.fa_split", num_chunks)
    
            self.chunked_trimmed_reads_dom_fns = generateChunkedFN(self.out_dir,
                "out.trimmed.hmmer_split", num_chunks)
    
            self._chunkReads(reads_fn=self._trimmed_fl_reads_fn,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=self.chunked_trimmed_reads_fns,
                             extract_front_back_only=False)
    
            self._startPhmmers(self.chunked_trimmed_reads_fns,
                               self.chunked_trimmed_reads_dom_fns,
                               self.out_trimmed_reads_dom_fn,
                               self.primer_chimera_fn,
                               self.pbmatrix_fn)

        suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn,
                                                self.chimera_detection_opts)

        # Only detect chimeras on full-length reads in order to save time
        self._updateChimeraInfo(suspicous_hits=suspicous_hits,
                                in_read_fn=self._trimmed_fl_reads_fn,
                                out_flnc_fn=self.out_flnc_fn,
                                out_flc_fn=self.out_flc_fn,
                                primer_report_fl_fn=self._primer_report_fl_fn)
        # full-length non-chimeric reads written to out_flnc.fa
        # non-full-length reads written to out_nfl.fa
        # primer info of fl reads reported to _primer_report_fl_fn
        # primer info of nfl reads reported to _primer_report_nfl_fn
        # Need to: (1) concatenate out_flnc and out_nfl to make
        #              out_all_reads_fn
        #          (2) concatenate _primer_report_fl_fn and
        #              _primer_report_nfl_fn to make primer_report_fn
        cat_files(src=[self.out_flnc_fn, self.out_nfl_fn],
                  dst=self.out_all_reads_fn)

        cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn],
                  dst=self.primer_report_fn)

        if need_cleanup:
            self._cleanup(self.chunked_trimmed_reads_fns +
                          self.chunked_trimmed_reads_dom_fns)
        logging.info("Done with chimera detection.")
Esempio n. 6
0
    def runChimeraDetector(self):
        """Detect chimeras from trimmed reads."""
        logging.info("Start to detect chimeras from trimmed reads.")
        need_cleanup = True
        if os.path.exists(self.out_trimmed_reads_dom_fn):
            logging.info("Output already exists. Parsing {0}.".format(
                self.out_trimmed_reads_dom_fn))
            need_cleanup = False
        else:
            # Create forward/reverse primers for chimera detection.
            _primer_indices = self._processPrimers(
                primer_fn=self.primer_fn,
                window_size=self.chimera_detection_opts.primer_search_window,
                primer_out_fn=self.primer_chimera_fn,
                revcmp_primers=True)

            num_chunks = max(min(self.summary.num_fl, self.cpus), 1)
            #logging.debug("Split non-full-length reads into {n} chunks.".
            #              format(n=num_chunks))
            # Only detect chimeras on full-length reads in order to save time
            reads_per_chunk = int(
                math.ceil(self.summary.num_fl / (float(num_chunks))))
            num_chunks = int(
                math.ceil(self.summary.num_fl / float(reads_per_chunk)))

            self.chunked_trimmed_reads_fns = generateChunkedFN(
                self.out_dir, "in.trimmed.fa_split", num_chunks)

            self.chunked_trimmed_reads_dom_fns = generateChunkedFN(
                self.out_dir, "out.trimmed.hmmer_split", num_chunks)

            self._chunkReads(reads_fn=self._trimmed_fl_reads_fn,
                             reads_per_chunk=reads_per_chunk,
                             chunked_reads_fns=self.chunked_trimmed_reads_fns,
                             extract_front_back_only=False)

            self._startPhmmers(self.chunked_trimmed_reads_fns,
                               self.chunked_trimmed_reads_dom_fns,
                               self.out_trimmed_reads_dom_fn,
                               self.primer_chimera_fn, self.pbmatrix_fn)

        suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn,
                                                self.chimera_detection_opts)

        # Only detect chimeras on full-length reads in order to save time
        self._updateChimeraInfo(suspicous_hits=suspicous_hits,
                                in_read_fn=self._trimmed_fl_reads_fn,
                                out_flnc_fn=self.out_flnc_fn,
                                out_flc_fn=self.out_flc_fn,
                                primer_report_fl_fn=self._primer_report_fl_fn)
        # full-length non-chimeric reads written to out_flnc.fa
        # non-full-length reads written to out_nfl.fa
        # primer info of fl reads reported to _primer_report_fl_fn
        # primer info of nfl reads reported to _primer_report_nfl_fn
        # Need to: (1) concatenate out_flnc and out_nfl to make
        #              out_all_reads_fn
        #          (2) concatenate _primer_report_fl_fn and
        #              _primer_report_nfl_fn to make primer_report_fn
        cat_files(src=[self.out_flnc_fn, self.out_nfl_fn],
                  dst=self.out_all_reads_fn)

        cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn],
                  dst=self.primer_report_fn)

        if need_cleanup:
            self._cleanup(self.chunked_trimmed_reads_fns +
                          self.chunked_trimmed_reads_dom_fns)
        logging.info("Done with chimera detection.")