def _detect_chimera(self, in_fasta, out_nc_fasta, out_c_fasta, primer_report_fn, out_dom, num_reads, job_name): """Detect chimeric reads from in_fasta, call phmmer to generate a dom file (out_dom), save non-chimeric reads to out_nc_fasta and chimeric reads to out_c_fasta. in_fasta --- either a fasta of trimmed fl reads, or a fasta of trimmed nfl reads. out_nc_fasta --- an output fasta of non-chimeric reads out_c_fasta --- an output fasta of chimeric reads primer_report_fn --- an output primer report out_dom --- phmmer output num_reads --- number of reads in in_fasta job_name --- either 'fl' or 'nfl' Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ if op.exists(out_dom) and self.reuse_dom: logging.warn( "Chimera detection output already exists. Parse {o}.".format( o=out_dom)) else: num_chunks = max(min(num_reads, self.cpus), 1) reads_per_chunk = int(math.ceil(num_reads / float(num_chunks))) num_chunks = int(math.ceil(num_reads / float(reads_per_chunk))) chunked_reads_fns = generateChunkedFN( self.out_dir, "in.{n}.trimmed.fa_split".format(n=job_name), num_chunks) chunked_dom_fns = generateChunkedFN( self.out_dir, "out.{n}.trimmed.hmmer_split".format(n=job_name), num_chunks) self._chunkReads(reads_fn=in_fasta, reads_per_chunk=reads_per_chunk, chunked_reads_fns=chunked_reads_fns, extract_front_back_only=False) self._startPhmmers(chunked_reads_fns=chunked_reads_fns, chunked_dom_fns=chunked_dom_fns, out_dom_fn=out_dom, primer_fn=self.primer_chimera_fn, pbmatrix_fn=self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(out_dom, self.chimera_detection_opts) # Update chimera information (num_nc, num_c, num_nc_bases, num_c_bases) = \ self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=in_fasta, out_nc_fn=out_nc_fasta, out_c_fn=out_c_fasta, primer_report_fn=primer_report_fn, write_report_header=True if job_name == "fl" else False) return (num_nc, num_c, num_nc_bases, num_c_bases)
def _detect_chimera(self, in_fasta, out_nc_fasta, out_c_fasta, primer_report_fn, out_dom, num_reads, job_name): """Detect chimeric reads from in_fasta, call phmmer to generate a dom file (out_dom), save non-chimeric reads to out_nc_fasta and chimeric reads to out_c_fasta. in_fasta --- either a fasta of trimmed fl reads, or a fasta of trimmed nfl reads. out_nc_fasta --- an output fasta of non-chimeric reads out_c_fasta --- an output fasta of chimeric reads primer_report_fn --- an output primer report out_dom --- phmmer output num_reads --- number of reads in in_fasta job_name --- either 'fl' or 'nfl' Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ if op.exists(out_dom) and self.reuse_dom: logging.warn("Chimera detection output already exists. Parse {o}.". format(o=out_dom)) else: num_chunks = max(min(num_reads, self.cpus), 1) reads_per_chunk = int(math.ceil(num_reads / float(num_chunks))) num_chunks = int(math.ceil(num_reads / float(reads_per_chunk))) chunked_reads_fns = generateChunkedFN(self.out_dir, "in.{n}.trimmed.fa_split".format(n=job_name), num_chunks) chunked_dom_fns = generateChunkedFN(self.out_dir, "out.{n}.trimmed.hmmer_split".format(n=job_name), num_chunks) self._chunkReads(reads_fn=in_fasta, reads_per_chunk=reads_per_chunk, chunked_reads_fns=chunked_reads_fns, extract_front_back_only=False) self._startPhmmers(chunked_reads_fns=chunked_reads_fns, chunked_dom_fns=chunked_dom_fns, out_dom_fn=out_dom, primer_fn=self.primer_chimera_fn, pbmatrix_fn=self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(out_dom, self.chimera_detection_opts) # Update chimera information (num_nc, num_c, num_nc_bases, num_c_bases) = \ self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=in_fasta, out_nc_fn=out_nc_fasta, out_c_fn=out_c_fasta, primer_report_fn=primer_report_fn, write_report_header=True if job_name == "fl" else False) return (num_nc, num_c, num_nc_bases, num_c_bases)
def runPrimerTrimmer(self): """Run PHMMER to identify barcodes and trim them away. (1) create forward/reverse primers (2) copy input with just the first/last k bases (3) run phmmer (4) parse phmmer DOM output, trim barcodes and output summary """ logging.info("Start to find and trim 3'/5' primers and polyAs.") # Sanity check input primers and create forward/reverse primers # for primer detection. primer_indices = self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_front_back_fn, revcmp_primers=False) logging.info("reuse_dom = {0}".format(self.reuse_dom)) if op.exists(self.out_front_back_dom_fn) and self.reuse_dom: logging.warn("Primer detection output already exists. Parsing {0}". format(self.out_front_back_dom_fn)) else: # Split reads in reads_fn into smaller chunks. num_chunks = max(min(self.cpus, self.numReads), 1) reads_per_chunk = int(math.ceil(self.numReads / (float(num_chunks)))) num_chunks = int(math.ceil(self.numReads / float(reads_per_chunk))) logging.debug("Split reads into {n} chunks".format(n=num_chunks)) # Divide input reads into smaller chunks and extract only # the front and the end segment from each read. self.chunked_front_back_reads_fns = generateChunkedFN(self.out_dir, "in.front_end.fa_split", num_chunks) # Dom output of phmmer for the above front/end sequences. self.chunked_front_back_dom_fns = generateChunkedFN(self.out_dir, "out.front_end.hmmer_split", num_chunks) # Split reads within 'reads_fn' into 'num_chunks' chunks, and only # extract the front and end segment from each read. window_size = self.chimera_detection_opts.primer_search_window self._chunkReads(reads_fn=self.reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_front_back_reads_fns, extract_front_back_only=True, window_size=window_size) # Start n='num_chunks' phmmer. self._startPhmmers( chunked_reads_fns=self.chunked_front_back_reads_fns, chunked_dom_fns=self.chunked_front_back_dom_fns, out_dom_fn=self.out_front_back_dom_fn, primer_fn=self.primer_front_back_fn, pbmatrix_fn=self.pbmatrix_fn) # Parse dome file, and return dictionary of front & back. best_of_front, best_of_back = self._getBestFrontBackRecord( self.out_front_back_dom_fn) # Trim bar code away self._trimBarCode(reads_fn=self.reads_fn, out_fl_reads_fn=self._trimmed_fl_reads_fn, out_nfl_reads_fn=self._trimmed_nfl_reads_fn, primer_report_nfl_fn=self._primer_report_nfl_fn, best_of_front=best_of_front, best_of_back=best_of_back, primer_indices=primer_indices, min_seq_len=self.chimera_detection_opts.min_seq_len, min_score=self.chimera_detection_opts.min_score, change_read_id=self.change_read_id, ignore_polyA=self.ignore_polyA) # Clean intemediate files: chunked reads files and chunked dom files. self._cleanup(self.chunked_front_back_reads_fns) self._cleanup(self.chunked_front_back_dom_fns) logging.info("Done with finding and trimming primers and polyAs.")
def runPrimerTrimmer(self): """Run PHMMER to identify barcodes and trim them away. (1) create forward/reverse primers (2) copy input with just the first/last k bases (3) run phmmer (4) parse phmmer DOM output, trim barcodes and output summary """ logging.info("Start to find and trim 3'/5' primers and polyAs.") # Sanity check input primers and create forward/reverse primers # for primer detection. primer_names = self._processPrimers( primer_fn_forward=self.primer_fn_forward, primer_fn_reverse=self.primer_fn_reverse, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_front_back_fn, revcmp_primers=False) logging.info("reuse_dom = {0}".format(self.reuse_dom)) if op.exists(self.out_front_back_dom_fn) and self.reuse_dom: logging.warn("Primer detection output already exists. Parsing {0}". format(self.out_front_back_dom_fn)) else: # Split reads in reads_fn into smaller chunks. num_chunks = max(min(self.cpus, self.numReads), 1) reads_per_chunk = int(math.ceil(self.numReads / (float(num_chunks)))) num_chunks = int(math.ceil(self.numReads / float(reads_per_chunk))) logging.debug("Split reads into {n} chunks".format(n=num_chunks)) # Divide input reads into smaller chunks and extract only # the front and the end segment from each read. self.chunked_front_back_reads_fns = generateChunkedFN(self.out_dir, "in.front_end.fa_split", num_chunks) # Dom output of phmmer for the above front/end sequences. self.chunked_front_back_dom_fns = generateChunkedFN(self.out_dir, "out.front_end.hmmer_split", num_chunks) # Split reads within 'reads_fn' into 'num_chunks' chunks, and only # extract the front and end segment from each read. window_size = self.chimera_detection_opts.primer_search_window self._chunkReads(reads_fn=self.reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_front_back_reads_fns, extract_front_back_only=True, window_size=window_size) # Start n='num_chunks' phmmer. self._startPhmmers( chunked_reads_fns=self.chunked_front_back_reads_fns, chunked_dom_fns=self.chunked_front_back_dom_fns, out_dom_fn=self.out_front_back_dom_fn, primer_fn=self.primer_front_back_fn, pbmatrix_fn=self.pbmatrix_fn) # Parse dome file, and return dictionary of front & back. best_of_front, best_of_back = self._getBestFrontBackRecord( self.out_front_back_dom_fn, self.chimera_detection_opts.min_score) # Trim bar code away self._trimBarCode(reads_fn=self.reads_fn, out_fl_reads_fn=self._trimmed_fl_reads_fn, out_nfl_reads_fn=self._trimmed_nfl_reads_fn, primer_report_nfl_fn=self._primer_report_nfl_fn, best_of_front=best_of_front, best_of_back=best_of_back, primer_names=primer_names, min_seq_len=self.chimera_detection_opts.min_seq_len, min_score=self.chimera_detection_opts.min_score, change_read_id=self.change_read_id, ignore_polyA=self.ignore_polyA, keep_primer=self.keep_primer) # Clean intemediate files: chunked reads files and chunked dom files. self._cleanup(self.chunked_front_back_reads_fns) self._cleanup(self.chunked_front_back_dom_fns) logging.info("Done with finding and trimming primers and polyAs.")
def runChimeraDetector(self): """Detect chimeras from trimmed reads.""" logging.info("Start to detect chimeras from trimmed reads.") need_cleanup = True if os.path.exists(self.out_trimmed_reads_dom_fn): logging.info("Output already exists. Parsing {0}.".format(self.out_trimmed_reads_dom_fn)) need_cleanup = False else: # Create forward/reverse primers for chimera detection. _primer_indices = self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) num_chunks = max(min(self.summary.num_fl, self.cpus), 1) #logging.debug("Split non-full-length reads into {n} chunks.". # format(n=num_chunks)) # Only detect chimeras on full-length reads in order to save time reads_per_chunk = int(math.ceil(self.summary.num_fl / (float(num_chunks)))) num_chunks = int(math.ceil(self.summary.num_fl/float(reads_per_chunk))) self.chunked_trimmed_reads_fns = generateChunkedFN(self.out_dir, "in.trimmed.fa_split", num_chunks) self.chunked_trimmed_reads_dom_fns = generateChunkedFN(self.out_dir, "out.trimmed.hmmer_split", num_chunks) self._chunkReads(reads_fn=self._trimmed_fl_reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_trimmed_reads_fns, extract_front_back_only=False) self._startPhmmers(self.chunked_trimmed_reads_fns, self.chunked_trimmed_reads_dom_fns, self.out_trimmed_reads_dom_fn, self.primer_chimera_fn, self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn, self.chimera_detection_opts) # Only detect chimeras on full-length reads in order to save time self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=self._trimmed_fl_reads_fn, out_flnc_fn=self.out_flnc_fn, out_flc_fn=self.out_flc_fn, primer_report_fl_fn=self._primer_report_fl_fn) # full-length non-chimeric reads written to out_flnc.fa # non-full-length reads written to out_nfl.fa # primer info of fl reads reported to _primer_report_fl_fn # primer info of nfl reads reported to _primer_report_nfl_fn # Need to: (1) concatenate out_flnc and out_nfl to make # out_all_reads_fn # (2) concatenate _primer_report_fl_fn and # _primer_report_nfl_fn to make primer_report_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) if need_cleanup: self._cleanup(self.chunked_trimmed_reads_fns + self.chunked_trimmed_reads_dom_fns) logging.info("Done with chimera detection.")
def runChimeraDetector(self): """Detect chimeras from trimmed reads.""" logging.info("Start to detect chimeras from trimmed reads.") need_cleanup = True if os.path.exists(self.out_trimmed_reads_dom_fn): logging.info("Output already exists. Parsing {0}.".format( self.out_trimmed_reads_dom_fn)) need_cleanup = False else: # Create forward/reverse primers for chimera detection. _primer_indices = self._processPrimers( primer_fn=self.primer_fn, window_size=self.chimera_detection_opts.primer_search_window, primer_out_fn=self.primer_chimera_fn, revcmp_primers=True) num_chunks = max(min(self.summary.num_fl, self.cpus), 1) #logging.debug("Split non-full-length reads into {n} chunks.". # format(n=num_chunks)) # Only detect chimeras on full-length reads in order to save time reads_per_chunk = int( math.ceil(self.summary.num_fl / (float(num_chunks)))) num_chunks = int( math.ceil(self.summary.num_fl / float(reads_per_chunk))) self.chunked_trimmed_reads_fns = generateChunkedFN( self.out_dir, "in.trimmed.fa_split", num_chunks) self.chunked_trimmed_reads_dom_fns = generateChunkedFN( self.out_dir, "out.trimmed.hmmer_split", num_chunks) self._chunkReads(reads_fn=self._trimmed_fl_reads_fn, reads_per_chunk=reads_per_chunk, chunked_reads_fns=self.chunked_trimmed_reads_fns, extract_front_back_only=False) self._startPhmmers(self.chunked_trimmed_reads_fns, self.chunked_trimmed_reads_dom_fns, self.out_trimmed_reads_dom_fn, self.primer_chimera_fn, self.pbmatrix_fn) suspicous_hits = self._getChimeraRecord(self.out_trimmed_reads_dom_fn, self.chimera_detection_opts) # Only detect chimeras on full-length reads in order to save time self._updateChimeraInfo(suspicous_hits=suspicous_hits, in_read_fn=self._trimmed_fl_reads_fn, out_flnc_fn=self.out_flnc_fn, out_flc_fn=self.out_flc_fn, primer_report_fl_fn=self._primer_report_fl_fn) # full-length non-chimeric reads written to out_flnc.fa # non-full-length reads written to out_nfl.fa # primer info of fl reads reported to _primer_report_fl_fn # primer info of nfl reads reported to _primer_report_nfl_fn # Need to: (1) concatenate out_flnc and out_nfl to make # out_all_reads_fn # (2) concatenate _primer_report_fl_fn and # _primer_report_nfl_fn to make primer_report_fn cat_files(src=[self.out_flnc_fn, self.out_nfl_fn], dst=self.out_all_reads_fn) cat_files(src=[self._primer_report_fl_fn, self._primer_report_nfl_fn], dst=self.primer_report_fn) if need_cleanup: self._cleanup(self.chunked_trimmed_reads_fns + self.chunked_trimmed_reads_dom_fns) logging.info("Done with chimera detection.")