def run(self): """Subset reads based on read annotation and subset rules.""" infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN) infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format( fl="true" if self.rules.FL != 0 else "false", nc="true" if self.rules.nonChimeric != 0 else "false") logging.info(infoMsg) if not self.printReadLengthOnly: with FastaReader(self.inFN) as reader, \ FastaWriter(self.outFN) as writer: for r in reader: #print >> sys.stderr, r.name, self.ignore_polyA annotation = ReadAnnotation.fromString(r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.writeRecord(r.name, r.sequence) else: # print read length only, dont print read names and sequences with FastaReader(self.inFN) as reader, \ open(self.outFN, 'w') as writer: for r in reader: annotation = ReadAnnotation.fromString(r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.write("{rl}\n".format(rl=len(r.sequence)))
def run(self): """Subset reads based on read annotation and subset rules.""" infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN) infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format( fl="true" if self.rules.FL != 0 else "false", nc="true" if self.rules.nonChimeric != 0 else "false") logging.info(infoMsg) if not self.printReadLengthOnly: with FastaReader(self.inFN) as reader, \ FastaWriter(self.outFN) as writer: for r in reader: #print >> sys.stderr, r.name, self.ignore_polyA annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.writeRecord(r.name, r.sequence) else: # print read length only, dont print read names and sequences with FastaReader(self.inFN) as reader, \ open(self.outFN, 'w') as writer: for r in reader: annotation = ReadAnnotation.fromString( r.name, self.ignore_polyA) if self.satisfy(annotation, self.rules): writer.write("{rl}\n".format(rl=len(r.sequence)))
def test_satisfy(self): """Test function satisfy().""" inFN = op.join(self.testDir, "data/test_subset.fa") reads = [] with FastaReader(inFN) as reader: reads = [x for x in reader] rules = SubsetRules(1, 1) # Full-length, non-chimeric obj = ReadsSubsetExtractor("in", "out", rules, True) ans = [ReadAnnotation.fromString(r.name) for r in reads] res = [obj.satisfy(an, rules) for an in ans] expected = [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1] self.assertTrue(res == expected)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn, out_c_fn, primer_report_fn, write_report_header=True): """ in_read_fn --- a fasta of full-length reads or a fasta of non-full-length reads. For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fn. Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ logging.debug( "Update chimera info for reads in {f} ".format(f=in_read_fn)) logging.debug( "Write primer report to {rpt}".format(rpt=primer_report_fn)) num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0 with FastaReader(in_read_fn) as reader, \ FastaWriter(out_nc_fn) as writer, \ FastaWriter(out_c_fn) as writer_chimera, \ open(primer_report_fn, 'w') as reporter: if write_report_header: reporter.write(ReadAnnotation.header(delimiter=",") + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 num_nc += 1 num_nc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 num_c += 1 num_c_bases += len(r.sequence) writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord(delimitor=",") + "\n") return (num_nc, num_c, num_nc_bases, num_c_bases)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn, out_c_fn, primer_report_fn, write_report_header=True): """ in_read_fn --- a fasta of full-length reads or a fasta of non-full-length reads. For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fn. Return: (num_nc, num_c, num_nc_bases, num_c_bases) """ logging.debug("Update chimera info for reads in {f} ". format(f=in_read_fn)) logging.debug("Write primer report to {rpt}". format(rpt=primer_report_fn)) num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0 with FastaReader(in_read_fn) as reader, \ FastaWriter(out_nc_fn) as writer, \ FastaWriter(out_c_fn) as writer_chimera, \ open(primer_report_fn, 'w') as reporter: if write_report_header: reporter.write(ReadAnnotation.header(delimiter=",") + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString(r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 num_nc += 1 num_nc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 num_c += 1 num_c_bases += len(r.sequence) writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord(delimitor=",") + "\n") return (num_nc, num_c, num_nc_bases, num_c_bases)
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn, out_flc_fn, primer_report_fl_fn): """ in_read_fn --- a fasta of full-length reads For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fl_fn. """ logging.info("Update chimera info to reads annotations " + "in the output FASTA file and the primer report.") with FastaReader(in_read_fn) as reader, \ FastaWriter(out_flnc_fn) as writer, \ FastaWriter(out_flc_fn) as writer_chimera, \ open(primer_report_fl_fn, 'w') as reporter: reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString(r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 assert(annotation.isFullLength) self.summary.num_flnc += 1 self.summary.num_flnc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 self.summary.num_flc += 1 writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord() + "\n")
def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn, out_flc_fn, primer_report_fl_fn): """ in_read_fn --- a fasta of full-length reads For each full-length read in in_read_fn FASTA file, detect whether it is chimeric or not, and write its annotation to primer_report_fl_fn. """ logging.info("Update chimera info to reads annotations " + "in the output FASTA file and the primer report.") with FastaReader(in_read_fn) as reader, \ FastaWriter(out_flnc_fn) as writer, \ FastaWriter(out_flc_fn) as writer_chimera, \ open(primer_report_fl_fn, 'w') as reporter: reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n") for r in reader: # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;" readid = r.name.split()[0] annotation = ReadAnnotation.fromString( r.name, ignore_polyA=self.ignore_polyA) if readid not in suspicous_hits: # Non-chimeric reads # Primer of a primer-trimmed read can not be None. # assert(annotation.primer is not None) annotation.chimera = 0 assert (annotation.isFullLength) self.summary.num_flnc += 1 self.summary.num_flnc_bases += len(r.sequence) writer.writeRecord(annotation.toAnnotation(), r.sequence) else: # chimeric reads annotation.chimera = 1 self.summary.num_flc += 1 writer_chimera.writeRecord(annotation.toAnnotation(), r.sequence) reporter.write(annotation.toReportRecord() + "\n")