Esempio n. 1
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
                   fl="true" if self.rules.FL != 0 else "false",
                   nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(r.name,
                                                           self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(r.name,
                                                           self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
Esempio n. 2
0
    def run(self):
        """Subset reads based on read annotation and subset rules."""
        infoMsg = "Extracting reads from {f} based on ".format(f=self.inFN)
        infoMsg += "rules(FullLength={fl}, nonChimeric={nc}).".format(
            fl="true" if self.rules.FL != 0 else "false",
            nc="true" if self.rules.nonChimeric != 0 else "false")
        logging.info(infoMsg)

        if not self.printReadLengthOnly:
            with FastaReader(self.inFN) as reader, \
                    FastaWriter(self.outFN) as writer:
                for r in reader:
                    #print >> sys.stderr, r.name, self.ignore_polyA
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.writeRecord(r.name, r.sequence)
        else:  # print read length only, dont print read names and sequences
            with FastaReader(self.inFN) as reader, \
                    open(self.outFN, 'w') as writer:
                for r in reader:
                    annotation = ReadAnnotation.fromString(
                        r.name, self.ignore_polyA)
                    if self.satisfy(annotation, self.rules):
                        writer.write("{rl}\n".format(rl=len(r.sequence)))
Esempio n. 3
0
    def test_satisfy(self):
        """Test function satisfy()."""
        inFN = op.join(self.testDir, "data/test_subset.fa")
        reads = []
        with FastaReader(inFN) as reader:
            reads = [x for x in reader]

        rules = SubsetRules(1, 1) # Full-length, non-chimeric
        obj = ReadsSubsetExtractor("in", "out", rules, True)

        ans = [ReadAnnotation.fromString(r.name) for r in reads]
        res = [obj.satisfy(an, rules) for an in ans]
        expected = [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1]
        self.assertTrue(res == expected)
Esempio n. 4
0
    def _updateChimeraInfo(self,
                           suspicous_hits,
                           in_read_fn,
                           out_nc_fn,
                           out_c_fn,
                           primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug(
            "Update chimera info for reads in {f} ".format(f=in_read_fn))
        logging.debug(
            "Write primer report to {rpt}".format(rpt=primer_report_fn))

        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with FastaReader(in_read_fn) as reader, \
                FastaWriter(out_nc_fn) as writer, \
                FastaWriter(out_c_fn) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
            return (num_nc, num_c, num_nc_bases, num_c_bases)
Esempio n. 5
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_nc_fn,
                           out_c_fn, primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug("Update chimera info for reads in {f} ".
                      format(f=in_read_fn))
        logging.debug("Write primer report to {rpt}".
                      format(rpt=primer_report_fn))

        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with FastaReader(in_read_fn) as reader, \
                FastaWriter(out_nc_fn) as writer, \
                FastaWriter(out_c_fn) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(r.name,
                                                       ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
            return (num_nc, num_c, num_nc_bases, num_c_bases)
Esempio n. 6
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn,
                           out_flc_fn, primer_report_fl_fn):
        """
        in_read_fn --- a fasta of full-length reads
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fl_fn.
        """
        logging.info("Update chimera info to reads annotations " +
                     "in the output FASTA file and the primer report.")

        with FastaReader(in_read_fn) as reader, \
             FastaWriter(out_flnc_fn) as writer, \
             FastaWriter(out_flc_fn) as writer_chimera, \
             open(primer_report_fl_fn, 'w') as reporter:
            reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    assert(annotation.isFullLength)
                    self.summary.num_flnc += 1
                    self.summary.num_flnc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    self.summary.num_flc += 1
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord() + "\n")
Esempio n. 7
0
    def _updateChimeraInfo(self, suspicous_hits, in_read_fn, out_flnc_fn,
                           out_flc_fn, primer_report_fl_fn):
        """
        in_read_fn --- a fasta of full-length reads
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fl_fn.
        """
        logging.info("Update chimera info to reads annotations " +
                     "in the output FASTA file and the primer report.")

        with FastaReader(in_read_fn) as reader, \
             FastaWriter(out_flnc_fn) as writer, \
             FastaWriter(out_flc_fn) as writer_chimera, \
             open(primer_report_fl_fn, 'w') as reporter:
            reporter.write("\t".join(ReadAnnotation.fieldsNames()) + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    assert (annotation.isFullLength)
                    self.summary.num_flnc += 1
                    self.summary.num_flnc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(), r.sequence)
                else:  # chimeric reads
                    annotation.chimera = 1
                    self.summary.num_flc += 1
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence)

                reporter.write(annotation.toReportRecord() + "\n")