def _processPrimers(self, primer_fn_forward, primer_fn_reverse, window_size, primer_out_fn,
                        revcmp_primers=False):
        """
        Do basic sanity checks that:
        (1) all primers in forward start with f_xxx and are unique
        (2) all primers in reverse start with r_xxx and are unique
        (3) check that no forward primers appear in reverse primers (no symmetry)
        (4) write the primers (f_xxx, f_xxx_revcmp, r_xxx, r_xxx_revcmp) all to one primer file
        """
        def sanity_check_primers(reader, prefix):
            """
            Go through the primers, check that the prefix exists and all seqs are unique
            """
            primers = {} # primer -> sequence, but can also contain the revcmp version with _revcmp suffix
            for r in reader:
                if not r.name.startswith(prefix):
                    errMsg = "Forward primer should start with f_, but saw:", r.name
                    raise ClassifierException(errMsg)
                if len(r.sequence) > window_size:
                    errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                     format(n=r.name, l=len(r.sequence), k=window_size)
                    logging.error(errMsg)
                    raise ClassifierException(errMsg)
                ss = r.sequence.upper()
                if ss in primers.itervalues():
                    errMsg = "Duplicate sequences found for", ss
                    raise ClassifierException(errMsg)
                primers[r.name.strip()] = r.sequence
                # revcmp not needed becuz phmmer does both strands apparently...
                #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence)
            return primers


        logging.info("Process primers for {case}.".
                     format(case=("finding primers" if not revcmp_primers
                                  else "detecting chimeras")))

        reader_f = FastaReader(primer_fn_forward)
        reader_r = FastaReader(primer_fn_reverse)

        primers_f = sanity_check_primers(reader_f, prefix="f_")
        primers_r = sanity_check_primers(reader_r, prefix="r_")

        reader_f.close()
        reader_r.close()

        same_seqs = set(primers_f.values()).intersection(primers_r.values())
        if len(same_seqs) > 0:
            errMsg = "Identical sequences found in both Forward/Reverse!\n"
            errMsg += "\n".join(same_seqs)
            raise ClassifierException(errMsg)

        # Write Fi and reverse-complemented Ri to primer_out_fn
        with open(primer_out_fn, 'w') as f:
            for (name, seq) in primers_f.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=seq))
            for (name, seq) in primers_r.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq)))
        return primers_f.keys() + primers_r.keys()
Example #2
0
    def _chunkReads(self,
                    reads_fn,
                    reads_per_chunk,
                    chunked_reads_fns,
                    extract_front_back_only=True,
                    window_size=100):
        """Split reads within reads_fn into multiple chunks each containing
        at most 'reads_per_chunk' reads, save to files in 'chunked_reads_fns'.
        If extract_front_back_only is true, extract the first and the last
        'window_size' bases and save them as readname_front and readname_back.
        Otherwise, copy read names and sequences entirely.
        """
        logging.debug("Split {f} into ".format(f=reads_fn) +
                      "{n} chunks, ".format(n=len(chunked_reads_fns)) +
                      "each containing at most {n} reads.".format(
                          n=reads_per_chunk))
        if extract_front_back_only:
            logging.debug("Extract exactly {k} bases from front" +
                          " and end of each read.".format(k=window_size))

        freader = FastaReader(reads_fn)
        chunkIndex = -1
        fwriter = None
        for i, read in enumerate(freader):
            if i % reads_per_chunk == 0:
                chunkIndex += 1
                if fwriter is not None:
                    fwriter.close()
                    fwriter = None
                fwriter = open(chunked_reads_fns[chunkIndex], 'w')
            rcseq = revcmp(read.sequence)
            if extract_front_back_only:
                fwriter.write(">{n}_front\n{s}\n>{n}_back\n{rcs}\n".format(
                    n=read.name,
                    s=read.sequence[:window_size],
                    rcs=rcseq[:window_size]))
            else:
                fwriter.write(">{n}\n{s}\n".format(n=read.name,
                                                   s=read.sequence))

        if fwriter is not None:
            fwriter.close()
    def _chunkReads(self, reads_fn, reads_per_chunk, chunked_reads_fns,
                    extract_front_back_only=True, window_size=100):
        """Split reads within reads_fn into multiple chunks each containing
        at most 'reads_per_chunk' reads, save to files in 'chunked_reads_fns'.
        If extract_front_back_only is true, extract the first and the last
        'window_size' bases and save them as readname_front and readname_back.
        Otherwise, copy read names and sequences entirely.
        """
        logging.debug("Split {f} into ".format(f=reads_fn) +
                      "{n} chunks, ".format(n=len(chunked_reads_fns)) +
                      "each containing at most {n} reads.".
                      format(n=reads_per_chunk))
        if extract_front_back_only:
            logging.debug("Extract exactly {k} bases from front" +
                          " and end of each read.".format(k=window_size))

        freader = FastaReader(reads_fn)
        chunkIndex = -1
        fwriter = None
        for i, read in enumerate(freader):
            if i % reads_per_chunk == 0:
                chunkIndex += 1
                if fwriter is not None:
                    fwriter.close()
                    fwriter = None
                fwriter = open(chunked_reads_fns[chunkIndex], 'w')
            rcseq = revcmp(read.sequence)
            if extract_front_back_only:
                fwriter.write(">{n}_front\n{s}\n>{n}_back\n{rcs}\n".format(
                              n=read.name, s=read.sequence[:window_size],
                              rcs=rcseq[:window_size]))
            else:
                fwriter.write(">{n}\n{s}\n".format(n=read.name,
                                                   s=read.sequence))

        if fwriter is not None:
            fwriter.close()
Example #4
0
    def _trimBarCode(self, reads_fn, out_fl_reads_fn, out_nfl_reads_fn,
                     primer_report_nfl_fn,
                     best_of_front, best_of_back, primer_indices,
                     min_seq_len, min_score, change_read_id,
                     ignore_polyA):
        """Trim bar code from reads in 'reads_fn', annotate each read,
        indicating:
            whether its 5' primer, 3' primer and polyA tail are seen,
            start positions of its 5' primer, 3' primer and polyA tail,
            and primer info.
        , save non-full-length reads to 'out_nfl_reads_fn',
        , save full-length reads to 'out_fl_reads_fn', which can later be
        used in chimera detection
        , write primer info of nfl reads to _primer_report_nfl_fn.

        Note that chimera detection is not necessary for nfl reads, but
        is required for fl reads. So we only write primer info for nfl here
        and will write primer info for fl reads when chimera detection
        is done.

        best_of_front/Back: {read_id: {primer_name:DOMRecord}}
        min_seq_len: minimum length to output a read.
        min_score: minimum score to output a read.
        change_read_id: if True, change read ids to 'movie/zmw/start_end'.
        """
        logging.info("Trim bar code away from reads.")
        logging.debug("Writing full-length trimmed reads to {f}".
                      format(f=out_fl_reads_fn))
        logging.debug("Writing non-full-length trimmed reads to {f}".
                      format(f=out_nfl_reads_fn))
        logging.debug("Writing primer reports before chimera detection to {f}".
                      format(f=primer_report_nfl_fn))

        with FastaReader(reads_fn) as fareader, \
                FastaWriter(out_nfl_reads_fn) as nfl_fawriter, \
                FastaWriter(out_fl_reads_fn) as fl_fawriter, \
                open(primer_report_nfl_fn, 'w') as reporter:
            for read in fareader:
                self.summary.num_reads += 1  # number of ROI reads
                pbread = PBRead(read)
                logging.debug("Pick up best primer combo for {r}".
                              format(r=read.name))
                primerIndex, strand, fw, rc = self._pickBestPrimerCombo(
                    best_of_front[read.name], best_of_back[read.name],
                    primer_indices, min_score)
                logging.debug("read={0}\n".format(read.name) +
                              "primer={0} strand={1} fw={2} rc={3}".
                              format(primerIndex, strand, fw, rc))

                if fw is None and rc is None:
                    # No primer seen in this sequence, classified
                    # as non-full-length
                    newName = pbread.name
                    if change_read_id:
                        newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                                  m=pbread.movie, z=pbread.zmw,
                                  s1=pbread.start, e1=pbread.end,
                                  isccs=("_CCS" if pbread.isCCS else ""))
                    annotation = ReadAnnotation(ID=newName)
                    # Write reports of nfl reads
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
                    if len(read.sequence) >= min_seq_len:
                        # output non-full-length reads to nfl.trimmed.fasta
                        nfl_fawriter.writeRecord(annotation.toAnnotation(),
                                                 read.sequence)
                        self.summary.num_nfl += 1
                    else:
                        self.summary.num_filtered_short_reads += 1
                    continue
                seq = read.sequence if strand == "+" else revcmp(read.sequence)
                five_end, three_start = None, None
                if fw is not None:
                    five_end = fw.sEnd
                    self.summary.num_5_seen += 1
                if rc is not None:
                    three_start = len(seq) - rc.sEnd
                    self.summary.num_3_seen += 1

                s, e = pbread.start, pbread.end
                # Try to find polyA tail in read
                polyAPos = self._findPolyA(seq, three_start=three_start)
                if polyAPos >= 0:  # polyA found
                    seq = seq[:polyAPos]
                    e1 = s + polyAPos if strand == "+" else e - polyAPos
                    self.summary.num_polyA_seen += 1
                elif three_start is not None:  # polyA not found
                    seq = seq[:three_start]
                    e1 = s + three_start if strand == "+" else e - three_start
                else:
                    e1 = e if strand == "+" else s

                if five_end is not None:
                    seq = seq[five_end:]
                    s1 = s + five_end if strand == "+" else e - five_end
                else:
                    s1 = s if strand == "+" else e

                newName = pbread.name
                if change_read_id:
                    newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                        m=pbread.movie, z=pbread.zmw, s1=s1, e1=e1,
                        isccs=("_CCS" if pbread.isCCS else ""))
                # Create an annotation
                annotation = ReadAnnotation(ID=newName, strand=strand,
                                            fiveend=five_end, polyAend=polyAPos,
                                            threeend=three_start, primer=primerIndex,
                                            ignore_polyA=ignore_polyA)

                # Write reports for nfl reads
                if annotation.isFullLength is not True:
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")

                if len(seq) >= min_seq_len:
                    if annotation.isFullLength is True:
                        # Write long full-length reads
                        fl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_fl += 1
                    else:
                        # Write long non-full-length reads.
                        nfl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_nfl += 1
                else:
                    self.summary.num_filtered_short_reads += 1
Example #5
0
                raise ClassifierException(errMsg)

            if len(r.sequence) > window_size:
                errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                    format(n=expectedName, l=len(r.sequence), k=window_size)
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if direction == "F":
                # Save >Fi and Fi_sequence.
                primers.append([expectedName, r.sequence])
            else:  # direction is "R"
                # fwdF/fwdR is the forward sequence of Fi/Ri
                fwdF, fwdR = primers[-1][1], r.sequence
                # revcmpF/revcmpR is the reverse complement of Fi/Ri
                revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR)
                # If Fi and Ri are reverse complementariliy identical, bail out,
                # because we need Poly A tail to distinguish Fi and Ri.
                if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0:
                    infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \
                        "are reverse complementarily identical. " + \
                        "Need to add 'AAAA' to 3' to distinguish them."
                    logging.info(infoMsg)
                    if revcmp_primers is False:
                        # Save primer Ri and revcmp(Ri_sequence) + TTTT
                        primers.append([expectedName, revcmpR + "T" * 4])
                    else:  # revcmp_primers is True
                        primers.append([expectedName, "A" * 4 + fwdR])
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
    def _trimBarCode(self, reads_fn, out_fl_reads_fn, out_nfl_reads_fn,
                     primer_report_nfl_fn,
                     best_of_front, best_of_back, primer_names,
                     min_seq_len, min_score, change_read_id,
                     ignore_polyA, keep_primer):
        """Trim bar code from reads in 'reads_fn', annotate each read,
        indicating:
            whether its 5' primer, 3' primer and polyA tail are seen,
            start positions of its 5' primer, 3' primer and polyA tail,
            and primer info.
        , save non-full-length reads to 'out_nfl_reads_fn',
        , save full-length reads to 'out_fl_reads_fn', which can later be
        used in chimera detection
        , write primer info of nfl reads to _primer_report_nfl_fn.

        Note that chimera detection is not necessary for nfl reads, but
        is required for fl reads. So we only write primer info for nfl here
        and will write primer info for fl reads when chimera detection
        is done.

        best_of_front/Back: {read_id: {primer_name:DOMRecord}}
        min_seq_len: minimum length to output a read.
        min_score: minimum score to output a read.
        change_read_id: if True, change read ids to 'movie/zmw/start_end'.
        """
        logging.info("Trim bar code away from reads.")
        logging.debug("Writing full-length trimmed reads to {f}".
                      format(f=out_fl_reads_fn))
        logging.debug("Writing non-full-length trimmed reads to {f}".
                      format(f=out_nfl_reads_fn))
        logging.debug("Writing primer reports before chimera detection to {f}".
                      format(f=primer_report_nfl_fn))

        with FastaReader(reads_fn) as fareader, \
                FastaWriter(out_nfl_reads_fn) as nfl_fawriter, \
                FastaWriter(out_fl_reads_fn) as fl_fawriter, \
                open(primer_report_nfl_fn, 'w') as reporter:
            for read in fareader:
                self.summary.num_reads += 1  # number of ROI reads
                pbread = PBRead(read)
                logging.debug("Pick up best primer combo for {r}".
                              format(r=read.name))
                primerName, strand, fw, rc = self._pickBestPrimerCombo(
                    best_of_front[read.name], best_of_back[read.name],
                    primer_names, min_score)
                logging.debug("read={0}\n".format(read.name) +
                              "strand={0} fw={1} rc={2}".
                              format(strand, fw, rc))

                if (strand == '?') or (fw is None and rc is None):
                    # No primer seen in this sequence, classified
                    # as non-full-length
                    newName = pbread.name
                    if change_read_id:
                        newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                                  m=pbread.movie, z=pbread.zmw,
                                  s1=pbread.start, e1=pbread.end,
                                  isccs=("_CCS" if pbread.isCCS else ""))
                    annotation = ReadAnnotation(ID=newName, primer=primerName)
                    # Write reports of nfl reads
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
                    if len(read.sequence) >= min_seq_len:
                        # output non-full-length reads to nfl.trimmed.fasta
                        nfl_fawriter.writeRecord(annotation.toAnnotation(),
                                                 read.sequence)
                        self.summary.num_nfl += 1
                    else:
                        self.summary.num_filtered_short_reads += 1
                    continue
                seq = read.sequence if strand == "+" else revcmp(read.sequence)
                five_end, three_start = None, None
                if fw is not None:
                    five_end = fw.sEnd
                    self.summary.num_5_seen += 1
                if rc is not None:
                    three_start = len(seq) - rc.sEnd
                    self.summary.num_3_seen += 1

                s, e = pbread.start, pbread.end
                # Try to find polyA tail in read
                polyAPos = self._findPolyA(seq, three_start=three_start)
                if polyAPos >= 0 and not ignore_polyA:  # polyA found and not to ignore it
                    if not keep_primer:
                        seq = seq[:polyAPos]
                        e1 = s + polyAPos if strand == "+" else e - polyAPos
                    else:
                        e1 = e if strand == '+' else s
                    self.summary.num_polyA_seen += 1
                elif three_start is not None:  # polyA not found but 3' found
                    if not keep_primer:
                        seq = seq[:three_start]
                        e1 = s + three_start if strand == "+" else e - three_start
                    else:
                        e1 = e if strand == '+' else s
                else: # polyA not found and 3' not found
                    e1 =  e if strand == "+" else s

                if five_end is not None:
                    if not keep_primer:
                        seq = seq[five_end:]
                        s1 = s + five_end if strand == "+" else e - five_end
                    else:
                        s1 = s if strand == '+' else e
                else:
                    s1 = s if strand == "+" else e

                newName = pbread.name
                if change_read_id:
                    newName = "{m}/{z}/{s1}_{e1}{isccs}".format(
                        m=pbread.movie, z=pbread.zmw, s1=s1, e1=e1,
                        isccs=("_CCS" if pbread.isCCS else ""))
                # Create an annotation
                annotation = ReadAnnotation(ID=newName, strand=strand,
                                            fiveend=five_end, polyAend=polyAPos,
                                            threeend=three_start, primer=primerName,
                                            ignore_polyA=ignore_polyA)

                # Write reports for nfl reads
                if annotation.isFullLength is not True:
                    reporter.write(annotation.toReportRecord(delimitor=",") + "\n")

                if len(seq) >= min_seq_len:
                    if annotation.isFullLength is True:
                        # Write long full-length reads
                        fl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_fl += 1
                    else:
                        # Write long non-full-length reads.
                        nfl_fawriter.writeRecord(annotation.toAnnotation(), seq)
                        self.summary.num_nfl += 1
                else:
                    self.summary.num_filtered_short_reads += 1
Example #7
0
        reader_f.close()
        reader_r.close()

        same_seqs = set(primers_f.values()).intersection(primers_r.values())
        if len(same_seqs) > 0:
            errMsg = "Identical sequences found in both Forward/Reverse!\n"
            errMsg += "\n".join(same_seqs)
            raise ClassifierException(errMsg)

        # Write Fi and reverse-complemented Ri to primer_out_fn
        with open(primer_out_fn, 'w') as f:
            for (name, seq) in primers_f.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=seq))
            for (name, seq) in primers_r.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq)))
        return primers_f.keys() + primers_r.keys()

    @property
    def numReads(self):
        """Return the number of reads in reads_fn."""
        cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn))
        output, errCode, errMsg = backticks(cmd)
        if errCode != 0:
            raise ClassifierException("Error reading file {r}:{e}".format(
                r=self.reads_fn, e=str(errMsg)))
        return int(output[0])

    def _chunkReads(self,
                    reads_fn,
                    reads_per_chunk,
Example #8
0
    def _processPrimers(self, primer_fn, window_size, primer_out_fn,
                        revcmp_primers=False):
        """
        Check and generate primers.
        1. Check primers in primer_fn are in order F0, R0, F1, R1, ...
        Fn, Rn, and lengths are all < k, where k is the primer search
        window length.
           F0  5' NNNNNNNNNN 3'
           R0  3' NNNNNNNNNN 5'
        2. If Ri and Fi are revers complementarily identical,
        add a polyA tail to 3' of Ri.
        3. For each combo of primers Fi and Ri, save the following to
        primer_out_fn.
           3.1 If revcmp_primers is False,
              >Fi
              Fi_sequence
              >Ri
              revcmp(Ri_sequence)
           3.2 If revcmp_primers is True,
              >Fi
              Fi_sequence
              >Ri
              Ri_sequence
              >Fi_revcmp
              revcmp(Fi_sqeuence)
              >Ri_revcmp
              revcmp(Ri_sqeuence)
        4. return primers range(0, n)
        """
        logging.info("Process primers for {case}.".
                     format(case=("finding primers" if not revcmp_primers
                                  else "detecting chimeras")))
        freader = FastaReader(primer_fn)
        primers = []
        primerComboId = -1
        for i, r in enumerate(freader):
            if i % 2 == 0:
                direction = "F"
                primerComboId += 1
            else:
                direction = "R"
            expectedName = "{d}{n}".format(d=direction, n=primerComboId)

            if r.name != expectedName:
                errMsg = "Primers should be placed in order F0, R0, F1, R1..."
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if len(r.sequence) > window_size:
                errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                    format(n=expectedName, l=len(r.sequence), k=window_size)
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if direction == "F":
                # Save >Fi and Fi_sequence.
                primers.append([expectedName, r.sequence])
            else:  # direction is "R"
                # fwdF/fwdR is the forward sequence of Fi/Ri
                fwdF, fwdR = primers[-1][1], r.sequence
                # revcmpF/revcmpR is the reverse complement of Fi/Ri
                revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR)
                # If Fi and Ri are reverse complementariliy identical, bail out,
                # because we need Poly A tail to distinguish Fi and Ri.
                if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0:
                    infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \
                        "are reverse complementarily identical. " + \
                        "Need to add 'AAAA' to 3' to distinguish them."
                    logging.info(infoMsg)
                    if revcmp_primers is False:
                        # Save primer Ri and revcmp(Ri_sequence) + TTTT
                        primers.append([expectedName, revcmpR + "T" * 4])
                    else:  # revcmp_primers is True
                        primers.append([expectedName, "A" * 4 + fwdR])
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR + "T" * 4])
                else:  # Ri and Fi are not revcmp identical
                    if revcmp_primers is False:
                        # Save >Ri and revcmp(Ri_sequence)
                        primers.append([expectedName, revcmpR])
                    else:
                        # Save >Ri and Ri_sequence
                        primers.append([expectedName, fwdR])
                        # Save >Fi_revcmp and revcmp(Fi_sequence)
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        # Save >Ri_revcmp and revcmp(Ri_sequence)
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR])
        freader.close()

        # Write Fi and reverse-complemented Ri to primer_out_fn
        f = open(primer_out_fn, 'w')
        for (name, seq) in primers:
            f.write(">{n}\n{s}\n".format(n=name, s=seq))
        f.close()
        return range(0, primerComboId + 1)