def _processPrimers(self, primer_fn_forward, primer_fn_reverse, window_size, primer_out_fn,
                        revcmp_primers=False):
        """
        Do basic sanity checks that:
        (1) all primers in forward start with f_xxx and are unique
        (2) all primers in reverse start with r_xxx and are unique
        (3) check that no forward primers appear in reverse primers (no symmetry)
        (4) write the primers (f_xxx, f_xxx_revcmp, r_xxx, r_xxx_revcmp) all to one primer file
        """
        def sanity_check_primers(reader, prefix):
            """
            Go through the primers, check that the prefix exists and all seqs are unique
            """
            primers = {} # primer -> sequence, but can also contain the revcmp version with _revcmp suffix
            for r in reader:
                if not r.name.startswith(prefix):
                    errMsg = "Forward primer should start with f_, but saw:", r.name
                    raise ClassifierException(errMsg)
                if len(r.sequence) > window_size:
                    errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                     format(n=r.name, l=len(r.sequence), k=window_size)
                    logging.error(errMsg)
                    raise ClassifierException(errMsg)
                ss = r.sequence.upper()
                if ss in primers.itervalues():
                    errMsg = "Duplicate sequences found for", ss
                    raise ClassifierException(errMsg)
                primers[r.name.strip()] = r.sequence
                # revcmp not needed becuz phmmer does both strands apparently...
                #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence)
            return primers


        logging.info("Process primers for {case}.".
                     format(case=("finding primers" if not revcmp_primers
                                  else "detecting chimeras")))

        reader_f = FastaReader(primer_fn_forward)
        reader_r = FastaReader(primer_fn_reverse)

        primers_f = sanity_check_primers(reader_f, prefix="f_")
        primers_r = sanity_check_primers(reader_r, prefix="r_")

        reader_f.close()
        reader_r.close()

        same_seqs = set(primers_f.values()).intersection(primers_r.values())
        if len(same_seqs) > 0:
            errMsg = "Identical sequences found in both Forward/Reverse!\n"
            errMsg += "\n".join(same_seqs)
            raise ClassifierException(errMsg)

        # Write Fi and reverse-complemented Ri to primer_out_fn
        with open(primer_out_fn, 'w') as f:
            for (name, seq) in primers_f.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=seq))
            for (name, seq) in primers_r.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq)))
        return primers_f.keys() + primers_r.keys()
Beispiel #2
0
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR + "T" * 4])
                else:  # Ri and Fi are not revcmp identical
                    if revcmp_primers is False:
                        # Save >Ri and revcmp(Ri_sequence)
                        primers.append([expectedName, revcmpR])
                    else:
                        # Save >Ri and Ri_sequence
                        primers.append([expectedName, fwdR])
                        # Save >Fi_revcmp and revcmp(Fi_sequence)
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        # Save >Ri_revcmp and revcmp(Ri_sequence)
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR])
        freader.close()

        # Write Fi and reverse-complemented Ri to primer_out_fn
        f = open(primer_out_fn, 'w')
        for (name, seq) in primers:
            f.write(">{n}\n{s}\n".format(n=name, s=seq))
        f.close()
        return range(0, primerComboId + 1)

    @property
    def numReads(self):
        """Return the number of reads in reads_fn."""
        cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn))
        output, errCode, errMsg = backticks(cmd)
        if errCode != 0:
            raise ClassifierException(
Beispiel #3
0
                    raise ClassifierException(errMsg)
                primers[r.name.strip()] = r.sequence
                # revcmp not needed becuz phmmer does both strands apparently...
                #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence)
            return primers

        logging.info("Process primers for {case}.".format(case=(
            "finding primers" if not revcmp_primers else "detecting chimeras"
        )))
        reader_f = FastaReader(primer_fn_forward)
        reader_r = FastaReader(primer_fn_reverse)

        primers_f = sanity_check_primers(reader_f, prefix="f_")
        primers_r = sanity_check_primers(reader_r, prefix="r_")

        reader_f.close()
        reader_r.close()

        same_seqs = set(primers_f.values()).intersection(primers_r.values())
        if len(same_seqs) > 0:
            errMsg = "Identical sequences found in both Forward/Reverse!\n"
            errMsg += "\n".join(same_seqs)
            raise ClassifierException(errMsg)

        # Write Fi and reverse-complemented Ri to primer_out_fn
        with open(primer_out_fn, 'w') as f:
            for (name, seq) in primers_f.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=seq))
            for (name, seq) in primers_r.iteritems():
                f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq)))
        return primers_f.keys() + primers_r.keys()
Beispiel #4
0
    def _processPrimers(self, primer_fn, window_size, primer_out_fn,
                        revcmp_primers=False):
        """
        Check and generate primers.
        1. Check primers in primer_fn are in order F0, R0, F1, R1, ...
        Fn, Rn, and lengths are all < k, where k is the primer search
        window length.
           F0  5' NNNNNNNNNN 3'
           R0  3' NNNNNNNNNN 5'
        2. If Ri and Fi are revers complementarily identical,
        add a polyA tail to 3' of Ri.
        3. For each combo of primers Fi and Ri, save the following to
        primer_out_fn.
           3.1 If revcmp_primers is False,
              >Fi
              Fi_sequence
              >Ri
              revcmp(Ri_sequence)
           3.2 If revcmp_primers is True,
              >Fi
              Fi_sequence
              >Ri
              Ri_sequence
              >Fi_revcmp
              revcmp(Fi_sqeuence)
              >Ri_revcmp
              revcmp(Ri_sqeuence)
        4. return primers range(0, n)
        """
        logging.info("Process primers for {case}.".
                     format(case=("finding primers" if not revcmp_primers
                                  else "detecting chimeras")))
        freader = FastaReader(primer_fn)
        primers = []
        primerComboId = -1
        for i, r in enumerate(freader):
            if i % 2 == 0:
                direction = "F"
                primerComboId += 1
            else:
                direction = "R"
            expectedName = "{d}{n}".format(d=direction, n=primerComboId)

            if r.name != expectedName:
                errMsg = "Primers should be placed in order F0, R0, F1, R1..."
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if len(r.sequence) > window_size:
                errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                    format(n=expectedName, l=len(r.sequence), k=window_size)
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if direction == "F":
                # Save >Fi and Fi_sequence.
                primers.append([expectedName, r.sequence])
            else:  # direction is "R"
                # fwdF/fwdR is the forward sequence of Fi/Ri
                fwdF, fwdR = primers[-1][1], r.sequence
                # revcmpF/revcmpR is the reverse complement of Fi/Ri
                revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR)
                # If Fi and Ri are reverse complementariliy identical, bail out,
                # because we need Poly A tail to distinguish Fi and Ri.
                if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0:
                    infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \
                        "are reverse complementarily identical. " + \
                        "Need to add 'AAAA' to 3' to distinguish them."
                    logging.info(infoMsg)
                    if revcmp_primers is False:
                        # Save primer Ri and revcmp(Ri_sequence) + TTTT
                        primers.append([expectedName, revcmpR + "T" * 4])
                    else:  # revcmp_primers is True
                        primers.append([expectedName, "A" * 4 + fwdR])
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR + "T" * 4])
                else:  # Ri and Fi are not revcmp identical
                    if revcmp_primers is False:
                        # Save >Ri and revcmp(Ri_sequence)
                        primers.append([expectedName, revcmpR])
                    else:
                        # Save >Ri and Ri_sequence
                        primers.append([expectedName, fwdR])
                        # Save >Fi_revcmp and revcmp(Fi_sequence)
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        # Save >Ri_revcmp and revcmp(Ri_sequence)
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR])
        freader.close()

        # Write Fi and reverse-complemented Ri to primer_out_fn
        f = open(primer_out_fn, 'w')
        for (name, seq) in primers:
            f.write(">{n}\n{s}\n".format(n=name, s=seq))
        f.close()
        return range(0, primerComboId + 1)