def _processPrimers(self, primer_fn_forward, primer_fn_reverse, window_size, primer_out_fn, revcmp_primers=False): """ Do basic sanity checks that: (1) all primers in forward start with f_xxx and are unique (2) all primers in reverse start with r_xxx and are unique (3) check that no forward primers appear in reverse primers (no symmetry) (4) write the primers (f_xxx, f_xxx_revcmp, r_xxx, r_xxx_revcmp) all to one primer file """ def sanity_check_primers(reader, prefix): """ Go through the primers, check that the prefix exists and all seqs are unique """ primers = {} # primer -> sequence, but can also contain the revcmp version with _revcmp suffix for r in reader: if not r.name.startswith(prefix): errMsg = "Forward primer should start with f_, but saw:", r.name raise ClassifierException(errMsg) if len(r.sequence) > window_size: errMsg = "Primer {n} has length {l} which is longer than {k}.".\ format(n=r.name, l=len(r.sequence), k=window_size) logging.error(errMsg) raise ClassifierException(errMsg) ss = r.sequence.upper() if ss in primers.itervalues(): errMsg = "Duplicate sequences found for", ss raise ClassifierException(errMsg) primers[r.name.strip()] = r.sequence # revcmp not needed becuz phmmer does both strands apparently... #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence) return primers logging.info("Process primers for {case}.". format(case=("finding primers" if not revcmp_primers else "detecting chimeras"))) reader_f = FastaReader(primer_fn_forward) reader_r = FastaReader(primer_fn_reverse) primers_f = sanity_check_primers(reader_f, prefix="f_") primers_r = sanity_check_primers(reader_r, prefix="r_") reader_f.close() reader_r.close() same_seqs = set(primers_f.values()).intersection(primers_r.values()) if len(same_seqs) > 0: errMsg = "Identical sequences found in both Forward/Reverse!\n" errMsg += "\n".join(same_seqs) raise ClassifierException(errMsg) # Write Fi and reverse-complemented Ri to primer_out_fn with open(primer_out_fn, 'w') as f: for (name, seq) in primers_f.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=seq)) for (name, seq) in primers_r.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq))) return primers_f.keys() + primers_r.keys()
primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR + "T" * 4]) else: # Ri and Fi are not revcmp identical if revcmp_primers is False: # Save >Ri and revcmp(Ri_sequence) primers.append([expectedName, revcmpR]) else: # Save >Ri and Ri_sequence primers.append([expectedName, fwdR]) # Save >Fi_revcmp and revcmp(Fi_sequence) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) # Save >Ri_revcmp and revcmp(Ri_sequence) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR]) freader.close() # Write Fi and reverse-complemented Ri to primer_out_fn f = open(primer_out_fn, 'w') for (name, seq) in primers: f.write(">{n}\n{s}\n".format(n=name, s=seq)) f.close() return range(0, primerComboId + 1) @property def numReads(self): """Return the number of reads in reads_fn.""" cmd = "grep -c '>' {r}".format(r=real_upath(self.reads_fn)) output, errCode, errMsg = backticks(cmd) if errCode != 0: raise ClassifierException(
raise ClassifierException(errMsg) primers[r.name.strip()] = r.sequence # revcmp not needed becuz phmmer does both strands apparently... #primers[r.name.strip() + "_revcmp"] = revcmp(r.sequence) return primers logging.info("Process primers for {case}.".format(case=( "finding primers" if not revcmp_primers else "detecting chimeras" ))) reader_f = FastaReader(primer_fn_forward) reader_r = FastaReader(primer_fn_reverse) primers_f = sanity_check_primers(reader_f, prefix="f_") primers_r = sanity_check_primers(reader_r, prefix="r_") reader_f.close() reader_r.close() same_seqs = set(primers_f.values()).intersection(primers_r.values()) if len(same_seqs) > 0: errMsg = "Identical sequences found in both Forward/Reverse!\n" errMsg += "\n".join(same_seqs) raise ClassifierException(errMsg) # Write Fi and reverse-complemented Ri to primer_out_fn with open(primer_out_fn, 'w') as f: for (name, seq) in primers_f.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=seq)) for (name, seq) in primers_r.iteritems(): f.write(">{n}\n{s}\n".format(n=name, s=revcmp(seq))) return primers_f.keys() + primers_r.keys()
def _processPrimers(self, primer_fn, window_size, primer_out_fn, revcmp_primers=False): """ Check and generate primers. 1. Check primers in primer_fn are in order F0, R0, F1, R1, ... Fn, Rn, and lengths are all < k, where k is the primer search window length. F0 5' NNNNNNNNNN 3' R0 3' NNNNNNNNNN 5' 2. If Ri and Fi are revers complementarily identical, add a polyA tail to 3' of Ri. 3. For each combo of primers Fi and Ri, save the following to primer_out_fn. 3.1 If revcmp_primers is False, >Fi Fi_sequence >Ri revcmp(Ri_sequence) 3.2 If revcmp_primers is True, >Fi Fi_sequence >Ri Ri_sequence >Fi_revcmp revcmp(Fi_sqeuence) >Ri_revcmp revcmp(Ri_sqeuence) 4. return primers range(0, n) """ logging.info("Process primers for {case}.". format(case=("finding primers" if not revcmp_primers else "detecting chimeras"))) freader = FastaReader(primer_fn) primers = [] primerComboId = -1 for i, r in enumerate(freader): if i % 2 == 0: direction = "F" primerComboId += 1 else: direction = "R" expectedName = "{d}{n}".format(d=direction, n=primerComboId) if r.name != expectedName: errMsg = "Primers should be placed in order F0, R0, F1, R1..." logging.error(errMsg) raise ClassifierException(errMsg) if len(r.sequence) > window_size: errMsg = "Primer {n} has length {l} which is longer than {k}.".\ format(n=expectedName, l=len(r.sequence), k=window_size) logging.error(errMsg) raise ClassifierException(errMsg) if direction == "F": # Save >Fi and Fi_sequence. primers.append([expectedName, r.sequence]) else: # direction is "R" # fwdF/fwdR is the forward sequence of Fi/Ri fwdF, fwdR = primers[-1][1], r.sequence # revcmpF/revcmpR is the reverse complement of Fi/Ri revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR) # If Fi and Ri are reverse complementariliy identical, bail out, # because we need Poly A tail to distinguish Fi and Ri. if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0: infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \ "are reverse complementarily identical. " + \ "Need to add 'AAAA' to 3' to distinguish them." logging.info(infoMsg) if revcmp_primers is False: # Save primer Ri and revcmp(Ri_sequence) + TTTT primers.append([expectedName, revcmpR + "T" * 4]) else: # revcmp_primers is True primers.append([expectedName, "A" * 4 + fwdR]) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR + "T" * 4]) else: # Ri and Fi are not revcmp identical if revcmp_primers is False: # Save >Ri and revcmp(Ri_sequence) primers.append([expectedName, revcmpR]) else: # Save >Ri and Ri_sequence primers.append([expectedName, fwdR]) # Save >Fi_revcmp and revcmp(Fi_sequence) primers.append(['F{n}_revcmp'.format(n=primerComboId), revcmpF]) # Save >Ri_revcmp and revcmp(Ri_sequence) primers.append(['R{n}_revcmp'.format(n=primerComboId), revcmpR]) freader.close() # Write Fi and reverse-complemented Ri to primer_out_fn f = open(primer_out_fn, 'w') for (name, seq) in primers: f.write(">{n}\n{s}\n".format(n=name, s=seq)) f.close() return range(0, primerComboId + 1)