def close_enough_old(self, primer, sequence, errors) : err_count = 0 for i,j in zip(sequence, primer) : if not IUPAC.equal(i, j) : err_count += 1 #if len(a) > 10 : # print a, b, err_count return err_count <= errors
def close_enough(self, primer, sequence, diff) : if diff < 0 : return False if (len(primer) == 0) or (len(sequence) == 0) : return True m = IUPAC.equal(sequence[0], primer[0]) return self.close_enough(primer[1:], sequence[1:], diff if m else diff-1) or \ self.close_enough(primer[1:], sequence, diff-1) or \ self.close_enough(primer, sequence[1:], diff-1)
def accept(self, seq) : seqprimer = seq.sequence[:self.len] ret = IUPAC.close_enough(self.primer, seq.sequence, self.err) #print self.primer, seq.sequence[:self.len], ret if ret and self.clip : # primer part of sequence may be longer or # shorter, but it does not really matter # as terminal gaps are not included in our # definition of identity seq.remove_mid(self.len) return ret
def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) : tmp = [] acc2name = {} # read in sequences fq = FastqFile(fastq_fname) fq.open() for seq in fq : if 'Nematoda' not in seq.id : continue seq.ungap() seq.back_translate() new_id = seq.id.split()[0][1:] tmp.append((new_id, seq.sequence)) acc2name[new_id] = seq.id[seq.id.find('Nematoda'):] fq.close() # test sequences p = Progress("Looking for primer sequences", len(tmp), False) p.start() tmp2 = [] for label,seq in tmp : findex = IUPAC.seq_position(fprimer, seq, diffs) if findex != -1 : #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 : shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length] if 'N' not in shortseq : tmp2.append((label, shortseq)) p.increment() p.end() return tmp2,acc2name
def distance(self, aligned) : leng = float(min(len(aligned[0]), len(aligned[1]))) last_gap = True diff = 0 for c1,c2 in zip(aligned[0], aligned[1]) : if (c1 == '-') and (c2 == '-') : continue gap = '-' in (c1,c2) if last_gap and gap : continue if not IUPAC.equal(c1, c2) : diff += 1 last_gap = gap if last_gap : diff -= 1 return (leng - diff) / leng
def extract(self, sff, outdir, primer, primer_errors, barcode, barcode_errors, max_homopolymer) : try : from Bio import SeqIO except ImportError : print >> sys.stderr, "BioPython not installed (only required for working with SFF files)" sys.exit(1) barcode_len = len(barcode) primer_len = len(primer) raw_seq_total = 0 names = [] flows = [] flowlens = [] for record in SeqIO.parse(sff.get_filename(), "sff") : raw_seq_total += 1 good_bases = record.seq[record.annotations["clip_qual_left"] : record.annotations["clip_qual_right"]] barcode_seq = good_bases[:barcode_len] primer_seq = good_bases[barcode_len : barcode_len + primer_len] new_length = 0 for i in range(0, len(record.annotations["flow_values"]), 4) : signal = 0 noise = 0 for j in range(4) : f = float(record.annotations["flow_values"][i + j]) / 100.0 if int(f + 0.5) > max_homopolymer : break if f > 0.5 : signal += 1 if f < 0.7 : noise += 1 if noise > 0 or signal == 0 : break new_length += 1 new_length *= 4 if new_length > 450 : new_length = 450 if new_length >= 360 and \ IUPAC.close_enough(barcode, barcode_seq, barcode_errors) and \ IUPAC.close_enough(primer, primer_seq, primer_errors) : flows.append(record.annotations["flow_values"]) flowlens.append(new_length) names.append(record.id) if len(flows) == 0 : self.log.info("kept 0/%d sequences" % raw_seq_total) return 0, None # output pyronoise input file # see http://userweb.eng.gla.ac.uk/christopher.quince/Software/PyroNoise.html f = open(join(outdir, "flows.dat"), 'w') print >> f, "%d %d" % (len(flows), max([ len(i) for i in flows ])) for i in range(len(flows)) : print >> f, " ".join([ names[i], str(flowlens[i]) ] + [ "%.2f" % (float(i) / 100.0) for i in flows[i] ]) f.close() self.log.info("kept %d/%d sequences" % (len(flows), raw_seq_total)) return len(flows), f.name