def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i,value in enumerate(averaged): if value < threshold: read = read[:i+windowsize-1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read
def trim_read(self, read): # First we remove base pairs strictly below the threshold on both sides # phred = read.letter_annotations["phred_quality"] above_yes_no = [True if x > self.threshold else False for x in phred] if not True in above_yes_no: return None new_start = above_yes_no.index(True) new_end = list(reversed(above_yes_no)).index(True) if new_end == 0: read = read[new_start:] if new_end != 0: read = read[new_start:-new_end] if not read.seq: return None phred = read.letter_annotations["phred_quality"] # Now we run our moving average # averaged = moving_average(phred, self.window_size, 'copy_padding_and_cut') # And then search for the longest stretch above the threshold # stretches = itertools.groupby(averaged, lambda x: x>self.threshold) stretches = [Stretch(above, scores) for above, scores in stretches] if not stretches: return None # This is a bit clumsy but we need to calculate the starts and ends # start = 0 for s in stretches: s.start = start start = start + len(s) s.end = start s.seq = read[s.start:s.end] # Remove the below ones # stretches = [s for s in stretches if s.above] if not stretches: return None # Discard N letters # if self.discard_N: stretches = [s for s in stretches if 'N' not in s.seq] if not stretches: return None # Get the longest one # longest = max(stretches, key = lambda x: len(x)) # Check the length # if len(longest.seq) < self.min_length: return None else: return longest.seq
def good_qual_iterator(reads): for read in reads: averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize) if any([value < self.qual_threshold for value in averaged]): continue yield read