Ejemplo n.º 1
0
 def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20):
     for read in reads:
         # Length #
         if len(read) < minlength: continue
         # Primer #
         match = self.primer_regex.search(str(read.seq))
         if not match: continue
         # PHRED score #
         scores = read.letter_annotations["phred_quality"]
         averaged = moving_average(scores, windowsize)
         discard = False
         for i,value in enumerate(averaged):
             if value < threshold:
                 read = read[:i+windowsize-1]
                 if len(read) < minlength: discard = True
                 break
         if discard: continue
         # Undetermined bases #
         if 'N' in read: continue
         # Remove primer #
         read = read[match.end():]
         # Flip them because 454 reads the other end #
         read = read.reverse_complement()
         # Return #
         yield read
Ejemplo n.º 2
0
 def trim_read(self, read):
     # First we remove base pairs strictly below the threshold on both sides #
     phred = read.letter_annotations["phred_quality"]
     above_yes_no = [True if x > self.threshold else False for x in phred]
     if not True in above_yes_no: return None
     new_start = above_yes_no.index(True)
     new_end = list(reversed(above_yes_no)).index(True)
     if new_end == 0: read = read[new_start:]
     if new_end != 0: read = read[new_start:-new_end]
     if not read.seq: return None
     phred = read.letter_annotations["phred_quality"]
     # Now we run our moving average #
     averaged = moving_average(phred, self.window_size, 'copy_padding_and_cut')
     # And then search for the longest stretch above the threshold #
     stretches = itertools.groupby(averaged, lambda x: x>self.threshold)
     stretches = [Stretch(above, scores) for above, scores in stretches]
     if not stretches: return None
     # This is a bit clumsy but we need to calculate the starts and ends #
     start = 0
     for s in stretches:
         s.start = start
         start = start + len(s)
         s.end = start
         s.seq = read[s.start:s.end]
     # Remove the below ones #
     stretches = [s for s in stretches if s.above]
     if not stretches: return None
     # Discard N letters #
     if self.discard_N: stretches = [s for s in stretches if 'N' not in s.seq]
     if not stretches: return None
     # Get the longest one #
     longest = max(stretches, key = lambda x: len(x))
     # Check the length #
     if len(longest.seq) < self.min_length: return None
     else: return longest.seq
Ejemplo n.º 3
0
 def good_qual_iterator(reads):
     for read in reads:
         averaged = moving_average(read.letter_annotations["phred_quality"], self.qual_windowsize)
         if any([value < self.qual_threshold for value in averaged]): continue
         yield read