def trim(reads, find_start=None, find_end=None, second_time=False): ''' Wrapper that handles the logistics of trimming reads given functions find_start and find_end that take a sequence and returns positions that trimming should occur at. ''' if find_start == None: find_start = lambda seq: 0 if find_end == None: find_end = len for read in reads: start = find_start(read.seq) end = find_end(read.seq) left_seq = read.seq[:start] left_qual = fastq.sanitize_qual(read.qual[:start]) right_seq = read.seq[end:] right_qual = fastq.sanitize_qual(read.qual[end:]) if second_time: payload_annotation = PayloadAnnotation.from_identifier(read.name) annotation = TrimmedTwiceAnnotation( retrimmed_left_seq=left_seq, retrimmed_left_qual=left_qual, retrimmed_right_seq=right_seq, retrimmed_right_qual=right_qual, **payload_annotation) else: annotation = PayloadAnnotation( original_name=read.name, left_seq=left_seq, left_qual=left_qual, right_seq=right_seq, right_qual=right_qual, ) trimmed_read = fastq.Read( annotation.identifier, read.seq[start:end], read.qual[start:end], ) yield trimmed_read
def trim(reads, find_start=None, find_end=None, second_time=False): ''' Wrapper that handles the logistics of trimming reads given functions find_start and find_end that take a sequence and returns positions that trimming should occur at. ''' if find_start == None: find_start = lambda seq: 0 if find_end == None: find_end = len for read in reads: start = find_start(read.seq) end = find_end(read.seq) left_seq = read.seq[:start] left_qual = fastq.sanitize_qual(read.qual[:start]) right_seq = read.seq[end:] right_qual = fastq.sanitize_qual(read.qual[end:]) if second_time: payload_annotation = PayloadAnnotation.from_identifier(read.name) annotation = TrimmedTwiceAnnotation(retrimmed_left_seq=left_seq, retrimmed_left_qual=left_qual, retrimmed_right_seq=right_seq, retrimmed_right_qual=right_qual, **payload_annotation) else: annotation = PayloadAnnotation(original_name=read.name, left_seq=left_seq, left_qual=left_qual, right_seq=right_seq, right_qual=right_qual, ) trimmed_read = fastq.Read(annotation.identifier, read.seq[start:end], read.qual[start:end], ) yield trimmed_read
def find_boundary_sequences(R1, R2, counters): # Find which read in the read pair is from the reverse strand by looking for # common_right_reverse. # First try to find a unique position entirely contained within R1 or R2 # that is close to common_right_reverse. # Failing this, find the longest of (the longest suffix of R1 or R2 that # matches a prefix of common_right_reverse) or (the longest prefix of R1 or # R2 that matches a suffix of common_right_reverse). R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites( R1.seq, common_right_reverse) R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites( R2.seq, common_right_reverse) if len(R1_contained) + len(R2_contained) > 1: # Only one of occurence of common_right_reverse should exist between R1 # and R2. return None, None elif len(R1_contained) + len(R2_contained) == 0: possiblities = [ (len(common_right_reverse) - R1_prefix, 'R1_prefix'), (len(common_right_reverse) - R2_prefix, 'R2_prefix'), (len(common_right_reverse) - R1_suffix, 'R1_suffix'), (len(common_right_reverse) - R2_suffix, 'R2_suffix'), ] length, kind = max(possiblities) if length > 5: if 'R1' in kind: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' elif 'R2' in kind: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' if 'prefix' in kind: common_right_reverse_start = len(reverse_read.seq) - length elif 'suffix' in kind: common_right_reverse_start = -length else: return None, None elif len(R1_contained) == 1: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' common_right_reverse_start = R1_contained.pop() elif len(R2_contained) == 1: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' common_right_reverse_start = R2_contained.pop() # '*' means that there was no opportunity to see this id. # 'X' means that there was an opportunity and it was neither A nor B. right_id = '*' left_id = '*' five_payload_slice = slice(None, max(0, common_right_reverse_start)) five_payload_seq = utilities.reverse_complement( reverse_read.seq[five_payload_slice]) five_payload_qual = reverse_read.qual[five_payload_slice][::-1] current_p = common_right_reverse_start + len(common_right_reverse) if current_p < len(reverse_read.seq) - after_right_length: right_id_seq = reverse_read.seq[current_p:current_p + after_right_length] for key, prefix in after_right_prefix.items(): if right_id_seq == prefix: right_id = key if right_id == '*': right_id = 'X' counters['right_ids'][right_id_seq] += 1 if right_id != 'X': current_p += len(after_right[right_id]) if current_p < len(reverse_read.seq) - 4: left_id_seq = reverse_read.seq[current_p:current_p + 4] for key, sequence in after_left.items(): if left_id_seq == sequence: left_id = key if left_id == '*': left_id = 'X' counters['left_ids'][left_id_seq] += 1 polyA_start, polyA_length = find_polyA_cython.find_polyA( forward_read.seq, 15) polyA_slice = slice(polyA_start, polyA_start + polyA_length) polyA_seq = forward_read.seq[polyA_slice] polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice]) three_payload_slice = slice(None, polyA_start) three_payload_seq = forward_read.seq[three_payload_slice] three_payload_qual = forward_read.qual[three_payload_slice] common_name, _ = R1.name.rsplit(':', 1) control_ids_string = '{0}-{1}'.format(left_id, right_id) five_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq='', right_qual='', ) three_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq=polyA_seq, right_qual=polyA_qual, ) five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual) three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual) counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1 counters['positions'][polyA_read][polyA_start] += 1 counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1 counters['polyA_lengths'][polyA_length] += 1 counters['control_ids'][control_ids_string] += 1 if polyA_length < 13: return None, None return five_payload_read, three_payload_read
def find_boundary_sequences(R1, R2, counters): # Find which read in the read pair is from the reverse strand by looking for # common_right_reverse. # First try to find a unique position entirely contained within R1 or R2 # that is close to common_right_reverse. # Failing this, find the longest of (the longest suffix of R1 or R2 that # matches a prefix of common_right_reverse) or (the longest prefix of R1 or # R2 that matches a suffix of common_right_reverse). R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(R1.seq, common_right_reverse) R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(R2.seq, common_right_reverse) if len(R1_contained) + len(R2_contained) > 1: # Only one of occurence of common_right_reverse should exist between R1 # and R2. return None, None elif len(R1_contained) + len(R2_contained) == 0: possiblities = [(len(common_right_reverse) - R1_prefix, 'R1_prefix'), (len(common_right_reverse) - R2_prefix, 'R2_prefix'), (len(common_right_reverse) - R1_suffix, 'R1_suffix'), (len(common_right_reverse) - R2_suffix, 'R2_suffix'), ] length, kind = max(possiblities) if length > 5: if 'R1' in kind: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' elif 'R2' in kind: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' if 'prefix' in kind: common_right_reverse_start = len(reverse_read.seq) - length elif 'suffix' in kind: common_right_reverse_start = -length else: return None, None elif len(R1_contained) == 1: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' common_right_reverse_start = R1_contained.pop() elif len(R2_contained) == 1: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' common_right_reverse_start = R2_contained.pop() # '*' means that there was no opportunity to see this id. # 'X' means that there was an opportunity and it was neither A nor B. right_id = '*' left_id = '*' five_payload_slice = slice(None, max(0, common_right_reverse_start)) five_payload_seq = utilities.reverse_complement(reverse_read.seq[five_payload_slice]) five_payload_qual = reverse_read.qual[five_payload_slice][::-1] current_p = common_right_reverse_start + len(common_right_reverse) if current_p < len(reverse_read.seq) - after_right_length: right_id_seq = reverse_read.seq[current_p:current_p + after_right_length] for key, prefix in after_right_prefix.items(): if right_id_seq == prefix: right_id = key if right_id == '*': right_id = 'X' counters['right_ids'][right_id_seq] += 1 if right_id != 'X': current_p += len(after_right[right_id]) if current_p < len(reverse_read.seq) - 4: left_id_seq = reverse_read.seq[current_p:current_p + 4] for key, sequence in after_left.items(): if left_id_seq == sequence: left_id = key if left_id == '*': left_id = 'X' counters['left_ids'][left_id_seq] += 1 polyA_start, polyA_length = find_polyA_cython.find_polyA(forward_read.seq, 15) polyA_slice = slice(polyA_start, polyA_start + polyA_length) polyA_seq = forward_read.seq[polyA_slice] polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice]) three_payload_slice = slice(None, polyA_start) three_payload_seq = forward_read.seq[three_payload_slice] three_payload_qual = forward_read.qual[three_payload_slice] common_name, _ = R1.name.rsplit(':', 1) control_ids_string = '{0}-{1}'.format(left_id, right_id) five_annotation = trim.PayloadAnnotation(original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq='', right_qual='', ) three_annotation = trim.PayloadAnnotation(original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq=polyA_seq, right_qual=polyA_qual, ) five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual) three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual) counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1 counters['positions'][polyA_read][polyA_start] += 1 counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1 counters['polyA_lengths'][polyA_length] += 1 counters['control_ids'][control_ids_string] += 1 if polyA_length < 13: return None, None return five_payload_read, three_payload_read