def make_artificial_reads( transcript, fragment_length, read_length, adapter_sequence, region_fetcher, common_buffer, ): transcript_sequence = transcript.retrieve_sequence( region_fetcher, left_buffer=common_buffer, right_buffer=common_buffer + fragment_length, ) # Needs to include one non-Solexa value for automatic encoding recognition. high_quals = fastq.encode_sanger([25] + [30] * (read_length - 1)) for i, transcript_position in enumerate( range(-common_buffer, transcript.CDS_length + common_buffer)): annotation = artifical_annotation( transcript_name=transcript.name, position=transcript_position, ) fragment_sequence = transcript_sequence[i:i + fragment_length] if '-' in fragment_sequence: # skip fragments that run off the edge of a reference sequence continue full_sequence = fragment_sequence + adapter_sequence read = fastq.Read(annotation.identifier, full_sequence[:read_length], high_quals) yield read
def get_reads(): for i, (seq, count) in enumerate(self.read_file('common_unmapped')['non_long_polyA'].most_common()): read = fastq.Read('{0}_{1}'.format(i, count), seq, fastq.encode_sanger([40]*len(seq)), ) yield read
def collapse_fastq(reads, outfile): counter = 0 collapsedReads = { } # dict of unique reads; collapsedRead{seq} = [read,count] for r in reads: #itertools.islice(reads,10000): # iterate thru reads and collapse as necessary if counter % 1000000 == 0: print(str(counter) + " reads processed ...") n = r.name.split('_') nseq = n[1] + "_" + n[2] + "_" + r.seq #[rSlice] if nseq in collapsedReads: # collapsable sequence [oldRead, count] = collapsedReads[nseq] # maximize quality nqual = r.qual #[rSlice] fqualList = [] for oR_q, nR_q in zip(oldRead.qual, nqual): if oR_q > nR_q: fqualList.append(oR_q) else: fqualList.append(nR_q) fqual = ''.join(fqualList) count = count + 1 nRead = fastq.Read(oldRead.name, r.seq, fqual) collapsedReads[nseq] = [nRead, count] else: nRead = fastq.Read(r.name, r.seq, r.qual) #[rSlice]) collapsedReads[nseq] = [nRead, 1] counter = counter + 1 fh = open(outfile, 'w') for i in collapsedReads: [r, count] = collapsedReads[i] #n = r.name.split(' ') fh.write(str(fastq.Read(r.name + "_" + str(count), r.seq, r.qual))) fh.close()
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # R2 isn't expected to have adapters sequence because it will # have to get through the A tail first. position = adapters.find_adapter(self.adapter_in_R1, 3, R1.seq) trimmed_lengths[position] += 1 if position < 12: continue long_enough_reads += 1 R1_slice = slice(None, position) # position points to where the barcode starts in R1. The length # of the trimmed R2 read should be equal to position. R2_slice = slice(len(self.barcode), len(self.barcode) + position) processed_R1 = fastq.Read(R1.name, R1.seq[R1_slice], R1.qual[R1_slice]) processed_R2 = fastq.Read(R2.name, R2.seq[R2_slice], R2.qual[R2_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend( [('Total read pairs', total_reads), ('Long enough', long_enough_reads), ] )
def trim(reads, find_start=None, find_end=None, second_time=False): ''' Wrapper that handles the logistics of trimming reads given functions find_start and find_end that take a sequence and returns positions that trimming should occur at. ''' if find_start == None: find_start = lambda seq: 0 if find_end == None: find_end = len for read in reads: start = find_start(read.seq) end = find_end(read.seq) left_seq = read.seq[:start] left_qual = fastq.sanitize_qual(read.qual[:start]) right_seq = read.seq[end:] right_qual = fastq.sanitize_qual(read.qual[end:]) if second_time: payload_annotation = PayloadAnnotation.from_identifier(read.name) annotation = TrimmedTwiceAnnotation( retrimmed_left_seq=left_seq, retrimmed_left_qual=left_qual, retrimmed_right_seq=right_seq, retrimmed_right_qual=right_qual, **payload_annotation) else: annotation = PayloadAnnotation( original_name=read.name, left_seq=left_seq, left_qual=left_qual, right_seq=right_seq, right_qual=right_qual, ) trimmed_read = fastq.Read( annotation.identifier, read.seq[start:end], read.qual[start:end], ) yield trimmed_read
def untrim_reads(trimmed_reads, second_time=False): if second_time: Annotation = TrimmedTwiceAnnotation left_seq_key = 'retrimmed_left_seq' left_qual_key = 'retrimmed_left_qual' right_seq_key = 'retrimmed_right_seq' right_qual_key = 'retrimmed_right_qual' else: Annotation = PayloadAnnotation left_seq_key = 'left_seq' left_qual_key = 'left_qual' right_seq_key = 'right_seq' right_qual_key = 'right_qual' for trimmed_read in trimmed_reads: annotation = Annotation.from_identifier(trimmed_read.name) name = trimmed_read.name seq = annotation[left_seq_key] + trimmed_read.seq + annotation[ right_seq_key] qual = annotation[left_qual_key] + trimmed_read.qual + annotation[ right_qual_key] read = fastq.Read(name, seq, qual) yield read
def align_reads( target_fasta_fn, reads, bam_fn, min_path_length=15, error_fn='/dev/null', alignment_type='overlap', ): ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing alignments in bam_fn and yielding unaligned reads. ''' targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)} target_names = sorted(targets) target_lengths = [len(targets[n]) for n in target_names] alignment_sorter = sam.AlignmentSorter( target_names, target_lengths, bam_fn, ) statistics = Counter() with alignment_sorter: for original_read in reads: statistics['input'] += 1 alignments = [] rc_read = fastq.Read( original_read.name, utilities.reverse_complement(original_read.seq), original_read.qual[::-1], ) for read, is_reverse in ([original_read, False], [rc_read, True]): qual = fastq.decode_sanger(read.qual) for target_name, target_seq in targets.iteritems(): alignment = generate_alignments(read.seq, target_seq, alignment_type)[0] path = alignment['path'] if len(path) >= min_path_length and alignment['score'] / ( 2. * len(path)) > 0.8: aligned_segment = pysam.AlignedSegment() aligned_segment.seq = read.seq aligned_segment.query_qualities = qual aligned_segment.is_reverse = is_reverse char_pairs = make_char_pairs(path, read.seq, target_seq) cigar = sam.aligned_pairs_to_cigar(char_pairs) clip_from_start = first_query_index(path) if clip_from_start > 0: cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start) ] + cigar clip_from_end = len( read.seq) - 1 - last_query_index(path) if clip_from_end > 0: cigar = cigar + [ (sam.BAM_CSOFT_CLIP, clip_from_end) ] aligned_segment.cigar = cigar read_aligned, ref_aligned = zip(*char_pairs) md = sam.alignment_to_MD_string( ref_aligned, read_aligned) aligned_segment.set_tag('MD', md) aligned_segment.set_tag('AS', alignment['score']) aligned_segment.tid = alignment_sorter.get_tid( target_name) aligned_segment.query_name = read.name aligned_segment.next_reference_id = -1 aligned_segment.reference_start = first_target_index( path) alignments.append(aligned_segment) if alignments: statistics['aligned'] += 1 sorted_alignments = sorted(alignments, key=lambda m: m.get_tag('AS'), reverse=True) grouped = utilities.group_by(sorted_alignments, key=lambda m: m.get_tag('AS')) _, highest_group = grouped.next() primary_already_assigned = False for alignment in highest_group: if len(highest_group) == 1: alignment.mapping_quality = 2 else: alignment.mapping_quality = 1 if not primary_already_assigned: primary_already_assigned = True else: alignment.is_secondary = True alignment_sorter.write(alignment) else: statistics['unaligned'] += 1 yield read with open(error_fn, 'w') as error_fh: for key in ['input', 'aligned', 'unaligned']: error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
overlap_R2_seq = R2_rc_seq[overlap_R2_slice] overlap_R2_qual = R2_rc_qual[overlap_R2_slice] just_R2_slice = slice(len(overlap_R1_seq), None) just_R2_seq = R2_rc_seq[just_R2_slice] just_R2_qual = R2_rc_qual[just_R2_slice] overlap_seq = [] overlap_qual = [] for R1_s, R1_q, R2_s, R2_q in zip( overlap_R1_seq, overlap_R1_qual, overlap_R2_seq, overlap_R2_qual, ): if R1_q > R2_q: s, q = R1_s, R1_q else: s, q = R2_s, R2_q overlap_seq.append(s) overlap_qual.append(q) overlap_seq = ''.join(overlap_seq) overlap_qual = ''.join(overlap_qual) seq = just_R1_seq + overlap_seq + just_R2_seq qual = just_R1_qual + overlap_qual + just_R2_qual output_fh.write(str(fastq.Read(R1.name, seq, qual)))
def get_R2_rc_reads(): read_pairs = islice(get_read_pairs(), 100) return (fastq.Read(R2.name, utilities.reverse_complement(R2.seq), R2.qual[::-1]) for R1, R2 in read_pairs)
def trim_reads(self, read_pairs): total_reads = 0 long_enough_reads = 0 trimmed_lengths = Counter() barcodes = Counter() truncated_in_R1 = self.adapter_in_R1[1:] truncated_in_R2 = self.adapter_in_R2[1:] for R1, R2 in read_pairs: total_reads += 1 barcodes[R2.seq[:len(self.barcode)]] += 1 # Check for weird thing where expected overhang base doesn't # exist in primer dimers. R1_dimer_distance = adapters.adapter_hamming_distance( R1.seq, truncated_in_R1, len(R1.seq), len(truncated_in_R1), len(self.barcode), ) R2_dimer_distance = adapters.adapter_hamming_distance( R2.seq, truncated_in_R2, len(R2.seq), len(truncated_in_R2), len(self.barcode), ) if R1_dimer_distance <= 3 and R2_dimer_distance <= 3: position = len(self.barcode) else: position = adapters.consistent_paired_position( R1.seq, R2.seq, self.adapter_in_R1, self.adapter_in_R2, 19, 3, ) if position != None: trimmed_lengths[position] += 1 if position - len(self.barcode) < 12: continue else: position = len(R1.seq) long_enough_reads += 1 payload_slice = slice(len(self.barcode), position) processed_R1 = fastq.Read(R1.name, R1.seq[payload_slice], R1.qual[payload_slice]) processed_R2 = fastq.make_record(R2.name, R2.seq[payload_slice], R2.qual[payload_slice]) yield processed_R1, processed_R2 trimmed_lengths = utilities.counts_to_array(trimmed_lengths) self.write_file('trimmed_lengths', trimmed_lengths) self.write_file('barcodes', barcodes) self.summary.extend([ ('Total read pairs', total_reads), ('Long enough', long_enough_reads), ])
def find_boundary_sequences(R1, R2, counters): # Find which read in the read pair is from the reverse strand by looking for # common_right_reverse. # First try to find a unique position entirely contained within R1 or R2 # that is close to common_right_reverse. # Failing this, find the longest of (the longest suffix of R1 or R2 that # matches a prefix of common_right_reverse) or (the longest prefix of R1 or # R2 that matches a suffix of common_right_reverse). R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites( R1.seq, common_right_reverse) R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites( R2.seq, common_right_reverse) if len(R1_contained) + len(R2_contained) > 1: # Only one of occurence of common_right_reverse should exist between R1 # and R2. return None, None elif len(R1_contained) + len(R2_contained) == 0: possiblities = [ (len(common_right_reverse) - R1_prefix, 'R1_prefix'), (len(common_right_reverse) - R2_prefix, 'R2_prefix'), (len(common_right_reverse) - R1_suffix, 'R1_suffix'), (len(common_right_reverse) - R2_suffix, 'R2_suffix'), ] length, kind = max(possiblities) if length > 5: if 'R1' in kind: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' elif 'R2' in kind: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' if 'prefix' in kind: common_right_reverse_start = len(reverse_read.seq) - length elif 'suffix' in kind: common_right_reverse_start = -length else: return None, None elif len(R1_contained) == 1: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' common_right_reverse_start = R1_contained.pop() elif len(R2_contained) == 1: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' common_right_reverse_start = R2_contained.pop() # '*' means that there was no opportunity to see this id. # 'X' means that there was an opportunity and it was neither A nor B. right_id = '*' left_id = '*' five_payload_slice = slice(None, max(0, common_right_reverse_start)) five_payload_seq = utilities.reverse_complement( reverse_read.seq[five_payload_slice]) five_payload_qual = reverse_read.qual[five_payload_slice][::-1] current_p = common_right_reverse_start + len(common_right_reverse) if current_p < len(reverse_read.seq) - after_right_length: right_id_seq = reverse_read.seq[current_p:current_p + after_right_length] for key, prefix in after_right_prefix.items(): if right_id_seq == prefix: right_id = key if right_id == '*': right_id = 'X' counters['right_ids'][right_id_seq] += 1 if right_id != 'X': current_p += len(after_right[right_id]) if current_p < len(reverse_read.seq) - 4: left_id_seq = reverse_read.seq[current_p:current_p + 4] for key, sequence in after_left.items(): if left_id_seq == sequence: left_id = key if left_id == '*': left_id = 'X' counters['left_ids'][left_id_seq] += 1 polyA_start, polyA_length = find_polyA_cython.find_polyA( forward_read.seq, 15) polyA_slice = slice(polyA_start, polyA_start + polyA_length) polyA_seq = forward_read.seq[polyA_slice] polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice]) three_payload_slice = slice(None, polyA_start) three_payload_seq = forward_read.seq[three_payload_slice] three_payload_qual = forward_read.qual[three_payload_slice] common_name, _ = R1.name.rsplit(':', 1) control_ids_string = '{0}-{1}'.format(left_id, right_id) five_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq='', right_qual='', ) three_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq=polyA_seq, right_qual=polyA_qual, ) five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual) three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual) counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1 counters['positions'][polyA_read][polyA_start] += 1 counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1 counters['polyA_lengths'][polyA_length] += 1 counters['control_ids'][control_ids_string] += 1 if polyA_length < 13: return None, None return five_payload_read, three_payload_read
def trim_read(read): trimmed = fastq.Read(read.name, read.seq[num_to_trim:], read.qual[num_to_trim:], ) return trimmed
def extract_reads_from_combined(combined_mapping): R1_seq, R1_qual, R2_seq, R2_qual = extract_seqs_from_combined( combined_mapping, remove_soft_clipped=False) R1 = fastq.Read(combined_mapping.qname, R1_seq, R1_qual) R2 = fastq.Read(combined_mapping.qname, R2_seq, R2_qual) return R1, R2