def __init__(self, **kwargs): super(WilkeningRNASeqExperiment, self).__init__(**kwargs) full_adapter_in_R2 = utilities.reverse_complement( self.barcode) + utilities.reverse_complement( adapters.primers['PE']['R1']) self.adapter_in_R2 = full_adapter_in_R2[:19]
def unambiguously_trimmed(bam_fn, unambiguous_bam_fn, genome_dir): ''' Reads that have had poly-As trimmed may have had some real RPF A's trimmed as well. Retains only mapped reads for which the last aligned base and the following base in the reference are both non-A. ''' genome = genomes.load_entire_genome(genome_dir) bamfile = pysam.Samfile(bam_fn) with pysam.Samfile(unambiguous_bam_fn, 'wb', header=bamfile.header) as unambiguous_bam_fh: for read in bamfile: rname = bamfile.getrname(read.tid) if not read.is_reverse: if read.positions[-1] == bamfile.lengths[read.tid] - 1: # There is no next base to get continue last_position = read.positions[-1] last_base, next_base = genome[rname][ last_position:last_position + 2] else: if read.positions[0] == 0: # There is no next base to get continue last_position = read.positions[0] last_base, next_base = utilities.reverse_complement( genome[rname][last_position - 1:last_position + 1]) if last_base.upper() != 'A' and next_base.upper() != 'A': unambiguous_bam_fh.write(read) pysam.index(unambiguous_bam_fn)
def unambiguously_trimmed(bam_fn, unambiguous_bam_fn, genome_dir): ''' Reads that have had poly-As trimmed may have had some real RPF A's trimmed as well. Retains only mapped reads for which the last aligned base and the following base in the reference are both non-A. ''' genome = genomes.load_entire_genome(genome_dir) bamfile = pysam.Samfile(bam_fn) with pysam.Samfile(unambiguous_bam_fn, 'wb', header=bamfile.header) as unambiguous_bam_fh: for read in bamfile: rname = bamfile.getrname(read.tid) if not read.is_reverse: if read.positions[-1] == bamfile.lengths[read.tid] - 1: # There is no next base to get continue last_position = read.positions[-1] last_base, next_base = genome[rname][last_position:last_position + 2] else: if read.positions[0] == 0: # There is no next base to get continue last_position = read.positions[0] last_base, next_base = utilities.reverse_complement(genome[rname][last_position - 1:last_position + 1]) if last_base.upper() != 'A' and next_base.upper() != 'A': unambiguous_bam_fh.write(read) pysam.index(unambiguous_bam_fn)
def build_adapter_ranges(index_sequence, primer_type='tru_seq'): def make_ranges(construct, names): cumulative_lengths = list(np.cumsum(map(len, construct))) bounds = zip([0] + cumulative_lengths, cumulative_lengths) ranges = zip(names, bounds) return ranges primer_in_R1 = utilities.reverse_complement(primers[primer_type]['R2']) primer_in_R2 = utilities.reverse_complement(primers[primer_type]['R1']) R1_construct = [ primer_in_R1, index_sequence, P7_rc, A_tail, ] R1_names = [ 'R2 primer', 'I7', 'P7', 'A tail', ] chemistry_only_cycles = 7 I5_length = 8 R2_construct = [ primer_in_R2[:-(I5_length + chemistry_only_cycles)], primer_in_R2[-(I5_length + chemistry_only_cycles):-(chemistry_only_cycles)], primer_in_R2[-chemistry_only_cycles:], P5_rc, A_tail, ] R2_names = [ 'R1 primer', 'I5', 'Chemistry', 'P7', 'A tail', ] R1_ranges = make_ranges(R1_construct, R1_names) R2_ranges = make_ranges(R2_construct, R2_names) return R1_ranges, R2_ranges
def build_before_adapters(I7_sequence='', primer_type='tru_seq', just_primers=False): if just_primers: before_R1 = primers[primer_type]['R1'] before_R2 = primers[primer_type]['R2'] else: before_R1 = flow_cell['P5'] + primers[primer_type]['R1'] before_R2 = flow_cell['P7'] + utilities.reverse_complement( I7_sequence) + primers[primer_type]['R2'] return before_R1, before_R2
def get_seq_info_pairs(clean_bam_fn): clean_bam_file = pysam.Samfile(clean_bam_fn) for aligned_read in clean_bam_file: if aligned_read.is_unmapped or aligned_read.is_secondary: continue perfect_and_unique = dict(aligned_read.tags)['NM'] == 0 and aligned_read.mapq == 50 if aligned_read.is_reverse: seq = utilities.reverse_complement(aligned_read.seq) else: seq = aligned_read.seq yield seq, perfect_and_unique
def build_adapter_ranges(index_sequence, primer_type='tru_seq'): def make_ranges(construct, names): cumulative_lengths = list(np.cumsum(map(len, construct))) bounds = zip([0] + cumulative_lengths, cumulative_lengths) ranges = zip(names, bounds) return ranges primer_in_R1 = utilities.reverse_complement(primers[primer_type]['R2']) primer_in_R2 = utilities.reverse_complement(primers[primer_type]['R1']) R1_construct = [primer_in_R1, index_sequence, P7_rc, A_tail, ] R1_names = ['R2 primer', 'I7', 'P7', 'A tail', ] chemistry_only_cycles = 7 I5_length = 8 R2_construct = [primer_in_R2[:-(I5_length + chemistry_only_cycles)], primer_in_R2[-(I5_length + chemistry_only_cycles):-(chemistry_only_cycles)], primer_in_R2[-chemistry_only_cycles:], P5_rc, A_tail, ] R2_names = ['R1 primer', 'I5', 'Chemistry', 'P7', 'A tail', ] R1_ranges = make_ranges(R1_construct, R1_names) R2_ranges = make_ranges(R2_construct, R2_names) return R1_ranges, R2_ranges
def get_seq_info_pairs(clean_bam_fn): clean_bam_file = pysam.Samfile(clean_bam_fn) for aligned_read in clean_bam_file: if aligned_read.is_unmapped or aligned_read.is_secondary: continue perfect_and_unique = dict( aligned_read.tags)['NM'] == 0 and aligned_read.mapq == 50 if aligned_read.is_reverse: seq = utilities.reverse_complement(aligned_read.seq) else: seq = aligned_read.seq yield seq, perfect_and_unique
def print_diagnostic(R1, R2, before_R1, before_R2, alignment, fh=sys.stdout): extended_R1 = before_R1.lower() + R1.seq extended_R2 = utilities.reverse_complement(before_R2.lower() + R2.seq) fh.write(R1.name + '\n') fh.write(R1.qual + '\n') fh.write(R2.qual + '\n') fh.write('{0}\t{1}\t{2}\n'.format(alignment['score'], len(alignment['path']) * 2, alignment['score'] - len(alignment['path']) * 2)) fh.write(str(alignment['path']) + '\n') print_local_alignment(extended_R1, extended_R2, alignment['path'], fh=fh) fh.write(str(alignment['insertions']) + '\n') fh.write(str(alignment['deletions']) + '\n') fh.write(str(sorted(alignment['mismatches'])) + '\n') for q, t in sorted(alignment['mismatches']): fh.write('\t{0}\t{1}\n'.format(extended_R1[q], extended_R2[t]))
def print_diagnostic(R1, R2, before_R1, before_R2, alignment, fh=sys.stdout): extended_R1 = before_R1.lower() + R1.seq extended_R2 = utilities.reverse_complement(before_R2.lower() + R2.seq) fh.write(R1.name + '\n') fh.write(R1.qual + '\n') fh.write(R2.qual + '\n') fh.write('{0}\t{1}\t{2}\n'.format( alignment['score'], len(alignment['path']) * .2, alignment['score'] - len(alignment['path']) * 2)) fh.write(str(alignment['path']) + '\n') print_local_alignment(extended_R1, extended_R2, alignment['path'], fh=fh) fh.write(str(alignment['insertions']) + '\n') fh.write(str(alignment['deletions']) + '\n') fh.write(str(sorted(alignment['mismatches'])) + '\n') for q, t in sorted(alignment['mismatches']): fh.write('\t{0}\t{1}\n'.format(extended_R1[q], extended_R2[t]))
def get_edge_alignments(read, targets): seq = read.seq seq_rc = utilities.reverse_complement(read.seq) all_alignments = [] min_score = 12 for target in targets: for query, is_reverse in [(seq, False), (seq_rc, True)]: alignments = sw.generate_alignments(query, target.seq, 'unpaired_adapter', min_score=min_score, ) for alignment in alignments: if alignment['score'] >= 2 * len(alignment['path']): alignment['query'] = query alignment['rname'] = target.name alignment['is_reverse'] = is_reverse all_alignments.append(alignment) return all_alignments
def get_local_alignments(read, targets): seq = read.seq seq_rc = utilities.reverse_complement(read.seq) all_alignments = [] for target in targets: min_score = min(20, 2 * len(target.seq)) for query, is_reverse in [(seq, False), (seq_rc, True)]: alignments = sw.generate_alignments(query, target.seq, 'local', min_score=min_score, max_alignments=3, ) for alignment in alignments: if alignment['score'] >= 0.5 * 2 * len(alignment['path']): alignment['query'] = query alignment['rname'] = target.name alignment['is_reverse'] = is_reverse all_alignments.append(alignment) return all_alignments
def get_extent_sequence(self, left_buffer=0, right_buffer=0): ''' Get the sequence of the extent. Useful for looking at gene with annotated frameshifts. ''' sequence = self.region_fetcher(self.seqname, min(self.genomic_to_extent), max(self.genomic_to_extent) + 1, ) if self.strand == '-': sequence = utilities.reverse_complement(sequence) sequence = np.asarray(sequence, dtype='c') extent_landmarks = {'start': 0, 'end': self.extent_length, } return positions.PositionCounts(extent_landmarks, left_buffer, right_buffer, data=sequence, )
def get_edge_alignments(read, targets): seq = read.seq seq_rc = utilities.reverse_complement(read.seq) all_alignments = [] min_score = 10 for target in targets: for query, is_reverse in [(seq, False), (seq_rc, True)]: alignments = sw.generate_alignments( query, target.seq, 'unpaired_adapter', min_score=min_score, ) for alignment in alignments: if alignment['score'] >= 2 * len(alignment['path']): alignment['query'] = query alignment['rname'] = target.name alignment['is_reverse'] = is_reverse all_alignments.append(alignment) return all_alignments
def get_amino_acid_locations(gene, genome): amino_acid_locations = defaultdict(list) if gene.seqname == 'MT': # Ignore these for now - diffent genetic code and tRNAs presumably means # different translation dynamics return None try: if gene.strand == '+': # gene.end is the last base before the stop codon seq = genome[gene.seqname][gene.start:gene.end + 4] translation = Bio.Seq.translate(seq, cds=True) elif gene.strand == '-': # gene.start is the first base after the (rc of the) stop codon # gene.end is the last base of the (rc of the) start codon rc_seq = genome[gene.seqname][gene.start - 3:gene.end + 1] seq = utilities.reverse_complement(rc_seq) translation = Bio.Seq.translate(seq, cds=True) except Bio.Seq.CodonTable.TranslationError, err: print err print gene.source, gene.feature, gene.seqname print gene.attribute return None
def get_local_alignments(read, targets): seq = read.seq seq_rc = utilities.reverse_complement(read.seq) all_alignments = [] for target in targets: min_score = min(20, 2 * len(target.seq)) for query, is_reverse in [(seq, False), (seq_rc, True)]: alignments = sw.generate_alignments( query, target.seq, 'local', min_score=min_score, max_alignments=3, ) for alignment in alignments: if alignment['score'] >= 0.7 * 2 * len(alignment['path']): alignment['query'] = query alignment['rname'] = target.name alignment['is_reverse'] = is_reverse all_alignments.append(alignment) return all_alignments
def get_extent_sequence(self, left_buffer=0, right_buffer=0): ''' Get the sequence of the extent. Useful for looking at gene with annotated frameshifts. ''' sequence = self.region_fetcher( self.seqname, min(self.genomic_to_extent), max(self.genomic_to_extent) + 1, ) if self.strand == '-': sequence = utilities.reverse_complement(sequence) sequence = np.asarray(sequence, dtype='c') extent_landmarks = { 'start': 0, 'end': self.extent_length, } return positions.PositionCounts( extent_landmarks, left_buffer, right_buffer, data=sequence, )
def infer_insert_length(R1, R2, before_R1, before_R2): ''' Infer the length of the insert represented by R1 and R2 by performing a semi-local alignment of R1 and the reverse complement of R2 with the expected adapter sequences prepended to each read. ''' extended_R1 = before_R1 + R1.seq extended_R2 = utilities.reverse_complement(before_R2 + R2.seq) alignment, = generate_alignments(extended_R1, extended_R2, 'overlap', 2, -1, -5, 1, 0, ) #print_diagnostic(R1, R2, before_R1, before_R2, alignment) R1_start = len(before_R1) R2_start = len(R2.seq) - 1 R1_start_in_R2 = alignment['query_mappings'][len(before_R1)] R2_start_in_R1 = alignment['target_mappings'][len(R2.seq) - 1] # Since R1 is the query and R2 is the target, bases in R1 that aren't in # R2 are called insertions, and bases in R2 that aren't in R1 are called # deletions. # An indel in the insert is non-physical. if R2_start_in_R1 != SOFT_CLIPPED: illegal_insertion = any(R1_start <= i <= R2_start_in_R1 for i in alignment['insertions']) else: illegal_insertion = any(R1_start <= i for i in alignment['insertions']) if R1_start_in_R2 != SOFT_CLIPPED: illegal_deletion = any(R1_start_in_R2 <= d <= R2_start for d in alignment['deletions']) else: illegal_deletion = any(d <= R2_start for d in alignment['deletions']) if illegal_insertion or illegal_deletion: return 'illegal', 500, -1 if R1_start_in_R2 != SOFT_CLIPPED and R2_start_in_R1 != SOFT_CLIPPED: length_from_R1 = R2_start_in_R1 - R1_start + 1 length_from_R2 = R2_start - R1_start_in_R2 + 1 else: # overlap alignment forces the alignment to start with either the # beginning of R1 or R2 and end with either the end of R1 or R2. # Making it to this else brach means that either the first base of R1 or # the first base of R2 or both wasn't aligned. This either means that # the insert is longer than the read length or a pathological alignment # has been produced in which only adapter bases are involved in the # alignment. Flag the second case as illegal. first_R1_index, first_R2_index = alignment['path'][0] length_from_R1 = (first_R1_index - R1_start + 1) + (len(R2.seq) - 1) last_R1_index, last_R2_index = alignment['path'][-1] length_from_R2 = (R2_start - last_R2_index + 1) + (len(R1.seq) - 1) if first_R1_index == 0 or last_R2_index == 0: return 'illegal', 500, -1 if length_from_R1 < -1 or length_from_R2 < -1: # Negative insert lengths are non-physical. Even though I don't # understand it, -1 is relatively common so is tolerated. return 'illegal', 500, -1 insert_length = length_from_R1 if 2 * len(alignment['path']) - alignment['score'] > .2 * len(alignment['path']): status = 'bad' else: status = 'good' if status == 'good' and (length_from_R1 != length_from_R2): print 'length from R1', length_from_R1 print 'length from R2', length_from_R2 print_diagnostic(R1, R2, before_R1, before_R2, alignment) # This shouldn't be possible without an illegal indel. raise ValueError return status, insert_length, alignment
def extract_seqs_from_combined( combined_mapping, include_overlap=True, remove_soft_clipped=True, flip_if_reverse=True, ): ''' Separates out the R1 and R2 seq and quals that went into a combined_mapping. ''' strand = sam.get_strand(combined_mapping) tags = dict(combined_mapping.tags) if 'Xs' not in tags: tags['Xs'] = '' tags['Xq'] = '' tags['Xw'] = 'left' skip_index = find_skip_index_in_combined(combined_mapping) left_cigar = combined_mapping.cigar[:skip_index] right_cigar = combined_mapping.cigar[skip_index + 1:] left_length = sam.total_read_nucs(left_cigar) left_seq = combined_mapping.seq[:left_length] left_qual = combined_mapping.qual[:left_length] right_seq = combined_mapping.seq[left_length:] right_qual = combined_mapping.qual[left_length:] if remove_soft_clipped: first_left_op, first_left_length = left_cigar[0] if first_left_op == sam.BAM_CSOFT_CLIP: left_seq = left_seq[first_left_length:] left_qual = left_qual[first_left_length:] last_right_op, last_right_length = right_cigar[-1] if last_right_op == sam.BAM_CSOFT_CLIP: right_seq = right_seq[:-last_right_length] right_qual = right_qual[:-last_right_length] if include_overlap: if tags['Xw'] == 'left': # Overlapping sequence in the combined read reflects that from the # left mapping, so the overlap from the right was stored in the Xs # and Xq tags. right_seq = tags['Xs'] + right_seq right_qual = tags['Xq'] + right_qual elif tags['Xw'] == 'right': # Overlapping sequence in the combined read reflects that from the # right mapping, so the overlap from the left was stored in the Xs # and Xq tags. left_seq = left_seq + tags['Xs'] left_qual = left_qual + tags['Xq'] if strand == '+': R1_seq, R1_qual = left_seq, left_qual R2_seq, R2_qual = right_seq, right_qual if flip_if_reverse: R2_seq = utilities.reverse_complement(R2_seq) R2_qual = R2_qual[::-1] elif strand == '-': R1_seq, R1_qual = right_seq, right_qual R2_seq, R2_qual = left_seq, left_qual if flip_if_reverse: R1_seq = utilities.reverse_complement(R1_seq) R1_qual = R1_qual[::-1] return R1_seq, R1_qual, R2_seq, R2_qual
def find_boundary_sequences(R1, R2, counters): # Find which read in the read pair is from the reverse strand by looking for # common_right_reverse. # First try to find a unique position entirely contained within R1 or R2 # that is close to common_right_reverse. # Failing this, find the longest of (the longest suffix of R1 or R2 that # matches a prefix of common_right_reverse) or (the longest prefix of R1 or # R2 that matches a suffix of common_right_reverse). R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(R1.seq, common_right_reverse) R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(R2.seq, common_right_reverse) if len(R1_contained) + len(R2_contained) > 1: # Only one of occurence of common_right_reverse should exist between R1 # and R2. return None, None elif len(R1_contained) + len(R2_contained) == 0: possiblities = [(len(common_right_reverse) - R1_prefix, 'R1_prefix'), (len(common_right_reverse) - R2_prefix, 'R2_prefix'), (len(common_right_reverse) - R1_suffix, 'R1_suffix'), (len(common_right_reverse) - R2_suffix, 'R2_suffix'), ] length, kind = max(possiblities) if length > 5: if 'R1' in kind: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' elif 'R2' in kind: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' if 'prefix' in kind: common_right_reverse_start = len(reverse_read.seq) - length elif 'suffix' in kind: common_right_reverse_start = -length else: return None, None elif len(R1_contained) == 1: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' common_right_reverse_start = R1_contained.pop() elif len(R2_contained) == 1: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' common_right_reverse_start = R2_contained.pop() # '*' means that there was no opportunity to see this id. # 'X' means that there was an opportunity and it was neither A nor B. right_id = '*' left_id = '*' five_payload_slice = slice(None, max(0, common_right_reverse_start)) five_payload_seq = utilities.reverse_complement(reverse_read.seq[five_payload_slice]) five_payload_qual = reverse_read.qual[five_payload_slice][::-1] current_p = common_right_reverse_start + len(common_right_reverse) if current_p < len(reverse_read.seq) - after_right_length: right_id_seq = reverse_read.seq[current_p:current_p + after_right_length] for key, prefix in after_right_prefix.items(): if right_id_seq == prefix: right_id = key if right_id == '*': right_id = 'X' counters['right_ids'][right_id_seq] += 1 if right_id != 'X': current_p += len(after_right[right_id]) if current_p < len(reverse_read.seq) - 4: left_id_seq = reverse_read.seq[current_p:current_p + 4] for key, sequence in after_left.items(): if left_id_seq == sequence: left_id = key if left_id == '*': left_id = 'X' counters['left_ids'][left_id_seq] += 1 polyA_start, polyA_length = find_polyA_cython.find_polyA(forward_read.seq, 15) polyA_slice = slice(polyA_start, polyA_start + polyA_length) polyA_seq = forward_read.seq[polyA_slice] polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice]) three_payload_slice = slice(None, polyA_start) three_payload_seq = forward_read.seq[three_payload_slice] three_payload_qual = forward_read.qual[three_payload_slice] common_name, _ = R1.name.rsplit(':', 1) control_ids_string = '{0}-{1}'.format(left_id, right_id) five_annotation = trim.PayloadAnnotation(original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq='', right_qual='', ) three_annotation = trim.PayloadAnnotation(original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq=polyA_seq, right_qual=polyA_qual, ) five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual) three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual) counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1 counters['positions'][polyA_read][polyA_start] += 1 counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1 counters['polyA_lengths'][polyA_length] += 1 counters['control_ids'][control_ids_string] += 1 if polyA_length < 13: return None, None return five_payload_read, three_payload_read
def build_adapters(index_sequence='', max_length=None, primer_type='tru_seq'): before_R1, before_R2 = build_before_adapters(index_sequence, primer_type) adapter_in_R1 = utilities.reverse_complement(before_R2) + A_tail adapter_in_R2 = utilities.reverse_complement(before_R1) + A_tail truncated_slice = slice(None, max_length) return adapter_in_R1[truncated_slice], adapter_in_R2[truncated_slice]
def __init__(self, **kwargs): super(ThreeTFillExperiment, self).__init__(**kwargs) self.barcode = kwargs['barcode'] full_adapter_in_R1 = utilities.reverse_complement(self.barcode) + utilities.reverse_complement(adapters.primers['PE']['R2']) self.adapter_in_R1 = full_adapter_in_R1[:19]
def align_reads( target_fasta_fn, reads, bam_fn, min_path_length=15, error_fn='/dev/null', alignment_type='overlap', ): ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing alignments in bam_fn and yielding unaligned reads. ''' targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)} target_names = sorted(targets) target_lengths = [len(targets[n]) for n in target_names] alignment_sorter = sam.AlignmentSorter( target_names, target_lengths, bam_fn, ) statistics = Counter() with alignment_sorter: for original_read in reads: statistics['input'] += 1 alignments = [] rc_read = fastq.Read( original_read.name, utilities.reverse_complement(original_read.seq), original_read.qual[::-1], ) for read, is_reverse in ([original_read, False], [rc_read, True]): qual = fastq.decode_sanger(read.qual) for target_name, target_seq in targets.iteritems(): alignment = generate_alignments(read.seq, target_seq, alignment_type)[0] path = alignment['path'] if len(path) >= min_path_length and alignment['score'] / ( 2. * len(path)) > 0.8: aligned_segment = pysam.AlignedSegment() aligned_segment.seq = read.seq aligned_segment.query_qualities = qual aligned_segment.is_reverse = is_reverse char_pairs = make_char_pairs(path, read.seq, target_seq) cigar = sam.aligned_pairs_to_cigar(char_pairs) clip_from_start = first_query_index(path) if clip_from_start > 0: cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start) ] + cigar clip_from_end = len( read.seq) - 1 - last_query_index(path) if clip_from_end > 0: cigar = cigar + [ (sam.BAM_CSOFT_CLIP, clip_from_end) ] aligned_segment.cigar = cigar read_aligned, ref_aligned = zip(*char_pairs) md = sam.alignment_to_MD_string( ref_aligned, read_aligned) aligned_segment.set_tag('MD', md) aligned_segment.set_tag('AS', alignment['score']) aligned_segment.tid = alignment_sorter.get_tid( target_name) aligned_segment.query_name = read.name aligned_segment.next_reference_id = -1 aligned_segment.reference_start = first_target_index( path) alignments.append(aligned_segment) if alignments: statistics['aligned'] += 1 sorted_alignments = sorted(alignments, key=lambda m: m.get_tag('AS'), reverse=True) grouped = utilities.group_by(sorted_alignments, key=lambda m: m.get_tag('AS')) _, highest_group = grouped.next() primary_already_assigned = False for alignment in highest_group: if len(highest_group) == 1: alignment.mapping_quality = 2 else: alignment.mapping_quality = 1 if not primary_already_assigned: primary_already_assigned = True else: alignment.is_secondary = True alignment_sorter.write(alignment) else: statistics['unaligned'] += 1 yield read with open(error_fn, 'w') as error_fh: for key in ['input', 'aligned', 'unaligned']: error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
open(args.bad_R2_fn, 'w') as bad_R2_fn: for R1, R2 in itertools.islice(read_pairs, 10000): if len(R1.seq) != len(R2.seq): bad_R1_fn.write(str(R1)) bad_R2_fn.write(str(R2)) continue status, insert_length, alignment = infer_insert_length( R1, R2, '', '') if status == 'bad': bad_R1_fn.write(str(R1)) bad_R2_fn.write(str(R2)) continue else: R2_rc_seq = utilities.reverse_complement(R2.seq) R2_rc_qual = R2.qual[::-1] just_R1_slice = slice(None, insert_length - len(R1.seq)) just_R1_seq = R1.seq[just_R1_slice] just_R1_qual = R1.qual[just_R1_slice] overlap_R1_slice = slice(insert_length - len(R1.seq), None) overlap_R1_seq = R1.seq[overlap_R1_slice] overlap_R1_qual = R1.seq[overlap_R1_slice] overlap_R2_slice = slice(None, len(overlap_R1_seq)) overlap_R2_seq = R2_rc_seq[overlap_R2_slice] overlap_R2_qual = R2_rc_qual[overlap_R2_slice] just_R2_slice = slice(len(overlap_R1_seq), None)
def infer_insert_length(R1, R2, before_R1, before_R2, solid=False): ''' Infer the length of the insert represented by R1 and R2 by performing a semi-local alignment of R1 and the reverse complement of R2 with the expected adapter sequences prepended to each read. ''' extended_R1 = before_R1 + R1.seq extended_R2 = utilities.reverse_complement(before_R2 + R2.seq) alignment, = generate_alignments( extended_R1, extended_R2, 'overlap', 2, -1, -5, 1, 0, ) R1_start = len(before_R1) R2_start = len(R2.seq) - 1 R1_start_in_R2 = alignment['query_mappings'][len(before_R1)] R2_start_in_R1 = alignment['target_mappings'][len(R2.seq) - 1] # Since R1 is the query and R2 is the target, bases in R1 that aren't in # R2 are called insertions, and bases in R2 that aren't in R1 are called # deletions. # An indel in the insert is non-physical. if R2_start_in_R1 != SOFT_CLIPPED: illegal_insertion = any(R1_start <= i <= R2_start_in_R1 for i in alignment['insertions']) else: illegal_insertion = any(R1_start <= i for i in alignment['insertions']) if R1_start_in_R2 != SOFT_CLIPPED: illegal_deletion = any(R1_start_in_R2 <= d <= R2_start for d in alignment['deletions']) else: illegal_deletion = any(d <= R2_start for d in alignment['deletions']) if illegal_insertion or illegal_deletion: return 'illegal', 500, -1 if len(alignment['path']) == 0: return 'illegal', 500, -1 if R1_start_in_R2 != SOFT_CLIPPED and R2_start_in_R1 != SOFT_CLIPPED: length_from_R1 = R2_start_in_R1 - R1_start + 1 length_from_R2 = R2_start - R1_start_in_R2 + 1 else: # overlap alignment forces the alignment to start with either the # beginning of R1 or R2 and end with either the end of R1 or R2. # Making it to this else branch means that either the first base of R1 or # the first base of R2 or both wasn't aligned. This either means that # the insert is longer than the read length or a pathological alignment # has been produced in which only adapter bases are involved in the # alignment. Flag the second case as illegal. try: first_R1_index, first_R2_index = alignment['path'][0] except IndexError: print R1 print R2 print alignment raise length_from_R1 = (first_R1_index - R1_start + 1) + (len(R2.seq) - 1) last_R1_index, last_R2_index = alignment['path'][-1] length_from_R2 = (R2_start - last_R2_index + 1) + (len(R1.seq) - 1) if first_R1_index == 0 or last_R2_index == 0: return 'illegal', 500, -1 if length_from_R1 < -1 or length_from_R2 < -1: # Negative insert lengths are non-physical. Even though I don't # understand it, -1 is relatively common so is tolerated. return 'illegal', 500, -1 insert_length = length_from_R1 if 2 * len(alignment['path']) - alignment['score'] > .2 * len( alignment['path']): status = 'bad' else: status = 'good' if status == 'good' and (length_from_R1 != length_from_R2): if solid and not (alignment['insertions'] or alignment['deletions']): pass else: # This shouldn't be possible without an illegal indel. #print 'length from R1', length_from_R1 #print 'length from R2', length_from_R2 #print_diagnostic(R1, R2, before_R1, before_R2, alignment) return 'illegal', 500, -1 #print_diagnostic(R1, R2, before_R1, before_R2, alignment) return status, insert_length, alignment
def get_R2_rc_reads(): read_pairs = islice(get_read_pairs(), 100) return (fastq.Read(R2.name, utilities.reverse_complement(R2.seq), R2.qual[::-1]) for R1, R2 in read_pairs)
def find_boundary_sequences(R1, R2, counters): # Find which read in the read pair is from the reverse strand by looking for # common_right_reverse. # First try to find a unique position entirely contained within R1 or R2 # that is close to common_right_reverse. # Failing this, find the longest of (the longest suffix of R1 or R2 that # matches a prefix of common_right_reverse) or (the longest prefix of R1 or # R2 that matches a suffix of common_right_reverse). R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites( R1.seq, common_right_reverse) R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites( R2.seq, common_right_reverse) if len(R1_contained) + len(R2_contained) > 1: # Only one of occurence of common_right_reverse should exist between R1 # and R2. return None, None elif len(R1_contained) + len(R2_contained) == 0: possiblities = [ (len(common_right_reverse) - R1_prefix, 'R1_prefix'), (len(common_right_reverse) - R2_prefix, 'R2_prefix'), (len(common_right_reverse) - R1_suffix, 'R1_suffix'), (len(common_right_reverse) - R2_suffix, 'R2_suffix'), ] length, kind = max(possiblities) if length > 5: if 'R1' in kind: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' elif 'R2' in kind: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' if 'prefix' in kind: common_right_reverse_start = len(reverse_read.seq) - length elif 'suffix' in kind: common_right_reverse_start = -length else: return None, None elif len(R1_contained) == 1: reverse_read = R1 forward_read = R2 polyA_read = 'R2_forward' polyT_read = 'R1_reverse' common_right_reverse_start = R1_contained.pop() elif len(R2_contained) == 1: reverse_read = R2 forward_read = R1 polyA_read = 'R1_forward' polyT_read = 'R2_reverse' common_right_reverse_start = R2_contained.pop() # '*' means that there was no opportunity to see this id. # 'X' means that there was an opportunity and it was neither A nor B. right_id = '*' left_id = '*' five_payload_slice = slice(None, max(0, common_right_reverse_start)) five_payload_seq = utilities.reverse_complement( reverse_read.seq[five_payload_slice]) five_payload_qual = reverse_read.qual[five_payload_slice][::-1] current_p = common_right_reverse_start + len(common_right_reverse) if current_p < len(reverse_read.seq) - after_right_length: right_id_seq = reverse_read.seq[current_p:current_p + after_right_length] for key, prefix in after_right_prefix.items(): if right_id_seq == prefix: right_id = key if right_id == '*': right_id = 'X' counters['right_ids'][right_id_seq] += 1 if right_id != 'X': current_p += len(after_right[right_id]) if current_p < len(reverse_read.seq) - 4: left_id_seq = reverse_read.seq[current_p:current_p + 4] for key, sequence in after_left.items(): if left_id_seq == sequence: left_id = key if left_id == '*': left_id = 'X' counters['left_ids'][left_id_seq] += 1 polyA_start, polyA_length = find_polyA_cython.find_polyA( forward_read.seq, 15) polyA_slice = slice(polyA_start, polyA_start + polyA_length) polyA_seq = forward_read.seq[polyA_slice] polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice]) three_payload_slice = slice(None, polyA_start) three_payload_seq = forward_read.seq[three_payload_slice] three_payload_qual = forward_read.qual[three_payload_slice] common_name, _ = R1.name.rsplit(':', 1) control_ids_string = '{0}-{1}'.format(left_id, right_id) five_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq='', right_qual='', ) three_annotation = trim.PayloadAnnotation( original_name=common_name, left_seq=control_ids_string, left_qual='', right_seq=polyA_seq, right_qual=polyA_qual, ) five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual) three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual) counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1 counters['positions'][polyA_read][polyA_start] += 1 counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1 counters['polyA_lengths'][polyA_length] += 1 counters['control_ids'][control_ids_string] += 1 if polyA_length < 13: return None, None return five_payload_read, three_payload_read
def __init__(self, **kwargs): super(WilkeningRNASeqExperiment, self).__init__(**kwargs) full_adapter_in_R2 = utilities.reverse_complement(self.barcode) + utilities.reverse_complement(adapters.primers['PE']['R1']) self.adapter_in_R2 = full_adapter_in_R2[:19]
def extend_polyA_end(mapping, region_fetcher, trimmed_twice=False): if mapping.is_unmapped: return mapping if trimmed_twice: # Trailing poly-As were removed by the second trimming step. annotation = TrimmedTwiceAnnotation.from_identifier(mapping.qname) polyA_seq = annotation['retrimmed_right_seq'] polyA_qual = annotation['retrimmed_right_qual'] new_qname = PayloadAnnotation.from_prefix_identifier(mapping.qname).identifier else: annotation = PayloadAnnotation.from_identifier(mapping.qname) polyA_seq = annotation['right_seq'] polyA_qual = annotation['right_qual'] new_qname = '{0}_{1}_{2}'.format(annotation['original_name'], annotation['left_seq'], annotation['left_qual'], ) num_trimmed = len(polyA_seq) if mapping.is_reverse: after = region_fetcher(mapping.tid, mapping.pos - num_trimmed, mapping.pos) after = utilities.reverse_complement(after) else: after = region_fetcher(mapping.tid, mapping.aend, mapping.aend + num_trimmed) extra_genomic_As = 0 for b in after: if b == 'A': extra_genomic_As += 1 else: break nongenomic_length = num_trimmed - extra_genomic_As if mapping.is_reverse: nongenomic_start = mapping.pos - 1 - extra_genomic_As else: # Note: 'aend points to one past the last aligned residue' nongenomic_start = mapping.aend + extra_genomic_As extra_genomic_seq = polyA_seq[:extra_genomic_As] soft_clipped_seq = polyA_seq[extra_genomic_As:] extra_genomic_qual = polyA_qual[:extra_genomic_As] soft_clipped_qual = polyA_qual[extra_genomic_As:] extra_seq = extra_genomic_seq + soft_clipped_seq extra_qual = extra_genomic_qual + soft_clipped_qual if mapping.is_reverse: final_cigar_block_index = 0 extended_seq = utilities.reverse_complement(extra_seq) + mapping.seq extended_qual = extra_qual[::-1] + mapping.qual mapping.pos = mapping.pos - extra_genomic_As else: final_cigar_block_index = -1 extended_seq = mapping.seq + extra_seq extended_qual = mapping.qual + extra_qual # Note: writing to mapping.seq destroys mapping.qual, so # mapping.qual needs to be retrieved above mapping.seq = extended_seq mapping.qual = extended_qual op, length = mapping.cigar[final_cigar_block_index] if op != 0: raise ValueError length += extra_genomic_As updated_cigar = mapping.cigar updated_cigar[final_cigar_block_index] = (op, length) if len(soft_clipped_seq) > 0: soft_clipped_block = [(sam.BAM_CSOFT_CLIP, len(soft_clipped_seq))] if final_cigar_block_index == 0: updated_cigar = soft_clipped_block + updated_cigar elif final_cigar_block_index == -1: updated_cigar = updated_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible addition of bases to the # alignment may have made it inaccurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, nongenomic_length) mapping.qname = new_qname return mapping
def build_before_adapters(index_sequence='', primer_type='tru_seq'): before_R1 = flow_cell['P5'] + primers[primer_type]['R1'] before_R2 = flow_cell['P7'] + utilities.reverse_complement(index_sequence) + primers[primer_type]['R2'] return before_R1, before_R2
def extend_polyA_end(mapping, region_fetcher, trimmed_twice=False): if mapping.is_unmapped: return mapping if trimmed_twice: # Trailing poly-As were removed by the second trimming step. annotation = TrimmedTwiceAnnotation.from_identifier(mapping.qname) polyA_seq = annotation['retrimmed_right_seq'] polyA_qual = annotation['retrimmed_right_qual'] new_qname = PayloadAnnotation.from_prefix_identifier( mapping.qname).identifier else: annotation = PayloadAnnotation.from_identifier(mapping.qname) polyA_seq = annotation['right_seq'] polyA_qual = annotation['right_qual'] new_qname = '{0}_{1}_{2}'.format( annotation['original_name'], annotation['left_seq'], annotation['left_qual'], ) num_trimmed = len(polyA_seq) if mapping.is_reverse: after = region_fetcher(mapping.tid, mapping.pos - num_trimmed, mapping.pos) after = utilities.reverse_complement(after) else: after = region_fetcher(mapping.tid, mapping.aend, mapping.aend + num_trimmed) extra_genomic_As = 0 for b in after: if b == 'A': extra_genomic_As += 1 else: break nongenomic_length = num_trimmed - extra_genomic_As if mapping.is_reverse: nongenomic_start = mapping.pos - 1 - extra_genomic_As else: # Note: 'aend points to one past the last aligned residue' nongenomic_start = mapping.aend + extra_genomic_As extra_genomic_seq = polyA_seq[:extra_genomic_As] soft_clipped_seq = polyA_seq[extra_genomic_As:] extra_genomic_qual = polyA_qual[:extra_genomic_As] soft_clipped_qual = polyA_qual[extra_genomic_As:] extra_seq = extra_genomic_seq + soft_clipped_seq extra_qual = extra_genomic_qual + soft_clipped_qual if mapping.is_reverse: final_cigar_block_index = 0 extended_seq = utilities.reverse_complement(extra_seq) + mapping.seq extended_qual = extra_qual[::-1] + mapping.qual mapping.pos = mapping.pos - extra_genomic_As else: final_cigar_block_index = -1 extended_seq = mapping.seq + extra_seq extended_qual = mapping.qual + extra_qual # Note: writing to mapping.seq destroys mapping.qual, so # mapping.qual needs to be retrieved above mapping.seq = extended_seq mapping.qual = extended_qual op, length = mapping.cigar[final_cigar_block_index] if op != 0: raise ValueError length += extra_genomic_As updated_cigar = mapping.cigar updated_cigar[final_cigar_block_index] = (op, length) if len(soft_clipped_seq) > 0: soft_clipped_block = [(sam.BAM_CSOFT_CLIP, len(soft_clipped_seq))] if final_cigar_block_index == 0: updated_cigar = soft_clipped_block + updated_cigar elif final_cigar_block_index == -1: updated_cigar = updated_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible addition of bases to the # alignment may have made it inaccurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, nongenomic_length) mapping.qname = new_qname return mapping