def produce_bowtie2_alignments( reads, index_prefix, genome_dir, score_min, ): bowtie2_options = { 'local': True, 'report_up_to': 10, 'seed_mismatches': 1, 'seed_interval_function': 'C,1,0', 'seed_length': 10, } sam_file, mappings = mapping_tools.map_bowtie2(index_prefix, reads=reads, custom_binary=True, score_min=score_min, yield_mappings=True, **bowtie2_options) base_lookup = genomes.build_base_lookup(genome_dir, sam_file) mapping_groups = utilities.group_by(mappings, lambda m: m.qname) for qname, group in mapping_groups: group = sorted(group, key=lambda m: (m.tid, m.pos)) alignments = [ mapping_to_alignment(mapping, sam_file, base_lookup) for mapping in group if not mapping.is_unmapped ] yield qname, alignments
def produce_bowtie2_alignments(reads, index_prefix, genome_dir, score_min, ): bowtie2_options = {'local': True, #'report_all': True, 'report_up_to': 10, 'seed_mismatches': 1, 'seed_interval_function': 'C,1,0', 'seed_length': 10, } sam_file, mappings = mapping_tools.map_bowtie2(index_prefix, reads=reads, custom_binary=True, score_min=score_min, yield_mappings=True, **bowtie2_options) region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) mapping_groups = utilities.group_by(mappings, lambda m: m.qname) for qname, group in mapping_groups: group = sorted(group, key=lambda m: (m.tid, m.pos)) alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher) for mapping in group if not mapping.is_unmapped] yield qname, alignments
def produce_bowtie2_alignments_old(reads, sam_fn, index_prefix, genome_dir, score_min, ): bowtie2_options = {'local': True, #'report_all': True, 'report_up_to': 10, 'seed_mismatches': 1, 'seed_interval_function': 'C,1,0', 'seed_length': 10, #'threads': 12, } mapping_tools.map_bowtie2(index_prefix, None, None, sam_fn, unpaired_Reads=reads, custom_binary=True, score_min=score_min, **bowtie2_options) sam_file = pysam.Samfile(sam_fn) region_fetcher = genomes.build_region_fetcher(genome_dir, load_references=True) mapping_groups = utilities.group_by(sam_file, lambda m: m.qname) for qname, group in mapping_groups: alignments = [mapping_to_alignment(mapping, sam_file, region_fetcher) for mapping in group if not mapping.is_unmapped] yield qname, alignments
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend( [('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def _consolidate_counts(positions_list): positions_list = sorted(positions_list) consolidated_list = [] for position, items in utilities.group_by(positions_list, key=lambda x: x[:4], ): ref_seq_name, ref_pos, ref_char, read_char = position count = sum(item[4] for item in items) consolidated = (ref_seq_name, ref_pos, ref_char, read_char, count) consolidated_list.append(consolidated) return consolidated_list
def collapse_fragments(self): get_position = annotation.make_convertor(self.MappingAnnotation, self.PositionAnnotation, ) get_fragment = annotation.make_convertor(self.MappingAnnotation, self.FragmentAnnotation, ) amplification_counts = Counter() sq_lines = sam.get_sq_lines(self.merged_file_names['sorted_clean_sam']) sam_lines = self.get_sorted_sam_lines() with open(self.file_names['collapsed_sam'], 'w') as collapsed_fh, \ open(self.file_names['collisions'], 'w') as collisions_fh: for sq_line in sq_lines: collapsed_fh.write(sq_line) position_groups = utilities.group_by(sam_lines, get_position) for position_annotation, position_lines in position_groups: fragment_counts = Counter() position_count = len(position_lines) fragment_groups = utilities.group_by(position_lines, get_fragment) for fragment_annotation, fragment_lines in fragment_groups: fragment_count = len(fragment_lines) fragment_counts[fragment_count] += 1 amplification_counts['{},{}'.format(position_count, fragment_count)] += 1 collapsed_annotation = self.CollapsedAnnotation(count=fragment_count, **fragment_annotation) new_line = sam.splice_in_name(fragment_lines[0], collapsed_annotation.identifier) collapsed_fh.write(new_line) fragment_counts = utilities.counts_to_array(fragment_counts) if position_count > 100: collisions_fh.write(position_annotation.identifier + '\n') collisions_fh.write(','.join(map(str, fragment_counts)) + '\n') sam.make_sorted_bam(self.file_names['collapsed_sam'], self.file_names['collapsed_bam'], ) self.write_file('amplification_counts', amplification_counts)
def shade_background(start, sequence, ax=None, save_as=None): ''' Lightly shade the background according to the expected sequence. ''' for p, expected_bases in enumerate(sequence): expected_bases = [k for k, _ in utilities.group_by(expected_bases)] increment = 1. / len(expected_bases) for i, base in enumerate(expected_bases): ax.axvspan( start + p - 0.5, start + p + 0.5, ymax=1 - i * increment, ymin=1 - (i + 1) * increment, facecolor=igv_colors.normalized_rgbs[base], alpha=0.3, linewidth=0.7, )
def combine_mappings(self): num_unmapped = 0 num_nonunique = 0 num_unique = 0 mappings = pysam.Samfile(self.file_names['accepted_hits']) unmapped = pysam.Samfile(self.file_names['unmapped_bam']) merged = sam.merge_by_name(mappings, unmapped) grouped = utilities.group_by(merged, lambda m: m.qname) alignment_sorter = sam.AlignmentSorter(mappings.references, mappings.lengths, self.file_names['bam'], ) with alignment_sorter: for qname, group in grouped: unmapped = any(m.is_unmapped for m in group) if unmapped: num_unmapped += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 else: num_unique += 1 for mapping in group: alignment_sorter.write(mapping) self.summary.extend( [('Unmapped', num_unmapped), ('Nonunique', num_nonunique), ('Unique', num_unique), ], )
def combine_mappings(self): num_unmapped = 0 num_nonunique = 0 num_unique = 0 mappings = pysam.Samfile(self.file_names['accepted_hits']) unmapped = pysam.Samfile(self.file_names['unmapped_bam']) merged = sam.merge_by_name(mappings, unmapped) grouped = utilities.group_by(merged, lambda m: m.qname) alignment_sorter = sam.AlignmentSorter( mappings.references, mappings.lengths, self.file_names['bam'], ) with alignment_sorter: for qname, group in grouped: unmapped = any(m.is_unmapped for m in group) if unmapped: num_unmapped += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 else: num_unique += 1 for mapping in group: alignment_sorter.write(mapping) self.summary.extend([ ('Unmapped', num_unmapped), ('Nonunique', num_nonunique), ('Unique', num_unique), ], )
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min( trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend([ ('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def align_reads( target_fasta_fn, reads, bam_fn, min_path_length=15, error_fn='/dev/null', alignment_type='overlap', ): ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing alignments in bam_fn and yielding unaligned reads. ''' targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)} target_names = sorted(targets) target_lengths = [len(targets[n]) for n in target_names] alignment_sorter = sam.AlignmentSorter( target_names, target_lengths, bam_fn, ) statistics = Counter() with alignment_sorter: for original_read in reads: statistics['input'] += 1 alignments = [] rc_read = fastq.Read( original_read.name, utilities.reverse_complement(original_read.seq), original_read.qual[::-1], ) for read, is_reverse in ([original_read, False], [rc_read, True]): qual = fastq.decode_sanger(read.qual) for target_name, target_seq in targets.iteritems(): alignment = generate_alignments(read.seq, target_seq, alignment_type)[0] path = alignment['path'] if len(path) >= min_path_length and alignment['score'] / ( 2. * len(path)) > 0.8: aligned_segment = pysam.AlignedSegment() aligned_segment.seq = read.seq aligned_segment.query_qualities = qual aligned_segment.is_reverse = is_reverse char_pairs = make_char_pairs(path, read.seq, target_seq) cigar = sam.aligned_pairs_to_cigar(char_pairs) clip_from_start = first_query_index(path) if clip_from_start > 0: cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start) ] + cigar clip_from_end = len( read.seq) - 1 - last_query_index(path) if clip_from_end > 0: cigar = cigar + [ (sam.BAM_CSOFT_CLIP, clip_from_end) ] aligned_segment.cigar = cigar read_aligned, ref_aligned = zip(*char_pairs) md = sam.alignment_to_MD_string( ref_aligned, read_aligned) aligned_segment.set_tag('MD', md) aligned_segment.set_tag('AS', alignment['score']) aligned_segment.tid = alignment_sorter.get_tid( target_name) aligned_segment.query_name = read.name aligned_segment.next_reference_id = -1 aligned_segment.reference_start = first_target_index( path) alignments.append(aligned_segment) if alignments: statistics['aligned'] += 1 sorted_alignments = sorted(alignments, key=lambda m: m.get_tag('AS'), reverse=True) grouped = utilities.group_by(sorted_alignments, key=lambda m: m.get_tag('AS')) _, highest_group = grouped.next() primary_already_assigned = False for alignment in highest_group: if len(highest_group) == 1: alignment.mapping_quality = 2 else: alignment.mapping_quality = 1 if not primary_already_assigned: primary_already_assigned = True else: alignment.is_secondary = True alignment_sorter.write(alignment) else: statistics['unaligned'] += 1 yield read with open(error_fn, 'w') as error_fh: for key in ['input', 'aligned', 'unaligned']: error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
def combine_mappings(self): num_unmapped = 0 num_five_unmapped = 0 num_three_unmapped = 0 num_nonunique = 0 num_discordant = 0 num_concordant = 0 five_prime_mappings = pysam.Samfile(self.file_names['five_prime_accepted_hits']) five_prime_unmapped = pysam.Samfile(self.file_names['five_prime_unmapped']) all_five_prime = sam.merge_by_name(five_prime_mappings, five_prime_unmapped) five_prime_grouped = utilities.group_by(all_five_prime, lambda m: m.qname) three_prime_mappings = pysam.Samfile(self.file_names['three_prime_accepted_hits']) three_prime_unmapped = pysam.Samfile(self.file_names['three_prime_unmapped']) all_three_prime = sam.merge_by_name(three_prime_mappings, three_prime_unmapped) three_prime_grouped = utilities.group_by(all_three_prime, lambda m: m.qname) group_pairs = izip(five_prime_grouped, three_prime_grouped) alignment_sorter = sam.AlignmentSorter(five_prime_mappings.references, five_prime_mappings.lengths, self.file_names['combined_extended'], ) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=five_prime_mappings, ) with alignment_sorter: for (five_qname, five_group), (three_qname, three_group) in group_pairs: five_annotation = trim.PayloadAnnotation.from_identifier(five_qname) three_annotation = trim.PayloadAnnotation.from_identifier(three_qname) if five_annotation['original_name'] != three_annotation['original_name']: # Ensure that the iteration through pairs is in sync. print five_qname, three_qname raise ValueError five_unmapped = any(m.is_unmapped for m in five_group) three_unmapped = any(m.is_unmapped for m in three_group) if five_unmapped: num_five_unmapped += 1 if three_unmapped: num_three_unmapped += 1 if five_unmapped or three_unmapped: num_unmapped += 1 continue five_nonunique = len(five_group) > 1 or any(m.mapq < 40 for m in five_group) three_nonunique = len(three_group) > 1 or any(m.mapq < 40 for m in three_group) if five_nonunique or three_nonunique: num_nonunique += 1 continue five_m = five_group.pop() three_m = three_group.pop() five_strand = '-' if five_m.is_reverse else '+' three_strand = '-' if three_m.is_reverse else '+' tlen = max(five_m.aend, three_m.aend) - min(five_m.pos, three_m.pos) discordant = (five_m.tid != three_m.tid) or (five_strand) != (three_strand) or (tlen > 10000) if discordant: num_discordant += 1 continue if five_strand == '+': first_read = five_m second_read = three_m elif five_strand == '-': first_read = three_m second_read = five_m gap = second_read.pos - first_read.aend if gap < 0: num_discordant += 1 continue combined_read = pysam.AlignedRead() # qname needs to come from three_m to include trimmed As combined_read.qname = three_m.qname combined_read.tid = five_m.tid combined_read.seq = first_read.seq + second_read.seq combined_read.qual = first_read.qual + second_read.qual combined_read.cigar = first_read.cigar + [(3, gap)] + second_read.cigar combined_read.pos = first_read.pos combined_read.is_reverse = first_read.is_reverse combined_read.mapq = min(first_read.mapq, second_read.mapq) combined_read.rnext = -1 combined_read.pnext = -1 num_concordant += 1 extended_mapping = trim.extend_polyA_end(combined_read, region_fetcher, ) alignment_sorter.write(extended_mapping) self.summary.extend( [('Unmapped', num_unmapped), ('Five prime unmapped', num_five_unmapped), ('Three prime unmapped', num_three_unmapped), ('Nonunique', num_nonunique), ('Discordant', num_discordant), ('Concordant', num_concordant), ], )
def group_mapping_pairs(mappings): groups = utilities.group_by(mappings, lambda r: r.query_name) for query_name, query_mappings in groups: R1_group = [m for m in query_mappings if m.is_read1] R2_group = [m for m in query_mappings if m.is_read2] yield query_name, (R1_group, R2_group)
def filter_mappings( mappings, minimum_mapq=42, max_insert_length=1000, counts_dict=None, verbose=False, unmapped_fns=None, ): ''' Filters out unmapped, nonuniquely mapped, or discordantly mapped reads. ''' pair_counts = { 'total': 0, 'unmapped': 0, 'indel': 0, 'nonunique': 0, 'discordant': 0, 'disoriented': 0, 'unique': Counter(), 'mapqs': Counter(), 'fragment_lengths': Counter(), 'tids': Counter(), } if unmapped_fns: R1_unmapped_fn, R2_unmapped_fn = unmapped_fns R1_unmapped_fh = open(R1_unmapped_fn, 'w') R2_unmapped_fh = open(R2_unmapped_fn, 'w') for _, aligned_pair in utilities.group_by(mappings, key=lambda m: m.qname): if len(aligned_pair) != 2: raise ValueError(len(aligned_pair)) pair_counts['total'] += 1 R1_aligned, R2_aligned = aligned_pair # If R2 is mapped but R1 isn't, R2 gets reported first. if not R1_aligned.is_read1: R1_aligned, R2_aligned = R2_aligned, R1_aligned if (not R1_aligned.is_read1) or (not R2_aligned.is_read2): raise ValueError(R1_aligned, R2_aligned) pair_counts['mapqs'][R1_aligned.mapq] += 1 pair_counts['mapqs'][R2_aligned.mapq] += 1 if R1_aligned.is_unmapped or R2_aligned.is_unmapped: pair_counts['unmapped'] += 1 if verbose: logging.info('{0} was unmapped'.format(R1_aligned.qname)) if unmapped_fns: R1_read = sam.mapping_to_Read(R1_aligned) R2_read = sam.mapping_to_Read(R2_aligned) R1_unmapped_fh.write(str(R1_read)) R2_unmapped_fh.write(str(R2_read)) elif is_discordant(R1_aligned, R2_aligned, max_insert_length): pair_counts['discordant'] += 1 else: pair_counts['tids'][R1_aligned.tid] += 1 if is_disoriented(R1_aligned, R2_aligned): pair_counts['disoriented'] += 1 elif R1_aligned.mapq < minimum_mapq or R2_aligned.mapq < minimum_mapq: pair_counts['nonunique'] += 1 if verbose: logging.info('{0} was nonunique, {1}, {2}'.format( R1_aligned.qname, R1_aligned.mapq, R2_aligned.mapq)) else: pair_counts['unique'][R1_aligned.tid] += 1 fragment_length = abs(R1_aligned.tlen) pair_counts['fragment_lengths'][fragment_length] += 1 if sam.contains_indel_pysam( R1_aligned) or sam.contains_indel_pysam(R2_aligned): pair_counts['indel'] += 1 yield R1_aligned, R2_aligned if counts_dict != None: counts_dict.update(pair_counts)
def combine_mappings(self): num_unmapped = 0 num_R1_unmapped = 0 num_R2_unmapped = 0 num_nonunique = 0 num_discordant = 0 num_disoriented = 0 num_concordant = 0 tlens = Counter() R1_mappings = pysam.Samfile(self.file_names['R1_accepted_hits']) R1_unmapped = pysam.Samfile(self.file_names['R1_unmapped']) all_R1 = sam.merge_by_name(R1_mappings, R1_unmapped) R1_grouped = utilities.group_by(all_R1, lambda m: m.qname) R2_mappings = pysam.Samfile(self.file_names['R2_accepted_hits']) R2_unmapped = pysam.Samfile(self.file_names['R2_unmapped']) all_R2 = sam.merge_by_name(R2_mappings, R2_unmapped) R2_grouped = utilities.group_by(all_R2, lambda m: m.qname) group_pairs = izip(R1_grouped, R2_grouped) alignment_sorter = sam.AlignmentSorter(R1_mappings.references, R1_mappings.lengths, self.file_names['combined'], ) with alignment_sorter: for (R1_qname, R1_group), (R2_qname, R2_group) in group_pairs: #print R1_qname, R2_qname if fastq.get_pair_name(R1_qname) != fastq.get_pair_name(R2_qname): # Ensure that the iteration through pairs is in sync. print R1_qname, R2_qname raise ValueError R1_unmapped = any(m.is_unmapped for m in R1_group) R2_unmapped = any(m.is_unmapped for m in R2_group) if R1_unmapped: num_R1_unmapped += 1 if R2_unmapped: num_R2_unmapped += 1 if R1_unmapped or R2_unmapped: num_unmapped += 1 continue R1_nonunique = len(R1_group) > 1 or any(m.mapq < 40 for m in R1_group) R2_nonunique = len(R2_group) > 1 or any(m.mapq < 40 for m in R2_group) if R1_nonunique or R2_nonunique: num_nonunique += 1 continue R1_m = R1_group.pop() R2_m = R2_group.pop() R1_strand = sam.get_strand(R1_m) R2_strand = sam.get_strand(R2_m) tlen = max(R1_m.aend, R2_m.aend) - min(R1_m.pos, R2_m.pos) discordant = (R1_m.tid != R2_m.tid) or (R1_strand) == (R2_strand) or (tlen > 10000) if discordant: num_discordant += 1 continue # Reminder: the protocol produces anti-sense reads. if R1_strand == '-': if R1_m.pos < R2_m.pos: num_disoriented += 1 continue elif R1_strand == '+': if R2_m.pos < R1_m.pos: num_disoriented += 1 continue combined_read = paired_end.combine_paired_mappings(R1_m, R2_m) tlens[tlen] += 1 if combined_read: # Flip combined_read back to the sense strand. if combined_read.is_reverse: combined_read.is_reverse = False else: combined_read.is_reverse = True trim.set_nongenomic_length(combined_read, 0) alignment_sorter.write(combined_read) num_concordant += 1 self.summary.extend( [('Unmapped', num_unmapped), ('R1 unmapped', num_R1_unmapped), ('R2 unmapped', num_R2_unmapped), ('Nonunique', num_nonunique), ('Discordant', num_discordant), ('Unexpected orientation', num_disoriented), ('Concordant', num_concordant), ], ) tlens = utilities.counts_to_array(tlens) self.write_file('tlens', tlens)