def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend( [('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def process_remapped(self): clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits']) type_shape = (self.max_read_length + 1, self.max_read_length, fastq.MAX_EXPECTED_QUAL + 1, 6, 6, ) type_counts = np.zeros(type_shape, int) remapped_length_counts = Counter() region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=clean_bam, ) for mapping in clean_bam: trimmed_from_start = trim.trim_mismatches_from_start(mapping, region_fetcher, type_counts, ) # Add back any genomic A's that were trimmed as part of mappings and # any remaining A's from the first non-genomic onward as soft clipped # bases for visualization in IGV. extended = trim.extend_polyA_end(trimmed_from_start, region_fetcher, trimmed_twice=True, ) if not extended.is_unmapped and not extended.is_secondary: remapped_length_counts[extended.qlen] += 1 yield extended remapped_lengths = self.zero_padded_array(remapped_length_counts) self.write_file('lengths', {'remapped': remapped_lengths})
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min( trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend([ ('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def combine_mappings(self): num_unmapped = 0 num_five_unmapped = 0 num_three_unmapped = 0 num_nonunique = 0 num_discordant = 0 num_concordant = 0 five_prime_mappings = pysam.Samfile(self.file_names['five_prime_accepted_hits']) five_prime_unmapped = pysam.Samfile(self.file_names['five_prime_unmapped']) all_five_prime = sam.merge_by_name(five_prime_mappings, five_prime_unmapped) five_prime_grouped = utilities.group_by(all_five_prime, lambda m: m.qname) three_prime_mappings = pysam.Samfile(self.file_names['three_prime_accepted_hits']) three_prime_unmapped = pysam.Samfile(self.file_names['three_prime_unmapped']) all_three_prime = sam.merge_by_name(three_prime_mappings, three_prime_unmapped) three_prime_grouped = utilities.group_by(all_three_prime, lambda m: m.qname) group_pairs = izip(five_prime_grouped, three_prime_grouped) alignment_sorter = sam.AlignmentSorter(five_prime_mappings.references, five_prime_mappings.lengths, self.file_names['combined_extended'], ) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=five_prime_mappings, ) with alignment_sorter: for (five_qname, five_group), (three_qname, three_group) in group_pairs: five_annotation = trim.PayloadAnnotation.from_identifier(five_qname) three_annotation = trim.PayloadAnnotation.from_identifier(three_qname) if five_annotation['original_name'] != three_annotation['original_name']: # Ensure that the iteration through pairs is in sync. print five_qname, three_qname raise ValueError five_unmapped = any(m.is_unmapped for m in five_group) three_unmapped = any(m.is_unmapped for m in three_group) if five_unmapped: num_five_unmapped += 1 if three_unmapped: num_three_unmapped += 1 if five_unmapped or three_unmapped: num_unmapped += 1 continue five_nonunique = len(five_group) > 1 or any(m.mapq < 40 for m in five_group) three_nonunique = len(three_group) > 1 or any(m.mapq < 40 for m in three_group) if five_nonunique or three_nonunique: num_nonunique += 1 continue five_m = five_group.pop() three_m = three_group.pop() five_strand = '-' if five_m.is_reverse else '+' three_strand = '-' if three_m.is_reverse else '+' tlen = max(five_m.aend, three_m.aend) - min(five_m.pos, three_m.pos) discordant = (five_m.tid != three_m.tid) or (five_strand) != (three_strand) or (tlen > 10000) if discordant: num_discordant += 1 continue if five_strand == '+': first_read = five_m second_read = three_m elif five_strand == '-': first_read = three_m second_read = five_m gap = second_read.pos - first_read.aend if gap < 0: num_discordant += 1 continue combined_read = pysam.AlignedRead() # qname needs to come from three_m to include trimmed As combined_read.qname = three_m.qname combined_read.tid = five_m.tid combined_read.seq = first_read.seq + second_read.seq combined_read.qual = first_read.qual + second_read.qual combined_read.cigar = first_read.cigar + [(3, gap)] + second_read.cigar combined_read.pos = first_read.pos combined_read.is_reverse = first_read.is_reverse combined_read.mapq = min(first_read.mapq, second_read.mapq) combined_read.rnext = -1 combined_read.pnext = -1 num_concordant += 1 extended_mapping = trim.extend_polyA_end(combined_read, region_fetcher, ) alignment_sorter.write(extended_mapping) self.summary.extend( [('Unmapped', num_unmapped), ('Five prime unmapped', num_five_unmapped), ('Three prime unmapped', num_three_unmapped), ('Nonunique', num_nonunique), ('Discordant', num_discordant), ('Concordant', num_concordant), ], )