def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher(self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter(sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend( [('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def filter_mappings(self): num_unmapped = 0 num_entirely_genomic = 0 num_nonunique = 0 num_unique = 0 nongenomic_lengths = Counter() sam_file = pysam.Samfile(self.file_names['accepted_hits']) region_fetcher = genomes.build_region_fetcher( self.file_names['genome'], load_references=True, sam_file=sam_file, ) extended_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended'], ) filtered_sorter = sam.AlignmentSorter( sam_file.references, sam_file.lengths, self.file_names['extended_filtered'], ) extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file) mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname) with extended_sorter, filtered_sorter: for qname, group in mapping_groups: for m in group: extended_sorter.write(m) min_nongenomic_length = min( trim.get_nongenomic_length(m) for m in group) nongenomic_lengths[min_nongenomic_length] += 1 if min_nongenomic_length == 0: num_entirely_genomic += 1 continue nonunique = len(group) > 1 or any(m.mapq < 40 for m in group) if nonunique: num_nonunique += 1 continue num_unique += 1 for m in group: filtered_sorter.write(m) self.summary.extend([ ('Mapped with no non-genomic A\'s', num_entirely_genomic), ('Nonunique', num_nonunique), ('Unique', num_unique), ], ) nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths) self.write_file('nongenomic_lengths', nongenomic_lengths)
def get_Transcript_position_counts(clean_bam_fn, transcripts, relevant_lengths, left_buffer=left_buffer, right_buffer=right_buffer, ): gene_infos = {} bam_file = pysam.Samfile(clean_bam_fn) max_nongenomic_length = 5 for transcript in transcripts: transcript.build_coordinate_maps(left_buffer, right_buffer) nonunique = 0 alternatively_spliced = 0 landmarks = {'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } five_prime_positions = {l: PositionCounts(landmarks, left_buffer, right_buffer) for l in relevant_lengths + ['all', 'all_nonunique']} three_prime_positions = {l: PositionCounts(landmarks, left_buffer, right_buffer) for l in range(max_nongenomic_length + 1) + ['all', 'all_nonunique']} transcript_sequence = transcript.get_transcript_sequence(left_buffer, right_buffer) # fetch raises a ValueError if given a negative start, but it doesn't # care if the end is valid. left_edge = max(0, transcript.start - left_buffer) right_edge = transcript.end + right_buffer overlapping_reads = bam_file.fetch(transcript.seqname, left_edge, right_edge) for read in overlapping_reads: if any(transcript.is_spliced_out(position) for position in read.positions): alternatively_spliced += 1 continue if read.mapq != 50: nonunique += 1 is_unique = False else: is_unique = True read_strand = '-' if read.is_reverse else '+' if read_strand != transcript.strand: continue left_edge = read.pos right_edge = read.aend - 1 if read_strand == '+': five_prime_position = left_edge three_prime_position = right_edge elif read_strand == '-': five_prime_position = right_edge three_prime_position = left_edge if five_prime_position in transcript.genomic_to_transcript: transcript_coord = transcript.genomic_to_transcript[five_prime_position] if is_unique: five_prime_positions['all']['start', transcript_coord] += 1 if read.qlen in relevant_lengths: five_prime_positions[read.qlen]['start', transcript_coord] += 1 elif not read.is_secondary: five_prime_positions['all_nonunique']['start', transcript_coord] += 1 if three_prime_position in transcript.genomic_to_transcript: transcript_coord = transcript.genomic_to_transcript[three_prime_position] if is_unique: three_prime_positions['all']['start', transcript_coord] += 1 nongenomic_length = trim.get_nongenomic_length(read) if nongenomic_length <= max_nongenomic_length: three_prime_positions[nongenomic_length]['start', transcript_coord] += 1 elif not read.is_secondary: three_prime_positions['all_nonunique']['start', transcript_coord] += 1 gene_infos[transcript.name] = {'CDS_length': transcript.CDS_length, 'five_prime_positions': five_prime_positions, 'three_prime_positions': three_prime_positions, 'nonunique': nonunique, 'alternatively_spliced': alternatively_spliced, 'sequence': transcript_sequence, } transcript.delete_coordinate_maps() return gene_infos
def get_Transcript_position_counts( clean_bam_fn, transcripts, relevant_lengths, left_buffer=left_buffer, right_buffer=right_buffer, ): gene_infos = {} bam_file = pysam.Samfile(clean_bam_fn) max_nongenomic_length = 5 for transcript in transcripts: transcript.build_coordinate_maps(left_buffer, right_buffer) nonunique = 0 alternatively_spliced = 0 landmarks = { 'start': 0, 'start_codon': transcript.transcript_start_codon, 'stop_codon': transcript.transcript_stop_codon, 'end': transcript.transcript_length, } five_prime_positions = { l: PositionCounts(landmarks, left_buffer, right_buffer) for l in relevant_lengths + ['all', 'all_nonunique'] } three_prime_positions = { l: PositionCounts(landmarks, left_buffer, right_buffer) for l in range(max_nongenomic_length + 1) + ['all', 'all_nonunique'] } transcript_sequence = transcript.get_transcript_sequence( left_buffer, right_buffer) # fetch raises a ValueError if given a negative start, but it doesn't # care if the end is valid. left_edge = max(0, transcript.start - left_buffer) right_edge = transcript.end + right_buffer overlapping_reads = bam_file.fetch(transcript.seqname, left_edge, right_edge) for read in overlapping_reads: if any( transcript.is_spliced_out(position) for position in read.positions): alternatively_spliced += 1 continue if read.mapq != 50: nonunique += 1 is_unique = False else: is_unique = True read_strand = '-' if read.is_reverse else '+' if read_strand != transcript.strand: continue left_edge = read.pos right_edge = read.aend - 1 if read_strand == '+': five_prime_position = left_edge three_prime_position = right_edge elif read_strand == '-': five_prime_position = right_edge three_prime_position = left_edge if five_prime_position in transcript.genomic_to_transcript: transcript_coord = transcript.genomic_to_transcript[ five_prime_position] if is_unique: five_prime_positions['all']['start', transcript_coord] += 1 if read.qlen in relevant_lengths: five_prime_positions[read.qlen]['start', transcript_coord] += 1 elif not read.is_secondary: five_prime_positions['all_nonunique'][ 'start', transcript_coord] += 1 if three_prime_position in transcript.genomic_to_transcript: transcript_coord = transcript.genomic_to_transcript[ three_prime_position] if is_unique: three_prime_positions['all']['start', transcript_coord] += 1 nongenomic_length = trim.get_nongenomic_length(read) if nongenomic_length <= max_nongenomic_length: three_prime_positions[nongenomic_length][ 'start', transcript_coord] += 1 elif not read.is_secondary: three_prime_positions['all_nonunique'][ 'start', transcript_coord] += 1 gene_infos[transcript.name] = { 'CDS_length': transcript.CDS_length, 'five_prime_positions': five_prime_positions, 'three_prime_positions': three_prime_positions, 'nonunique': nonunique, 'alternatively_spliced': alternatively_spliced, 'sequence': transcript_sequence, } transcript.delete_coordinate_maps() return gene_infos