def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=sam_file,
                                                     )

        extended_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended'],
                                             )
        filtered_sorter = sam.AlignmentSorter(sam_file.references,
                                              sam_file.lengths,
                                              self.file_names['extended_filtered'],
                                             )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher) for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings, lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue
                
                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue
                
                num_unique += 1
                
                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend(
            [('Mapped with no non-genomic A\'s', num_entirely_genomic),
             ('Nonunique', num_nonunique),
             ('Unique', num_unique),
            ],
        )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
Beispiel #2
0
    def process_remapped(self):
        clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits'])
        
        type_shape = (self.max_read_length + 1,
                      self.max_read_length,
                      fastq.MAX_EXPECTED_QUAL + 1,
                      6,
                      6,
                     )
        type_counts = np.zeros(type_shape, int)
        remapped_length_counts = Counter()
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=clean_bam,
                                                     )

        for mapping in clean_bam:
            trimmed_from_start = trim.trim_mismatches_from_start(mapping,
                                                                 region_fetcher,
                                                                 type_counts,
                                                                )
            # Add back any genomic A's that were trimmed as part of mappings and
            # any remaining A's from the first non-genomic onward as soft clipped
            # bases for visualization in IGV.
            extended = trim.extend_polyA_end(trimmed_from_start,
                                             region_fetcher,
                                             trimmed_twice=True,
                                            )
            if not extended.is_unmapped and not extended.is_secondary:
                remapped_length_counts[extended.qlen] += 1

            yield extended

        remapped_lengths = self.zero_padded_array(remapped_length_counts)
        self.write_file('lengths', {'remapped': remapped_lengths})
Beispiel #3
0
    def filter_mappings(self):
        num_unmapped = 0
        num_entirely_genomic = 0
        num_nonunique = 0
        num_unique = 0

        nongenomic_lengths = Counter()

        sam_file = pysam.Samfile(self.file_names['accepted_hits'])

        region_fetcher = genomes.build_region_fetcher(
            self.file_names['genome'],
            load_references=True,
            sam_file=sam_file,
        )

        extended_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended'],
        )
        filtered_sorter = sam.AlignmentSorter(
            sam_file.references,
            sam_file.lengths,
            self.file_names['extended_filtered'],
        )

        extended_mappings = (trim.extend_polyA_end(mapping, region_fetcher)
                             for mapping in sam_file)
        mapping_groups = utilities.group_by(extended_mappings,
                                            lambda m: m.qname)

        with extended_sorter, filtered_sorter:
            for qname, group in mapping_groups:
                for m in group:
                    extended_sorter.write(m)

                min_nongenomic_length = min(
                    trim.get_nongenomic_length(m) for m in group)
                nongenomic_lengths[min_nongenomic_length] += 1
                if min_nongenomic_length == 0:
                    num_entirely_genomic += 1
                    continue

                nonunique = len(group) > 1 or any(m.mapq < 40 for m in group)
                if nonunique:
                    num_nonunique += 1
                    continue

                num_unique += 1

                for m in group:
                    filtered_sorter.write(m)

        self.summary.extend([
            ('Mapped with no non-genomic A\'s', num_entirely_genomic),
            ('Nonunique', num_nonunique),
            ('Unique', num_unique),
        ], )

        nongenomic_lengths = utilities.counts_to_array(nongenomic_lengths)
        self.write_file('nongenomic_lengths', nongenomic_lengths)
Beispiel #4
0
    def combine_mappings(self):
        num_unmapped = 0
        num_five_unmapped = 0
        num_three_unmapped = 0
        num_nonunique = 0
        num_discordant = 0
        num_concordant = 0

        five_prime_mappings = pysam.Samfile(self.file_names['five_prime_accepted_hits'])
        five_prime_unmapped = pysam.Samfile(self.file_names['five_prime_unmapped'])
        all_five_prime = sam.merge_by_name(five_prime_mappings, five_prime_unmapped)
        five_prime_grouped = utilities.group_by(all_five_prime, lambda m: m.qname)

        three_prime_mappings = pysam.Samfile(self.file_names['three_prime_accepted_hits'])
        three_prime_unmapped = pysam.Samfile(self.file_names['three_prime_unmapped'])
        all_three_prime = sam.merge_by_name(three_prime_mappings, three_prime_unmapped)
        three_prime_grouped = utilities.group_by(all_three_prime, lambda m: m.qname)

        group_pairs = izip(five_prime_grouped, three_prime_grouped)

        alignment_sorter = sam.AlignmentSorter(five_prime_mappings.references,
                                               five_prime_mappings.lengths,
                                               self.file_names['combined_extended'],
                                              )
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=five_prime_mappings,
                                                     )

        with alignment_sorter:
            for (five_qname, five_group), (three_qname, three_group) in group_pairs:
                five_annotation = trim.PayloadAnnotation.from_identifier(five_qname)
                three_annotation = trim.PayloadAnnotation.from_identifier(three_qname)
                if five_annotation['original_name'] != three_annotation['original_name']:
                    # Ensure that the iteration through pairs is in sync.
                    print five_qname, three_qname
                    raise ValueError

                five_unmapped = any(m.is_unmapped for m in five_group)
                three_unmapped = any(m.is_unmapped for m in three_group)
                if five_unmapped:
                    num_five_unmapped += 1
                if three_unmapped:
                    num_three_unmapped += 1
                if five_unmapped or three_unmapped:
                    num_unmapped += 1
                    continue

                five_nonunique = len(five_group) > 1 or any(m.mapq < 40 for m in five_group)
                three_nonunique = len(three_group) > 1 or any(m.mapq < 40 for m in three_group)
                if five_nonunique or three_nonunique:
                    num_nonunique += 1
                    continue
                
                five_m = five_group.pop()
                three_m = three_group.pop()

                five_strand = '-' if five_m.is_reverse else '+'
                three_strand = '-' if three_m.is_reverse else '+'

                tlen = max(five_m.aend, three_m.aend) - min(five_m.pos, three_m.pos)
                discordant = (five_m.tid != three_m.tid) or (five_strand) != (three_strand) or (tlen > 10000) 
                if discordant:
                    num_discordant += 1
                    continue
                
                if five_strand == '+':
                    first_read = five_m
                    second_read = three_m
                elif five_strand == '-':
                    first_read = three_m
                    second_read = five_m
                
                gap = second_read.pos - first_read.aend
                if gap < 0:
                    num_discordant += 1
                    continue
                
                combined_read = pysam.AlignedRead()
                # qname needs to come from three_m to include trimmed As
                combined_read.qname = three_m.qname
                combined_read.tid = five_m.tid
                combined_read.seq = first_read.seq + second_read.seq
                combined_read.qual = first_read.qual + second_read.qual
                combined_read.cigar = first_read.cigar + [(3, gap)] + second_read.cigar
                combined_read.pos = first_read.pos
                combined_read.is_reverse = first_read.is_reverse
                combined_read.mapq = min(first_read.mapq, second_read.mapq)
                combined_read.rnext = -1
                combined_read.pnext = -1
                
                num_concordant += 1

                extended_mapping = trim.extend_polyA_end(combined_read,
                                                         region_fetcher,
                                                        )

                alignment_sorter.write(extended_mapping)

        self.summary.extend(
            [('Unmapped', num_unmapped),
             ('Five prime unmapped', num_five_unmapped),
             ('Three prime unmapped', num_three_unmapped),
             ('Nonunique', num_nonunique),
             ('Discordant', num_discordant),
             ('Concordant', num_concordant),
            ],
        )