Beispiel #1
0
    def process_full_length_mappings(self):
        clean_bam = pysam.Samfile(self.file_names['clean_bam'])
        
        type_shape = (self.max_read_length + 1,
                      self.max_read_length,
                      fastq.MAX_EXPECTED_QUAL + 1,
                      6,
                      6,
                     )
        type_counts = np.zeros(type_shape, int)

        # To avoid counting mismatches in non-unique mappings multiple times,
        # a dummy secondary_type_counts array is passed to
        # trim_mismatches_from_start for secondary mappings.
        secondary_type_counts = np.zeros(type_shape, int)
        
        clean_trimmed_length_counts = Counter()
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=clean_bam,
                                                     )

        for mapping in clean_bam:
            if mapping.is_secondary:
                counts_array = secondary_type_counts
            else:
                counts_array = type_counts

            trimmed_from_start = trim.trim_mismatches_from_start(mapping,
                                                                 region_fetcher,
                                                                 counts_array,
                                                                )
            trimmed_from_end = trim.trim_nongenomic_polyA_from_end(trimmed_from_start,
                                                                   region_fetcher,
                                                                  )
            if not trimmed_from_end.is_unmapped and not trimmed_from_end.is_secondary:
                clean_trimmed_length_counts[trimmed_from_end.qlen] += 1

            yield trimmed_from_end

        self.write_file('mismatches', type_counts)
        
        clean_trimmed_lengths = self.zero_padded_array(clean_trimmed_length_counts)
        self.write_file('lengths', {'clean_trimmed': clean_trimmed_lengths})
Beispiel #2
0
    def process_remapped(self):
        clean_bam = pysam.Samfile(self.file_names['remapped_accepted_hits'])
        
        type_shape = (self.max_read_length + 1,
                      self.max_read_length,
                      fastq.MAX_EXPECTED_QUAL + 1,
                      6,
                      6,
                     )
        type_counts = np.zeros(type_shape, int)
        remapped_length_counts = Counter()
    
        region_fetcher = genomes.build_region_fetcher(self.file_names['genome'],
                                                      load_references=True,
                                                      sam_file=clean_bam,
                                                     )

        for mapping in clean_bam:
            trimmed_from_start = trim.trim_mismatches_from_start(mapping,
                                                                 region_fetcher,
                                                                 type_counts,
                                                                )
            # Add back any genomic A's that were trimmed as part of mappings and
            # any remaining A's from the first non-genomic onward as soft clipped
            # bases for visualization in IGV.
            extended = trim.extend_polyA_end(trimmed_from_start,
                                             region_fetcher,
                                             trimmed_twice=True,
                                            )
            if not extended.is_unmapped and not extended.is_secondary:
                remapped_length_counts[extended.qlen] += 1

            yield extended

        remapped_lengths = self.zero_padded_array(remapped_length_counts)
        self.write_file('lengths', {'remapped': remapped_lengths})