def process_remapped_unmapped(self): unmapped_lengths = np.zeros(self.max_read_length + 1, int) unmapped_seq_counts = Counter() long_polyA_lengths = np.zeros(self.max_read_length + 1, int) long_polyA_counts = Counter() unmapped_reads = sam.bam_to_fastq(self.file_names['remapped_unmapped_bam']) unretrimmed_reads = trim.untrim_reads(unmapped_reads, second_time=True) synthetic_filtered_reads = self.filter_synthetic_sequences(unretrimmed_reads) def record_common(reads): for read in reads: if predominantly_A(read.seq): long_polyA_lengths[len(read.seq)] += 1 long_polyA_counts[read.seq] += 1 else: unmapped_lengths[len(read.seq)] += 1 unmapped_seq_counts[read.seq] += 1 yield read synthetic_filtered_reads = record_common(synthetic_filtered_reads) seq_info_pairs = ((read.seq, False) for read in synthetic_filtered_reads) all_array, _ = composition.length_stratified_composition(seq_info_pairs, self.max_read_length) self.write_file('unmapped_composition', all_array) self.write_file('lengths', {'unmapped': unmapped_lengths, 'long_polyA': long_polyA_lengths, }, ) non_long_polyA = Counter(dict(unmapped_seq_counts.most_common(100))) long_polyA = Counter(dict(long_polyA_counts.most_common(100))) common_unmapped = {'non_long_polyA': non_long_polyA, 'long_polyA': long_polyA, } self.write_file('common_unmapped', common_unmapped)
def process_initially_unmapped(self): unmapped_reads = sam.bam_to_fastq(self.file_names['unmapped_bam']) no_phiX_reads = self.filter_phiX(unmapped_reads) self.remap_polyA_trimmed(no_phiX_reads)