def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = Samfile(bam_fhand.name) stats = {'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter()} for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments(mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments(mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = { 'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0' } bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output( [get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_classify_paired_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # Non chimeric query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n' # Chimeric query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n' query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n' # unknown query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n' query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n' query = query1 + query2 + query5 + query6 + query3 + query4 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname') result = classify_mapped_reads(bam_fhand, mate_distance=2000) for pair, kind in result: if kind == NON_CHIMERIC: assert 'seq1' in get_name(pair[0]) elif kind == UNKNOWN: assert 'seq3' in get_name(pair[0]) elif kind == CHIMERA: assert 'seq2' in get_name(pair[0]) else: self.fail()
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def test_classify_paired_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') #Non chimeric query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n' #Chimeric query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n' query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n' #unknown query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n' query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n' query = query1 + query2 + query5 + query6 + query3 + query4 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname') result = classify_mapped_reads(bam_fhand, mate_distance=2000) for pair, kind in result: if kind == NON_CHIMERIC: assert 'seq1' in get_name(pair[0]) elif kind == UNKNOWN: assert 'seq3' in get_name(pair[0]) elif kind == CHIMERA: assert 'seq2' in get_name(pair[0]) else: self.fail()
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output([get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def test_map_with_bwa(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output( [get_binary_path('samtools'), 'view', bam_fhand.name]) assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def test_add_rg_to_bam(self): reference_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') reads_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') directory = TemporaryDir() index_fpath = get_or_create_bwa_index(reference_fpath, directory.name) bam_fhand = NamedTemporaryFile(suffix='.bam') lib_name = 'aa' log_fhand = NamedTemporaryFile() readgroup = {'ID': lib_name, 'PL': 'illumina', 'LB': lib_name, 'SM': '{0}_illumina_pe'.format(lib_name), 'PU': '0'} bwa = map_with_bwamem(index_fpath, unpaired_fpath=reads_fpath, readgroup=readgroup, log_fpath=log_fhand.name) map_process_to_bam(bwa, bam_fhand.name) out = subprocess.check_output([get_binary_path('samtools'), 'view', '-h', bam_fhand.name], stderr=log_fhand) assert '@RG\tID:aa' in out assert 'TTCTGATTCAATCTACTTCAAAGTTGGCTTTATCAATAAG' in out directory.close()
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None, unpaired_fpaths=None, directory=None, file_format=None, min_seed_len=None): fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0]) if file_format is not None: set_format(fhand, file_format) else: file_format = get_format(fhand) index_fpath = get_or_create_bwa_index(ref_fpath, directory) extra_params = ['-a', '-M'] if min_seed_len is not None: extra_params.extend(['-k', min_seed_len]) bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths, unpaired_fpath=unpaired_fpaths, extra_params=extra_params) bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp') sort_mapped_reads(bwa, bam_fhand.name, key='queryname') bamfile = pysam.Samfile(bam_fhand.name) return bamfile
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) return calculate_distance_distribution_in_bam(bam_fhand, max_clipping=max_clipping, max_distance=max_distance)
def test_rev_compl_fragmented_reads(self): reference_seq = GENOME #with paired_reads. #f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' #f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' #f is fragmented in two reference sequences. r mapps completely query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC' query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n' query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC' query8 += 'TGAGTAATATTATAGAAAGT\n' query_f = query1 + query3 + query7 query_r = query2 + query4 + query8 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = [f_fhand.name, r_fhand.name] ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() index_fpath = get_or_create_bwa_index(ref_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bwa, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)
def calculate_distance_distribution(interleave_fhand, index_fpath, max_clipping, max_distance=None, tempdir=None, threads=None): bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=interleave_fhand.name, extra_params=extra_params, threads=threads) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) bamfile = AlignmentFile(bam_fhand.name) stats = { 'outies': IntCounter(), 'innies': IntCounter(), 'others': IntCounter() } for grouped_mates in _group_alignments_reads_by_qname(bamfile): mates = _split_mates(grouped_mates) for aligned_read1 in _get_totally_mapped_alignments( mates[0], max_clipping): for aligned_read2 in _get_totally_mapped_alignments( mates[1], max_clipping): if aligned_read1.rname == aligned_read2.rname: aligned_reads = [aligned_read1, aligned_read2] distance = _find_distance(aligned_reads) if _mates_are_outies(aligned_reads): kind = 'outies' elif _mates_are_innies(aligned_reads): kind = 'innies' else: kind = 'others' if max_distance is None or max_distance > distance: stats[kind][distance] += 1 return stats
def test_rev_compl_fragmented_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # with paired_reads. # f is reversed r is direct query1 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += '\n' query2 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query2 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' # f is direct, r is reversed query3 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query3 += '\n' query4 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query4 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' # f is fragmented in two reference sequences. r mapps completely query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC' query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n' query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC' query8 += 'TGAGTAATATTATAGAAAGT\n' query_f = query1 + query3 + query7 query_r = query2 + query4 + query8 f_fhand = NamedTemporaryFile() f_fhand.write(query_f) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(query_r) r_fhand.flush() paired_fpaths = (f_fhand.name, r_fhand.name) bam_fhand = NamedTemporaryFile(suffix='.bam') bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths) map_process_to_bam(bwa, bam_fhand.name) samfile = pysam.Samfile(bam_fhand.name)