def test_sam_record_count(self): '''test sam_record_count''' tmpfile = 'tmp.utils.sam_record_count' with open(tmpfile, 'w') as f: print('@foo bar', file=f) print('@PG baz', file=f) for i in range(42): print('zaphod', file=f) self.assertEqual(42, utils.sam_record_count(tmpfile)) os.unlink(tmpfile)
def test_sam_record_count(self): """test sam_record_count""" tmpfile = "tmp.utils.sam_record_count" with open(tmpfile, "w") as f: print("@foo bar", file=f) print("@PG baz", file=f) for i in range(42): print("zaphod", file=f) self.assertEqual(42, utils.sam_record_count(tmpfile)) os.unlink(tmpfile)
def test_map_reads_secondary_hits_removed(self): """test map_reads secondary hits get removed""" reads1 = os.path.join(data_dir, "secondary_hits_removed.reads_1.fq") reads2 = os.path.join(data_dir, "secondary_hits_removed.reads_2.fq") ref_fasta = os.path.join(data_dir, "secondary_hits_removed.ref.fa") tmp_sam = "tmp.test_map_reads.sam" if os.path.exists(tmp_sam): os.unlink(tmp_sam) read_map.map_reads(ref_fasta, reads1, reads2, tmp_sam) # bwa mem reports one secondary alignment, so the 1 read pair makes # 3 SAM records. So should have 2 records after removing secondary match. self.assertEqual(2, utils.sam_record_count(tmp_sam)) os.unlink(tmp_sam)
def map_reads( ref_fasta, reads1, reads2, outfile, rmdup=False, markdup=False, read_group=None, threads=1 ): """Maps reads with BWA MEM. By default, outputs SAM file in input read order. rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam Incompatible with markdup=True markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam. Incompatible with rmdup=True read_group should be a tuple (group_id, group_name). If given, these will be put into the BAM""" if rmdup and markdup: raise Error("Cannot have rmdup and markdup both True." "") try: expected_read_count = 2 * fqtools.count([reads1, reads2]) except: raise Error("Error counting reads in input files " + reads1 + " " + reads2) if rmdup or markdup: tmpdir = tempfile.mkdtemp( prefix=outfile + ".tmp.map_reads.", dir=os.path.dirname(outfile) ) sam_file = os.path.join(tmpdir, "tmp.sam") else: sam_file = outfile # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used R_option = ( "" if read_group is None else r"""-R '@RG\tLB:LIB\tID:""" + read_group[0] + r"""\tSM:""" + read_group[1] + "'" ) cmd = " ".join( [ "bwa mem -M", f"-t {threads}", R_option, ref_fasta, reads1, reads2, r""" | awk '/^@/ || !and($2,256)' """, # remove secondary alignments (but keep header) ">", sam_file, ] ) try: utils.syscall(cmd) except: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error("Error running BWA MEM: " + cmd) number_in_sam = utils.sam_record_count(sam_file) if expected_read_count != number_in_sam: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error( "Error! Mismatch in read counts. Expected " + str(expected_read_count) + " but got " + str(number_in_sam) ) if rmdup or markdup: sorted_bam = os.path.join(tmpdir, "tmp.sorted.bam") cmd = " ".join(["samtools sort", "-o", sorted_bam, sam_file]) try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error("Error running samtools sort: " + cmd) if rmdup: cmd = "samtools rmdup " + sorted_bam + " " + outfile try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error("Error running samtools rmdup: " + cmd) else: try: picard.mark_duplicates(sorted_bam, outfile) except: shutil.rmtree(tmpdir) raise Error("Error running picard mark_duplicates " + cmd) shutil.rmtree(tmpdir)
def map_reads(ref_fasta, reads1, reads2, outfile, rmdup=False, markdup=False, read_group=None): '''Maps reads with BWA MEM. By default, outputs SAM file in input read order. rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam Incompatible with markdup=True markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam. Incompatible with rmdup=True read_group should be a tuple (group_id, group_name). If given, these will be put into the BAM''' if rmdup and markdup: raise Error('Cannot have rmdup and markdup both True.' '') try: expected_read_count = 2 * fqtools.count([reads1, reads2]) except: raise Error('Error counting reads in input files ' + reads1 + ' ' + reads2) if rmdup or markdup: tmpdir = tempfile.mkdtemp(prefix=outfile + '.tmp.map_reads.', dir=os.path.dirname(outfile)) sam_file = os.path.join(tmpdir, 'tmp.sam') else: sam_file = outfile # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used R_option = '' if read_group is None else r'''-R '@RG\tLB:LIB\tID:''' + read_group[ 0] + r'''\tSM:''' + read_group[1] + "'" cmd = ' '.join([ 'bwa mem -M', R_option, ref_fasta, reads1, reads2, r''' | awk '/^@/ || !and($2,256)' ''', # remove secondary alignments (but keep header) '>', sam_file ]) try: utils.syscall(cmd) except: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error('Error running BWA MEM: ' + cmd) number_in_sam = utils.sam_record_count(sam_file) if expected_read_count != number_in_sam: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error('Error! Mismatch in read counts. Expected ' + str(expected_read_count) + ' but got ' + str(number_in_sam)) if rmdup or markdup: sorted_bam = os.path.join(tmpdir, 'tmp.sorted.bam') cmd = ' '.join(['samtools sort', '-o', sorted_bam, sam_file]) try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error('Error running samtools sort: ' + cmd) if rmdup: cmd = 'samtools rmdup ' + sorted_bam + ' ' + outfile try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error('Error running samtools rmdup: ' + cmd) else: try: picard.mark_duplicates(sorted_bam, outfile) except: shutil.rmtree(tmpdir) raise Error('Error running picard mark_duplicates ' + cmd) shutil.rmtree(tmpdir)