def _write_counts_tsv(cls, reads1, reads2, outfile): number_of_read_pairs = fqtools.count([reads1, reads2]) with open(outfile, 'w') as f: print('Name', 'Is_contam', 'Reads', sep='\t', file=f) print('Remove_contam_not_run', '0', 2 * number_of_read_pairs, sep='\t', file=f) print('Reads_kept_after_remove_contam', '0', 2 * number_of_read_pairs, sep='\t', file=f)
def _write_counts_tsv(cls, reads1, reads2, outfile): number_of_read_pairs = fqtools.count([reads1, reads2]) with open(outfile, "w") as f: print("Name", "Is_contam", "Reads", sep="\t", file=f) print( "Remove_contam_not_run", "0", 2 * number_of_read_pairs, sep="\t", file=f ) print( "Reads_kept_after_remove_contam", "0", 2 * number_of_read_pairs, sep="\t", file=f, )
def test_count(self): """test count""" file1 = os.path.join(data_dir, "count.1.fq") file2 = os.path.join(data_dir, "count.2.fq") got = fqtools.count([file1, file2]) self.assertEqual(3, got)
def map_reads( ref_fasta, reads1, reads2, outfile, rmdup=False, markdup=False, read_group=None, threads=1 ): """Maps reads with BWA MEM. By default, outputs SAM file in input read order. rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam Incompatible with markdup=True markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam. Incompatible with rmdup=True read_group should be a tuple (group_id, group_name). If given, these will be put into the BAM""" if rmdup and markdup: raise Error("Cannot have rmdup and markdup both True." "") try: expected_read_count = 2 * fqtools.count([reads1, reads2]) except: raise Error("Error counting reads in input files " + reads1 + " " + reads2) if rmdup or markdup: tmpdir = tempfile.mkdtemp( prefix=outfile + ".tmp.map_reads.", dir=os.path.dirname(outfile) ) sam_file = os.path.join(tmpdir, "tmp.sam") else: sam_file = outfile # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used R_option = ( "" if read_group is None else r"""-R '@RG\tLB:LIB\tID:""" + read_group[0] + r"""\tSM:""" + read_group[1] + "'" ) cmd = " ".join( [ "bwa mem -M", f"-t {threads}", R_option, ref_fasta, reads1, reads2, r""" | awk '/^@/ || !and($2,256)' """, # remove secondary alignments (but keep header) ">", sam_file, ] ) try: utils.syscall(cmd) except: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error("Error running BWA MEM: " + cmd) number_in_sam = utils.sam_record_count(sam_file) if expected_read_count != number_in_sam: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error( "Error! Mismatch in read counts. Expected " + str(expected_read_count) + " but got " + str(number_in_sam) ) if rmdup or markdup: sorted_bam = os.path.join(tmpdir, "tmp.sorted.bam") cmd = " ".join(["samtools sort", "-o", sorted_bam, sam_file]) try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error("Error running samtools sort: " + cmd) if rmdup: cmd = "samtools rmdup " + sorted_bam + " " + outfile try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error("Error running samtools rmdup: " + cmd) else: try: picard.mark_duplicates(sorted_bam, outfile) except: shutil.rmtree(tmpdir) raise Error("Error running picard mark_duplicates " + cmd) shutil.rmtree(tmpdir)
def map_reads(ref_fasta, reads1, reads2, outfile, rmdup=False, markdup=False, read_group=None): '''Maps reads with BWA MEM. By default, outputs SAM file in input read order. rmdup=True => remove duplicates using samtools rmdup. Final output is sorted bam Incompatible with markdup=True markdup=True => mark duplicates using picard MarkDuplicate. Final output is sorted bam. Incompatible with rmdup=True read_group should be a tuple (group_id, group_name). If given, these will be put into the BAM''' if rmdup and markdup: raise Error('Cannot have rmdup and markdup both True.' '') try: expected_read_count = 2 * fqtools.count([reads1, reads2]) except: raise Error('Error counting reads in input files ' + reads1 + ' ' + reads2) if rmdup or markdup: tmpdir = tempfile.mkdtemp(prefix=outfile + '.tmp.map_reads.', dir=os.path.dirname(outfile)) sam_file = os.path.join(tmpdir, 'tmp.sam') else: sam_file = outfile # "LB:LIB" is needed, otherwise samtools rmdup segfaults when map_reads_set() is used R_option = '' if read_group is None else r'''-R '@RG\tLB:LIB\tID:''' + read_group[ 0] + r'''\tSM:''' + read_group[1] + "'" cmd = ' '.join([ 'bwa mem -M', R_option, ref_fasta, reads1, reads2, r''' | awk '/^@/ || !and($2,256)' ''', # remove secondary alignments (but keep header) '>', sam_file ]) try: utils.syscall(cmd) except: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error('Error running BWA MEM: ' + cmd) number_in_sam = utils.sam_record_count(sam_file) if expected_read_count != number_in_sam: if rmdup or markdup: shutil.rmtree(tmpdir) raise Error('Error! Mismatch in read counts. Expected ' + str(expected_read_count) + ' but got ' + str(number_in_sam)) if rmdup or markdup: sorted_bam = os.path.join(tmpdir, 'tmp.sorted.bam') cmd = ' '.join(['samtools sort', '-o', sorted_bam, sam_file]) try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error('Error running samtools sort: ' + cmd) if rmdup: cmd = 'samtools rmdup ' + sorted_bam + ' ' + outfile try: utils.syscall(cmd) except: shutil.rmtree(tmpdir) raise Error('Error running samtools rmdup: ' + cmd) else: try: picard.mark_duplicates(sorted_bam, outfile) except: shutil.rmtree(tmpdir) raise Error('Error running picard mark_duplicates ' + cmd) shutil.rmtree(tmpdir)
def test_count(self): '''test count''' file1 = os.path.join(data_dir, 'count.1.fq') file2 = os.path.join(data_dir, 'count.2.fq') got = fqtools.count([file1, file2]) self.assertEqual(3, got)