def classify(self, inBam, db, outReads, numThreads=None): """Classify input reads (bam) Args: inBam: unaligned reads db: Kraken built database directory. outReads: Output file of command. """ if tools.samtools.SamtoolsTool().isEmpty(inBam): # kraken cannot deal with empty input with open(outReads, 'rt') as outf: pass return tmp_fastq1 = util.file.mkstempfname('.1.fastq') tmp_fastq2 = util.file.mkstempfname('.2.fastq') # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq1, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault) if numThreads is None: numThreads = 10000000 opts = { '--paired': None, '--threads': min(int(numThreads), util.misc.available_cpu_count()), } res = self.execute('kraken', db, outReads, args=[tmp_fastq1, tmp_fastq2], options=opts) os.unlink(tmp_fastq1) os.unlink(tmp_fastq2)
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1): ''' Classify reads by the taxon of the Lowest Common Ancestor (LCA) ''' tmp_fastq = util.file.mkstempfname('.fastq') tmp_fastq2 = util.file.mkstempfname('.fastq') # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault) diamond_tool = tools.diamond.Diamond() diamond_tool.install() tmp_alignment = util.file.mkstempfname('.daa') tmp_m8 = util.file.mkstempfname('.diamond.m8') diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={'--threads': numThreads}) diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads}) if outM8: with open(tmp_m8, 'rb') as f_in: with gzip.open(outM8, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True, load_gis=True) tmp_lca_tsv = util.file.mkstempfname('.tsv') with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca: blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50) if outLca: with open(tmp_lca_tsv, 'rb') as f_in: with gzip.open(outLca, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) with open(tmp_lca_tsv) as f: hits = taxa_hits_from_tsv(f) with open(outReport, 'w') as f: for line in kraken_dfs_report(tax_db, hits): print(line, file=f)
def pipeline(self, inBam, db, outReport=None, outReads=None, filterThreshold=None, numThreads=None): with util.file.fifo(2) as (fastq1_pipe, fastq2_pipe): # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute( inBam, fastq1_pipe, fastq2_pipe, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault, background=True) if numThreads is None: numThreads = 10000000 opts = { '--threads': min(int(numThreads), util.misc.available_cpu_count()), } kraken_bin = os.path.join(self.libexec, 'kraken') cmd = '''export KRAKEN_DEFAULT_DB={kraken_db}; {{ {kraken} --paired --fastq-input --threads {threads} {fastq1} {fastq2} 2>&1 1>&3 3>&- | sed '/Processed [0-9]* sequences/d'; }} \ 3>&1 1>&2'''.format(kraken_db=db, kraken=kraken_bin, threads=numThreads, fastq1=fastq1_pipe, fastq2=fastq2_pipe) if outReads is not None: cmd += '| tee >(gzip > {kraken_reads})'.format( kraken_reads=outReads) if filterThreshold is not None: kraken_filter_bin = os.path.join(self.libexec, 'kraken-filter') cmd += '| {kraken_filter} --threshold {filterThreshold}'.format( kraken_filter=kraken_filter_bin, filterThreshold=filterThreshold) if outReport is not None: kraken_report_bin = os.path.join(self.libexec, 'kraken-report') cmd += '| {kraken_report} > {outReport}'.format( kraken_report=kraken_report_bin, outReport=outReport) subprocess.check_call(cmd, shell=True, executable='/bin/bash')
def kraken(inBam, db, outReport=None, outReads=None, filterThreshold=None, numThreads=1): assert outReads or outReport, ( 'Either --outReads or --outReport must be specified.') tmp_fastq1 = util.file.mkstempfname('.1.fastq') tmp_fastq2 = util.file.mkstempfname('.2.fastq') picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq1, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault) kraken_tool = tools.kraken.Kraken() tmp_reads = util.file.mkstempfname('.kraken') opts = { '--paired': None, '--threads': min(int(numThreads), util.misc.available_cpu_count()), } # Could be optimized in 3.5 piping directly to kraken-filter. kraken_tool.classify(db, [tmp_fastq1, tmp_fastq2], tmp_reads, options=opts) if filterThreshold: opts = { '--threshold': filterThreshold, } tmp_filtered_reads = util.file.mkstempfname('.filtered-kraken') kraken_tool.execute('kraken-filter', db, tmp_filtered_reads, args=[tmp_reads], options=opts) else: tmp_filtered_reads = tmp_reads if outReads: with open(tmp_filtered_reads, 'rb') as f_in: with gzip.open(outReads, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) if outReport: kraken_tool.execute('kraken-report', db, outReport, args=[tmp_filtered_reads])
def classify(self, inBam, db, outReads, numThreads=None): """Classify input reads (bam) Args: inBam: unaligned reads db: Kraken built database directory. outReads: Output file of command. """ if tools.samtools.SamtoolsTool().isEmpty(inBam): # kraken cannot deal with empty input with open(outReads, 'rt') as outf: pass return tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz') tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz') # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute( inBam, tmp_fastq1, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault) if numThreads is None: numThreads = 10000000 opts = { '--threads': min(int(numThreads), util.misc.available_cpu_count()), '--fastq-input': None, '--gzip-compressed': None, } if os.path.getsize(tmp_fastq2) < 50: res = self.execute('kraken', db, outReads, args=[tmp_fastq1], options=opts) else: opts['--paired'] = None res = self.execute('kraken', db, outReads, args=[tmp_fastq1, tmp_fastq2], options=opts) os.unlink(tmp_fastq1) os.unlink(tmp_fastq2)
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1): tmp_fastq = util.file.mkstempfname('.fastq') tmp_fastq2 = util.file.mkstempfname('.fastq') picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault) diamond_tool = tools.diamond.Diamond() diamond_tool.install() tmp_alignment = util.file.mkstempfname('.daa') tmp_m8 = util.file.mkstempfname('.diamond.m8') diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={'--threads': numThreads}) diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads}) if outM8: with open(tmp_m8, 'rb') as f_in: with gzip.open(outM8, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) tax_db = TaxonomyDb(tax_dir=taxDb) tmp_lca_tsv = util.file.mkstempfname('.tsv') with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca: blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50) if outLca: with open(tmp_lca_tsv, 'rb') as f_in: with gzip.open(outLca, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) with open(tmp_lca_tsv) as f: hits = taxa_hits_from_tsv(f) with open(outReport, 'w') as f: for line in kraken_dfs_report(tax_db, hits): print(line, file=f)
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1): """ Classify reads by the taxon of the Lowest Common Ancestor (LCA) """ tmp_fastq = util.file.mkstempfname(".fastq") tmp_fastq2 = util.file.mkstempfname(".fastq") # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { "CLIPPING_ATTRIBUTE": tools.picard.SamToFastqTool.illumina_clipping_attribute, "CLIPPING_ACTION": "X", } picard.execute( inBam, tmp_fastq, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault, ) diamond_tool = tools.diamond.Diamond() diamond_tool.install() tmp_alignment = util.file.mkstempfname(".daa") tmp_m8 = util.file.mkstempfname(".diamond.m8") diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={"--threads": numThreads}) diamond_tool.view(tmp_alignment, tmp_m8, options={"--threads": numThreads}) if outM8: with open(tmp_m8, "rb") as f_in: with gzip.open(outM8, "wb") as f_out: shutil.copyfileobj(f_in, f_out) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True, load_gis=True) tmp_lca_tsv = util.file.mkstempfname(".tsv") with open(tmp_m8) as m8, open(tmp_lca_tsv, "w") as lca: blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50) if outLca: with open(tmp_lca_tsv, "rb") as f_in: with gzip.open(outLca, "wb") as f_out: shutil.copyfileobj(f_in, f_out) with open(tmp_lca_tsv) as f: hits = taxa_hits_from_tsv(f) with open(outReport, "w") as f: for line in kraken_dfs_report(tax_db, hits): print(line, file=f)
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute( "SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m' ) os.unlink(tmp_sam_reads) os.unlink(bigsam)
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1): ''' Classify reads by the taxon of the Lowest Common Ancestor (LCA) ''' tmp_fastq = util.file.mkstempfname('.fastq') tmp_fastq2 = util.file.mkstempfname('.fastq') # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault) diamond_tool = tools.diamond.Diamond() diamond_tool.install() tmp_alignment = util.file.mkstempfname('.daa') tmp_m8 = util.file.mkstempfname('.diamond.m8') diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={'--threads': numThreads}) diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads}) if outM8: with open(tmp_m8, 'rb') as f_in: with gzip.open(outM8, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) tax_db = TaxonomyDb(tax_dir=taxDb) tmp_lca_tsv = util.file.mkstempfname('.tsv') with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca: blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50) if outLca: with open(tmp_lca_tsv, 'rb') as f_in: with gzip.open(outLca, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) with open(tmp_lca_tsv) as f: hits = taxa_hits_from_tsv(f) with open(outReport, 'w') as f: for line in kraken_dfs_report(tax_db, hits, prepend_column=True): print(line, file=f)
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute("SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m') os.unlink(tmp_sam_reads) os.unlink(bigsam)
def diamond(inBam, db, taxDb, outReport, outReads=None, numThreads=1): ''' Classify reads by the taxon of the Lowest Common Ancestor (LCA) ''' # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() s2fq = picard.execute( inBam, '/dev/stdout', interleave=True, illuminaClipping=True, JVMmemory=picard.jvmMemDefault, background=True, stdout=subprocess.PIPE, ) diamond_tool = tools.diamond.Diamond() diamond_tool.install() taxonmap = join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz') taxonnodes = join(taxDb, 'nodes.dmp') cmd = '{} blastx --outfmt 102 --sallseqid'.format( diamond_tool.install_and_get_path()) if numThreads is not None: cmd += ' --threads {threads}'.format(threads=numThreads) cmd += ' --db {db} --taxonmap {taxonmap} --taxonnodes {taxonnodes}'.format( threads=numThreads, db=db, taxonmap=taxonmap, taxonnodes=taxonnodes) if outReads is not None: # Interstitial save of stdout to output file cmd += ' | tee >(gzip > {out})'.format(out=outReads) diamond_ps = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, executable='/bin/bash') def f(input_pipe, output_pipe): output_pipe = codecs.getwriter('ascii')(output_pipe) SeqIO.write( util.file.join_interleaved_fastq(input_pipe, output_format='fasta', num_n=16), output_pipe, 'fasta') util.misc.bind_pipes(s2fq.stdout, diamond_ps.stdin, f) tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True) lca_p = codecs.getreader('ascii')(diamond_ps.stdout) hits = taxa_hits_from_tsv(lca_p) with open(outReport, 'w') as f: for line in kraken_dfs_report(tax_db, hits): print(line, file=f) s2fq.wait() diamond_ps.wait()
def classify(self, in_bam, db, out_reads=None, out_report=None, num_threads=None): """Classify input reads (bam) Args: in_bam: unaligned reads db: Kraken built database directory. outReads: Output file of command. """ tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz') tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz') # Do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(in_bam, tmp_fastq1, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault) opts = { '--threads': util.misc.sanitize_thread_count(num_threads), '--fastq-input': None, '--gzip-compressed': None, '--preload': None } if out_report: opts['--report-file'] = out_report # Detect if input bam was paired by checking fastq 2 if os.path.getsize(tmp_fastq2) < 50: res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1], options=opts) else: opts['--paired'] = None res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts) os.unlink(tmp_fastq1) os.unlink(tmp_fastq2) if out_report: with open(out_report, 'rt+') as f: lines = [line.strip() for line in f.readlines() if not line.startswith('#')] lines = [line for line in lines if line] if not lines: f.seek(f.tell() - 1, os.SEEK_SET) print('\t'.join(['%', 'reads', 'taxReads', 'kmers', 'dup', 'cov', 'taxID', 'rank', 'taxName']), file=f) print('\t'.join(['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']), file=f)
def classify(self, inBam, db, outReads, numThreads=None): """Classify input reads (bam) Args: inBam: unaligned reads db: Kraken built database directory. outReads: Output file of command. """ if tools.samtools.SamtoolsTool().isEmpty(inBam): # kraken cannot deal with empty input with open(outReads, 'rt') as outf: pass return tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz') tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz') # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute(inBam, tmp_fastq1, tmp_fastq2, picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault) opts = { '--threads': util.misc.sanitize_thread_count(numThreads), '--fastq-input': None, '--gzip-compressed': None, } # Detect if input bam was paired by checking fastq 2 if os.path.getsize(tmp_fastq2) < 50: res = self.execute('kraken', db, outReads, args=[tmp_fastq1], options=opts) else: opts['--paired'] = None res = self.execute('kraken', db, outReads, args=[tmp_fastq1, tmp_fastq2], options=opts) os.unlink(tmp_fastq1) os.unlink(tmp_fastq2)
def classify(self, in_bam, db, out_reads=None, out_report=None, confidence=None, min_base_qual=None, minimum_hit_groups=None, num_threads=None): """Classify input reads (bam) Args: in_bam: unaligned reads db: Kraken built database directory. out_reads: Output file of command. """ if tools.samtools.SamtoolsTool().isEmpty(in_bam): # kraken cannot deal with empty input if out_reads: with open(out_reads, 'wt') as outf: pass if out_report: with open(out_report, 'wt') as outf: pass return opts = {'--threads': util.misc.sanitize_thread_count(num_threads)} if out_report: opts['--report'] = out_report if not out_reads: out_reads = '-' # in kraken2, this suppresses normal output if min_base_qual: opts['--minimum-base-quality'] = min_base_qual if confidence: opts['--confidence'] = confidence if minimum_hit_groups: opts['--minimum-hit-groups'] = minimum_hit_groups tmp_fastq1 = util.file.mkstempfname('.1.fastq') tmp_fastq2 = util.file.mkstempfname('.2.fastq') tmp_fastq3 = util.file.mkstempfname('.s.fastq') # Do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } picard.execute( in_bam, tmp_fastq1, tmp_fastq2, outFastq0=tmp_fastq3, picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault) if out_report: opts['--report'] = out_report # Detect if input bam was paired by checking fastq 2 if os.path.getsize(tmp_fastq2) < os.path.getsize(tmp_fastq3): log.warn("running in single-end read mode!") res = self.execute('kraken2', db, out_reads, args=[tmp_fastq3], options=opts) else: opts['--paired'] = None res = self.execute('kraken2', db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts) os.unlink(tmp_fastq1) os.unlink(tmp_fastq2) os.unlink(tmp_fastq3)
def pipeline(self, db, inBams, outReports=None, outReads=None, lockMemory=None, filterThreshold=None, numThreads=None): assert outReads is not None or outReports is not None n_bams = len(inBams) # 2n for paired fastq, 1n for kraken output n_pipes = n_bams * 3 if outReports and len(outReports) != n_bams: raise Exception("--outReports specified with {} output files, which does not match the number of input bams ({})".format(len(outReports), n_bams)) if outReads and len(outReads) != n_bams: raise Exception("--outReads specified with {} output files, which does not match the number of input bams ({})".format(len(outReads), n_bams)) threads = util.misc.sanitize_thread_count(numThreads) with util.file.fifo(n_pipes) as pipes: fastq_pipes = pipes[:n_bams * 2] kraken_output_pipes = pipes[n_bams * 2:] kraken_bin = 'kraken' opts = '' if lockMemory: opts += ' --lock-memory' db_opts, env, tax_filter_opts, tax_report_opts = self._db_opts(db, threads) opts += db_opts cmd = '''set -ex -o pipefail; {kraken}{opts} --paired --fastq-input --threads {threads} {outputs} {fastqs}'''.format( kraken=kraken_bin, opts=opts, threads=threads, outputs=' '.join('--output {}'.format(x) for x in kraken_output_pipes), fastqs=' '.join(fastq_pipes)) log.debug('Calling kraken command line: %s', cmd) subprocess.Popen(cmd, shell=True, executable='/bin/bash', env=env) for i, in_bam in enumerate(inBams): cmd = 'cat {kraken_output}'.format(kraken_output=kraken_output_pipes[i]) if outReads: if outReports: cmd += ' | tee >(pigz --best > {kraken_reads})' else: cmd += ' | pigz --best > {kraken_reads}' cmd = cmd.format(kraken_reads=outReads[i]) if outReports: if filterThreshold is not None: kraken_filter_bin = 'kraken-filter' cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format( kraken_filter=kraken_filter_bin, tax_opts=tax_filter_opts, filterThreshold=filterThreshold) kraken_report_bin = 'kraken-report' cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format( kraken_report=kraken_report_bin, tax_opts=tax_report_opts, outReport=outReports[i]) # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } bam2fq_ps = picard.execute(in_bam, fastq_pipes[i*2], fastq_pipes[i*2 + 1], picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts), JVMmemory=picard.jvmMemDefault, background=True) log.debug('Calling kraken output command line: %s', cmd) subprocess.check_call(cmd, shell=True, executable='/bin/bash', env=env) if bam2fq_ps.poll(): raise subprocess.CalledProcessError(bam2fq_ps.returncode, "SamToFastqTool().execute({})".format(in_bam))
def miseq_fastq_to_bam(outBam, sampleSheet, inFastq1, inFastq2=None, runInfo=None, sequencing_center=None, JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault): ''' Convert fastq read files to a single bam file. Fastq file names must conform to patterns emitted by Miseq machines. Sample metadata must be provided in a SampleSheet.csv that corresponds to the fastq filename. Specifically, the _S##_ index in the fastq file name will be used to find the corresponding row in the SampleSheet ''' # match miseq based on fastq filenames mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq1) assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq1 assert mo.group( 2 ) == '1', "fastq1 must correspond to read 1, not read %s" % mo.group(2) sample_num = mo.group(1) if inFastq2: mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq2) assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq2 assert mo.group( 2 ) == '2', "fastq2 must correspond to read 2, not read %s" % mo.group(2) assert mo.group( 1 ) == sample_num, "fastq1 (%s) and fastq2 (%s) must have the same sample number" % ( sample_num, mo.group(1)) # load metadata samples = SampleSheet(sampleSheet, allow_non_unique=True) sample_info = samples.fetch_by_index(sample_num) assert sample_info, "sample %s not found in %s" % (sample_num, sampleSheet) sampleName = sample_info['sample'] log.info("Using sample name: %s", sampleName) if sample_info.get('barcode_2'): barcode = '-'.join( (sample_info['barcode_1'], sample_info['barcode_2'])) else: barcode = sample_info['barcode_1'] picardOpts = { 'LIBRARY_NAME': sample_info['library'], 'PLATFORM': 'illumina', 'VERBOSITY': 'WARNING', 'QUIET': 'TRUE', } if runInfo: runInfo = RunInfo(runInfo) flowcell = runInfo.get_flowcell() picardOpts['RUN_DATE'] = runInfo.get_rundate_iso() if inFastq2: assert runInfo.num_reads( ) == 2, "paired fastqs given for a single-end RunInfo.xml" else: assert runInfo.num_reads( ) == 1, "second fastq missing for a paired-end RunInfo.xml" else: flowcell = 'A' if sequencing_center is None and runInfo: sequencing_center = runInfo.get_machine() if sequencing_center: picardOpts['SEQUENCING_CENTER'] = sequencing_center picardOpts['PLATFORM_UNIT'] = '.'.join((flowcell, '1', barcode)) if len(flowcell) > 5: flowcell = flowcell[:5] picardOpts['READ_GROUP_NAME'] = flowcell # run Picard picard = tools.picard.FastqToSamTool() picard.execute(inFastq1, inFastq2, sampleName, outBam, picardOptions=picard.dict_to_picard_opts(picardOpts), JVMmemory=JVMmemory) return 0
def miseq_fastq_to_bam(outBam, sampleSheet, inFastq1, inFastq2=None, runInfo=None, sequencing_center=None, JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault): ''' Convert fastq read files to a single bam file. Fastq file names must conform to patterns emitted by Miseq machines. Sample metadata must be provided in a SampleSheet.csv that corresponds to the fastq filename. Specifically, the _S##_ index in the fastq file name will be used to find the corresponding row in the SampleSheet ''' # match miseq based on fastq filenames mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq1) assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq1 assert mo.group(2) == '1', "fastq1 must correspond to read 1, not read %s" % mo.group(2) sample_num = mo.group(1) if inFastq2: mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq2) assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq2 assert mo.group(2) == '2', "fastq2 must correspond to read 2, not read %s" % mo.group(2) assert mo.group(1) == sample_num, "fastq1 (%s) and fastq2 (%s) must have the same sample number" % ( sample_num, mo.group(1)) # load metadata samples = SampleSheet(sampleSheet, allow_non_unique=True) sample_info = samples.fetch_by_index(sample_num) assert sample_info, "sample %s not found in %s" % (sample_num, sampleSheet) sampleName = sample_info['sample'] log.info("Using sample name: %s", sampleName) if sample_info.get('barcode_2'): barcode = '-'.join((sample_info['barcode_1'], sample_info['barcode_2'])) else: barcode = sample_info['barcode_1'] picardOpts = { 'LIBRARY_NAME': sample_info['library'], 'PLATFORM': 'illumina', 'VERBOSITY': 'WARNING', 'QUIET': 'TRUE', } if runInfo: runInfo = RunInfo(runInfo) flowcell = runInfo.get_flowcell() picardOpts['RUN_DATE'] = runInfo.get_rundate_iso() if inFastq2: assert runInfo.num_reads() == 2, "paired fastqs given for a single-end RunInfo.xml" else: assert runInfo.num_reads() == 1, "second fastq missing for a paired-end RunInfo.xml" else: flowcell = 'A' if sequencing_center is None and runInfo: sequencing_center = runInfo.get_machine() if sequencing_center: picardOpts['SEQUENCING_CENTER'] = sequencing_center picardOpts['PLATFORM_UNIT'] = '.'.join((flowcell, '1', barcode)) if len(flowcell) > 5: flowcell = flowcell[:5] picardOpts['READ_GROUP_NAME'] = flowcell # run Picard picard = tools.picard.FastqToSamTool() picard.execute(inFastq1, inFastq2, sampleName, outBam, picardOptions=picard.dict_to_picard_opts(picardOpts), JVMmemory=JVMmemory) return 0
def pipeline(self, db, inBams, outReports=None, outReads=None, lockMemory=None, filterThreshold=None, numThreads=None): assert outReads is not None or outReports is not None n_bams = len(inBams) # 2n for paired fastq, 1n for kraken output n_pipes = n_bams * 3 if outReports and len(outReports) != n_bams: raise Exception( "--outReports specified with {} output files, which does not match the number of input bams ({})" .format(len(outReports), n_bams)) if outReads and len(outReads) != n_bams: raise Exception( "--outReads specified with {} output files, which does not match the number of input bams ({})" .format(len(outReads), n_bams)) threads = util.misc.sanitize_thread_count(numThreads) with util.file.fifo(n_pipes) as pipes: fastq_pipes = pipes[:n_bams * 2] kraken_output_pipes = pipes[n_bams * 2:] kraken_bin = os.path.join(self.libexec, 'kraken') opts = '' if lockMemory: opts += ' --lock-memory' db_opts, env, tax_filter_opts, tax_report_opts = self._db_opts( db, threads) opts += db_opts cmd = '''set -ex -o pipefail; {kraken}{opts} --paired --fastq-input --threads {threads} {outputs} {fastqs}'''.format( kraken=kraken_bin, opts=opts, threads=threads, outputs=' '.join('--output {}'.format(x) for x in kraken_output_pipes), fastqs=' '.join(fastq_pipes)) log.debug('Calling kraken command line: %s', cmd) subprocess.Popen(cmd, shell=True, executable='/bin/bash', env=env) for i, in_bam in enumerate(inBams): cmd = 'cat {kraken_output}'.format( kraken_output=kraken_output_pipes[i]) if outReads: if outReports: cmd += ' | tee >(pigz --best > {kraken_reads})' else: cmd += ' | pigz --best > {kraken_reads}' cmd = cmd.format(kraken_reads=outReads[i]) if outReports: if filterThreshold is not None: kraken_filter_bin = os.path.join( self.libexec, 'kraken-filter') cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format( kraken_filter=kraken_filter_bin, tax_opts=tax_filter_opts, filterThreshold=filterThreshold) kraken_report_bin = os.path.join(self.libexec, 'kraken-report') cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format( kraken_report=kraken_report_bin, tax_opts=tax_report_opts, outReport=outReports[i]) # do not convert this to samtools bam2fq unless we can figure out how to replicate # the clipping functionality of Picard SamToFastq picard = tools.picard.SamToFastqTool() picard_opts = { 'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute, 'CLIPPING_ACTION': 'X' } bam2fq_ps = picard.execute( in_bam, fastq_pipes[i * 2], fastq_pipes[i * 2 + 1], picardOptions=tools.picard.PicardTools.dict_to_picard_opts( picard_opts), JVMmemory=picard.jvmMemDefault, background=True) log.debug('Calling kraken output command line: %s', cmd) subprocess.check_call(cmd, shell=True, executable='/bin/bash', env=env) if bam2fq_ps.poll(): raise subprocess.CalledProcessError( bam2fq_ps.returncode, "SamToFastqTool().execute({})".format(in_bam))