def deplete_blastn(inFastq, outFastq, refDbs) : 'Use blastn to remove reads that match at least one of the databases.' ## Get tools noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), 'noBlastHits_v3.py') ## Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) ## Run blastn using each of the databases in turn blastOutFiles = [] for db in refDbs : log.info("running blastn on %s against %s", inFastq, db) blastOutFiles += blastn_chunked_fasta(inFasta, db) ## Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout = outf) ## run noBlastHits_v3.py to extract reads with no blast hits # TODO: slurp the small amount of code in this script into here noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, '-r', inFastq, '-m', 'nohit'] log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq) with util.file.open_or_gzopen(outFastq, 'wt') as outf : subprocess.check_call(noBlastHitsCmd, stdout = outf)
def _merge_fastqs_and_mvicuna(lb, files): readList = mkstempfname('.keep_reads.txt') log.info("executing M-Vicuna DupRm on library " + lb) # create merged FASTQs per library infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq')) for d in range(2): with open(infastqs[d], 'wt') as outf: for fprefix in files: fn = '%s_%d.fastq' % (fprefix, d + 1) if os.path.isfile(fn): with open(fn, 'rt') as inf: for line in inf: outf.write(line) os.unlink(fn) else: log.warn( """no reads found in %s, assuming that's because there's no reads in that read group""", fn ) # M-Vicuna DupRm to see what we should keep (append IDs to running file) if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0: mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList) for fn in infastqs: os.unlink(fn) return readList
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path( ) tmpUnpaired1 = mkstempfname() tmpUnpaired2 = mkstempfname() javaCmd = [] # the conda version wraps the jar file with a shell script if trimmomaticPath.endswith(".jar"): # This java program has a lot of argments... javaCmd.extend([ 'java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath, 'org.usadellab.trimmomatic.TrimmomaticPE' ]) else: javaCmd.extend([trimmomaticPath, "PE"]) javaCmd.extend([ inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta) ]) log.debug(' '.join(javaCmd)) util.misc.run_and_print(javaCmd, check=True) os.unlink(tmpUnpaired1) os.unlink(tmpUnpaired2)
def filter_lastal_bam(inBam, db, outBam, JVMmemory=None): ''' Restrict input reads to those that align to the given reference database using LASTAL. ''' # convert BAM to paired FASTQ inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2) # look for hits in inReads1 and inReads2 hitList1 = mkstempfname('.1.hits') hitList2 = mkstempfname('.2.hits') lastal_get_hits(inReads1, db, hitList1) os.unlink(inReads1) lastal_get_hits(inReads2, db, hitList2) os.unlink(inReads2) # merge hits hitList = mkstempfname('.hits') with open(hitList, 'wt') as outf: subprocess.check_call(['sort', '-u', hitList1, hitList2], stdout=outf) os.unlink(hitList1) os.unlink(hitList2) # filter original BAM file against keep list tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory) os.unlink(hitList)
def blastn_chunked_fasta(fasta, db, chunkSize=1000000): """ Helper function: blastn a fasta file, overcoming apparent memory leaks on an input with many query sequences, by splitting it into multiple chunks and running a new blastn process on each chunk. Return a list of output filenames containing hits """ blastnPath = tools.blast.BlastnTool().install_and_get_path() hits_files = [] with open(fasta, "rt") as fastaFile: record_iter = SeqIO.parse(fastaFile, "fasta") for batch in batch_iterator(record_iter, chunkSize): chunk_fasta = mkstempfname('.fasta') with open(chunk_fasta, "wt") as handle: SeqIO.write(batch, handle, "fasta") batch = None chunk_hits = mkstempfname('.hits.txt') blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs', '2', '-query', chunk_fasta, '-out', chunk_hits] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) os.unlink(chunk_fasta) hits_files.append(chunk_hits) return hits_files
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs): tmpDb = None if len(refDbs)>1 and not any( not os.path.exists(db) # indexed db prefix or os.path.isdir(db) # indexed db in directory or (os.path.isfile(db) and ('.tar' in db or '.tgz' in db or '.zip' in db)) # packaged indexed db for db in refDbs): # this is a scenario where all refDbs are unbuilt fasta # files. we can simplify and speed up execution by # concatenating them all and running deplete_method # just once tmpDb = mkstempfname('.fasta') merge_compressed_files(refDbs, tmpDb, sep='\n') refDbs = [tmpDb] samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, **kwargs) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam) if tmpDb: os.unlink(tmpDb)
def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, novoalign_options='', JVMmemory=None): ''' Take reads, align to reference with Novoalign, mark duplicates with Picard, realign indels with GATK, and optionally filter final file to mapped/non-dupe reads. ''' if not (outBamAll or outBamFiltered): log.warn("are you sure you meant to do nothing?") return bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute( inBam, refFasta, bam_aligned, options=novoalign_options.split(), JVMmemory=JVMmemory) bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( [bam_aligned], bam_mkdup, picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory) os.unlink(bam_aligned) bam_realigned = mkstempfname('.realigned.bam') tools.gatk.GATKTool().local_realign( bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory) os.unlink(bam_mkdup) if outBamAll: shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: tools.samtools.SamtoolsTool().view( ['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned)
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta): trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path() tmpUnpaired1 = mkstempfname() tmpUnpaired2 = mkstempfname() # This java program has a lot of argments... javaCmd = ['java', '-Xmx2g', '-Djava.io.tmpdir='+tempfile.tempdir, '-classpath', trimmomaticPath, 'org.usadellab.trimmomatic.TrimmomaticPE', inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta) ] log.debug(' '.join(javaCmd)) subprocess.check_call(javaCmd) os.unlink(tmpUnpaired1) os.unlink(tmpUnpaired2)
def rmdup_cdhit_bam(inBam, outBam, max_mismatches=None, jvm_memory=None): ''' Remove duplicate reads from BAM file using cd-hit-dup. ''' max_mismatches = max_mismatches or 4 tmp_dir = tempfile.mkdtemp() tools.picard.SplitSamByLibraryTool().execute(inBam, tmp_dir) s2fq_tool = tools.picard.SamToFastqTool() cdhit = tools.cdhit.CdHit() out_bams = [] for f in os.listdir(tmp_dir): out_bam = mkstempfname('.bam') out_bams.append(out_bam) library_sam = os.path.join(tmp_dir, f) in_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq') s2fq_tool.execute(library_sam, in_fastqs[0], in_fastqs[1]) if not os.path.getsize(in_fastqs[0]) > 0 and not os.path.getsize( in_fastqs[1]) > 0: continue out_fastqs = mkstempfname('.1.fastq'), mkstempfname('.2.fastq') options = { '-e': max_mismatches, } if in_fastqs[1] is not None and os.path.getsize(in_fastqs[1]) > 10: options['-i2'] = in_fastqs[1] options['-o2'] = out_fastqs[1] log.info("executing cd-hit-est on library " + library_sam) # cd-hit-dup cannot operate on piped fastq input because it reads twice cdhit.execute('cd-hit-dup', in_fastqs[0], out_fastqs[0], options=options, background=True) tools.picard.FastqToSamTool().execute(out_fastqs[0], out_fastqs[1], f, out_bam, JVMmemory=jvm_memory) for fn in in_fastqs: os.unlink(fn) with util.file.fifo(name='merged.sam') as merged_bam: merge_opts = ['SORT_ORDER=queryname'] tools.picard.MergeSamFilesTool().execute(out_bams, merged_bam, picardOptions=merge_opts, JVMmemory=jvm_memory, background=True) tools.picard.ReplaceSamHeaderTool().execute(merged_bam, inBam, outBam, JVMmemory=jvm_memory)
def trimmomatic( inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta, unpairedOutFastq1=None, unpairedOutFastq2=None, leading_q_cutoff=15, trailing_q_cutoff=15, minlength_to_keep=30, sliding_window_size=4, sliding_window_q_cutoff=25 ): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path() unpairedFastq1 = unpairedOutFastq1 or mkstempfname() unpairedFastq2 = unpairedOutFastq2 or mkstempfname() javaCmd = [] # the conda version wraps the jar file with a shell script if trimmomaticPath.endswith(".jar"): # This java program has a lot of argments... javaCmd.extend( [ 'java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath, 'org.usadellab.trimmomatic.TrimmomaticPE' ] ) else: javaCmd.extend([trimmomaticPath, "PE"]) # Explicitly use Phred-33 quality scores javaCmd.extend(['-phred33']) javaCmd.extend( [ inFastq1, inFastq2, pairedOutFastq1, unpairedFastq1, pairedOutFastq2, unpairedFastq2, 'LEADING:{leading_q_cutoff}'.format(leading_q_cutoff=leading_q_cutoff), 'TRAILING:{trailing_q_cutoff}'.format(trailing_q_cutoff=trailing_q_cutoff), 'SLIDINGWINDOW:{sliding_window_size}:{sliding_window_q_cutoff}'.format( sliding_window_size=sliding_window_size, sliding_window_q_cutoff=sliding_window_q_cutoff, ), 'MINLEN:{minlength_to_keep}'.format(minlength_to_keep=minlength_to_keep), 'ILLUMINACLIP:{clipFasta}:2:30:12'.format(clipFasta=clipFasta) ] ) log.debug(' '.join(javaCmd)) util.misc.run_and_print(javaCmd, check=True) if not unpairedOutFastq1: os.unlink(unpairedFastq1) if not unpairedOutFastq2: os.unlink(unpairedFastq2)
def deplete_bmtagger_bam(inBam, db, outBam, threads=None, srprism_memory=7168, JVMmemory=None): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. inBam: paired-end input reads in BAM format. db: bmtagger expects files db.bitmask created by bmtool, and db.srprism.idx, db.srprism.map, etc. created by srprism mkindex outBam: the output BAM files to hold the unmatched reads. srprism_memory: srprism memory in megabytes. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() path = os.environ['PATH'].split(os.pathsep) for t in (bmtaggerPath, blastnPath): d = os.path.dirname(t) if d not in path: path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path inReads1 = mkstempfname('.1.fastq') tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1) bmtaggerConf = mkstempfname('.bmtagger.conf') with open(bmtaggerConf, 'w') as f: # Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168" print( 'srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M {srprism_memory} --paired false"' .format(srprism_memory=srprism_memory), file=f) tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') cmdline = [ bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-o', matchesFile ] log.debug(' '.join(cmdline)) util.misc.run_and_print(cmdline, check=True) os.unlink(inReads1) os.unlink(bmtaggerConf) tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory)
def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None): ''' Remove duplicate reads from BAM file using M-Vicuna. The primary advantage to this approach over Picard's MarkDuplicates tool is that Picard requires that input reads are aligned to a reference, and M-Vicuna can operate on unaligned reads. ''' # Convert BAM -> FASTQ pairs per read group and load all read groups tempDir = tempfile.mkdtemp() tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT']) read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG'] read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups] # Collect FASTQ pairs for each library lb_to_files = {} for rg in read_groups: lb_to_files.setdefault(rg.get('LB', 'none'), set()) fname = rg['ID'] if 'PU' in rg: fname = rg['PU'] lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname)) log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups)) # For each library, merge FASTQs and run rmdup for entire library readList = mkstempfname('.keep_reads.txt') for lb, files in lb_to_files.items(): log.info("executing M-Vicuna DupRm on library " + lb) # create merged FASTQs per library infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq')) for d in range(2): with open(infastqs[d], 'wt') as outf: for fprefix in files: fn = '%s_%d.fastq' % (fprefix, d + 1) if os.path.isfile(fn): with open(fn, 'rt') as inf: for line in inf: outf.write(line) os.unlink(fn) else: log.warn( """no reads found in %s, assuming that's because there's no reads in that read group""", fn ) # M-Vicuna DupRm to see what we should keep (append IDs to running file) if os.path.getsize(infastqs[0]) > 0 or os.path.getsize(infastqs[1]) > 0: mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList) for fn in infastqs: os.unlink(fn) # Filter original input BAM against keep-list tools.picard.FilterSamReadsTool().execute(inBam, False, readList, outBam, JVMmemory=JVMmemory) return 0
def filter_lastal(inFastq, refDb, outFastq): ''' Restrict input reads to those that align to the given reference database using LASTAL. Also, remove duplicates with prinseq. ''' assert outFastq.endswith('.fastq') tempFilePath = mkstempfname('.hits') lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path() noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(), 'noBlastLikeHits.py') # each pipe separated cmd gets own line # unfortunately, it doesn't seem to work to do .format(**locals()) on the # final string as opposed to the individual parts. lastalCmd = ' '.join([ '{lastalPath} -Q1 {refDb} {inFastq}'.format(**locals()), '| {mafSortPath} -n2'.format(**locals()), '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(**locals()), ]) log.debug(lastalCmd) assert not os.system(lastalCmd) # filter inFastq against lastal hits filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: noBlastLikeHitsCmd = [ noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(noBlastLikeHitsCmd) + ' > ' + filteredFastq) subprocess.check_call(noBlastLikeHitsCmd, stdout=outf) # remove duplicate reads and reads with multiple Ns if os.path.getsize(filteredFastq) == 0: # prinseq-lite fails on empty file input (which can happen in real life # if no reads match the refDb) so handle this scenario specially log.info("output is empty: no reads in input match refDb") shutil.copyfile(filteredFastq, outFastq) else: prinseqCmd = [ 'perl', prinseqPath, '-ns_max_n', '1', '-derep', '1', '-fastq', filteredFastq, '-out_bad', 'null', '-line_width', '0', '-out_good', outFastq[:-6] ] log.debug(' '.join(prinseqCmd)) subprocess.check_call(prinseqCmd) os.unlink(filteredFastq)
def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None): ''' Remove duplicate reads from BAM file using M-Vicuna. The primary advantage to this approach over Picard's MarkDuplicates tool is that Picard requires that input reads are aligned to a reference, and M-Vicuna can operate on unaligned reads. ''' # Convert BAM -> FASTQ pairs per read group and load all read groups tempDir = tempfile.mkdtemp() tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT']) read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG'] read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups] # Collect FASTQ pairs for each library lb_to_files = {} for rg in read_groups: lb_to_files.setdefault(rg.get('LB', 'none'), set()) fname = rg['ID'] if 'PU' in rg: fname = rg['PU'] lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname)) log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups)) # For each library, merge FASTQs and run rmdup for entire library readList = mkstempfname('.keep_reads.txt') for lb, files in lb_to_files.items(): log.info("executing M-Vicuna DupRm on library " + lb) # create merged FASTQs per library infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq')) for d in range(2): with open(infastqs[d], 'wt') as outf: for fprefix in files: fn = '%s_%d.fastq' % (fprefix, d + 1) if os.path.isfile(fn): with open(fn, 'rt') as inf: for line in inf: outf.write(line) os.unlink(fn) else: log.warn("""no reads found in %s, assuming that's because there's no reads in that read group""", fn) # M-Vicuna DupRm to see what we should keep (append IDs to running file) mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList) for fn in infastqs: os.unlink(fn) # Filter original input BAM against keep-list tools.picard.FilterSamReadsTool().execute(inBam, False, readList, outBam, JVMmemory=JVMmemory) return 0
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs): tmpfq1_a = mkstempfname('.fastq') tmpfq1_b = mkstempfname('.fastq') tmpfq2_b = mkstempfname('.fastq') tmpfq2_c = mkstempfname('.fastq') # deplete fq1 deplete_blastn(infq1, tmpfq1_a, refDbs) # purge fq2 of read pairs lost in fq1 # (this should significantly speed up the second run of deplete_blastn) read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b) # deplete fq2 deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs) # purge fq1 of read pairs lost in fq2 read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs, threads): 'Use blastn to remove reads that match at least one of the databases.' tmpfq1_a = mkstempfname('.fastq') tmpfq1_b = mkstempfname('.fastq') tmpfq2_b = mkstempfname('.fastq') tmpfq2_c = mkstempfname('.fastq') # deplete fq1 deplete_blastn(infq1, tmpfq1_a, refDbs) # purge fq2 of read pairs lost in fq1 # (this should significantly speed up the second run of deplete_blastn) read_utils.purge_unmated(tmpfq1_a, infq2, tmpfq1_b, tmpfq2_b) # deplete fq2 deplete_blastn(tmpfq2_b, tmpfq2_c, refDbs, threads) # purge fq1 of read pairs lost in fq2 read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2)
def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None, JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, picardOptions=None): ''' Convert a pair of fastq paired-end read files and optional text header to a single bam file. ''' picardOptions = picardOptions or [] if header: fastqToSamOut = mkstempfname('.bam') else: fastqToSamOut = outBam if sampleName is None: sampleName = 'Dummy' # Will get overwritten by rehead command if header: # With the header option, rehead will be called after FastqToSam. # This will invalidate any md5 file, which would be a slow to construct # on our own, so just disallow and let the caller run md5sum if desired. if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions): raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""") tools.picard.FastqToSamTool().execute( inFastq1, inFastq2, sampleName, fastqToSamOut, picardOptions=picardOptions, JVMmemory=JVMmemory) if header: tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam) return 0
def main_reheader_bams(args): ''' Copy BAM files while renaming elements of the BAM header. The mapping file specifies which (key, old value, new value) mappings. For example: LB lib1 lib_one SM sample1 Sample_1 SM sample2 Sample_2 SM sample3 Sample_3 CN broad BI FN in1.bam out1.bam FN in2.bam out2.bam ''' # read mapping file mapper = dict((a+':'+b, a+':'+c) for a,b,c in util.file.read_tabfile(args.rgMap) if a != 'FN') files = list((b,c) for a,b,c in util.file.read_tabfile(args.rgMap) if a == 'FN') header_file = mkstempfname('.sam') # read and convert bam headers for inBam, outBam in files: if os.path.isfile(inBam): with open(header_file, 'wt') as outf: for row in tools.samtools.SamtoolsTool().getHeader(inBam): if row[0] == '@RG': row = [mapper.get(x, x) for x in row] outf.write('\t'.join(row)+'\n') # write new bam with new header tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam) os.unlink(header_file) return 0
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None, min_score_to_filter=None, aligner_options=None): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. ''' assert outBam or outStats, "Either outBam or outStats must be specified" if outBam is None: bam_aligned = mkstempfname('.aligned.bam') else: bam_aligned = outBam samtools = tools.samtools.SamtoolsTool() bwa = tools.bwa.Bwa() ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(refFasta, ref_indexed) bwa.index(ref_indexed) bwa_opts = [] if aligner_options is None else aligner_options.split() bwa.mem(inBam, refFasta, bam_aligned, options=bwa_opts, min_score_to_filter=min_score_to_filter) if outStats is not None: samtools.idxstats(bam_aligned, outStats) if outBam is None: os.unlink(bam_aligned)
def lastal_get_hits(inFastq, db, outList, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100): filteredFastq = mkstempfname('.filtered.fastq') lastal_chunked_fastq( inFastq, db, filteredFastq, max_gapless_alignments_per_position=max_gapless_alignments_per_position, min_length_for_initial_matches=min_length_for_initial_matches, max_length_for_initial_matches=max_length_for_initial_matches, max_initial_matches_per_position=max_initial_matches_per_position) with open(outList, 'wt') as outf: with open(filteredFastq, 'rt') as inf: line_num = 0 for line in inf: if (line_num % 4) == 0: seq_id = line.rstrip('\n\r')[1:] if seq_id.endswith('/1') or seq_id.endswith('/2'): seq_id = seq_id[:-2] outf.write(seq_id + '\n') line_num += 1 os.unlink(filteredFastq)
def main_reheader_bams(args): ''' Copy BAM files while renaming elements of the BAM header. The mapping file specifies which (key, old value, new value) mappings. For example: LB lib1 lib_one SM sample1 Sample_1 SM sample2 Sample_2 SM sample3 Sample_3 CN broad BI FN in1.bam out1.bam FN in2.bam out2.bam ''' # read mapping file mapper = dict((a + ':' + b, a + ':' + c) for a, b, c in util.file.read_tabfile(args.rgMap) if a != 'FN') files = list( (b, c) for a, b, c in util.file.read_tabfile(args.rgMap) if a == 'FN') header_file = mkstempfname('.sam') # read and convert bam headers for inBam, outBam in files: if os.path.isfile(inBam): with open(header_file, 'wt') as outf: for row in tools.samtools.SamtoolsTool().getHeader(inBam): if row[0] == '@RG': row = [mapper.get(x, x) for x in row] outf.write('\t'.join(row) + '\n') # write new bam with new header tools.samtools.SamtoolsTool().reheader(inBam, header_file, outBam) os.unlink(header_file) return 0
def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False, JVMmemory=None): ''' Take reads, align to reference with Novoalign and return aligned read counts for each reference sequence. ''' bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute(inBam, refFasta, bam_aligned, options=['-r', 'Random'], JVMmemory=JVMmemory) samtools = tools.samtools.SamtoolsTool() seqs = list( dict(x.split(':', 1) for x in row[1:])['SN'] for row in samtools.getHeader(bam_aligned) if row[0] == '@SQ') with util.file.open_or_gzopen(outCounts, 'w') as outf: for seq in seqs: n = samtools.count(bam_aligned, regions=[seq]) if n > 0 or includeZeros: outf.write("{}\t{}\n".format(seq, n)) os.unlink(bam_aligned)
def lastal_get_hits( inFastq, db, outList, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100 ): filteredFastq = mkstempfname('.filtered.fastq') lastal_chunked_fastq( inFastq, db, filteredFastq, max_gapless_alignments_per_position=max_gapless_alignments_per_position, min_length_for_initial_matches=min_length_for_initial_matches, max_length_for_initial_matches=max_length_for_initial_matches, max_initial_matches_per_position=max_initial_matches_per_position ) with open(outList, 'wt') as outf: with open(filteredFastq, 'rt') as inf: line_num = 0 for line in inf: if (line_num % 4) == 0: seq_id = line.rstrip('\n\r')[1:] if seq_id.endswith('/1') or seq_id.endswith('/2'): seq_id = seq_id[:-2] outf.write(seq_id + '\n') line_num += 1 os.unlink(filteredFastq)
def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False, JVMmemory=None, threads=1): ''' Take reads, align to reference with Novoalign and return aligned read counts for each reference sequence. ''' bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute( inBam, refFasta, bam_aligned, options=['-r', 'Random'], JVMmemory=JVMmemory) samtools = tools.samtools.SamtoolsTool() seqs = list(dict(x.split(':', 1) for x in row[1:])['SN'] for row in samtools.getHeader(bam_aligned) if row[0]=='@SQ') with util.file.open_or_gzopen(outCounts, 'w') as outf: for seq in seqs: n = samtools.count(bam_aligned, regions=[seq]) if n>0 or includeZeros: outf.write("{}\t{}\n".format(seq, n)) os.unlink(bam_aligned)
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute("SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m') os.unlink(tmp_sam_reads) os.unlink(bigsam)
def lastal_get_hits(inFastq, db, outList): lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(), 'noBlastLikeHits.py') lastalOut = mkstempfname('.lastal') with open(lastalOut, 'wt') as outf: cmd = [lastalPath, '-Q1', db, inFastq] log.debug(' '.join(cmd) + ' > ' + lastalOut) subprocess.check_call(cmd, stdout=outf) # everything below this point in this method should be replaced with # our own code that just reads lastal output and makes a list of read names mafSortOut = mkstempfname('.mafsort') with open(mafSortOut, 'wt') as outf: with open(lastalOut, 'rt') as inf: cmd = [mafSortPath, '-n2'] log.debug('cat ' + lastalOut + ' | ' + ' '.join(cmd) + ' > ' + mafSortOut) subprocess.check_call(cmd, stdin=inf, stdout=outf) os.unlink(lastalOut) mafConvertOut = mkstempfname('.mafconvert') with open(mafConvertOut, 'wt') as outf: cmd = [mafConvertPath, 'tab', mafSortOut] log.debug(' '.join(cmd) + ' > ' + mafConvertOut) subprocess.check_call(cmd, stdout=outf) os.unlink(mafSortOut) filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: cmd = [noBlastLikeHitsPath, '-b', mafConvertOut, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(cmd) + ' > ' + filteredFastq) subprocess.check_call(cmd, stdout=outf) os.unlink(mafConvertOut) with open(outList, 'wt') as outf: with open(filteredFastq, 'rt') as inf: line_num = 0 for line in inf: if (line_num % 4) == 0: id = line.rstrip('\n\r')[1:] if id.endswith('/1') or id.endswith('/2'): id = id[:-2] outf.write(id+'\n') line_num += 1
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute( "SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m' ) os.unlink(tmp_sam_reads) os.unlink(bigsam)
def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, novoalign_options='', JVMmemory=None, threads=1): ''' Take reads, align to reference with Novoalign, mark duplicates with Picard, realign indels with GATK, and optionally filter final file to mapped/non-dupe reads. ''' if not (outBamAll or outBamFiltered): log.warn("are you sure you meant to do nothing?") return bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute(inBam, refFasta, bam_aligned, options=novoalign_options.split(), JVMmemory=JVMmemory) bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( [bam_aligned], bam_mkdup, picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory) os.unlink(bam_aligned) bam_realigned = mkstempfname('.realigned.bam') tools.gatk.GATKTool().local_realign(bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory, threads=threads) os.unlink(bam_mkdup) if outBamAll: shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned)
def lastal_get_hits(inFastq, db, outList): lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') lastalOut = mkstempfname('.lastal') with open(lastalOut, 'wt') as outf: cmd = [lastalPath, '-Q1', db, inFastq] log.debug(' '.join(cmd) + ' > ' + lastalOut) subprocess.check_call(cmd, stdout=outf) # everything below this point in this method should be replaced with # our own code that just reads lastal output and makes a list of read names mafSortOut = mkstempfname('.mafsort') with open(mafSortOut, 'wt') as outf: with open(lastalOut, 'rt') as inf: cmd = [mafSortPath, '-n2'] log.debug('cat ' + lastalOut + ' | ' + ' '.join(cmd) + ' > ' + mafSortOut) subprocess.check_call(cmd, stdin=inf, stdout=outf) os.unlink(lastalOut) mafConvertOut = mkstempfname('.mafconvert') with open(mafConvertOut, 'wt') as outf: cmd = [mafConvertPath, 'tab', mafSortOut] log.debug(' '.join(cmd) + ' > ' + mafConvertOut) subprocess.check_call(cmd, stdout=outf) os.unlink(mafSortOut) filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: cmd = [noBlastLikeHitsPath, '-b', mafConvertOut, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(cmd) + ' > ' + filteredFastq) subprocess.check_call(cmd, stdout=outf) os.unlink(mafConvertOut) with open(outList, 'wt') as outf: with open(filteredFastq, 'rt') as inf: line_num = 0 for line in inf: if (line_num % 4) == 0: seq_id = line.rstrip('\n\r')[1:] if seq_id.endswith('/1') or seq_id.endswith('/2'): seq_id = seq_id[:-2] outf.write(seq_id + '\n') line_num += 1
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam): tmpBamIn = inBam for db in refDbs: tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def main_deplete_human(args): ''' Run the entire depletion pipeline: bmtagger, mvicuna, blastn. Optionally, use lastal to select a specific taxon of interest.''' # only RevertSam if inBam is already aligned # Most of the time the input will be unaligned # so we can save save time if we can skip RevertSam in the unaligned case # # via the SAM/BAM spec, if the file is aligned, an SQ line should be present # in the header. Using pysam, we can check this if header['SQ'])>0 # https://samtools.github.io/hts-specs/SAMv1.pdf # if the user has requested a revertBam revertBamOut = args.revertBam if args.revertBam else mkstempfname('.bam') bamToDeplete = args.inBam with pysam.AlignmentFile(args.inBam, 'rb', check_sq=False) as bam: # if it looks like the bam is aligned, revert it if 'SQ' in bam.header and len(bam.header['SQ'])>0: tools.picard.RevertSamTool().execute( args.inBam, revertBamOut, picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true'] ) bamToDeplete = revertBamOut else: # if we don't need to produce a revertBam file # but the user has specified one anyway # simply touch the output if args.revertBam: log.warning("An output was specified for 'revertBam', but the input is unaligned, so RevertSam was not needed. Touching the output.") util.file.touch(revertBamOut) # TODO: error out? run RevertSam anyway? multi_db_deplete_bam( bamToDeplete, args.bmtaggerDbs, deplete_bmtagger_bam, args.bmtaggerBam, threads=args.threads, JVMmemory=args.JVMmemory ) # if the user has not specified saving a revertBam, we used a temp file and can remove it if not args.revertBam: os.unlink(revertBamOut) read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory) multi_db_deplete_bam( args.rmdupBam, args.blastDbs, deplete_blastn_bam, args.blastnBam, threads=args.threads, JVMmemory=args.JVMmemory ) if args.taxfiltBam and args.lastDb: filter_lastal_bam(args.blastnBam, args.lastDb, args.taxfiltBam, JVMmemory=args.JVMmemory) return 0
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, JVMmemory=None): tmpBamIn = inBam for db in refDbs: tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, JVMmemory=JVMmemory) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList): # Run M-Vicuna on FASTQ files outFastq1 = mkstempfname('.1.fastq') outFastq2 = mkstempfname('.2.fastq') tools.mvicuna.MvicunaTool().rmdup((inFastq1, inFastq2), (outFastq1, outFastq2), None) # Make a list of reads to keep with open(readList, 'at') as outf: for fq in (outFastq1, outFastq2): with util.file.open_or_gzopen(fq, 'rt') as inf: line_num = 0 for line in inf: if (line_num % 4) == 0: idVal = line.rstrip('\n')[1:] if idVal.endswith('/1'): outf.write(idVal[:-2] + '\n') line_num += 1 os.unlink(outFastq1) os.unlink(outFastq2)
def deplete_blastn_bam(inBam, db, outBam, threads=None, chunkSize=1000000, JVMmemory=None): #def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=0, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' blast_hits = mkstempfname('.blast_hits.txt') if threads is None: threads = util.misc.available_cpu_count() else: threads = max(min(util.misc.available_cpu_count(), threads), 1) if chunkSize: ## chunk up input and perform blastn in several parallel threads # Initial BAM -> FASTA sequences fasta = mkstempfname('.fasta') tools.samtools.SamtoolsTool().bam2fa(inBam, fasta) # Find BLAST hits log.info("running blastn on %s against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) util.file.cat(blast_hits, blastOutFiles) os.unlink(fasta) else: ## pipe tools together and run blastn multithreaded with open(blast_hits, 'wt') as outf: for read_id in tools.blast.BlastnTool().get_hits(inBam, db, threads=threads): outf.write(read_id + '\n') # Deplete BAM of hits tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, outBam, JVMmemory=JVMmemory) os.unlink(blast_hits)
def deplete_bmtagger_bam(inBam, db, outBam, threads=None, JVMmemory=None): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. inBam: paired-end input reads in BAM format. db: bmtagger expects files db.bitmask created by bmtool, and db.srprism.idx, db.srprism.map, etc. created by srprism mkindex outBam: the output BAM files to hold the unmatched reads. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() path = os.environ['PATH'].split(os.pathsep) for t in (bmtaggerPath, blastnPath): d = os.path.dirname(t) if d not in path: path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.samtools.SamtoolsTool().bam2fq(inBam, inReads1, inReads2) bmtaggerConf = mkstempfname('.bmtagger.conf') with open(bmtaggerConf, 'w') as f: # Default srprismopts: "-b 100000000 -n 5 -R 0 -r 1 -M 7168" print('srprismopts="-b 100000000 -n 5 -R 0 -r 1 -M 7168 --paired false"', file=f) tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') cmdline = [ bmtaggerPath, '-b', db + '.bitmask', '-C', bmtaggerConf, '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-2', inReads2, '-o', matchesFile ] log.debug(' '.join(cmdline)) util.misc.run_and_print(cmdline, check=True) tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory) os.unlink(matchesFile)
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, threads=1, JVMmemory=None): samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, threads=threads, JVMmemory=JVMmemory) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, **kwargs): samtools = tools.samtools.SamtoolsTool() tmpBamIn = inBam for db in refDbs: if not samtools.isEmpty(tmpBamIn): tmpBamOut = mkstempfname('.bam') deplete_method(tmpBamIn, db, tmpBamOut, **kwargs) if tmpBamIn != inBam: os.unlink(tmpBamIn) tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam)
def filter_lastal(inFastq, refDb, outFastq): ''' Restrict input reads to those that align to the given reference database using LASTAL. Also, remove duplicates with prinseq. ''' assert outFastq.endswith('.fastq') tempFilePath = mkstempfname('.hits') lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path() noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') lastalCmd = ' '.join([ '{lastalPath} -Q1 {refDb} {inFastq}'.format(lastalPath=lastalPath, refDb=refDb, inFastq=inFastq), '| {mafSortPath} -n2'.format(mafSortPath=mafSortPath), '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(mafConvertPath=mafConvertPath, tempFilePath=tempFilePath), ]) log.debug(lastalCmd) assert not os.system(lastalCmd) # filter inFastq against lastal hits filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: noBlastLikeHitsCmd = [noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(noBlastLikeHitsCmd) + ' > ' + filteredFastq) subprocess.check_call(noBlastLikeHitsCmd, stdout=outf) # remove duplicate reads and reads with multiple Ns if os.path.getsize(filteredFastq) == 0: # prinseq-lite fails on empty file input (which can happen in real life # if no reads match the refDb) so handle this scenario specially log.info("output is empty: no reads in input match refDb") shutil.copyfile(filteredFastq, outFastq) else: prinseqCmd = [ 'perl', prinseqPath, '-ns_max_n', '1', '-derep', '1', '-fastq', filteredFastq, '-out_bad', 'null', '-line_width', '0', '-out_good', outFastq[:-6] ] log.debug(' '.join(prinseqCmd)) subprocess.check_call(prinseqCmd) os.unlink(filteredFastq)
def run_blastn(blastn_path, db, input_fasta, blast_threads=1): """ run blastn on the input fasta file. this is intended to be run in parallel """ chunk_hits = mkstempfname('.hits.txt') blastnCmd = [ blastn_path, '-db', db, '-word_size', '16', '-num_threads', str(blast_threads), '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs', '2', '-query', input_fasta, '-out', chunk_hits ] log.debug(' '.join(blastnCmd)) util.misc.run_and_print(blastnCmd, check=True) os.unlink(input_fasta) return chunk_hits
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000): 'Use blastn to remove reads that match at least one of the databases.' # Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) # Run blastn using each of the databases in turn blastOutFiles = [] for db in refDbs: log.info("running blastn on %s against %s", inFastq, db) blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads) # Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout=outf) # extract reads with no blast hits no_blast_hits(blastOutCombined, inFastq, outFastq)
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' fastq1 = mkstempfname('.1.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1) # Find BLAST hits read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) log.info("running blastn on %s against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) os.unlink(fasta) # Deplete BAM of hits tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, outBam, JVMmemory=JVMmemory) os.unlink(blast_hits)
def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' #blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2) # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 1 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory) # Depleted BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2) # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 2 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory) # Clean up for fn in (fasta, blast_hits, halfBam): os.unlink(fn)
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' #blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1, fastq2) # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 1 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory) # Depleted BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(halfBam, fastq1, fastq2) # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 2 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory) # Clean up for fn in (fasta, blast_hits, halfBam): os.unlink(fn)
def deplete_bmtagger_bam(inBam, db, outBam) : """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. inBam: paired-end input reads in BAM format. db: bmtagger expects files db.bitmask created by bmtool, and db.srprism.idx, db.srprism.map, etc. created by srprism mkindex outBam: the output BAM files to hold the unmatched reads. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() path = os.environ['PATH'].split(os.pathsep) for t in (bmtaggerPath, blastnPath): d = os.path.dirname(t) if d not in path: path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2) tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') cmdline = [bmtaggerPath, '-b', db+'.bitmask', '-x', db+'.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-2', inReads2, '-o', matchesFile] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam) os.unlink(matchesFile)
def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex=r'^@(\S+)/[1|2]$'): '''Use mergeShuffledFastqSeqs to purge unmated reads, and put corresponding reads in the same order. Corresponding sequences must have sequence identifiers of the form SEQID/1 and SEQID/2. ''' tempOutput = mkstempfname() mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(), 'mergeShuffledFastqSeqs.pl') cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex, '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) shutil.move(tempOutput + '.1.fastq', outFastq1) shutil.move(tempOutput + '.2.fastq', outFastq2) return 0
def deplete_blastn(inFastq, outFastq, refDbs) : 'Use blastn to remove reads that match at least one of the databases.' ## Get tools blastnPath = tools.blast.BlastnTool().install_and_get_path() noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), 'noBlastHits_v3.py') ## Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) ## Run blastn using each of the databases in turn blastOutFiles = [mkstempfname() for db in refDbs] for db, blastOutFile in zip(refDbs, blastOutFiles) : log.info("running blastn on {} against {}".format(inFastq, db)) blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-num_descriptions', '2', '-num_alignments', '2', '-query', inFasta, '-out', blastOutFile] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) ## Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout = outf) ## run noBlastHits_v3.py to extract reads with no blast hits # TODO: slurp the small amount of code in this script into here noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, '-r', inFastq, '-m', 'nohit'] log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq) with util.file.open_or_gzopen(outFastq, 'wt') as outf : subprocess.check_call(noBlastHitsCmd, stdout = outf)
def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. inBam: paired-end input reads in BAM format. db: bmtagger expects files db.bitmask created by bmtool, and db.srprism.idx, db.srprism.map, etc. created by srprism mkindex outBam: the output BAM files to hold the unmatched reads. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() path = os.environ['PATH'].split(os.pathsep) for t in (bmtaggerPath, blastnPath): d = os.path.dirname(t) if d not in path: path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2) tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') cmdline = [bmtaggerPath, '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-2', inReads2, '-o', matchesFile] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory) os.unlink(matchesFile)
def filter_lastal_bam(inBam, db, outBam, max_gapless_alignments_per_position=1, min_length_for_initial_matches=5, max_length_for_initial_matches=50, max_initial_matches_per_position=100, JVMmemory=None): ''' Restrict input reads to those that align to the given reference database using LASTAL. ''' # convert BAM to paired FASTQ inReads = mkstempfname('.all.fastq') tools.samtools.SamtoolsTool().bam2fq(inBam, inReads) # look for hits in FASTQ hitList1 = mkstempfname('.hits') lastal_get_hits(inReads, db, hitList1, max_gapless_alignments_per_position, min_length_for_initial_matches, max_length_for_initial_matches, max_initial_matches_per_position) os.unlink(inReads) # merge & uniqify hits hitList = mkstempfname('.hits') with open(hitList, 'wt') as outf: subprocess.check_call(['sort', '-u', hitList1], stdout=outf) os.unlink(hitList1) # filter original BAM file against keep list tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory) os.unlink(hitList)
def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path() tmpUnpaired1 = mkstempfname() tmpUnpaired2 = mkstempfname() javaCmd = [] # the conda version wraps the jar file with a shell script if trimmomaticPath.endswith(".jar"): # This java program has a lot of argments... javaCmd.extend(['java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath, 'org.usadellab.trimmomatic.TrimmomaticPE']) else: javaCmd.extend([trimmomaticPath, "PE"]) javaCmd.extend([inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, pairedOutFastq2, tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)]) log.debug(' '.join(javaCmd)) subprocess.check_call(javaCmd) os.unlink(tmpUnpaired1) os.unlink(tmpUnpaired2)
def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. inFastq1, inFastq2: paired-end input reads in fastq format The names of the reads must be in one-to-one correspondence. databases: for each db in databases bmtagger expects files db.bitmask created by bmtool, and db.srprism.idx, db.srprism.map, etc. created by srprism mkindex outFastq1, outFastq2: pair of output fastq files depleted of reads present in the databases This version is optimized for the case of only requiring depletion, which allows us to avoid time-intensive lookups. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() blastnPath = tools.blast.BlastnTool().install_and_get_path() # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH path = os.environ['PATH'].split(os.pathsep) for t in (bmtaggerPath, blastnPath): d = os.path.dirname(t) if d not in path: path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path tempDir = tempfile.mkdtemp() curReads1, curReads2 = inFastq1, inFastq2 tempfiles = [] for db in databases: outprefix = mkstempfname() cmdline = [ bmtaggerPath, '-X', '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', curReads1, '-2', curReads2, '-o', outprefix ] log.debug(' '.join(cmdline)) util.misc.run_and_print(cmdline, check=True) curReads1, curReads2 = [ outprefix + suffix for suffix in ('_1.fastq', '_2.fastq') ] tempfiles += [curReads1, curReads2] shutil.copyfile(curReads1, outFastq1) shutil.copyfile(curReads2, outFastq2) for fn in tempfiles: os.unlink(fn) log.debug("deplete_bmtagger complete")
def filter_lastal(inFastq, refDbs, outFastq): """ TODO: make this operate on BAM files """ assert outFastq.endswith('.fastq') outFastq = outFastq[:-6] tempFilePath = mkstempfname() lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path() noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(), 'noBlastLikeHits.py') # each pipe separated cmd gets own line # unfortunately, it doesn't seem to work to do .format(**locals()) on the # final string as opposed to the individual parts. lastalCmd = ' '.join([ '{lastalPath} -Q1 {refDbs} {inFastq}'.format(**locals()), '| {mafSortPath} -n2'.format(**locals()), '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(**locals()), ]) log.debug(lastalCmd) assert not os.system(lastalCmd) # each option/flag on own line noBlastLikeHitsCmd = ' '.join([ 'python', noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m hit' ]) prinseqCmd = ' '.join([ 'perl', prinseqPath, '-ns_max_n 1', '-derep 1', '-fastq stdin', '-out_bad null', '-line_width 0', '-out_good', outFastq ]) fullCmd = "{noBlastLikeHitsCmd} | {prinseqCmd}".format(**locals()) log.debug(fullCmd) assert not os.system(fullCmd) log.debug("done")