def deplete_blastn(inFastq, outFastq, refDbs) : 'Use blastn to remove reads that match at least one of the databases.' ## Get tools noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), 'noBlastHits_v3.py') ## Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) ## Run blastn using each of the databases in turn blastOutFiles = [] for db in refDbs : log.info("running blastn on %s against %s", inFastq, db) blastOutFiles += blastn_chunked_fasta(inFasta, db) ## Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout = outf) ## run noBlastHits_v3.py to extract reads with no blast hits # TODO: slurp the small amount of code in this script into here noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, '-r', inFastq, '-m', 'nohit'] log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq) with util.file.open_or_gzopen(outFastq, 'wt') as outf : subprocess.check_call(noBlastHitsCmd, stdout = outf)
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' #blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1, fastq2) # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 1 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory) # Depleted BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(halfBam, fastq1, fastq2) # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 2 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory) # Clean up for fn in (fasta, blast_hits, halfBam): os.unlink(fn)
def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' #blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2) # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 1 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory) # Depleted BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2) # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on %s pair 2 against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory) # Clean up for fn in (fasta, blast_hits, halfBam): os.unlink(fn)
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000): 'Use blastn to remove reads that match at least one of the databases.' # Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) # Run blastn using each of the databases in turn blastOutFiles = [] for db in refDbs: log.info("running blastn on %s against %s", inFastq, db) blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads) # Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout=outf) # extract reads with no blast hits no_blast_hits(blastOutCombined, inFastq, outFastq)
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' fastq1 = mkstempfname('.1.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1) # Find BLAST hits read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) log.info("running blastn on %s against %s", inBam, db) blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads) with open(blast_hits, 'wt') as outf: for blastOutFile in blastOutFiles: with open(blastOutFile, 'rt') as inf: for line in inf: idVal = line.split('\t')[0].strip() if idVal.endswith('/1') or idVal.endswith('/2'): idVal = idVal[:-2] outf.write(idVal + '\n') os.unlink(blastOutFile) os.unlink(fasta) # Deplete BAM of hits tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, outBam, JVMmemory=JVMmemory) os.unlink(blast_hits)
def deplete_blastn(inFastq, outFastq, refDbs) : 'Use blastn to remove reads that match at least one of the databases.' ## Get tools blastnPath = tools.blast.BlastnTool().install_and_get_path() noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), 'noBlastHits_v3.py') ## Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) ## Run blastn using each of the databases in turn blastOutFiles = [mkstempfname() for db in refDbs] for db, blastOutFile in zip(refDbs, blastOutFiles) : log.info("running blastn on {} against {}".format(inFastq, db)) blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-num_descriptions', '2', '-num_alignments', '2', '-query', inFasta, '-out', blastOutFile] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) ## Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: subprocess.check_call(catCmd, stdout = outf) ## run noBlastHits_v3.py to extract reads with no blast hits # TODO: slurp the small amount of code in this script into here noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, '-r', inFastq, '-m', 'nohit'] log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq) with util.file.open_or_gzopen(outFastq, 'wt') as outf : subprocess.check_call(noBlastHitsCmd, stdout = outf)
def deplete_blastn_bam(inBam, db, outBam): 'Use blastn to remove reads that match at least one of the databases.' blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') fasta = mkstempfname('.1.fasta') blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') # Initial BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2) # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on {} pair 1 against {}".format(inBam, db)) blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-num_descriptions', '2', '-num_alignments', '2', '-query', fasta, '-out', blastOutFile] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) with open(blast_hits, 'wt') as outf: with open(blastOutFile, 'rt') as inf: for line in inf: id = line.split('\t')[0].strip() if id.endswith('/1') or id.endswith('/2'): id = id[:-2] outf.write(id+'\n') os.unlink(blastOutFile) # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam) # Depleted BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2) # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) os.unlink(fastq2) log.info("running blastn on {} pair 2 against {}".format(inBam, db)) blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-num_descriptions', '2', '-num_alignments', '2', '-query', fasta, '-out', blastOutFile] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) with open(blast_hits, 'wt') as outf: with open(blastOutFile, 'rt') as inf: for line in inf: id = line.split('\t')[0].strip() if id.endswith('/1') or id.endswith('/2'): id = id[:-2] outf.write(id+'\n') os.unlink(blastOutFile) # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam) # Clean up map(os.unlink, (fasta, blast_hits, halfBam))