def deplete_blastn(inFastq, outFastq, refDbs) :
    'Use blastn to remove reads that match at least one of the databases.'
    
    ## Get tools
    noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(),
                                      'noBlastHits_v3.py')
    
    ## Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)
    
    ## Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs :
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db)

    ## Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout = outf)

    ## run noBlastHits_v3.py to extract reads with no blast hits
    # TODO: slurp the small amount of code in this script into here
    noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined,
                     '-r', inFastq, '-m', 'nohit']
    log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq)
    with util.file.open_or_gzopen(outFastq, 'wt') as outf :
        subprocess.check_call(noBlastHitsCmd, stdout = outf)
Beispiel #2
0
def deplete_blastn_bam(inBam, db, outBam, threads, chunkSize=1000000, JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    #blastnPath = tools.blast.BlastnTool().install_and_get_path()
    fastq1 = mkstempfname('.1.fastq')
    fastq2 = mkstempfname('.2.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    halfBam = mkstempfname('.half.bam')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ1
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 1 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits in FASTQ1
    tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory)

    # Depleted BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(halfBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ2 (which is already smaller than before)
    read_utils.fastq_to_fasta(fastq2, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 2 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits against FASTQ2
    tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory)

    # Clean up
    for fn in (fasta, blast_hits, halfBam):
        os.unlink(fn)
Beispiel #3
0
def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    #blastnPath = tools.blast.BlastnTool().install_and_get_path()
    fastq1 = mkstempfname('.1.fastq')
    fastq2 = mkstempfname('.2.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    halfBam = mkstempfname('.half.bam')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ1
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 1 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits in FASTQ1
    tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory)

    # Depleted BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2)

    # Find BLAST hits against FASTQ2 (which is already smaller than before)
    read_utils.fastq_to_fasta(fastq2, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on %s pair 2 against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)

    # Deplete BAM of hits against FASTQ2
    tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory)

    # Clean up
    for fn in (fasta, blast_hits, halfBam):
        os.unlink(fn)
Beispiel #4
0
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000):
    'Use blastn to remove reads that match at least one of the databases.'

    # Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)

    # Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs:
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads)

    # Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout=outf)

    # extract reads with no blast hits
    no_blast_hits(blastOutCombined, inFastq, outFastq)
Beispiel #5
0
def deplete_blastn_bam(inBam,
                       db,
                       outBam,
                       threads,
                       chunkSize=1000000,
                       JVMmemory=None):
    'Use blastn to remove reads that match at least one of the databases.'

    fastq1 = mkstempfname('.1.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    blastOutFile = mkstempfname('.hits.txt')

    # Initial BAM -> FASTQ pair
    tools.samtools.SamtoolsTool().bam2fq(inBam, fastq1)

    # Find BLAST hits
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    log.info("running blastn on %s against %s", inBam, db)
    blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize, threads)
    with open(blast_hits, 'wt') as outf:
        for blastOutFile in blastOutFiles:
            with open(blastOutFile, 'rt') as inf:
                for line in inf:
                    idVal = line.split('\t')[0].strip()
                    if idVal.endswith('/1') or idVal.endswith('/2'):
                        idVal = idVal[:-2]
                    outf.write(idVal + '\n')
            os.unlink(blastOutFile)
    os.unlink(fasta)

    # Deplete BAM of hits
    tools.picard.FilterSamReadsTool().execute(inBam,
                                              True,
                                              blast_hits,
                                              outBam,
                                              JVMmemory=JVMmemory)
    os.unlink(blast_hits)
Beispiel #6
0
def deplete_blastn(inFastq, outFastq, refDbs, threads=1, chunkSize=1000000):
    'Use blastn to remove reads that match at least one of the databases.'

    # Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)

    # Run blastn using each of the databases in turn
    blastOutFiles = []
    for db in refDbs:
        log.info("running blastn on %s against %s", inFastq, db)
        blastOutFiles += blastn_chunked_fasta(inFasta, db, chunkSize, threads)

    # Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout=outf)

    # extract reads with no blast hits
    no_blast_hits(blastOutCombined, inFastq, outFastq)
Beispiel #7
0
def deplete_blastn(inFastq, outFastq, refDbs) :
    'Use blastn to remove reads that match at least one of the databases.'
    
    ## Get tools
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(),
                                      'noBlastHits_v3.py')
    
    ## Convert to fasta
    inFasta = mkstempfname('.fasta')
    read_utils.fastq_to_fasta(inFastq, inFasta)
    
    ## Run blastn using each of the databases in turn
    blastOutFiles = [mkstempfname() for db in refDbs]
    for db, blastOutFile in zip(refDbs, blastOutFiles) :
        log.info("running blastn on {} against {}".format(inFastq, db))
        blastnCmd = [blastnPath, '-db', db,
                    '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6',
                    '-num_descriptions', '2', '-num_alignments', '2',
                    '-query', inFasta, '-out', blastOutFile]
        log.debug(' '.join(blastnCmd))
        subprocess.check_call(blastnCmd)

    ## Combine results from different databases
    blastOutCombined = mkstempfname('.txt')
    catCmd = ['cat'] + blastOutFiles
    log.debug(' '.join(catCmd) + '> ' + blastOutCombined)
    with open(blastOutCombined, 'wt') as outf:
        subprocess.check_call(catCmd, stdout = outf)

    ## run noBlastHits_v3.py to extract reads with no blast hits
    # TODO: slurp the small amount of code in this script into here
    noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined,
                     '-r', inFastq, '-m', 'nohit']
    log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq)
    with util.file.open_or_gzopen(outFastq, 'wt') as outf :
        subprocess.check_call(noBlastHitsCmd, stdout = outf)
Beispiel #8
0
def deplete_blastn_bam(inBam, db, outBam):
    'Use blastn to remove reads that match at least one of the databases.'
    
    blastnPath = tools.blast.BlastnTool().install_and_get_path()
    fastq1 = mkstempfname('.1.fastq')
    fastq2 = mkstempfname('.2.fastq')
    fasta = mkstempfname('.1.fasta')
    blast_hits = mkstempfname('.blast_hits.txt')
    halfBam = mkstempfname('.half.bam')
    blastOutFile = mkstempfname('.hits.txt')
    
    # Initial BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2)
    
    # Find BLAST hits against FASTQ1
    read_utils.fastq_to_fasta(fastq1, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on {} pair 1 against {}".format(inBam, db))
    blastnCmd = [blastnPath, '-db', db,
                '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6',
                '-num_descriptions', '2', '-num_alignments', '2',
                '-query', fasta, '-out', blastOutFile]
    log.debug(' '.join(blastnCmd))
    subprocess.check_call(blastnCmd)
    with open(blast_hits, 'wt') as outf:
        with open(blastOutFile, 'rt') as inf:
            for line in inf:
                id = line.split('\t')[0].strip()
                if id.endswith('/1') or id.endswith('/2'):
                    id = id[:-2]
                outf.write(id+'\n')
        os.unlink(blastOutFile)
    
    # Deplete BAM of hits in FASTQ1
    tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam)
    
    # Depleted BAM -> FASTQ pair
    tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2)
    
    # Find BLAST hits against FASTQ2 (which is already smaller than before)
    read_utils.fastq_to_fasta(fastq2, fasta)
    os.unlink(fastq1)
    os.unlink(fastq2)
    log.info("running blastn on {} pair 2 against {}".format(inBam, db))
    blastnCmd = [blastnPath, '-db', db,
                '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6',
                '-num_descriptions', '2', '-num_alignments', '2',
                '-query', fasta, '-out', blastOutFile]
    log.debug(' '.join(blastnCmd))
    subprocess.check_call(blastnCmd)
    with open(blast_hits, 'wt') as outf:
        with open(blastOutFile, 'rt') as inf:
            for line in inf:
                id = line.split('\t')[0].strip()
                if id.endswith('/1') or id.endswith('/2'):
                    id = id[:-2]
                outf.write(id+'\n')
        os.unlink(blastOutFile)
    
    # Deplete BAM of hits against FASTQ2
    tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam)
    
    # Clean up
    map(os.unlink, (fasta, blast_hits, halfBam))