Exemple #1
0
    def classify(self, inBam, db, outReads, numThreads=None):
        """Classify input reads (bam)

        Args:
          inBam: unaligned reads
          db: Kraken built database directory.
          outReads: Output file of command.
        """
        if tools.samtools.SamtoolsTool().isEmpty(inBam):
            # kraken cannot deal with empty input
            with open(outReads, 'rt') as outf:
                pass
            return
        tmp_fastq1 = util.file.mkstempfname('.1.fastq')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq')
        # do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(inBam, tmp_fastq1, tmp_fastq2,
                       picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                       JVMmemory=picard.jvmMemDefault)
        if numThreads is None:
            numThreads = 10000000
        opts = {
            '--paired': None,
            '--threads': min(int(numThreads), util.misc.available_cpu_count()),
        }
        res = self.execute('kraken', db, outReads, args=[tmp_fastq1, tmp_fastq2], options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
Exemple #2
0
def diamond(inBam,
            db,
            taxDb,
            outReport,
            outM8=None,
            outLca=None,
            numThreads=1):
    '''
        Classify reads by the taxon of the Lowest Common Ancestor (LCA)
    '''
    tmp_fastq = util.file.mkstempfname('.fastq')
    tmp_fastq2 = util.file.mkstempfname('.fastq')
    # do not convert this to samtools bam2fq unless we can figure out how to replicate
    # the clipping functionality of Picard SamToFastq
    picard = tools.picard.SamToFastqTool()
    picard_opts = {
        'CLIPPING_ATTRIBUTE':
        tools.picard.SamToFastqTool.illumina_clipping_attribute,
        'CLIPPING_ACTION': 'X'
    }
    picard.execute(inBam,
                   tmp_fastq,
                   tmp_fastq2,
                   picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                       picard_opts),
                   JVMmemory=picard.jvmMemDefault)

    diamond_tool = tools.diamond.Diamond()
    diamond_tool.install()
    tmp_alignment = util.file.mkstempfname('.daa')
    tmp_m8 = util.file.mkstempfname('.diamond.m8')
    diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2],
                        tmp_alignment,
                        options={'--threads': numThreads})
    diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads})

    if outM8:
        with open(tmp_m8, 'rb') as f_in:
            with gzip.open(outM8, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    tax_db = TaxonomyDb(tax_dir=taxDb,
                        load_names=True,
                        load_nodes=True,
                        load_gis=True)
    tmp_lca_tsv = util.file.mkstempfname('.tsv')
    with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca:
        blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50)

    if outLca:
        with open(tmp_lca_tsv, 'rb') as f_in:
            with gzip.open(outLca, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    with open(tmp_lca_tsv) as f:
        hits = taxa_hits_from_tsv(f)
    with open(outReport, 'w') as f:
        for line in kraken_dfs_report(tax_db, hits):
            print(line, file=f)
Exemple #3
0
    def pipeline(self,
                 inBam,
                 db,
                 outReport=None,
                 outReads=None,
                 filterThreshold=None,
                 numThreads=None):

        with util.file.fifo(2) as (fastq1_pipe, fastq2_pipe):
            # do not convert this to samtools bam2fq unless we can figure out how to replicate
            # the clipping functionality of Picard SamToFastq
            picard = tools.picard.SamToFastqTool()
            picard_opts = {
                'CLIPPING_ATTRIBUTE':
                tools.picard.SamToFastqTool.illumina_clipping_attribute,
                'CLIPPING_ACTION': 'X'
            }
            picard.execute(
                inBam,
                fastq1_pipe,
                fastq2_pipe,
                picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                    picard_opts),
                JVMmemory=picard.jvmMemDefault,
                background=True)

            if numThreads is None:
                numThreads = 10000000
            opts = {
                '--threads': min(int(numThreads),
                                 util.misc.available_cpu_count()),
            }

            kraken_bin = os.path.join(self.libexec, 'kraken')
            cmd = '''export KRAKEN_DEFAULT_DB={kraken_db}; {{ {kraken} --paired --fastq-input --threads {threads} {fastq1} {fastq2} 2>&1 1>&3 3>&- | sed '/Processed [0-9]* sequences/d'; }} \
            3>&1 1>&2'''.format(kraken_db=db,
                                kraken=kraken_bin,
                                threads=numThreads,
                                fastq1=fastq1_pipe,
                                fastq2=fastq2_pipe)

            if outReads is not None:
                cmd += '| tee >(gzip > {kraken_reads})'.format(
                    kraken_reads=outReads)

            if filterThreshold is not None:

                kraken_filter_bin = os.path.join(self.libexec, 'kraken-filter')
                cmd += '| {kraken_filter} --threshold {filterThreshold}'.format(
                    kraken_filter=kraken_filter_bin,
                    filterThreshold=filterThreshold)

            if outReport is not None:
                kraken_report_bin = os.path.join(self.libexec, 'kraken-report')
                cmd += '| {kraken_report} > {outReport}'.format(
                    kraken_report=kraken_report_bin, outReport=outReport)
            subprocess.check_call(cmd, shell=True, executable='/bin/bash')
Exemple #4
0
def kraken(inBam,
           db,
           outReport=None,
           outReads=None,
           filterThreshold=None,
           numThreads=1):
    assert outReads or outReport, (
        'Either --outReads or --outReport must be specified.')

    tmp_fastq1 = util.file.mkstempfname('.1.fastq')
    tmp_fastq2 = util.file.mkstempfname('.2.fastq')
    picard = tools.picard.SamToFastqTool()
    picard_opts = {
        'CLIPPING_ATTRIBUTE':
        tools.picard.SamToFastqTool.illumina_clipping_attribute,
        'CLIPPING_ACTION': 'X'
    }
    picard.execute(inBam,
                   tmp_fastq1,
                   tmp_fastq2,
                   picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                       picard_opts),
                   JVMmemory=picard.jvmMemDefault)

    kraken_tool = tools.kraken.Kraken()
    tmp_reads = util.file.mkstempfname('.kraken')
    opts = {
        '--paired': None,
        '--threads': min(int(numThreads), util.misc.available_cpu_count()),
    }
    # Could be optimized in 3.5 piping directly to kraken-filter.
    kraken_tool.classify(db, [tmp_fastq1, tmp_fastq2], tmp_reads, options=opts)

    if filterThreshold:
        opts = {
            '--threshold': filterThreshold,
        }

        tmp_filtered_reads = util.file.mkstempfname('.filtered-kraken')
        kraken_tool.execute('kraken-filter',
                            db,
                            tmp_filtered_reads,
                            args=[tmp_reads],
                            options=opts)
    else:
        tmp_filtered_reads = tmp_reads

    if outReads:
        with open(tmp_filtered_reads, 'rb') as f_in:
            with gzip.open(outReads, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if outReport:
        kraken_tool.execute('kraken-report',
                            db,
                            outReport,
                            args=[tmp_filtered_reads])
Exemple #5
0
    def classify(self, inBam, db, outReads, numThreads=None):
        """Classify input reads (bam)

        Args:
          inBam: unaligned reads
          db: Kraken built database directory.
          outReads: Output file of command.
        """
        if tools.samtools.SamtoolsTool().isEmpty(inBam):
            # kraken cannot deal with empty input
            with open(outReads, 'rt') as outf:
                pass
            return
        tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz')
        # do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE':
            tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(
            inBam,
            tmp_fastq1,
            tmp_fastq2,
            picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                picard_opts),
            JVMmemory=picard.jvmMemDefault)

        if numThreads is None:
            numThreads = 10000000
        opts = {
            '--threads': min(int(numThreads), util.misc.available_cpu_count()),
            '--fastq-input': None,
            '--gzip-compressed': None,
        }
        if os.path.getsize(tmp_fastq2) < 50:
            res = self.execute('kraken',
                               db,
                               outReads,
                               args=[tmp_fastq1],
                               options=opts)
        else:
            opts['--paired'] = None
            res = self.execute('kraken',
                               db,
                               outReads,
                               args=[tmp_fastq1, tmp_fastq2],
                               options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
Exemple #6
0
def diamond(inBam,
            db,
            taxDb,
            outReport,
            outM8=None,
            outLca=None,
            numThreads=1):
    tmp_fastq = util.file.mkstempfname('.fastq')
    tmp_fastq2 = util.file.mkstempfname('.fastq')
    picard = tools.picard.SamToFastqTool()
    picard_opts = {
        'CLIPPING_ATTRIBUTE':
        tools.picard.SamToFastqTool.illumina_clipping_attribute,
        'CLIPPING_ACTION': 'X'
    }
    picard.execute(inBam,
                   tmp_fastq,
                   tmp_fastq2,
                   picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                       picard_opts),
                   JVMmemory=picard.jvmMemDefault)

    diamond_tool = tools.diamond.Diamond()
    diamond_tool.install()
    tmp_alignment = util.file.mkstempfname('.daa')
    tmp_m8 = util.file.mkstempfname('.diamond.m8')
    diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2],
                        tmp_alignment,
                        options={'--threads': numThreads})
    diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads})

    if outM8:
        with open(tmp_m8, 'rb') as f_in:
            with gzip.open(outM8, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    tax_db = TaxonomyDb(tax_dir=taxDb)
    tmp_lca_tsv = util.file.mkstempfname('.tsv')
    with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca:
        blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50)

    if outLca:
        with open(tmp_lca_tsv, 'rb') as f_in:
            with gzip.open(outLca, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    with open(tmp_lca_tsv) as f:
        hits = taxa_hits_from_tsv(f)
    with open(outReport, 'w') as f:
        for line in kraken_dfs_report(tax_db, hits):
            print(line, file=f)
Exemple #7
0
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
    """
        Classify reads by the taxon of the Lowest Common Ancestor (LCA)
    """
    tmp_fastq = util.file.mkstempfname(".fastq")
    tmp_fastq2 = util.file.mkstempfname(".fastq")
    # do not convert this to samtools bam2fq unless we can figure out how to replicate
    # the clipping functionality of Picard SamToFastq
    picard = tools.picard.SamToFastqTool()
    picard_opts = {
        "CLIPPING_ATTRIBUTE": tools.picard.SamToFastqTool.illumina_clipping_attribute,
        "CLIPPING_ACTION": "X",
    }
    picard.execute(
        inBam,
        tmp_fastq,
        tmp_fastq2,
        picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
        JVMmemory=picard.jvmMemDefault,
    )

    diamond_tool = tools.diamond.Diamond()
    diamond_tool.install()
    tmp_alignment = util.file.mkstempfname(".daa")
    tmp_m8 = util.file.mkstempfname(".diamond.m8")
    diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={"--threads": numThreads})
    diamond_tool.view(tmp_alignment, tmp_m8, options={"--threads": numThreads})

    if outM8:
        with open(tmp_m8, "rb") as f_in:
            with gzip.open(outM8, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

    tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True, load_gis=True)
    tmp_lca_tsv = util.file.mkstempfname(".tsv")
    with open(tmp_m8) as m8, open(tmp_lca_tsv, "w") as lca:
        blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50)

    if outLca:
        with open(tmp_lca_tsv, "rb") as f_in:
            with gzip.open(outLca, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

    with open(tmp_lca_tsv) as f:
        hits = taxa_hits_from_tsv(f)
    with open(outReport, "w") as f:
        for line in kraken_dfs_report(tax_db, hits):
            print(line, file=f)
Exemple #8
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute(
                "SamFormatConverter", [
                    'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING'
                ],
                JVMmemory='512m'
            )
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
Exemple #9
0
def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
    '''
        Classify reads by the taxon of the Lowest Common Ancestor (LCA)
    '''
    tmp_fastq = util.file.mkstempfname('.fastq')
    tmp_fastq2 = util.file.mkstempfname('.fastq')
    # do not convert this to samtools bam2fq unless we can figure out how to replicate
    # the clipping functionality of Picard SamToFastq
    picard = tools.picard.SamToFastqTool()
    picard_opts = {
        'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
        'CLIPPING_ACTION': 'X'
    }
    picard.execute(inBam, tmp_fastq, tmp_fastq2,
                   picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                   JVMmemory=picard.jvmMemDefault)


    diamond_tool = tools.diamond.Diamond()
    diamond_tool.install()
    tmp_alignment = util.file.mkstempfname('.daa')
    tmp_m8 = util.file.mkstempfname('.diamond.m8')
    diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment,
                        options={'--threads': numThreads})
    diamond_tool.view(tmp_alignment, tmp_m8,
                      options={'--threads': numThreads})

    if outM8:
        with open(tmp_m8, 'rb') as f_in:
            with gzip.open(outM8, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    tax_db = TaxonomyDb(tax_dir=taxDb)
    tmp_lca_tsv = util.file.mkstempfname('.tsv')
    with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca:
        blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50)

    if outLca:
        with open(tmp_lca_tsv, 'rb') as f_in:
            with gzip.open(outLca, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    with open(tmp_lca_tsv) as f:
        hits = taxa_hits_from_tsv(f)
    with open(outReport, 'w') as f:
        for line in kraken_dfs_report(tax_db, hits, prepend_column=True):
            print(line, file=f)
Exemple #10
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each",
             totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute("SamFormatConverter", [
                'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam,
                'VERBOSITY=WARNING'
            ],
                           JVMmemory='512m')
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
Exemple #11
0
def diamond(inBam, db, taxDb, outReport, outReads=None, numThreads=1):
    '''
        Classify reads by the taxon of the Lowest Common Ancestor (LCA)
    '''
    # do not convert this to samtools bam2fq unless we can figure out how to replicate
    # the clipping functionality of Picard SamToFastq
    picard = tools.picard.SamToFastqTool()
    s2fq = picard.execute(
        inBam,
        '/dev/stdout',
        interleave=True,
        illuminaClipping=True,
        JVMmemory=picard.jvmMemDefault,
        background=True,
        stdout=subprocess.PIPE,
    )

    diamond_tool = tools.diamond.Diamond()
    diamond_tool.install()
    taxonmap = join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz')
    taxonnodes = join(taxDb, 'nodes.dmp')

    cmd = '{} blastx --outfmt 102 --sallseqid'.format(
        diamond_tool.install_and_get_path())
    if numThreads is not None:
        cmd += ' --threads {threads}'.format(threads=numThreads)
    cmd += ' --db {db} --taxonmap {taxonmap} --taxonnodes {taxonnodes}'.format(
        threads=numThreads, db=db, taxonmap=taxonmap, taxonnodes=taxonnodes)

    if outReads is not None:
        # Interstitial save of stdout to output file
        cmd += ' | tee >(gzip > {out})'.format(out=outReads)

    diamond_ps = subprocess.Popen(cmd,
                                  shell=True,
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE,
                                  executable='/bin/bash')

    def f(input_pipe, output_pipe):
        output_pipe = codecs.getwriter('ascii')(output_pipe)
        SeqIO.write(
            util.file.join_interleaved_fastq(input_pipe,
                                             output_format='fasta',
                                             num_n=16), output_pipe, 'fasta')

    util.misc.bind_pipes(s2fq.stdout, diamond_ps.stdin, f)

    tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)

    lca_p = codecs.getreader('ascii')(diamond_ps.stdout)

    hits = taxa_hits_from_tsv(lca_p)
    with open(outReport, 'w') as f:
        for line in kraken_dfs_report(tax_db, hits):
            print(line, file=f)

    s2fq.wait()
    diamond_ps.wait()
    def classify(self, in_bam, db, out_reads=None, out_report=None, num_threads=None):
        """Classify input reads (bam)

        Args:
          in_bam: unaligned reads
          db: Kraken built database directory.
          outReads: Output file of command.
        """
        tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz')
        # Do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(in_bam, tmp_fastq1, tmp_fastq2,
                       picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                       JVMmemory=picard.jvmMemDefault)

        opts = {
            '--threads': util.misc.sanitize_thread_count(num_threads),
            '--fastq-input': None,
            '--gzip-compressed': None,
            '--preload': None
        }
        if out_report:
            opts['--report-file'] = out_report
        # Detect if input bam was paired by checking fastq 2
        if os.path.getsize(tmp_fastq2) < 50:
            res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1], options=opts)
        else:
            opts['--paired'] = None
            res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
        if out_report:
            with open(out_report, 'rt+') as f:
                lines = [line.strip() for line in f.readlines() if not line.startswith('#')]
                lines = [line for line in lines if line]
                if not lines:
                    f.seek(f.tell() - 1, os.SEEK_SET)
                    print('\t'.join(['%', 'reads', 'taxReads', 'kmers', 'dup', 'cov', 'taxID', 'rank', 'taxName']), file=f)
                    print('\t'.join(['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']), file=f)
Exemple #13
0
    def classify(self, in_bam, db, out_reads=None, out_report=None, num_threads=None):
        """Classify input reads (bam)

        Args:
          in_bam: unaligned reads
          db: Kraken built database directory.
          outReads: Output file of command.
        """
        tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz')
        # Do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(in_bam, tmp_fastq1, tmp_fastq2,
                       picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                       JVMmemory=picard.jvmMemDefault)

        opts = {
            '--threads': util.misc.sanitize_thread_count(num_threads),
            '--fastq-input': None,
            '--gzip-compressed': None,
            '--preload': None
        }
        if out_report:
            opts['--report-file'] = out_report
        # Detect if input bam was paired by checking fastq 2
        if os.path.getsize(tmp_fastq2) < 50:
            res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1], options=opts)
        else:
            opts['--paired'] = None
            res = self.execute(self.BINS['classify'], db, out_reads, args=[tmp_fastq1, tmp_fastq2], options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
        if out_report:
            with open(out_report, 'rt+') as f:
                lines = [line.strip() for line in f.readlines() if not line.startswith('#')]
                lines = [line for line in lines if line]
                if not lines:
                    f.seek(f.tell() - 1, os.SEEK_SET)
                    print('\t'.join(['%', 'reads', 'taxReads', 'kmers', 'dup', 'cov', 'taxID', 'rank', 'taxName']), file=f)
                    print('\t'.join(['100.00', '0', '0', '0', '0', 'NA', '0', 'no rank', 'unclassified']), file=f)
Exemple #14
0
    def classify(self, inBam, db, outReads, numThreads=None):
        """Classify input reads (bam)

        Args:
          inBam: unaligned reads
          db: Kraken built database directory.
          outReads: Output file of command.
        """
        if tools.samtools.SamtoolsTool().isEmpty(inBam):
            # kraken cannot deal with empty input
            with open(outReads, 'rt') as outf:
                pass
            return
        tmp_fastq1 = util.file.mkstempfname('.1.fastq.gz')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq.gz')
        # do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(inBam, tmp_fastq1, tmp_fastq2,
                       picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                       JVMmemory=picard.jvmMemDefault)

        opts = {
            '--threads': util.misc.sanitize_thread_count(numThreads),
            '--fastq-input': None,
            '--gzip-compressed': None,
        }
        # Detect if input bam was paired by checking fastq 2
        if os.path.getsize(tmp_fastq2) < 50:
            res = self.execute('kraken', db, outReads, args=[tmp_fastq1], options=opts)
        else:
            opts['--paired'] = None
            res = self.execute('kraken', db, outReads, args=[tmp_fastq1, tmp_fastq2], options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
    def classify(self,
                 in_bam,
                 db,
                 out_reads=None,
                 out_report=None,
                 confidence=None,
                 min_base_qual=None,
                 minimum_hit_groups=None,
                 num_threads=None):
        """Classify input reads (bam)

        Args:
          in_bam: unaligned reads
          db: Kraken built database directory.
          out_reads: Output file of command.
        """

        if tools.samtools.SamtoolsTool().isEmpty(in_bam):
            # kraken cannot deal with empty input
            if out_reads:
                with open(out_reads, 'wt') as outf:
                    pass
            if out_report:
                with open(out_report, 'wt') as outf:
                    pass
            return

        opts = {'--threads': util.misc.sanitize_thread_count(num_threads)}
        if out_report:
            opts['--report'] = out_report
        if not out_reads:
            out_reads = '-'  # in kraken2, this suppresses normal output
        if min_base_qual:
            opts['--minimum-base-quality'] = min_base_qual
        if confidence:
            opts['--confidence'] = confidence
        if minimum_hit_groups:
            opts['--minimum-hit-groups'] = minimum_hit_groups

        tmp_fastq1 = util.file.mkstempfname('.1.fastq')
        tmp_fastq2 = util.file.mkstempfname('.2.fastq')
        tmp_fastq3 = util.file.mkstempfname('.s.fastq')
        # Do not convert this to samtools bam2fq unless we can figure out how to replicate
        # the clipping functionality of Picard SamToFastq
        picard = tools.picard.SamToFastqTool()
        picard_opts = {
            'CLIPPING_ATTRIBUTE':
            tools.picard.SamToFastqTool.illumina_clipping_attribute,
            'CLIPPING_ACTION': 'X'
        }
        picard.execute(
            in_bam,
            tmp_fastq1,
            tmp_fastq2,
            outFastq0=tmp_fastq3,
            picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                picard_opts),
            JVMmemory=picard.jvmMemDefault)

        if out_report:
            opts['--report'] = out_report
        # Detect if input bam was paired by checking fastq 2
        if os.path.getsize(tmp_fastq2) < os.path.getsize(tmp_fastq3):
            log.warn("running in single-end read mode!")
            res = self.execute('kraken2',
                               db,
                               out_reads,
                               args=[tmp_fastq3],
                               options=opts)
        else:
            opts['--paired'] = None
            res = self.execute('kraken2',
                               db,
                               out_reads,
                               args=[tmp_fastq1, tmp_fastq2],
                               options=opts)
        os.unlink(tmp_fastq1)
        os.unlink(tmp_fastq2)
        os.unlink(tmp_fastq3)
Exemple #16
0
    def pipeline(self, db, inBams, outReports=None, outReads=None,
                 lockMemory=None, filterThreshold=None, numThreads=None):
        assert outReads is not None or outReports is not None

        n_bams = len(inBams)
        # 2n for paired fastq, 1n for kraken output
        n_pipes = n_bams * 3
        if outReports and len(outReports) != n_bams:
            raise Exception("--outReports specified with {} output files, which does not match the number of input bams ({})".format(len(outReports), n_bams))
        if outReads and len(outReads) != n_bams:
            raise Exception("--outReads specified with {} output files, which does not match the number of input bams ({})".format(len(outReads), n_bams))
        threads = util.misc.sanitize_thread_count(numThreads)

        with util.file.fifo(n_pipes) as pipes:
            fastq_pipes = pipes[:n_bams * 2]
            kraken_output_pipes = pipes[n_bams * 2:]

            kraken_bin = 'kraken'
            opts = ''
            if lockMemory:
                opts += ' --lock-memory'

            db_opts, env, tax_filter_opts, tax_report_opts = self._db_opts(db, threads)
            opts += db_opts

            cmd = '''set -ex -o pipefail; {kraken}{opts} --paired --fastq-input --threads {threads} {outputs} {fastqs}'''.format(
                kraken=kraken_bin,
                opts=opts,
                threads=threads,
                outputs=' '.join('--output {}'.format(x) for x in kraken_output_pipes),
                fastqs=' '.join(fastq_pipes))
            log.debug('Calling kraken command line: %s', cmd)
            subprocess.Popen(cmd, shell=True, executable='/bin/bash', env=env)

            for i, in_bam in enumerate(inBams):
                cmd = 'cat {kraken_output}'.format(kraken_output=kraken_output_pipes[i])

                if outReads:
                    if outReports:
                        cmd += ' | tee >(pigz --best > {kraken_reads})'
                    else:
                        cmd += ' | pigz --best > {kraken_reads}'

                    cmd = cmd.format(kraken_reads=outReads[i])

                if outReports:
                    if filterThreshold is not None:

                        kraken_filter_bin = 'kraken-filter'
                        cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format(
                            kraken_filter=kraken_filter_bin,
                            tax_opts=tax_filter_opts,
                            filterThreshold=filterThreshold)

                    kraken_report_bin = 'kraken-report'
                    cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format(
                        kraken_report=kraken_report_bin,
                        tax_opts=tax_report_opts,
                        outReport=outReports[i])

                # do not convert this to samtools bam2fq unless we can figure out how to replicate
                # the clipping functionality of Picard SamToFastq
                picard = tools.picard.SamToFastqTool()
                picard_opts = {
                    'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
                    'CLIPPING_ACTION': 'X'
                }
                bam2fq_ps = picard.execute(in_bam, fastq_pipes[i*2], fastq_pipes[i*2 + 1],
                    picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
                    JVMmemory=picard.jvmMemDefault, background=True)

                log.debug('Calling kraken output command line: %s', cmd)
                subprocess.check_call(cmd, shell=True, executable='/bin/bash', env=env)

                if bam2fq_ps.poll():
                    raise subprocess.CalledProcessError(bam2fq_ps.returncode, "SamToFastqTool().execute({})".format(in_bam))
Exemple #17
0
def miseq_fastq_to_bam(outBam,
                       sampleSheet,
                       inFastq1,
                       inFastq2=None,
                       runInfo=None,
                       sequencing_center=None,
                       JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault):
    ''' Convert fastq read files to a single bam file. Fastq file names must conform
        to patterns emitted by Miseq machines. Sample metadata must be provided
        in a SampleSheet.csv that corresponds to the fastq filename. Specifically,
        the _S##_ index in the fastq file name will be used to find the corresponding
        row in the SampleSheet
    '''

    # match miseq based on fastq filenames
    mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq1)
    assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq1
    assert mo.group(
        2
    ) == '1', "fastq1 must correspond to read 1, not read %s" % mo.group(2)
    sample_num = mo.group(1)
    if inFastq2:
        mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq2)
        assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq2
        assert mo.group(
            2
        ) == '2', "fastq2 must correspond to read 2, not read %s" % mo.group(2)
        assert mo.group(
            1
        ) == sample_num, "fastq1 (%s) and fastq2 (%s) must have the same sample number" % (
            sample_num, mo.group(1))

    # load metadata
    samples = SampleSheet(sampleSheet, allow_non_unique=True)
    sample_info = samples.fetch_by_index(sample_num)
    assert sample_info, "sample %s not found in %s" % (sample_num, sampleSheet)
    sampleName = sample_info['sample']
    log.info("Using sample name: %s", sampleName)
    if sample_info.get('barcode_2'):
        barcode = '-'.join(
            (sample_info['barcode_1'], sample_info['barcode_2']))
    else:
        barcode = sample_info['barcode_1']
    picardOpts = {
        'LIBRARY_NAME': sample_info['library'],
        'PLATFORM': 'illumina',
        'VERBOSITY': 'WARNING',
        'QUIET': 'TRUE',
    }
    if runInfo:
        runInfo = RunInfo(runInfo)
        flowcell = runInfo.get_flowcell()
        picardOpts['RUN_DATE'] = runInfo.get_rundate_iso()
        if inFastq2:
            assert runInfo.num_reads(
            ) == 2, "paired fastqs given for a single-end RunInfo.xml"
        else:
            assert runInfo.num_reads(
            ) == 1, "second fastq missing for a paired-end RunInfo.xml"
    else:
        flowcell = 'A'
    if sequencing_center is None and runInfo:
        sequencing_center = runInfo.get_machine()
    if sequencing_center:
        picardOpts['SEQUENCING_CENTER'] = sequencing_center
    picardOpts['PLATFORM_UNIT'] = '.'.join((flowcell, '1', barcode))
    if len(flowcell) > 5:
        flowcell = flowcell[:5]
    picardOpts['READ_GROUP_NAME'] = flowcell

    # run Picard
    picard = tools.picard.FastqToSamTool()
    picard.execute(inFastq1,
                   inFastq2,
                   sampleName,
                   outBam,
                   picardOptions=picard.dict_to_picard_opts(picardOpts),
                   JVMmemory=JVMmemory)
    return 0
Exemple #18
0
def miseq_fastq_to_bam(outBam, sampleSheet, inFastq1, inFastq2=None, runInfo=None,
                       sequencing_center=None,
                       JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault):
    ''' Convert fastq read files to a single bam file. Fastq file names must conform
        to patterns emitted by Miseq machines. Sample metadata must be provided
        in a SampleSheet.csv that corresponds to the fastq filename. Specifically,
        the _S##_ index in the fastq file name will be used to find the corresponding
        row in the SampleSheet
    '''

    # match miseq based on fastq filenames
    mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq1)
    assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq1
    assert mo.group(2) == '1', "fastq1 must correspond to read 1, not read %s" % mo.group(2)
    sample_num = mo.group(1)
    if inFastq2:
        mo = re.match(r"^\S+_S(\d+)_L001_R(\d)_001.fastq(?:.gz|)$", inFastq2)
        assert mo, "fastq filename %s does not match the patterns used by an Illumina Miseq machine" % inFastq2
        assert mo.group(2) == '2', "fastq2 must correspond to read 2, not read %s" % mo.group(2)
        assert mo.group(1) == sample_num, "fastq1 (%s) and fastq2 (%s) must have the same sample number" % (
            sample_num, mo.group(1))

    # load metadata
    samples = SampleSheet(sampleSheet, allow_non_unique=True)
    sample_info = samples.fetch_by_index(sample_num)
    assert sample_info, "sample %s not found in %s" % (sample_num, sampleSheet)
    sampleName = sample_info['sample']
    log.info("Using sample name: %s", sampleName)
    if sample_info.get('barcode_2'):
        barcode = '-'.join((sample_info['barcode_1'], sample_info['barcode_2']))
    else:
        barcode = sample_info['barcode_1']
    picardOpts = {
        'LIBRARY_NAME': sample_info['library'],
        'PLATFORM': 'illumina',
        'VERBOSITY': 'WARNING',
        'QUIET': 'TRUE',
    }
    if runInfo:
        runInfo = RunInfo(runInfo)
        flowcell = runInfo.get_flowcell()
        picardOpts['RUN_DATE'] = runInfo.get_rundate_iso()
        if inFastq2:
            assert runInfo.num_reads() == 2, "paired fastqs given for a single-end RunInfo.xml"
        else:
            assert runInfo.num_reads() == 1, "second fastq missing for a paired-end RunInfo.xml"
    else:
        flowcell = 'A'
    if sequencing_center is None and runInfo:
        sequencing_center = runInfo.get_machine()
    if sequencing_center:
        picardOpts['SEQUENCING_CENTER'] = sequencing_center
    picardOpts['PLATFORM_UNIT'] = '.'.join((flowcell, '1', barcode))
    if len(flowcell) > 5:
        flowcell = flowcell[:5]
    picardOpts['READ_GROUP_NAME'] = flowcell

    # run Picard
    picard = tools.picard.FastqToSamTool()
    picard.execute(inFastq1,
                   inFastq2,
                   sampleName,
                   outBam,
                   picardOptions=picard.dict_to_picard_opts(picardOpts),
                   JVMmemory=JVMmemory)
    return 0
Exemple #19
0
    def pipeline(self,
                 db,
                 inBams,
                 outReports=None,
                 outReads=None,
                 lockMemory=None,
                 filterThreshold=None,
                 numThreads=None):
        assert outReads is not None or outReports is not None

        n_bams = len(inBams)
        # 2n for paired fastq, 1n for kraken output
        n_pipes = n_bams * 3
        if outReports and len(outReports) != n_bams:
            raise Exception(
                "--outReports specified with {} output files, which does not match the number of input bams ({})"
                .format(len(outReports), n_bams))
        if outReads and len(outReads) != n_bams:
            raise Exception(
                "--outReads specified with {} output files, which does not match the number of input bams ({})"
                .format(len(outReads), n_bams))
        threads = util.misc.sanitize_thread_count(numThreads)

        with util.file.fifo(n_pipes) as pipes:
            fastq_pipes = pipes[:n_bams * 2]
            kraken_output_pipes = pipes[n_bams * 2:]

            kraken_bin = os.path.join(self.libexec, 'kraken')
            opts = ''
            if lockMemory:
                opts += ' --lock-memory'

            db_opts, env, tax_filter_opts, tax_report_opts = self._db_opts(
                db, threads)
            opts += db_opts

            cmd = '''set -ex -o pipefail; {kraken}{opts} --paired --fastq-input --threads {threads} {outputs} {fastqs}'''.format(
                kraken=kraken_bin,
                opts=opts,
                threads=threads,
                outputs=' '.join('--output {}'.format(x)
                                 for x in kraken_output_pipes),
                fastqs=' '.join(fastq_pipes))
            log.debug('Calling kraken command line: %s', cmd)
            subprocess.Popen(cmd, shell=True, executable='/bin/bash', env=env)

            for i, in_bam in enumerate(inBams):
                cmd = 'cat {kraken_output}'.format(
                    kraken_output=kraken_output_pipes[i])

                if outReads:
                    if outReports:
                        cmd += ' | tee >(pigz --best > {kraken_reads})'
                    else:
                        cmd += ' | pigz --best > {kraken_reads}'

                    cmd = cmd.format(kraken_reads=outReads[i])

                if outReports:
                    if filterThreshold is not None:

                        kraken_filter_bin = os.path.join(
                            self.libexec, 'kraken-filter')
                        cmd += ' | {kraken_filter}{tax_opts} --threshold {filterThreshold}'.format(
                            kraken_filter=kraken_filter_bin,
                            tax_opts=tax_filter_opts,
                            filterThreshold=filterThreshold)

                    kraken_report_bin = os.path.join(self.libexec,
                                                     'kraken-report')
                    cmd += ' | {kraken_report}{tax_opts} > {outReport}'.format(
                        kraken_report=kraken_report_bin,
                        tax_opts=tax_report_opts,
                        outReport=outReports[i])

                # do not convert this to samtools bam2fq unless we can figure out how to replicate
                # the clipping functionality of Picard SamToFastq
                picard = tools.picard.SamToFastqTool()
                picard_opts = {
                    'CLIPPING_ATTRIBUTE':
                    tools.picard.SamToFastqTool.illumina_clipping_attribute,
                    'CLIPPING_ACTION': 'X'
                }
                bam2fq_ps = picard.execute(
                    in_bam,
                    fastq_pipes[i * 2],
                    fastq_pipes[i * 2 + 1],
                    picardOptions=tools.picard.PicardTools.dict_to_picard_opts(
                        picard_opts),
                    JVMmemory=picard.jvmMemDefault,
                    background=True)

                log.debug('Calling kraken output command line: %s', cmd)
                subprocess.check_call(cmd,
                                      shell=True,
                                      executable='/bin/bash',
                                      env=env)

                if bam2fq_ps.poll():
                    raise subprocess.CalledProcessError(
                        bam2fq_ps.returncode,
                        "SamToFastqTool().execute({})".format(in_bam))