コード例 #1
0
ファイル: cnvkit.py プロジェクト: sanzick/bcbio-nextgen
def _cnvkit_segment(cnr_file,
                    cov_interval,
                    data,
                    items,
                    out_file=None,
                    detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(cores), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += [
                        "--vcf", small_vrn_files[0].name, "--sample-id",
                        small_vrn_files[0].sample
                    ]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment",
                                                       data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
コード例 #2
0
ファイル: cleanbam.py プロジェクト: chapmanb/bcbio-nextgen
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_exists(out_file):
        out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data))
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam))
            cores = dd.get_cores(data)
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{ref_file}")' | """
                   "samtools view -@ {cores} -u - | "
                   "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
コード例 #3
0
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #4
0
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific fastq input.
    """
    bin_dir = os.path.dirname(os.path.realpath(sys.executable))
    out_dir = utils.safe_makedir(out_dir)
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.join(bin_dir, "razers3")
            if not os.path.exists(razers3):
                raise ValueError("Could not find razers3 executable at %s" %
                                 (razers3))
            out_handle.write(
                CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        resources = config_utils.get_resources("optitype", data["config"])
        if resources.get("options"):
            opts = " ".join([str(x) for x in resources["options"]])
        else:
            opts = ""
        cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} "
               "-i {hla_fq} -c {config_file}")
        do.run(cmd.format(**locals()), "HLA typing with OptiType")
        for outf in os.listdir(tx_out_dir):
            shutil.move(os.path.join(tx_out_dir, outf),
                        os.path.join(out_dir, outf))
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(
        out_file
    ) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
コード例 #5
0
ファイル: cnvkit.py プロジェクト: chris-pepin/bcbio-nextgen
def _cnvkit_segment(cnr_file, cov_interval, data):
    """Perform segmentation and copy number calling on normalized inputs
    """
    out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(dd.get_cores(data)), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += ["-v", small_vrn_files[0]]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
コード例 #6
0
ファイル: optitype.py プロジェクト: chapmanb/bcbio-nextgen
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific fastq input.
    """
    bin_dir = os.path.dirname(os.path.realpath(sys.executable))
    out_dir = utils.safe_makedir(out_dir)
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.join(bin_dir, "razers3")
            if not os.path.exists(razers3):
                raise ValueError("Could not find razers3 executable at %s" % (razers3))
            out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        resources = config_utils.get_resources("optitype", data["config"])
        if resources.get("options"):
            opts = " ".join([str(x) for x in resources["options"]])
        else:
            opts = ""
        cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} "
                "-i {hla_fq} -c {config_file}")
        do.run(cmd.format(**locals()), "HLA typing with OptiType")
        for outf in os.listdir(tx_out_dir):
            shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf))
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
コード例 #7
0
ファイル: gatkjoint.py プロジェクト: chapmanb/bcbio-nextgen
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
コード例 #8
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
コード例 #9
0
def _get_snpeff_cmd(cmd_name, datadir, data, out_file):
    """Retrieve snpEff base command line.
    """
    resources = config_utils.get_resources("snpeff", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3g"])
    # scale by cores, defaulting to 2x base usage to ensure we have enough memory
    # for single core runs to use with human genomes.
    # Sets a maximum amount of memory to avoid core dumps exceeding 32Gb
    # We shouldn't need that much memory for snpEff, so avoid issues
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/heap-sizing.html#compressed_oops
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "maximum": "30000M",
                    "magnitude": max(2, dd.get_cores(data))
                }
            }
        })
    memory = " ".join(jvm_opts)
    snpeff = config_utils.get_program("snpEff", data["config"])
    java_args = "-Djava.io.tmpdir=%s" % utils.safe_makedir(
        os.path.join(os.path.dirname(out_file), "tmp"))
    export = "unset JAVA_HOME && export PATH=%s:$PATH && " % (
        utils.get_java_binpath())
    cmd = "{export} {snpeff} {memory} {java_args} {cmd_name} -dataDir {datadir}"
    return cmd.format(**locals())
コード例 #10
0
ファイル: gatkjoint.py プロジェクト: wshands/bcbio-nextgen
def _run_genotype_gvcfs(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data,
                                    {})
            if not assoc_files: assoc_files = {}
            params = [
                "-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file, "-L",
                bamprep.region_to_gatk(region), "--max_alternate_alleles", "4"
            ]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #11
0
def _cnvkit_coverage(data, bed_file, input_type):
    """Calculate coverage in a BED file for CNVkit.
    """
    bam_file = dd.get_align_bam(data)
    work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw"))
    exts = {".target.bed": ("target", "targetcoverage.cnn"),
            ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")}
    cnntype = None
    for orig, (cur_cnntype, ext) in exts.items():
        if bed_file.endswith(orig):
            cnntype = cur_cnntype
            break
    if cnntype is None:
        assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file
        cnntype = ""
    base, base_old = _bam_to_outbase(bam_file, work_dir, data)
    out_file = "%s.%s" % (base, ext)
    out_file_old = "%s.%s" % (base_old, ext)
    # back compatible with previous runs to avoid re-calculating
    if utils.file_exists(out_file_old):
        out_file = out_file_old
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [_get_cmd(), "coverage", "-p", str(dd.get_cores(data)), bam_file, bed_file, "-o", tx_out_file]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage")
    return {"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype,
            "sample": dd.get_sample_name(data)}
コード例 #12
0
ファイル: gatkjoint.py プロジェクト: fishinwind/bcbio-nextgen
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                # Recent GATK 3.x versions also have race conditions with multiple
                # threads, so limit to 1 and keep memory available
                # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs
                # params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #13
0
ファイル: gatkjoint.py プロジェクト: fishinwind/bcbio-nextgen
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    XXX Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We need to
    explore options to identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir):
        with file_transaction(data, out_dir) as tx_out_dir:
            broad_runner = broad.runner_from_config(data["config"])
            cores = dd.get_cores(data)
            params = ["-T", "GenomicsDBImport",
                      "--readerThreads", str(cores),
                      "--genomicsDBWorkspace", out_dir,
                      "-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
コード例 #14
0
ファイル: cnvkit.py プロジェクト: druvus/bcbio-nextgen
def _run_cnvkit_shared(items,
                       test_bams,
                       background_bams,
                       work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(
        raw_work_dir,
        "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = _bam_to_outbase(test_bam, raw_work_dir)
        ckouts.append({
            "cnr": "%s.cns" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {
            "type": "local",
            "cores": dd.get_cores(data),
            "progs": ["cnvkit"]
        }
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        split_cnns = run_multicore(
            _cnvkit_coverage,
            [(bam, bed, _bam_to_itype(bam), raw_work_dir, data)
             for bam in test_bams + background_bams
             for bed in _split_bed(target_bed, data) +
             _split_bed(antitarget_bed, data)], data["config"], parallel)
        coverage_cnns = _merge_coverage(split_cnns, data)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn, target_bed, antitarget_bed, data)
        fixed_cnrs = run_multicore(
            _cnvkit_fix, [(cnns, background_cnn, data) for cnns in tz.groupby(
                "bam", [x for x in coverage_cnns
                        if x["itype"] == "evaluate"]).values()],
            data["config"], parallel)
        called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data)
                                                      for cnr in fixed_cnrs],
                                    data["config"], parallel)
    return ckouts
コード例 #15
0
def _safe_run_theta(input_file, out_dir, output_ext, args, data):
    """Run THetA, catching and continuing on any errors.
    """
    out_file = os.path.join(out_dir, _split_theta_ext(input_file) + output_ext)
    skip_file = out_file + ".skipped"
    if utils.file_exists(skip_file):
        return None
    if not utils.file_exists(out_file):
        with file_transaction(data, out_dir) as tx_out_dir:
            utils.safe_makedir(tx_out_dir)
            cmd = _get_cmd("RunTHetA.py") + args + \
                  [input_file, "--NUM_PROCESSES", dd.get_cores(data),
                   "--FORCE", "-d", tx_out_dir]
            try:
                do.run(cmd, "Run THetA to calculate purity", log_error=False)
            except subprocess.CalledProcessError, msg:
                if ("Number of intervals must be greater than 1" in str(msg) or
                        "This sample isn't a good candidate for THetA analysis"
                        in str(msg)):
                    with open(
                            os.path.join(tx_out_dir,
                                         os.path.basename(skip_file)),
                            "w") as out_handle:
                        out_handle.write("Expected TheTA failure, skipping")
                    return None
                else:
                    raise
コード例 #16
0
ファイル: gatkfilter.py プロジェクト: boratonAJ/bcbio-nextgen
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data):
    """Run variant quality score recalibration.
    """
    cutoffs = ["100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91",
               "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0"]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    base = utils.splitext_plus(in_file)[0]
    recal_file = "%s.recal" % base
    tranches_file = "%s.tranches" % base
    if not utils.file_exists(recal_file):
        with file_transaction(data, recal_file, tranches_file) as (tx_recal, tx_tranches):
            params = ["-T", "VariantRecalibrator",
                      "-R", ref_file,
                      "--input", in_file,
                      "--mode", filter_type,
                      "--recal_file", tx_recal,
                      "--tranches_file", tx_tranches]
            for cutoff in cutoffs:
                params += ["-tranche", str(cutoff)]
            params += _get_vqsr_training(filter_type, vrn_files)
            for a in _get_vqsr_annotations(filter_type):
                params += ["-an", a]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params, log_error=False, memscale=memscale)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    return recal_file, tranches_file
コード例 #17
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)),
                       "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650
                paired = vcfutils.get_paired(items)
                if paired:
                    cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
コード例 #18
0
ファイル: optitype.py プロジェクト: naumenko-sa/bcbio-nextgen
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific fastq input."""
    out_dir = utils.safe_makedir(out_dir)
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.realpath(utils.which("razers3"))
            if not os.path.exists(razers3):
                raise ValueError(
                    f"Could not find razers3 executable at {razers3}")
            out_handle.write(
                CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        resources = config_utils.get_resources("optitype", data["config"])
        if resources.get("options"):
            opts = " ".join([str(x) for x in resources["options"]])
        else:
            opts = ""
        # optitype is looking for the reference in ./data which is in env/python3.6 not in tools/bin
        optitype = os.path.realpath(utils.which("OptiTypePipeline.py"))
        # techically, optitype is not a python package, conda is not able to set up its shebang properly
        python_bin = os.path.join(os.path.dirname(optitype), "python")
        cmd = f"{python_bin} {optitype} -v --dna {opts} -o {tx_out_dir} --enumerate 10 "\
              f" -i {hla_fq} -c {config_file}"
        do.run(cmd, "HLA typing with OptiType")
        for outf in os.listdir(tx_out_dir):
            shutil.move(os.path.join(tx_out_dir, outf),
                        os.path.join(out_dir, outf))
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(
        out_file
    ) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
コード例 #19
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.
            ropts += [str(x) for x in resources.get("options", [])]

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            # see issue https://github.com/bcbio/bcbio-nextgen/issues/3263
            # for why --genomicsdb-use-vcf-codec is necessary
            params = [
                "-T", "GenotypeGVCFs", "--variant",
                "gendb://%s" % genomics_db, "-R",
                dd.get_ref_file(data), "--genomicsdb-use-vcf-codec",
                "--output", tx_out_file, "-L",
                bamprep.region_to_gatk(region)
            ]
            params += ["-ploidy", str(ploidy.get_ploidy([data], region))]
            # Avoid slow genotyping runtimes with improved quality score calculation in GATK4
            # https://gatkforums.broadinstitute.org/gatk/discussion/11471/performance-troubleshooting-tips-for-genotypegvcfs/p1
            resources = config_utils.get_resources("gatk", data["config"])
            params += [str(x) for x in resources.get("options", [])]
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #20
0
ファイル: gridss.py プロジェクト: chapmanb/bcbio-nextgen
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or
                                                            dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true",
                           "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss", inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                         {"direction": "increase",
                                                                          "magnitude": cores}}})
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
コード例 #21
0
ファイル: gatkjoint.py プロジェクト: chapmanb/bcbio-nextgen
def _run_genotype_gvcfs_gatk3(data, region, vrn_files, ref_file, out_file):
    """Performs genotyping of gVCFs into final VCF files.
    """
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
            if not assoc_files: assoc_files = {}
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region),
                      "--max_alternate_alleles", "4"]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            if assoc_files.get("dbsnp"):
                params += ["--dbsnp", assoc_files["dbsnp"]]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                # GATK performs poorly with memory usage when parallelizing
                # with a large number of cores but makes use of extra memory,
                # so we cap at 6 cores.
                # See issue #1565 for discussion
                # Recent GATK 3.x versions also have race conditions with multiple
                # threads, so limit to 1 and keep memory available
                # https://gatkforums.broadinstitute.org/wdl/discussion/8718/concurrentmodificationexception-in-gatk-3-7-genotypegvcfs
                # params += ["-nt", str(min(6, cores))]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale, parallel_gc=True)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #22
0
def fixrg(in_bam, names, ref_file, dirs, data):
    """Fix read group in a file, using samtools addreplacerg.

    addreplacerg does not remove the old read group, causing confusion when
    checking. We use reheader to work around this
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-fixrg.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_exists(out_file):
        out_file = os.path.join(work_dir,
                                "%s-fixrg.bam" % dd.get_sample_name(data))
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            rg_info = novoalign.get_rg_info(names)
            new_header = "%s-header.txt" % os.path.splitext(out_file)[0]
            cores = dd.get_cores(data)
            do.run(
                "samtools view -H {in_bam} | grep -v ^@RG > {new_header}".
                format(**locals()),
                "Create empty RG header: %s" % dd.get_sample_name(data))
            cmd = (
                "samtools reheader {new_header} {in_bam} | "
                "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} -"
            )
            do.run(cmd.format(**locals()),
                   "Fix read groups: %s" % dd.get_sample_name(data))
    return out_file
コード例 #23
0
ファイル: gatkfilter.py プロジェクト: gberriz/bcbio-nextgen
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type,
              data):
    """Run variant quality score recalibration.
    """
    cutoffs = [
        "100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93",
        "99.92", "99.91", "99.9", "99.8", "99.7", "99.6", "99.5", "99.0",
        "98.0", "90.0"
    ]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    gatk_type = broad_runner.gatk_type()
    base = utils.splitext_plus(in_file)[0]
    recal_file = ("%s-vqsrrecal.vcf.gz" %
                  base) if gatk_type == "gatk4" else ("%s.recal" % base)
    tranches_file = "%s.tranches" % base
    plot_file = "%s-plots.R" % base
    if not utils.file_exists(recal_file):
        with file_transaction(data, recal_file, tranches_file,
                              plot_file) as (tx_recal, tx_tranches,
                                             tx_plot_file):
            params = [
                "-T", "VariantRecalibrator", "-R", ref_file, "--mode",
                filter_type, "--tranches_file", tx_tranches, "--rscript_file",
                tx_plot_file
            ]
            if gatk_type == "gatk4":
                params += ["--variant", in_file, "--output", tx_recal]
            else:
                params += ["--input", in_file, "--recal_file", tx_recal]
            params += _get_vqsr_training(filter_type, vrn_files, gatk_type)
            resources = config_utils.get_resources("gatk_variant_recalibrator",
                                                   data["config"])
            opts = resources.get("options", [])
            if not opts:
                for cutoff in cutoffs:
                    opts += ["-tranche", str(cutoff)]
                for a in _get_vqsr_annotations(filter_type):
                    opts += ["-an", a]
            params += opts
            cores = dd.get_cores(data)
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params,
                                      log_error=False,
                                      memscale=memscale,
                                      parallel_gc=True)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    if gatk_type == "gatk4":
        vcfutils.bgzip_and_index(recal_file, data["config"])
    return recal_file, tranches_file
コード例 #24
0
ファイル: cnvkit.py プロジェクト: elkingtonmcb/bcbio-nextgen
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = _bam_to_outbase(test_bam, raw_work_dir)
        ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]}
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        split_cnns = run_multicore(
            _cnvkit_coverage,
            [
                (bam, bed, _bam_to_itype(bam), raw_work_dir, data)
                for bam in test_bams + background_bams
                for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)
            ],
            data["config"],
            parallel,
        )
        coverage_cnns = _merge_coverage(split_cnns, data)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn,
            target_bed,
            antitarget_bed,
            data,
        )
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [
                (cnns, background_cnn, data)
                for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()
            ],
            data["config"],
            parallel,
        )
        called_segs = run_multicore(
            _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel
        )
    return ckouts
コード例 #25
0
ファイル: cnvkit.py プロジェクト: chapmanb/bcbio-nextgen
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [_get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment", data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
コード例 #26
0
ファイル: purple.py プロジェクト: chapmanb/bcbio-nextgen
def _get_jvm_opts(out_file, data):
    """Retrieve Java options, adjusting memory for available cores.
    """
    resources = config_utils.get_resources("purple", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"])
    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                 {"direction": "increase",
                                                                  "maximum": "30000M",
                                                                  "magnitude": dd.get_cores(data)}}})
    jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
    return jvm_opts
コード例 #27
0
ファイル: sambamba.py プロジェクト: pansapiens/bcbio-nextgen
def make_command(data, cmd, bam_file, bed_file=None,
                 depth_thresholds=None, max_cov=None, query=None):
    sambamba = config_utils.get_program("sambamba", data["config"], default="sambamba")
    num_cores = dd.get_cores(data)
    target = (" -L " + bed_file) if bed_file else ""
    thresholds = "".join([" -T" + str(d) for d in (depth_thresholds or [])])
    maxcov = (" -C " + str(max_cov)) if max_cov else ""
    if query is None:
        query = "not failed_quality_control and not duplicate and not unmapped"
    return ("{sambamba} {cmd} -t {num_cores} {bam_file} "
            "{target} {thresholds} {maxcov} -F \"{query}\"").format(**locals())
コード例 #28
0
ファイル: sambamba.py プロジェクト: sdwfrost/bcbio-nextgen
def make_command(data, cmd, bam_file, bed_file=None,
                 depth_thresholds=None, max_cov=None, query=None, multicore=True):
    sambamba = config_utils.get_program("sambamba", data["config"], default="sambamba")
    num_cores = dd.get_cores(data) if multicore else 1
    target = (" -L " + bed_file) if bed_file else ""
    thresholds = "".join([" -T" + str(d) for d in (depth_thresholds or [])])
    maxcov = (" -C " + str(max_cov)) if max_cov else ""
    if query is None:
        query = mapped_filter_query + " and not duplicate"
    return ("{sambamba} {cmd} -t {num_cores} {bam_file} "
            "{target} {thresholds} {maxcov} -F \"{query}\"").format(**locals())
コード例 #29
0
ファイル: cnvkit.py プロジェクト: gberriz/bcbio-nextgen
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                        zip(["evaluate"] * len(inputs), inputs)
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                    [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                    [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add,
                                   [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                                    inputs + backgrounds)
                                    for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
            background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                                background_cnn, target_bed, antitarget_bed, inputs[0])
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = _cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"],
                                                background_cnn, target_bed, antitarget_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs, ckouts) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                   inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs]
    return ckouts
コード例 #30
0
ファイル: regions.py プロジェクト: bogdang989/bcbio-nextgen
def _normalize_sv_coverage_cnvkit(group_id, inputs, backgrounds, work_dir,
                                  back_files, out_files):
    """Normalize CNV coverage depths by GC, repeats and background using CNVkit

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    cnns = reduce(operator.add, [[
        tz.get_in(["depth", "bins", "target"], x),
        tz.get_in(["depth", "bins", "antitarget"], x)
    ] for x in backgrounds], [])
    for d in inputs:
        if tz.get_in(["depth", "bins", "target"], d):
            target_bed = tz.get_in(["depth", "bins", "target"], d)
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
    input_backs = set(
        filter(lambda x: x is not None,
               [dd.get_background_cnv_reference(d, "cnvkit") for d in inputs]))
    if input_backs:
        assert len(
            input_backs
        ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
        back_file = list(input_backs)[0]
    else:
        back_file = cnvkit.cnvkit_background(
            cnns,
            os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
            backgrounds or inputs, target_bed, antitarget_bed)
    fix_cmd_inputs = []
    for data in inputs:
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "structural",
                         dd.get_sample_name(data), "bins"))
        if tz.get_in(["depth", "bins", "target"], data):
            fix_file = os.path.join(
                work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
            fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"],
                                             data),
                                   tz.get_in(["depth", "bins", "antitarget"],
                                             data), back_file, fix_file, data))
            out_files[dd.get_sample_name(data)] = fix_file
            back_files[dd.get_sample_name(data)] = back_file
    parallel = {
        "type": "local",
        "cores": dd.get_cores(inputs[0]),
        "progs": ["cnvkit"]
    }
    run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"],
                  parallel)
    return back_files, out_files
コード例 #31
0
def _run_qsignature_generator(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (dict) dict with the normalize vcf file
    """
    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        if not position:
            logger.info("There is no qsignature for this species: %s" %
                        tz.get_in(['genome_build'], data))
            return {}
        jvm_opts = "-Xms750m -Xmx2g"
        limit_reads = 20000000
        if mixup_check == "qsignature_full":
            slice_bam = bam_file
            jvm_opts = "-Xms750m -Xmx8g"
            limit_reads = 100000000
        else:
            slice_bam = _slice_chr22(bam_file, data)
        qsig = config_utils.get_program("qsignature", data["config"])
        if not qsig:
            return {}
        utils.safe_makedir(out_dir)
        out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_file} ")
        if not os.path.exists(out_file):
            down_file = bam.downsample(slice_bam, data, limit_reads)
            if not down_file:
                down_file = slice_bam
            file_qsign_out = "{0}.qsig.vcf".format(down_file)
            do.run(base_cmd.format(**locals()),
                   "qsignature vcf generation: %s" % data["name"][-1])
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return {'qsig_vcf': out_file}
    return {}
コード例 #32
0
ファイル: cnvkit.py プロジェクト: chapmanb/bcbio-nextgen
def _run_cnvkit_shared_orig(inputs, backgrounds):
    """Original CNVkit implementation with full normalization and segmentation.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \
                         list(zip(["evaluate"] * len(inputs), inputs))
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add,
                                       [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add,
                                [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                                    inputs + backgrounds)
                                    for cnns in tz.groupby("bam", raw_coverage_cnns).values()])
            background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns),
                                                background_cnn, inputs, target_bed, antitarget_bed)
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"],
                                                background_cnn, inputs, target_bed, antitarget_bed)
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs, ckouts) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                   inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs]
    return ckouts
コード例 #33
0
ファイル: wham.py プロジェクト: elkingtonmcb/bcbio-nextgen
def _run_wham_genotype(in_file, all_bams, coords, data):
    """Run genotyping on a prepped, merged VCF file.
    """
    out_file = "%s-wgts%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cores = dd.get_cores(data)
            ref_file = dd.get_ref_file(data)
            coord_str = bamprep.region_to_gatk(coords)
            cmd = ("WHAM-GRAPHENING -b {in_file} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotype WHAM: %s" % region.to_safestr(coords))
    return out_file
コード例 #34
0
ファイル: wham.py プロジェクト: vhuarui/bcbio-nextgen
def _run_wham_genotype(in_file, all_bams, coords, data):
    """Run genotyping on a prepped, merged VCF file.
    """
    out_file = "%s-wgts%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cores = dd.get_cores(data)
            ref_file = dd.get_ref_file(data)
            coord_str = bamprep.region_to_gatk(coords)
            cmd = ("WHAM-GRAPHENING -b {in_file} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotype WHAM: %s" % region.to_safestr(coords))
    return out_file
コード例 #35
0
def _run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
コード例 #36
0
def _run_qsignature_generator(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (dict) dict with the normalize vcf file
    """
    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        if not position:
            logger.info("There is no qsignature for this species: %s"
                        % tz.get_in(['genome_build'], data))
            return {}
        jvm_opts = "-Xms750m -Xmx2g"
        limit_reads = 20000000
        if mixup_check == "qsignature_full":
            slice_bam = bam_file
            jvm_opts = "-Xms750m -Xmx8g"
            limit_reads = 100000000
        else:
            slice_bam = _slice_chr22(bam_file, data)
        qsig = config_utils.get_program("qsignature", data["config"])
        if not qsig:
            return {}
        utils.safe_makedir(out_dir)
        out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_file} ")
        if not os.path.exists(out_file):
            down_file = bam.downsample(slice_bam, data, limit_reads)
            if not down_file:
                down_file = slice_bam
            file_qsign_out = "{0}.qsig.vcf".format(down_file)
            do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % data["name"][-1])
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return {'qsig_vcf': out_file}
    return {}
コード例 #37
0
def run(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (string) output normalized vcf file
    """
    qsig = config_utils.get_program("qsignature", data["config"])
    res_qsig = config_utils.get_resources("qsignature", data["config"])
    jvm_opts = " ".join(res_qsig.get("jvm_opts", ["-Xms750m", "-Xmx8g"]))
    if not qsig:
        logger.info("There is no qsignature tool. Skipping...")
        return None

    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        utils.safe_makedir(out_dir)
        if not position:
            logger.info("There is no qsignature for this species: %s" %
                        tz.get_in(['genome_build'], data))
            return None
        if mixup_check == "qsignature_full":
            down_bam = bam_file
        else:
            down_bam = _slice_bam_chr21(bam_file, data)
            position = _slice_vcf_chr21(position, out_dir)

        out_name = os.path.basename(down_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_bam} ")
        if not os.path.exists(out_file):
            file_qsign_out = "{0}.qsig.vcf".format(down_bam)
            do.run(base_cmd.format(**locals()),
                   "qsignature vcf generation: %s" % dd.get_sample_name(data))
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return out_file
    return None
コード例 #38
0
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None):
    """Run mosdepth generating distribution, region depth and per-base depth.
    """
    MosdepthCov = collections.namedtuple(
        "MosdepthCov", ("dist", "per_base", "regions", "quantize"))
    bam_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "coverage",
                     dd.get_sample_name(data)))
    prefix = os.path.join(work_dir,
                          "%s-%s" % (dd.get_sample_name(data), target_name))
    out = MosdepthCov("%s.mosdepth.dist.txt" % prefix,
                      ("%s.per-base.bed.gz" % prefix) if per_base else None,
                      ("%s.regions.bed.gz" % prefix) if bed_file else None,
                      ("%s.quantized.bed.gz" % prefix) if quantize else None)
    if not utils.file_uptodate(out.dist, bam_file):
        with file_transaction(data, out.dist) as tx_out_file:
            tx_prefix = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(prefix))
            num_cores = dd.get_cores(data)
            bed_arg = ("--by %s" % bed_file) if bed_file else ""
            perbase_arg = "" if per_base else "--no-per-base"
            mapq_arg = "-Q 1" if (per_base or quantize) else ""
            if quantize:
                quant_arg = "--quantize %s" % quantize[0]
                quant_export = " && ".join([
                    "export MOSDEPTH_Q%s=%s" % (i, x)
                    for (i, x) in enumerate(quantize[1])
                ])
                quant_export += " && "
            else:
                quant_arg, quant_export = "", ""
            cmd = (
                "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} "
                "{tx_prefix} {bam_file}")
            message = "Calculating coverage: %s %s" % (
                dd.get_sample_name(data), target_name)
            do.run(cmd.format(**locals()), message.format(**locals()))
            if out.per_base:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.per_base)), out.per_base)
            if out.regions:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.regions)), out.regions)
            if out.quantize:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.quantize)), out.quantize)
    return out
コード例 #39
0
def run(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (string) output normalized vcf file
    """
    qsig = config_utils.get_program("qsignature", data["config"])
    res_qsig = config_utils.get_resources("qsignature", data["config"])
    jvm_opts = " ".join(res_qsig.get("jvm_opts", ["-Xms750m", "-Xmx8g"]))
    if not qsig:
        logger.info("There is no qsignature tool. Skipping...")
        return None

    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        utils.safe_makedir(out_dir)
        if not position:
            logger.info("There is no qsignature for this species: %s"
                        % tz.get_in(['genome_build'], data))
            return None
        if mixup_check == "qsignature_full":
            down_bam = bam_file
        else:
            down_bam = _slice_bam_chr21(bam_file, data)
            position = _slice_vcf_chr21(position, out_dir)

        out_name = os.path.basename(down_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_bam} ")
        if not os.path.exists(out_file):
            file_qsign_out = "{0}.qsig.vcf".format(down_bam)
            do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % dd.get_sample_name(data))
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return out_file
    return None
コード例 #40
0
ファイル: gatkjoint.py プロジェクト: fishinwind/bcbio-nextgen
def run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file]
            if region:
                params += ["-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #41
0
ファイル: wham.py プロジェクト: biocyberman/bcbio-nextgen
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            include_chroms = ",".join([c.name for c in ref.file_contigs(ref_file)
                                       if chromhacks.is_autosomal_or_x(c.name)])
            all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams)
            cmd = ("whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} "
                   "| bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "WHAM SV caller: %s" % ", ".join(dd.get_sample_name(d) for d in inputs))
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
コード例 #42
0
ファイル: wham.py プロジェクト: tfmorris/bcbio-nextgen
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf" % dd.get_sample_name(inputs[0]))
    input_bams = [x["align_bam"] for x in inputs]
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            background = "-b %s" % ",".join(background_bams) if background_bams else ""
            target_bams = ",".join(x["align_bam"] for x in inputs)
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], inputs[0])
            target_str = "-e %s" % target_bed if target_bed else ""
            cmd = ("WHAM-BAM -x {cores} -t {target_bams} {background} {target_str} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Run WHAM")
    return out_file
コード例 #43
0
ファイル: cnvkit.py プロジェクト: pansapiens/bcbio-nextgen
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() /
                        float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
                                                     pct_coverage, raw_work_dir, inputs[0])
        split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        split_cnns = run_multicore(_cnvkit_coverage,
                                   [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds],
                                   inputs[0]["config"], parallel)
        raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
        coverage_cnns = run_multicore(_cnvkit_metrics,
                                      [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
                                       for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
                                      inputs[0]["config"], parallel)
        background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                            background_cnn, target_bed, antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs + backgrounds) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                      inputs[0]["config"], parallel)
        run_multicore(_cnvkit_segment,
                      [(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
                      inputs[0]["config"], parallel)
    return ckouts
コード例 #44
0
def _run_genotype_gvcfs(data, vrn_files, ref_file, out_file):
    if not file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "GenotypeGVCFs", "-R", ref_file, "-o", tx_out_file]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                params += ["-nt", str(cores)]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
コード例 #45
0
ファイル: wham.py プロジェクト: vhuarui/bcbio-nextgen
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
            parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []}
            rs = run_multicore(_run_wham_coords,
                                [(inputs, background_bams, coord, out_file)
                                 for coord in coords],
                                inputs[0]["config"], parallel)
            rs = {coord: fname for (coord, fname) in rs}
            vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords,
                                          dd.get_ref_file(inputs[0]), inputs[0]["config"])
    return out_file
コード例 #46
0
ファイル: wham.py プロジェクト: elkingtonmcb/bcbio-nextgen
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
            parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []}
            rs = run_multicore(_run_wham_coords,
                                [(inputs, background_bams, coord, out_file)
                                 for coord in coords],
                                inputs[0]["config"], parallel)
            rs = {coord: fname for (coord, fname) in rs}
            vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords,
                                          dd.get_ref_file(inputs[0]), inputs[0]["config"])
    return out_file
コード例 #47
0
ファイル: wham.py プロジェクト: fw1121/bcbio-nextgen
def _run_wham_coords(inputs, background_bams, coords, final_file):
    """Run WHAM on a specific set of chromosome, start, end coordinates.
    """
    base, ext = os.path.splitext(final_file)
    out_file = "%s-%s%s" % (base, region.to_safestr(coords), ext)
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams)
            coord_str = bamprep.region_to_gatk(coords)
            opts = "-k -m 30"
            cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_out_file}")
            do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords))
    return [[coords, out_file]]
コード例 #48
0
ファイル: gatkjoint.py プロジェクト: fishinwind/bcbio-nextgen
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = ["-T", "GenotypeGVCFs",
                      "--variant", "gendb://%s" % genomics_db,
                      "-R", dd.get_ref_file(data),
                      "--output", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #49
0
def _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data):
    """GenotypeGVCFs from a merged GenomicsDB input: GATK4.

    No core scaling -- not yet supported in GATK4.
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            params = ["-T", "GenotypeGVCFs",
                      "--variant", "gendb://%s" % genomics_db,
                      "-R", dd.get_ref_file(data),
                      "--output", tx_out_file,
                      "-L", bamprep.region_to_gatk(region)]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
コード例 #50
0
def _get_jvm_opts(out_file, data):
    """Retrieve Java options, adjusting memory for available cores.
    """
    resources = config_utils.get_resources("purple", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx3500m"])
    jvm_opts = config_utils.adjust_opts(
        jvm_opts, {
            "algorithm": {
                "memory_adjust": {
                    "direction": "increase",
                    "maximum": "30000M",
                    "magnitude": dd.get_cores(data)
                }
            }
        })
    jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
    return jvm_opts
コード例 #51
0
def _run_gridss(inputs, background, work_dir):
    out_file = os.path.join(
        work_dir, "%s-gridss.sv.vcf" %
        (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0])))
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            htsjdk_opts = [
                "-Dsamjdk.create_index=true",
                "-Dsamjdk.use_async_io_read_samtools=true",
                "-Dsamjdk.use_async_io_write_samtools=true",
                "-Dsamjdk.use_async_io_write_tribble=true"
            ]
            cores = dd.get_cores(inputs[0])
            resources = config_utils.get_resources("gridss",
                                                   inputs[0]["config"])
            jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
            jvm_opts = config_utils.adjust_opts(
                jvm_opts, {
                    "algorithm": {
                        "memory_adjust": {
                            "direction": "increase",
                            "magnitude": cores
                        }
                    }
                })
            jvm_opts = _finalize_memory(jvm_opts)
            tx_ref_file = _setup_reference_files(inputs[0],
                                                 os.path.dirname(tx_out_file))
            blacklist_bed = sshared.prepare_exclude_file(
                inputs + background, out_file)
            cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \
                  ["THREADS=%s" % cores,
                   "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file),
                   "OUTPUT=%s" % tx_out_file,
                   "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"),
                   "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed]
            for data in inputs + background:
                cmd += [
                    "INPUT=%s" % dd.get_align_bam(data),
                    "INPUT_LABEL=%s" % dd.get_sample_name(data)
                ]
            exports = utils.local_path_export()
            cmd = exports + " ".join(cmd)
            do.run(cmd, "GRIDSS SV analysis")
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
コード例 #52
0
ファイル: variation.py プロジェクト: cbrueffer/bcbio-nextgen
def _run_genotype_gvcfs(data, vrn_files, ref_file, out_file):
    if not file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "GenotypeGVCFs",
                      "-R", ref_file, "-o", tx_out_file]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            broad_runner.new_resources("gatk-haplotype")
            cores = dd.get_cores(data)
            if cores > 1:
                params += ["-nt", str(cores)]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"}
            else:
                memscale = None
            broad_runner.run_gatk(params, memscale=memscale)
    return out_file
コード例 #53
0
ファイル: optitype.py プロジェクト: lpantano/bcbio-nextgen
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific
    """
    bin_dir = os.path.dirname(os.path.realpath(sys.executable))
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.join(bin_dir, "razers3")
            if not os.path.exists(razers3):
                raise ValueError("Could not find razers3 executable at %s" % (razers3))
            out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        cmd = ("OptiTypePipeline.py -v --dna -o {tx_out_dir} "
                "-i {hla_fq} -c {config_file}")
        do.run(cmd.format(**locals()), "HLA typing with OptiType")
        shutil.move(tx_out_dir, out_dir)
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
コード例 #54
0
def index(ref_file, out_dir, data):
    """Create a STAR index in the defined reference directory.
    """
    (ref_dir, local_file) = os.path.split(ref_file)
    gtf_file = dd.get_gtf_file(data)
    if not utils.file_exists(gtf_file):
        raise ValueError("%s not found, could not create a star index." % (gtf_file))
    if not utils.file_exists(out_dir):
        with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
            num_cores = dd.get_cores(data)
            cmd = ("STAR --genomeDir {tx_out_dir} --genomeFastaFiles {ref_file} "
                   "--runThreadN {num_cores} "
                   "--runMode genomeGenerate --sjdbOverhang 99 --sjdbGTFfile {gtf_file}")
            do.run(cmd.format(**locals()), "Index STAR")
            if os.path.exists(out_dir):
                shutil.rmtree(out_dir)
            shutil.move(tx_out_dir, out_dir)
    return out_dir
コード例 #55
0
ファイル: seq2c.py プロジェクト: bennyyu686/bcbio-nextgen
def _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name):
    sambamba_depth_file = os.path.join(work_dir, sample_name + '-sambamba_depth.tsv')
    sambamba = config_utils.get_program("sambamba", data["config"])
    num_cores = dd.get_cores(data)
    if not utils.file_exists(sambamba_depth_file):
        with file_transaction(data, sambamba_depth_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {num_cores} "
                   "-F \"\" -L {bed_file} {bam_file} -o {tx_out_file}")
            do.run(cmd.format(**locals()), "Calling sambamba region depth")
    logger.debug("Saved to " + sambamba_depth_file)

    out_file = os.path.join(work_dir, sample_name + '-coverage.tsv')
    if not utils.file_exists(out_file):
        logger.debug('Converting sambamba depth output to cov2lr.pl input in ' + dd.get_sample_name(data))
        with file_transaction(data, out_file) as tx_out_file:
            _sambabma_depth_to_seq2cov(sambamba_depth_file, tx_out_file, sample_name)
    logger.debug("Saved to " + out_file)
    return out_file
コード例 #56
0
ファイル: wham.py プロジェクト: fw1121/bcbio-nextgen
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.bedpe" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0]))
                parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["wham"]}
                rs = run_multicore(_run_wham_coords,
                                   [(inputs, background_bams, coord, out_file)
                                    for coord in coords],
                                   inputs[0]["config"], parallel)
                rs = {coord: fname for (coord, fname) in rs}
                for coord in coords:
                    with open(rs[coord]) as in_handle:
                        shutil.copyfileobj(in_handle, out_handle)
    return out_file
コード例 #57
0
ファイル: gatkfilter.py プロジェクト: matthdsm/bcbio-nextgen
def _run_vqsr(in_file, ref_file, vrn_files, sensitivity_cutoff, filter_type, data):
    """Run variant quality score recalibration.
    """
    cutoffs = ["100.0", "99.99", "99.98", "99.97", "99.96", "99.95", "99.94", "99.93", "99.92", "99.91",
               "99.9", "99.8", "99.7", "99.6", "99.5", "99.0", "98.0", "90.0"]
    if sensitivity_cutoff not in cutoffs:
        cutoffs.append(sensitivity_cutoff)
        cutoffs.sort()
    broad_runner = broad.runner_from_config(data["config"])
    gatk_type = broad_runner.gatk_type()
    base = utils.splitext_plus(in_file)[0]
    recal_file = ("%s-vqsrrecal.vcf.gz" % base) if gatk_type == "gatk4" else ("%s.recal" % base)
    tranches_file = "%s.tranches" % base
    plot_file = "%s-plots.R" % base
    if not utils.file_exists(recal_file):
        with file_transaction(data, recal_file, tranches_file, plot_file) as (tx_recal, tx_tranches, tx_plot_file):
            params = ["-T", "VariantRecalibrator",
                      "-R", ref_file,
                      "--mode", filter_type]
            if gatk_type == "gatk4":
                params += ["--variant", in_file, "--output", tx_recal,
                           "--tranches-file", tx_tranches, "--rscript-file", tx_plot_file]
            else:
                params += ["--input", in_file, "--recal_file", tx_recal,
                           "--tranches_file", tx_tranches, "--rscript_file", tx_plot_file]
            params += _get_vqsr_training(filter_type, vrn_files, gatk_type)
            resources = config_utils.get_resources("gatk_variant_recalibrator", data["config"])
            opts = resources.get("options", [])
            if not opts:
                for cutoff in cutoffs:
                    opts += ["-tranche", str(cutoff)]
                for a in _get_vqsr_annotations(filter_type, data):
                    opts += ["-an", a]
            params += opts
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            try:
                broad_runner.new_resources("gatk-vqsr")
                broad_runner.run_gatk(params, log_error=False, memscale=memscale, parallel_gc=True)
            except:  # Can fail to run if not enough values are present to train.
                return None, None
    if gatk_type == "gatk4":
        vcfutils.bgzip_and_index(recal_file, data["config"])
    return recal_file, tranches_file
コード例 #58
0
ファイル: wham.py プロジェクト: elkingtonmcb/bcbio-nextgen
def _run_wham_coords(inputs, background_bams, coords, final_file):
    """Run WHAM on a specific set of chromosome, start, end coordinates.
    """
    base, ext = utils.splitext_plus(final_file)
    raw_file = "%s-%s.vcf" % (base, region.to_safestr(coords))
    all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams)
    if not utils.file_exists(raw_file):
        with file_transaction(inputs[0], raw_file) as tx_raw_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            coord_str = bamprep.region_to_gatk(coords)
            opts = "-k -m 30"
            cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} "
                   "> {tx_raw_file}")
            do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords))
    merge_vcf = _run_wham_merge(raw_file, inputs[0])
    gt_vcf = _run_wham_genotype(merge_vcf, all_bams, coords, inputs[0])
    prep_vcf = vcfutils.sort_by_ref(gt_vcf, inputs[0])
    return [[coords, prep_vcf]]