Example #1
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools")
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Example #2
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, tx_out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF 4.2 compatibility issue in samtools 1.0
    by removing addition 4.2-only isms from VCF header lines.
    """
    config = items[0]["config"]
    mpileup = prep_mpileup(align_bams, ref_file, config,
                           target_regions=target_regions, want_bcf=True)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
        raise ValueError("samtools calling not supported with pre-1.0 samtools")
    bcftools_opts = "call -v -m"
    compress_cmd = "| bgzip -c" if tx_out_file.endswith(".gz") else ""
    cmd = ("{mpileup} "
           "| {bcftools} {bcftools_opts} - "
           "| vt normalize -n -q -r {ref_file} - "
           "| sed 's/VCFv4.2/VCFv4.1/' "
           "| sed 's/,Version=3>/>/' "
           "| sed 's/,Version=\"3\">/>/' "
           "| sed 's/Number=R/Number=./' "
           "{compress_cmd} > {tx_out_file}")
    do.run(cmd.format(**locals()), "Variant calling with samtools", items[0])
Example #3
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, tx_out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF 4.2 compatibility issue in samtools 1.0
    by removing addition 4.2-only isms from VCF header lines.
    """
    config = items[0]["config"]
    mpileup = prep_mpileup(align_bams, ref_file, config,
                           target_regions=target_regions, want_bcf=True)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
        raise ValueError("samtools calling not supported with pre-1.0 samtools")
    bcftools_opts = "call -v -m"
    compress_cmd = "| bgzip -c" if tx_out_file.endswith(".gz") else ""
    cmd = ("{mpileup} "
           "| {bcftools} {bcftools_opts} - "
           "| vt normalize -n -q -r {ref_file} - "
           "| sed 's/VCFv4.2/VCFv4.1/' "
           "| sed 's/,Version=3>/>/' "
           "| sed 's/,Version=\"3\">/>/' "
           "| sed 's/Number=R/Number=./' "
           "{compress_cmd} > {tx_out_file}")
    do.run(cmd.format(**locals()), "Variant calling with samtools", items[0])
Example #4
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions, out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams, ref_file, max_read_depth, config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError("samtools calling not supported with 0.1.19 samtools and 0.20 bcftools")
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    cmd = ("{mpileup} "
           "| {bcftools} {bcftools_opts} - "
           "| {vcfutils} varFilter -D {max_read_depth} "
           "| sed 's/,Version=3>/>/'"
           "> {out_file}")
    logger.info(cmd.format(**locals()))
    do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Example #5
0
def _freebayes_custom(in_file, ref_file, data):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.

    Experimental: for testing new methods.
    """
    if vcfutils.get_paired_phenotype(data):
        return None
    config = data["config"]
    bv_ver = programs.get_version("bcbio_variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(
            os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar(
            "bcbio.variation",
            config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + [
            "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file
        ]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Example #6
0
def prep_gemini_db(fnames, call_id, samples, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    use_gemini = _do_db_build(samples) and any(
        vcfutils.vcf_has_variants(f) for f in fnames)
    is_population = len(fnames) > 1
    if is_population:
        name, caller = call_id
        gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    else:
        gemini_vcf = fnames[0]
    if use_gemini and not utils.file_exists(gemini_db):
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            gemini_ver = programs.get_version("gemini", config=data["config"])
            # Recent versions of gemini allow loading only passing variants
            if LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts = "--passonly"
            else:
                load_opts = ""
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
    return [[
        call_id, {
            "db": gemini_db if use_gemini else None,
            "vcf": gemini_vcf if is_population else None
        }
    ]]
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False) and
                        any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [[(name, caller), {"db": gemini_db if utils.file_exists(gemini_db) else None,
                              "vcf": gemini_vcf if is_batch else None}]]
Example #8
0
def prep_gemini_db(fnames, call_id, samples, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    use_gemini = _do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    is_population = len(fnames) > 1
    if is_population:
        name, caller = call_id
        gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    else:
        gemini_vcf = fnames[0]
    if use_gemini and not utils.file_exists(gemini_db):
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            gemini_ver = programs.get_version("gemini", config=data["config"])
            # Recent versions of gemini allow loading only passing variants
            if LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts = "--passonly"
            else:
                load_opts = ""
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % str(call_id), data)
    return [[call_id, {"db": gemini_db if use_gemini else None,
                       "vcf": gemini_vcf if is_population else None}]]
Example #9
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False)
                        and any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(
            vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini",
                                                      config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                        "0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                    ]:
                        if not os.path.exists(
                                os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                # skip gerp-bp which slows down loading
                load_opts += " --skip-gerp-bp "
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = ("snpEff" if tz.get_in(
                    ("config", "algorithm",
                     "effects"), data, "snpeff") == "snpeff" else "VEP")
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd,
                       "Create gemini database for %s %s" % (name, caller),
                       data)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": gemini_vcf if is_batch else None
    }]]
Example #10
0
def _call_variants_samtools(align_bams, ref_file, items, target_regions,
                            out_file):
    """Call variants with samtools in target_regions.

    Works around a GATK VCF compatibility issue in samtools 0.20 by removing extra
    Version information from VCF header lines.
    """
    config = items[0]["config"]

    max_read_depth = "1000"
    mpileup = prep_mpileup(align_bams,
                           ref_file,
                           max_read_depth,
                           config,
                           target_regions=target_regions)
    bcftools = config_utils.get_program("bcftools", config)
    bcftools_version = programs.get_version("bcftools", config=config)
    samtools_version = programs.get_version("samtools", config=config)
    if LooseVersion(bcftools_version) > LooseVersion("0.1.19"):
        if LooseVersion(samtools_version) <= LooseVersion("0.1.19"):
            raise ValueError(
                "samtools calling not supported with 0.1.19 samtools and 0.20 bcftools"
            )
        bcftools_opts = "call -v -c"
    else:
        bcftools_opts = "view -v -c -g"
    compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
    vcfutils = config_utils.get_program("vcfutils.pl", config)
    # XXX Check if we need this when supporting samtools 0.2.0 calling.
    # 0.1.9 fails on regions without reads.
    if not any(
            realign.has_aligned_reads(x, target_regions) for x in align_bams):
        vcfutils.write_empty_vcf(out_file, config)
    else:
        cmd = ("{mpileup} "
               "| {bcftools} {bcftools_opts} - "
               "| {vcfutils} varFilter -D {max_read_depth} "
               "| sed 's/,Version=3>/>/'"
               "{compress_cmd} > {out_file}")
        logger.info(cmd.format(**locals()))
        do.run(cmd.format(**locals()), "Variant calling with samtools", {})
Example #11
0
 def _set_default_versions(self, config):
     """Retrieve pre-computed version information for expensive to retrieve versions.
     Starting up GATK takes a lot of resources so we do it once at start of analysis.
     """
     out = []
     for name in ["gatk", "picard"]:
         try:
             v = programs.get_version(name, config=config)
         except KeyError:
             v = None
         out.append(v)
     self._gatk_version, self._picard_version = out
Example #12
0
 def _set_default_versions(self, config):
     """Retrieve pre-computed version information for expensive to retrieve versions.
     Starting up GATK takes a lot of resources so we do it once at start of analysis.
     """
     out = []
     for name in ["gatk", "picard", "mutect"]:
         try:
             v = programs.get_version(name, config=config)
         except KeyError:
             v = None
         out.append(v)
     self._gatk_version, self._picard_version, self._mutect_version = out
Example #13
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini",
                                                  config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                    "0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                    "0.7.0"):
                gemini_dir = install.get_gemini_dir(data)
                for skip_cmd, check_file in [
                    ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                ]:
                    if not os.path.exists(os.path.join(gemini_dir,
                                                       check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]
                                    ]) if resources.get("options") else ""
            cmd = (
                "{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} "
                "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Example #14
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = do_db_build(samples, check_gemini=False) and any(vcfutils.vcf_has_variants(f) for f in fnames)
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(data, gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini", config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                # Skip CADD or gerp-bp if neither are loaded
                if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                    gemini_dir = install.get_gemini_dir()
                    for skip_cmd, check_file in [
                        ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz"),
                        ("--skip-gerp-bp", "hg19.gerp.bw"),
                    ]:
                        if not os.path.exists(os.path.join(gemini_dir, check_file)):
                            load_opts += " %s" % skip_cmd
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                eanns = "snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff" else "VEP"
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd, "Create gemini database for %s %s" % (name, caller), data)
    return [
        [
            (name, caller),
            {"db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": gemini_vcf if is_batch else None},
        ]
    ]
Example #15
0
def _freebayes_custom(in_file, ref_file, config):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.
    """
    bv_ver = programs.get_version("bcbio.variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar("bcbio.variation",
                                      config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-filter", "freebayes",
                                                 in_file, ref_file]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Example #16
0
def create_gemini_db(gemini_vcf, data, gemini_db=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini",
                                                  config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                    "0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion(
                    "0.7.0"):
                gemini_dir = install.get_gemini_dir()
                for skip_cmd, check_file in [
                    ("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")
                ]:
                    if not os.path.exists(os.path.join(gemini_dir,
                                                       check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            eanns = ("snpEff" if tz.get_in(
                ("config", "algorithm",
                 "effects"), data, "snpeff") == "snpeff" else "VEP")
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
    return gemini_db
Example #17
0
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini", config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                gemini_dir = install.get_gemini_dir(data)
                for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                    if not os.path.exists(os.path.join(gemini_dir, check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else ""
            cmd = ("{gemini} {gemini_opts} load {load_opts} -v {gemini_vcf} {eanns} --cores {num_cores} "
                   "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Example #18
0
def prep_gemini_db(fnames, call_info, samples):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    data = samples[0]
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    name, caller, is_batch = call_info
    gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller))
    gemini_vcf = get_multisample_vcf(fnames, name, caller, data)
    use_gemini_quick = (do_db_build(samples, check_gemini=False)
                        and any(vcfutils.vcf_has_variants(f) for f in fnames))
    if not utils.file_exists(gemini_db) and use_gemini_quick:
        use_gemini = do_db_build(samples) and any(
            vcfutils.vcf_has_variants(f) for f in fnames)
        if use_gemini:
            with file_transaction(gemini_db) as tx_gemini_db:
                gemini = config_utils.get_program("gemini", data["config"])
                if "program_versions" in data["config"].get("resources", {}):
                    gemini_ver = programs.get_version("gemini",
                                                      config=data["config"])
                else:
                    gemini_ver = None
                # Recent versions of gemini allow loading only passing variants
                load_opts = ""
                if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.2.1"):
                    load_opts += " --passonly"
                # For small test files, skip gene table loading which takes a long time
                if gemini_ver and LooseVersion(gemini_ver) > LooseVersion(
                        "0.6.4"):
                    if _is_small_vcf(gemini_vcf):
                        load_opts += " --skip-gene-tables"
                    if "/test_automated_output/" in gemini_vcf:
                        load_opts += " --test-mode"
                num_cores = data["config"]["algorithm"].get("num_cores", 1)
                cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
                cmd = cmd.format(**locals())
                do.run(cmd,
                       "Create gemini database for %s %s" % (name, caller),
                       data)
    return [[(name, caller), {
        "db": gemini_db if utils.file_exists(gemini_db) else None,
        "vcf": gemini_vcf if is_batch else None
    }]]
Example #19
0
def _freebayes_custom(in_file, ref_file, data):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.

    Experimental: for testing new methods.
    """
    if vcfutils.get_paired_phenotype(data):
        return None
    config = data["config"]
    bv_ver = programs.get_version("bcbio_variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["bcbio-variation"] + jvm_opts + java_args + ["variant-filter", "freebayes", in_file, ref_file]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Example #20
0
def create_gemini_db(gemini_vcf, data, gemini_db=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            if "program_versions" in data["config"].get("resources", {}):
                gemini_ver = programs.get_version("gemini", config=data["config"])
            else:
                gemini_ver = None
            # Recent versions of gemini allow loading only passing variants
            load_opts = ""
            if not gemini_ver or LooseVersion(gemini_ver) > LooseVersion("0.6.2.1"):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if gemini_ver and LooseVersion(gemini_ver) > LooseVersion("0.6.4"):
                if _is_small_vcf(gemini_vcf):
                    load_opts += " --skip-gene-tables"
                if "/test_automated_output/" in gemini_vcf:
                    load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            if gemini_ver and LooseVersion(gemini_ver) >= LooseVersion("0.7.0"):
                gemini_dir = install.get_gemini_dir()
                for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                    if not os.path.exists(os.path.join(gemini_dir, check_file)):
                        load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            eanns = ("snpEff" if tz.get_in(("config", "algorithm", "effects"), data, "snpeff") == "snpeff"
                     else "VEP")
            cmd = "{gemini} load {load_opts} -v {gemini_vcf} -t {eanns} --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
    return gemini_db