Ejemplo n.º 1
0
def run(items):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    assert len(items) == 1, "Expect one input to MetaSV ensemble calling"
    data = items[0]
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    methods = []
    for call in data.get("sv", []):
        if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods:
            methods.append(call["variantcaller"])
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if len(methods) >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                             os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff")
        data["sv"].append({"variantcaller": "metasv",
                           "vrn_file": effects_vcf or filter_file})
    return [data]
Ejemplo n.º 2
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                            os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>10000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>20) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>1)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        calls.append({"variantcaller": "metasv",
                      "vrn_file": filter_file})
    return calls
Ejemplo n.º 3
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, data):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        filterexp = " || ".join(["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
        return vfilter.hard_w_expression(snp_file, filterexp, data, filter_type)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files, data)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file, vrn_files, data)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)
Ejemplo n.º 4
0
def run(items):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    assert len(items) == 1, "Expect one input to MetaSV ensemble calling"
    data = items[0]
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + [
        "--sample",
        dd.get_sample_name(data), "--reference",
        dd.get_ref_file(data), "--bam",
        dd.get_align_bam(data), "--outdir", work_dir
    ]
    methods = []
    for call in data.get("sv", []):
        if call["variantcaller"] in SUPPORTED and call[
                "variantcaller"] not in methods:
            methods.append(call["variantcaller"])
            cmd += [
                "--%s_vcf" % call["variantcaller"],
                call.get("vcf_file", call["vrn_file"])
            ]
    if len(methods) >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(
                dd.get_align_bam(data),
                os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += [
                "--workdir", tx_work_dir, "--num_threads",
                str(dd.get_num_cores(data))
            ]
            cmd += [
                "--spades",
                utils.which("spades.py"), "--age",
                utils.which("age_align")
            ]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += [
                "--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd",
                ins_stats["std"]
            ]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = (
            "(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || "
            "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || "
            "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
            "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)")
        filter_file = vfilter.hard_w_expression(out_file,
                                                filters,
                                                data,
                                                name="ReassemblyStats",
                                                limit_regions=None)
        effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "metasv",
            "vrn_file": effects_vcf or filter_file
        })
    return [data]
Ejemplo n.º 5
0
def _filter_by_support(in_file, data):
    """Filter call file based on supporting evidence, adding FILTER annotations to VCF.

    Filters based on the following criteria:
      - Minimum read support for the call (SU = total support)
      - Large calls need split read evidence.
    """
    rc_filter = ("FORMAT/SU < 4 || "
                 "(FORMAT/SR == 0 && ABS(SVLEN)>20000)")
    return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport",
                                     limit_regions=None)
Ejemplo n.º 6
0
def _filter_by_support(in_file, data):
    """Filter call file based on supporting evidence, adding FILTER annotations to VCF.

    Filters based on the following criteria:
      - Minimum read support for the call.
    Other filters not currently applied due to being too restrictive:
      - Multiple forms of evidence in any sample (split and paired end)
    """
    rc_filter = "FORMAT/SU < 4"
    # approach_filter = "FORMAT/SR == 0 || FORMAT/PE == 0"
    return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport")
Ejemplo n.º 7
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "structural",
                     items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {
        "type": "local",
        "cores": config["algorithm"].get("num_cores", 1),
        "progs": ["delly"]
    }
    sv_types = [
        "DEL", "DUP", "INV"
    ]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam:
        bytype_vcfs = run_multicore(
            _run_delly,
            [(work_bams, chrom, sv_type, ref_file, work_dir, items)
             for (chrom, sv_type
                  ) in itertools.product(pysam_work_bam.references, sv_types)],
            config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file,
                                               items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(
            combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = vfilter.hard_w_expression(
            delly_sample_vcf,
            "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2",
            data,
            name="DVSupport")
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf})
        out.append(data)
    return out
Ejemplo n.º 8
0
def _filter_by_support(in_file, data):
    """Filter call file based on supporting evidence, adding FILTER annotations to VCF.

    Filters based on the following criteria:
      - Minimum read support for the call.
    Other filters not currently applied due to being too restrictive:
      - Multiple forms of evidence in any sample (split and paired end)
    """
    rc_filter = "FORMAT/SU < 4"
    # approach_filter = "FORMAT/SR == 0 || FORMAT/PE == 0"
    return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport",
                                     limit_regions=None)
Ejemplo n.º 9
0
def _filter_by_support(in_file, data):
    """Filter call file based on supporting evidence, adding FILTER annotations to VCF.

    Filters based on the following criteria:
      - Minimum read support for the call (SU = total support)
      - Large calls need split read evidence.
    """
    rc_filter = ("FORMAT/SU < 4 || "
                 "(FORMAT/SR == 0 && FORMAT/SU < 15 && ABS(SVLEN)>50000) || "
                 "(FORMAT/SR == 0 && FORMAT/SU < 5 && ABS(SVLEN)<2000) || "
                 "(FORMAT/SR == 0 && FORMAT/SU < 15 && ABS(SVLEN)<300)")
    return vfilter.hard_w_expression(in_file, rc_filter, data, name="ReadCountSupport",
                                     limit_regions=None)
Ejemplo n.º 10
0
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data):
    """Filter SNP variant calls using GATK best practice recommendations.
    """
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    filter_type = "SNP"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    filters = [
        "QD < 2.0", "MQ < 40.0", "FS > 60.0", "MQRankSum < -12.5",
        "ReadPosRankSum < -8.0"
    ]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.hard_w_expression(snp_file, " || ".join(filters), data,
                                         filter_type)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(
            base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_snp(snp_file, ref_file, vrn_files, data)
        assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \
            "Need HapMap and 1000 genomes training files"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file,
                                  tranches_file) as (tx_recal, tx_tranches):
                params.extend(
                    ["--recal_file", tx_recal, "--tranches_file", tx_tranches])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                # Can fail to run if not enough values are present to train. Rerun with regional
                # filtration approach instead
                except:
                    logger.info(
                        "VQSR failed due to lack of training data. Using hard filtering."
                    )
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_snp(snp_file, ref_file,
                                                   vrn_files, data)
        return _apply_variant_recal(broad_runner, snp_file, ref_file,
                                    recal_file, tranches_file, filter_type)
Ejemplo n.º 11
0
def run(items):
    """Perform detection of structural variations with delly.

    Performs post-call filtering with a custom filter tuned based
    on NA12878 Moleculo and PacBio data, using calls prepared by
    @ryanlayer and @cc2qe

    Filters using the high quality variant pairs (DV) compared with
    high quality reference pairs (DR).
    """
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural",
                                               items[0]["name"][-1], "delly"))
    work_bams = [data["align_bam"] for data in items]
    ref_file = utils.get_in(items[0], ("reference", "fasta", "base"))
    # Add core request for delly
    config = copy.deepcopy(items[0]["config"])
    delly_config = utils.get_in(config, ("resources", "delly"), {})
    delly_config["cores"] = len(items)
    config["resources"]["delly"] = delly_config
    parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1),
                "progs": ["delly"]}
    sv_types = ["DEL", "DUP", "INV"]  # "TRA" has invalid VCF END specifications that GATK doesn't like
    exclude_file = _get_full_exclude_file(items, work_dir)
    bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items)
                                             for (chrom, sv_type)
                                             in itertools.product(get_sv_chroms(items, exclude_file), sv_types)],
                                config, parallel)
    out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs)
    combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"])
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        base, ext = utils.splitext_plus(combo_vcf)
        sample = tz.get_in(["rgnames", "sample"], data)
        delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample,
                                                  "%s-%s%s" % (base, sample, ext), data["config"])
        delly_vcf = vfilter.hard_w_expression(delly_sample_vcf,
                                              "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data,
                                              name="DVSupport")
        data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf,
                           "exclude": exclude_file})
        out.append(data)
    return out
Ejemplo n.º 12
0
def _variant_filtration_indel(snp_file, ref_file, vrn_files, data):
    """Filter indel variant calls using GATK best practice recommendations.
    """
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    filter_type = "INDEL"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if not config_utils.use_vqsr([config["algorithm"]]):
        filterexp = " || ".join(
            ["QD < 2.0", "ReadPosRankSum < -20.0", "FS > 200.0"])
        return vfilter.hard_w_expression(snp_file, filterexp, data,
                                         filter_type)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(
            base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_indel(snp_file, ref_file, vrn_files,
                                             data)
        assert "train_indels" in vrn_files, "Need indel training file specified"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file,
                                  tranches_file) as (tx_recal, tx_tranches):
                params.extend(
                    ["--recal_file", tx_recal, "--tranches_file", tx_tranches])
                if LooseVersion(broad_runner.gatk_major_version()
                                ) >= LooseVersion("2.7"):
                    params.extend(["--numBadVariants", "3000"])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                except:
                    logger.info(
                        "VQSR failed due to lack of training data. Using hard filtering."
                    )
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_indel(snp_file, ref_file,
                                                     vrn_files, data)
        return _apply_variant_recal(broad_runner, snp_file, ref_file,
                                    recal_file, tranches_file, filter_type)
Ejemplo n.º 13
0
def _variant_filtration_snp(snp_file, ref_file, vrn_files, data):
    """Filter SNP variant calls using GATK best practice recommendations.
    """
    config = data["config"]
    broad_runner = broad.runner_from_config(config)
    filter_type = "SNP"
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    filters = ["QD < 2.0", "MQ < 40.0", "FS > 60.0",
               "MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    if variantcaller not in ["gatk-haplotype"]:
        filters.append("HaplotypeScore > 13.0")
    if not config_utils.use_vqsr([config["algorithm"]]):
        return vfilter.hard_w_expression(snp_file, " || ".join(filters), data, filter_type)
    else:
        # also check if we've failed recal and needed to do strict filtering
        filter_file = "{base}-filter{ext}.vcf".format(base=os.path.splitext(snp_file)[0], ext=filter_type)
        if file_exists(filter_file):
            config["algorithm"]["coverage_interval"] = "regional"
            return _variant_filtration_snp(snp_file, ref_file, vrn_files, data)
        assert "train_hapmap" in vrn_files and "train_1000g_omni" in vrn_files, \
            "Need HapMap and 1000 genomes training files"
        params, recal_file, tranches_file = _shared_variant_filtration(
            filter_type, snp_file, ref_file, vrn_files, variantcaller)
        if not file_exists(recal_file):
            with file_transaction(recal_file, tranches_file) as (tx_recal, tx_tranches):
                params.extend(["--recal_file", tx_recal,
                               "--tranches_file", tx_tranches])
                try:
                    broad_runner.new_resources("gatk-vqsr")
                    broad_runner.run_gatk(params, log_error=False)
                # Can fail to run if not enough values are present to train. Rerun with regional
                # filtration approach instead
                except:
                    logger.info("VQSR failed due to lack of training data. Using hard filtering.")
                    config["algorithm"]["coverage_interval"] = "regional"
                    return _variant_filtration_snp(snp_file, ref_file, vrn_files, data)
        return _apply_variant_recal(broad_runner, snp_file, ref_file, recal_file,
                                    tranches_file, filter_type)