Exemple #1
0
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                cur["description"] = "%s-germline" % cur["description"]
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    del cur["metadata"]["batch"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc[
                        "germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    items = [_clean_flat_variantcaller(x) for x in items]
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    cur["metadata"]["batch"] = "%s-germline" % cur["description"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc["germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
Exemple #3
0
def run_jointvc(items):
    items = [utils.to_single_data(x) for x in items]
    data = items[0]
    if not dd.get_jointcaller(data):
        data["config"]["algorithm"][
            "jointcaller"] = "%s-joint" % dd.get_variantcaller(data)
    # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these.
    chrom, coords = data["region"].split(":")
    start, end = coords.split("-")
    ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end)
    str_region = ready_region.replace(":", "_")
    batches = dd.get_batches(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    out_file = os.path.join(
        utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "joint",
                         dd.get_variantcaller(data),
                         str_region)), "%s-%s-%s.vcf.gz" %
        (batches[0], dd.get_variantcaller(data), str_region))
    joint_out = square_batch_region(data, ready_region, [],
                                    [d["vrn_file"] for d in items],
                                    out_file)[0]
    data["vrn_file_region"] = joint_out["vrn_file"]
    return data
Exemple #4
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {
        "validate": validate.combine_validations(items),
        "variants": {
            "calls": [],
            "gvcf": [],
            "samples": []
        }
    }
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(
                data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     out_key)), "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
Exemple #5
0
def run_jointvc(items):
    items = [utils.to_single_data(x) for x in items]
    data = items[0]
    if not dd.get_jointcaller(data):
        data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data)
    # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these.
    chrom, coords = data["region"].split(":")
    start, end = coords.split("-")
    ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end)
    str_region = ready_region.replace(":", "_")
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint",
                                                            dd.get_variantcaller(data), str_region)),
                            "%s-%s-%s.vcf.gz" % (dd.get_batches(data)[0], dd.get_variantcaller(data), str_region))
    joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0]
    data["vrn_file_region"] = joint_out["vrn_file"]
    return data
Exemple #6
0
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    """
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
            cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples),
                        "batch_samples": batch_samples,
                        "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples],
                                     "calls": [d.get("vrn_file") for d in gsamples]}})
            out.append(cur)

    def by_original_order(d):
        return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order])
    return sorted(out, key=by_original_order)
Exemple #7
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Exemple #8
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
Exemple #9
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
def _needs_java(data):
    """Check if a caller needs external java for MuTect.

    No longer check for older GATK (<3.6) versions because of time cost; this
    won't be relevant to most runs so we skip the sanity check.
    """
    vc = dd.get_variantcaller(data)
    if isinstance(vc, dict):
        out = {}
        for k, v in vc.items():
            if not isinstance(v, (list, tuple)):
                v = [v]
            out[k] = v
        vc = out
    elif not isinstance(vc, (list, tuple)):
        vc = [vc]
    if "mutect" in vc or ("somatic" in vc and "mutect" in vc["somatic"]):
        return True
    if "gatk" in vc or "gatk-haplotype" in vc or (
            "germline" in vc and "gatk-haplotype" in vc["germline"]):
        pass
        # runner = broad.runner_from_config(data["config"])
        # version = runner.get_gatk_version()
        # if LooseVersion(version) < LooseVersion("3.6"):
        #     return True
    return False
def _needs_java(data):
    """Check if a caller needs external java for MuTect.

    No longer check for older GATK (<3.6) versions because of time cost; this
    won't be relevant to most runs so we skip the sanity check.
    """
    vc = dd.get_variantcaller(data)
    if isinstance(vc, dict):
        out = {}
        for k, v in vc.items():
            if not isinstance(v, (list, tuple)):
                v = [v]
            out[k] = v
        vc = out
    elif not isinstance(vc, (list, tuple)):
        vc = [vc]
    if "mutect" in vc or ("somatic" in vc and "mutect" in vc["somatic"]):
        return True
    if "gatk" in vc or "gatk-haplotype" in vc or ("germline" in vc and "gatk-haplotype" in vc["germline"]):
        pass
        # runner = broad.runner_from_config(data["config"])
        # version = runner.get_gatk_version()
        # if LooseVersion(version) < LooseVersion("3.6"):
        #     return True
    return False
Exemple #12
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for d in joint.square_off(samples, run_parallel):
            out.extend(
                [[to_single_data(xs)]
                 for xs in multi.split_variants_by_sample(to_single_data(d))])
        samples = out
    if variantcaller:
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for data in (to_single_data(xs) for xs in samples):
            if "variants" not in data:
                data["variants"] = []
            data["variants"].append({
                "variantcaller": "gatk-haplotype",
                "vcf": data["vrn_file_orig"],
                "population": {
                    "vcf": data["vrn_file"]
                }
            })
            data["vrn_file"] = data.pop("vrn_file_orig")
            out.append([data])
        samples = out
    return samples
Exemple #13
0
def _rnaseq_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["rnaseq"] = True
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    return checkpoints
Exemple #14
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [
        utils.to_single_data(x) for x in validate.summarize_grading(items)
    ]
    out = {"validate": items[0]["validate"], "variants": {"calls": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            cur_name = "%s-%s" % (names[0], dd.get_variantcaller(data))
            if cur_name not in added:
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     "calls")), "%s.vcf.gz" % cur_name)
                added.add(cur_name)
                # Ideally could symlink here but doesn't appear to work with
                # Docker container runs on Toil where PATHs don't get remapped
                utils.copy_plus(os.path.realpath(data["vrn_file"]), out_file)
                vcfutils.bgzip_and_index(out_file, data["config"])
                out["variants"]["calls"].append(out_file)
    return [out]
Exemple #15
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
Exemple #16
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    return [out]
Exemple #18
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
Exemple #19
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {"validate": validate.combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
Exemple #20
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Exemple #21
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    return checkpoints
Exemple #22
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        samples = joint.square_off(samples, run_parallel)
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    return samples
def _get_jvm_opts(data, out_file):
    """Retrieve JVM options when running the Java version of VarDict.
    """
    if dd.get_variantcaller(data).endswith("-java"):
        resources = config_utils.get_resources("vardict", data["config"])
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
        jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
        return "export VAR_DICT_OPTS='%s' && " % " ".join(jvm_opts)
    else:
        return ""
Exemple #24
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Exemple #25
0
def _get_jvm_opts(data, out_file):
    """Retrieve JVM options when running the Java version of VarDict.
    """
    if not dd.get_variantcaller(data).endswith("-perl"):
        resources = config_utils.get_resources("vardict", data["config"])
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"])
        jvm_opts += broad.get_default_jvm_opts(os.path.dirname(out_file))
        return "export VAR_DICT_OPTS='%s' && " % " ".join(jvm_opts)
    else:
        return ""
Exemple #26
0
def _default_conf_files(data, retriever):
    conf_files = []
    if dd.get_variantcaller(data) or dd.get_vrn_file(data):
        if annotate_gemini(data, retriever):
            conf_files.append("gemini")
        if _annotate_somatic(data, retriever):
            conf_files.append("somatic")
        if dd.get_analysis(data).lower().find("rna-seq") >= 0:
            conf_files.append("rnaedit")
    return conf_files
Exemple #27
0
def get_somatic_variantcallers(items):
    """Retrieve all variant callers for somatic calling, handling somatic/germline.
    """
    out = []
    for data in items:
        vcs = dd.get_variantcaller(data)
        if isinstance(vcs, dict) and "somatic" in vcs:
            vcs = vcs["somatic"]
        if not isinstance(vcs, (list, tuple)):
            vcs = [vcs]
        out += vcs
    return set(vcs)
Exemple #28
0
def get_somatic_variantcallers(items):
    """Retrieve all variant callers for somatic calling, handling somatic/germline.
    """
    out = []
    for data in items:
        vcs = dd.get_variantcaller(data)
        if isinstance(vcs, dict) and "somatic" in vcs:
            vcs = vcs["somatic"]
        if not isinstance(vcs, (list, tuple)):
            vcs = [vcs]
        out += vcs
    return set(vcs)
Exemple #29
0
def get_type(data):
    """Retrieve the type of effects calculation to do.
    """
    if data["analysis"].lower().startswith("var") or dd.get_variantcaller(data):
        etype = tz.get_in(("config", "algorithm", "effects"), data, "snpeff")
        if isinstance(etype, (list, tuple)):
            if len(etype) == 1:
                return etype[0]
            else:
                raise ValueError("Unexpected variant effect type for %s: %s" % (dd.get_sample_name(data), etype))
        else:
            return etype
Exemple #30
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    return [[data]]
Exemple #31
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for d in joint.square_off(samples, run_parallel):
            out.extend([[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))])
        samples = out
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    return samples
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = dd.get_variantcaller(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                cmd = ("{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {fix_ambig} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    return out_file
Exemple #33
0
def run_rnaseq_variant_calling(data):
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    return [[data]]
Exemple #34
0
def get_type(data):
    """Retrieve the type of effects calculation to do.
    """
    if data["analysis"].lower().startswith("var") or dd.get_variantcaller(
            data):
        etype = tz.get_in(("config", "algorithm", "effects"), data, "snpeff")
        if isinstance(etype, (list, tuple)):
            if len(etype) == 1:
                return etype[0]
            else:
                raise ValueError("Unexpected variant effect type for %s: %s" %
                                 (dd.get_sample_name(data), etype))
        else:
            return etype
Exemple #35
0
def _needs_java(data):
    """Check if a caller needs external java for MuTect or older GATK 3.6.
    """
    vc = dd.get_variantcaller(data)
    if not isinstance(vc, (list, tuple)):
        vc = [vc]
    if "mutect" in vc:
        return True
    if "gatk" in vc or "gatk-haplotype" in vc:
        runner = broad.runner_from_config(data["config"])
        version = runner.get_gatk_version()
        if LooseVersion(version) < LooseVersion("3.6"):
            return True
    return False
Exemple #36
0
def _needs_java(data):
    """Check if a caller needs external java for MuTect or older GATK 3.6.
    """
    vc = dd.get_variantcaller(data)
    if not isinstance(vc, (list, tuple)):
        vc = [vc]
    if "mutect" in vc:
        return True
    if "gatk" in vc or "gatk-haplotype" in vc:
        runner = broad.runner_from_config(data["config"])
        version = runner.get_gatk_version()
        if LooseVersion(version) < LooseVersion("3.6"):
            return True
    return False
Exemple #37
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Exemple #38
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d)
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    return checkpoints
Exemple #39
0
def run_rnaseq_variant_calling(data):
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    return [[data]]
Exemple #40
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    ref_file = dd.get_ref_file(data)
    out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf")
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, out_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Exemple #41
0
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return batch_groups.values()
Exemple #42
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_file = os.path.join(utils.safe_makedir(os.path.join("variation", "rnaseq", "gatk-haplotype")),
                            "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                     out_file=out_file)
    return dd.set_vrn_file(data, out_file)
Exemple #43
0
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return list(batch_groups.values())
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = dd.get_variantcaller(items[0])
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_paired.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
            # merge bed file regions as amplicon VarDict is only supported in single sample mode
            opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True))
            coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                   for data in items):
                somatic_filter = ""
            else:
                somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                                  os.path.join(os.path.dirname(sys.executable), "py"))
            jvm_opts = _get_jvm_opts(items[0], tx_out_file)
            cmd = ("{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                   "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                   "| {strandbias} "
                   "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                   "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                   "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                   "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                   "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file
Exemple #45
0
def get_active_vcinfo(data):
    """Use first caller if ensemble is not active
    """
    callers = dd.get_variantcaller(data)
    if not callers:
        return None
    if isinstance(callers, basestring):
        callers = [callers]
    active_vs = []
    if "variants" in data:
        for v in data["variants"]:
            if v.get("variantcaller") == "ensemble":
                return v
            if v.get("vrn_file"):
                active_vs.append(v)
        if len(active_vs) > 0:
            return active_vs[0]
Exemple #46
0
def get_vardict_command(data):
    """
    convert variantcaller specification to proper vardict command, handling
    string or list specification
    """
    vcaller = dd.get_variantcaller(data)
    if isinstance(vcaller, list):
        vardict = [x for x in vcaller if "vardict" in x]
        if not vardict:
            return None
        vardict = vardict[0]
    elif not vcaller:
        return None
    else:
        vardict = vcaller
    vardict = "vardict-java" if not vardict.endswith("-perl") else "vardict"
    return vardict
Exemple #47
0
def _callable_from_gvcf(data, vrn_file, out_dir):
    """Retrieve callable regions based on ref call regions in gVCF.

    Uses https://github.com/lijiayong/gvcf_regions
    """
    methods = {"freebayes": "freebayes", "platypus": "platypus",
               "gatk-haplotype": "gatk"}
    gvcf_type = methods.get(dd.get_variantcaller(data))
    if gvcf_type:
        out_file = os.path.join(out_dir, "%s-gcvf-coverage.bed" %
                                utils.splitext_plus(os.path.basename(vrn_file))[0])
        if not utils.file_uptodate(out_file, vrn_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} "
                       "| bedtools merge > {tx_out_file}")
                do.run(cmd.format(**locals()), "Convert gVCF to BED file of callable regions")
        return out_file
Exemple #48
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Exemple #49
0
def _clean_flat_variantcaller(data):
    """Convert flattened dictionary from CWL representation into dictionary.

    CWL flattens somatic/germline tags into a set of strings, which we
    reconstitute as a dictionary.
    """
    vc = dd.get_variantcaller(data)
    if isinstance(vc, (list, tuple)) and all([x.count(":") == 1 for x in vc]):
        out = {}
        for v in vc:
            k, v = v.split(":")
            if k in out:
                out[k].append(v)
            else:
                out[k] = [v]
        data = dd.set_variantcaller(data, out)
    return data
Exemple #50
0
def _callable_from_gvcf(data, vrn_file, out_dir):
    """Retrieve callable regions based on ref call regions in gVCF.

    Uses https://github.com/lijiayong/gvcf_regions
    """
    methods = {"freebayes": "freebayes", "platypus": "platypus",
               "gatk-haplotype": "gatk"}
    gvcf_type = methods.get(dd.get_variantcaller(data))
    if gvcf_type:
        out_file = os.path.join(out_dir, "%s-gcvf-coverage.bed" %
                                utils.splitext_plus(os.path.basename(vrn_file))[0])
        if not utils.file_uptodate(out_file, vrn_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} "
                       "| bedtools merge > {tx_out_file}")
                do.run(cmd.format(**locals()), "Convert gVCF to BED file of callable regions")
        return out_file
Exemple #51
0
def get_vardict_command(data):
    """
    convert variantcaller specification to proper vardict command, handling
    string or list specification
    """
    vcaller = dd.get_variantcaller(data)
    if isinstance(vcaller, list):
        vardict = [x for x in vcaller if "vardict" in x]
        if not vardict:
            return None
        vardict = vardict[0]
    elif not vcaller:
        return None
    else:
        vardict = vcaller
    vardict = "vardict-java" if not vardict.endswith("-perl") else "vardict"
    return vardict
Exemple #52
0
def _get_active_vcinfo(data):
    """Use first caller if ensemble is not active
    """
    callers = dd.get_variantcaller(data)
    if not callers:
        return None
    if isinstance(callers, basestring):
        callers = [callers]
    active_vs = []
    if "variants" in data:
        for v in data["variants"]:
            if v.get("variantcaller") == "ensemble":
                return v
            if v.get("vrn_file"):
                active_vs.append(v)
        if len(active_vs) > 0:
            return active_vs[0]
def _clean_flat_variantcaller(data):
    """Convert flattened dictionary from CWL representation into dictionary.

    CWL flattens somatic/germline tags into a set of strings, which we
    reconstitute as a dictionary.
    """
    vc = dd.get_variantcaller(data)
    if isinstance(vc, (list, tuple)) and all([x.count(":") == 1 for x in vc]):
        out = {}
        for v in vc:
            k, v = v.split(":")
            if k in out:
                out[k].append(v)
            else:
                out[k] = [v]
        data = dd.set_variantcaller(data, out)
    return data
Exemple #54
0
def run_rnaseq_joint_genotyping(*samples):
    data = samples[0][0]
    variantcaller = dd.get_variantcaller(data)
    if not variantcaller:
       return samples
    if "gatk" not in variantcaller:
        return samples
    ref_file = dd.get_ref_file(data)
    if variantcaller and "gatk" in variantcaller:
        vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)]
        out_file = variation.gatk_joint_calling(data, vrn_files, ref_file)
        vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data)
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_square_vcf(data, vrn_file)
            updated_samples.append([data])
        return updated_samples
    return samples
Exemple #55
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Exemple #56
0
def parallel_prep_region(samples, run_parallel):
    """Perform full pre-variant calling BAM prep work on regions.
    """
    file_key = "work_bam"
    split_fn = _split_by_regions("bamprep", "-prep.bam", file_key)
    # identify samples that do not need preparation -- no recalibration or realignment
    extras = []
    torun = []
    for data in [x[0] for x in samples]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if (not dd.get_recalibrate(data) and not dd.get_realign(data) and not dd.get_variantcaller(data)):
            extras.append([data])
        elif not data.get(file_key):
            extras.append([data])
        else:
            torun.append([data])
    return extras + parallel_split_combine(torun, split_fn, run_parallel,
                                           "piped_bamprep", _add_combine_info, file_key, ["config"])
Exemple #57
0
def _get_variant_callers(data):
    """Use first caller if ensemble is not active"""
    callers = dd.get_variantcaller(data)
    if not callers:
        return None
    if isinstance(callers, basestring):
        callers = [callers]
    active_callers = [c.get("variantcaller") for c in data.get("variants", [{}])]
    active_vcf = [c.get("vrn_file") for c in data.get("variants", [{}])]
    active_germline = [c.get("germline") for c in data.get("variants", [{}])]
    vcf = dict(zip(active_callers, active_vcf))
    germline = dict(zip(active_callers, active_germline))
    if "ensemble" in active_callers:
        vcf_fn = vcf["ensemble"]
    else:
        vcf_fn = vcf[callers[0]]
    if not vcf_fn:
        vcf_fn = germline[callers[0]]
    return vcf_fn
Exemple #58
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
        vrn_file = dd.get_vrn_file(data)
    return [[data]]
Exemple #59
0
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    """
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]