コード例 #1
0
ファイル: genotype.py プロジェクト: biocyberman/bcbio-nextgen
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)
    return batches + extras
コード例 #2
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    return [out]
コード例 #3
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(
                set(
                    tz.concat(
                        [dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
コード例 #4
0
ファイル: run_info.py プロジェクト: screx/bcbio-nextgen
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = vcfutils.get_somatic_variantcallers(items)
            if "vardict" in vcs:
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
            elif "mutect" in vcs or "mutect2" in vcs:
                raise ValueError(
                    "MuTect and MuTect2 require a 'phenotype: tumor' sample for calling, "
                    "in batch %s: %s" %
                    (batch, [dd.get_sample_name(data) for data in items]))
コード例 #5
0
ファイル: run_info.py プロジェクト: simexin/bcbio-nextgen
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
コード例 #6
0
ファイル: genotype.py プロジェクト: mutual-ai/bcbio-nextgen
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
        all_keys.update(set(data["cwl_keys"]))
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
コード例 #7
0
ファイル: cwlutils.py プロジェクト: chapmanb/bcbio-nextgen
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(("%s-" % test_name,
                                                       "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
コード例 #8
0
ファイル: __init__.py プロジェクト: chapmanb/bcbio-nextgen
def get_variants(data, include_germline=False):
    """Retrieve set of variant calls to use for heterogeneity analysis.
    """
    data = utils.deepish_copy(data)
    supported = ["precalled", "vardict", "vardict-java", "vardict-perl",
                 "freebayes", "octopus", "strelka2"]
    # Right now mutect2 and mutect do not provide heterozygous germline calls
    # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464
    # supported += ["mutect2", "mutect"]
    if include_germline:
        supported.insert(1, "gatk-haplotype")
    out = []
    # CWL based input
    if isinstance(data.get("variants"), dict) and "samples" in data["variants"]:
        cur_vs = []
        # Unpack single sample list of files
        if (isinstance(data["variants"]["samples"], (list, tuple)) and
              len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))):
            data["variants"]["samples"] = data["variants"]["samples"][0]
        for fname in data["variants"]["samples"]:
            variantcaller = utils.splitext_plus(os.path.basename(fname))[0]
            variantcaller = variantcaller.replace(dd.get_sample_name(data) + "-", "")
            for batch in dd.get_batches(data):
                variantcaller = variantcaller.replace(batch + "-", "")
            cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller})
        data["variants"] = cur_vs
    for v in data.get("variants", []):
        if v["variantcaller"] in supported and v.get("vrn_file"):
            out.append((supported.index(v["variantcaller"]), v))
    out.sort()
    return [xs[1] for xs in out]
コード例 #9
0
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    """
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
            cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples),
                        "batch_samples": batch_samples,
                        "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples],
                                     "calls": [d.get("vrn_file") for d in gsamples]}})
            out.append(cur)

    def by_original_order(d):
        return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order])
    return sorted(out, key=by_original_order)
コード例 #10
0
ファイル: __init__.py プロジェクト: chapmanb/bcbio-nextgen
def _useful_basename(data):
    """Provide a useful file basename for outputs, referencing batch/sample and caller.
    """
    names = dd.get_batches(data)
    if not names:
        names = [dd.get_sample_name(data)]
    batch_name = names[0]
    return "%s-%s" % (batch_name, data["sv"]["variantcaller"])
コード例 #11
0
ファイル: __init__.py プロジェクト: vallurumk/bcbio-nextgen
def _useful_basename(data):
    """Provide a useful file basename for outputs, referencing batch/sample and caller.
    """
    names = dd.get_batches(data)
    if not names:
        names = [dd.get_sample_name(data)]
    batch_name = names[0]
    return "%s-%s" % (batch_name, data["sv"]["variantcaller"])
コード例 #12
0
ファイル: __init__.py プロジェクト: chapmanb/bcbio-nextgen
def _clean_name(fname, data):
    """Remove standard prefixes from a filename before renaming with useful names.
    """
    for to_remove in dd.get_batches(data) + [dd.get_sample_name(data), data["sv"]["variantcaller"]]:
        for ext in ("-", "_"):
            if fname.startswith("%s%s" % (to_remove, ext)):
                fname = fname[len(to_remove) + len(ext):]
        if fname.startswith(to_remove):
            fname = fname[len(to_remove):]
    return fname
コード例 #13
0
ファイル: __init__.py プロジェクト: zhangyupisa/bcbio-nextgen
def _clean_name(fname, data):
    """Remove standard prefixes from a filename before renaming with useful names.
    """
    for to_remove in dd.get_batches(data) + [dd.get_sample_name(data), data["sv"]["variantcaller"]]:
        for ext in ("-", "_"):
            if fname.startswith("%s%s" % (to_remove, ext)):
                fname = fname[len(to_remove) + len(ext):]
        if fname.startswith(to_remove):
            fname = fname[len(to_remove):]
    return fname
コード例 #14
0
ファイル: peaks.py プロジェクト: chapmanb/bcbio-nextgen
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = origin[0].get("work_bam")
            return [sample]
    return [sample]
コード例 #15
0
ファイル: genotype.py プロジェクト: mutual-ai/bcbio-nextgen
def _get_batch_name(items):
    """Retrieve the shared batch name for a group of items.
    """
    batch_names = collections.defaultdict(int)
    for data in items:
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_names[b] += 1
    return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
コード例 #16
0
ファイル: variation.py プロジェクト: NAMPHER/bcbio-nextgen
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {
        "validate": validate.combine_validations(items),
        "variants": {
            "calls": [],
            "gvcf": [],
            "samples": []
        }
    }
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(
                data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     out_key)), "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
コード例 #17
0
def _get_batch_name(items):
    """Retrieve the shared batch name for a group of items.
    """
    batch_names = collections.defaultdict(int)
    for data in items:
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_names[b] += 1
    return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
コード例 #18
0
ファイル: peaks.py プロジェクト: yangzixu/bcbio-nextgen
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = origin[0].get("work_bam")
            return [sample]
    return [sample]
コード例 #19
0
ファイル: __init__.py プロジェクト: DoaneAS/bcbio-nextgen
def _add_batch(x, sample):
    """Potentially add batch name to an upload file.
    """
    added = False
    for batch in sorted(dd.get_batches(sample) or [], key=len, reverse=True):
        if batch and os.path.basename(x["path"]).startswith("%s-" % batch):
            x["batch"] = batch
            added = True
            break
    if not added:
        x["batch"] = dd.get_sample_name(sample)
    return x
コード例 #20
0
ファイル: __init__.py プロジェクト: gberriz/bcbio-nextgen
def _add_batch(x, sample):
    """Potentially add batch name to an upload file.
    """
    added = False
    for batch in sorted(dd.get_batches(sample) or [], key=len, reverse=True):
        if batch and os.path.basename(x["path"]).startswith("%s-" % batch):
            x["batch"] = batch
            added = True
            break
    if not added:
        x["batch"] = dd.get_sample_name(sample)
    return x
コード例 #21
0
ファイル: germline.py プロジェクト: aiminy/bcbio-nextgen
def extract(data, items):
    """Extract germline calls for the given sample, if tumor/normal or prioritized.
    """
    if vcfutils.get_paired_phenotype(data):
        is_paired = dd.get_batches(data) and len(items) > 1
        if is_paired:
            germline_vcf = _extract_germline(data["vrn_file"], data)
        else:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
        germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
        data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
コード例 #22
0
ファイル: germline.py プロジェクト: druvus/bcbio-nextgen
def extract(data, items):
    """Extract germline calls for the given sample, if tumor/normal or prioritized.
    """
    if vcfutils.get_paired_phenotype(data):
        is_paired = dd.get_batches(data) and len(items) > 1
        if is_paired:
            germline_vcf = _extract_germline(data["vrn_file"], data)
        else:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
        germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
        data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
コード例 #23
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [
        utils.to_single_data(x)
        for x in vcvalidate.summarize_grading(items, "svvalidate")
    ]
    out = {
        "sv": {
            "calls": [],
            "prioritize": {
                "tsv": [],
                "raw": []
            }
        },
        "svvalidate": vcvalidate.combine_validations(items, "svvalidate")
    }
    added = set([])
    # Standard callers
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(
                        utils.safe_makedir(
                            os.path.join(dd.get_work_dir(data), "sv",
                                         "calls")), "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(
                        out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
    # prioritization
    for pdata in _group_by_sample(items):
        prioritysv = [
            x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get(
                "sv", []) if x["variantcaller"] == "sv-prioritize"
        ]
        if prioritysv:
            out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"])
            out["sv"]["prioritize"]["raw"].extend(
                prioritysv[0]["raw_files"].values())
    return [out]
コード例 #24
0
ファイル: genotype.py プロジェクト: biocyberman/bcbio-nextgen
def _get_batch_name(items, skip_jointcheck=False):
    """Retrieve the shared batch name for a group of items.
    """
    batch_names = collections.defaultdict(int)
    has_joint = any([is_joint(d) for d in items])
    for data in items:
        if has_joint and not skip_jointcheck:
            batches = dd.get_sample_name(data)
        else:
            batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_names[b] += 1
    return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
コード例 #25
0
ファイル: joint.py プロジェクト: thekingofall/bcbio-nextgen
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return list(batch_groups.values())
コード例 #26
0
ファイル: joint.py プロジェクト: biocyberman/bcbio-nextgen
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return batch_groups.values()
コード例 #27
0
ファイル: genotype.py プロジェクト: biterbilen/bcbio-nextgen
def _get_batch_name(items, skip_jointcheck=False):
    """Retrieve the shared batch name for a group of items.
    """
    batch_names = collections.defaultdict(int)
    has_joint = any([is_joint(d) for d in items])
    for data in items:
        if has_joint and not skip_jointcheck:
            batches = dd.get_sample_name(data)
        else:
            batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_names[b] += 1
    return sorted(batch_names.items(), key=lambda x: x[-1], reverse=True)[0][0]
コード例 #28
0
ファイル: genotype.py プロジェクト: dbolser-ebi/bcbio-nextgen
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in to_process]:
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
コード例 #29
0
def _get_vcf_samples(calls, data):
    have_full_file = False
    all_samples = set([])
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for test_name in [dd.get_sample_name(data)] + dd.get_batches(data):
                if os.path.basename(f).startswith("%s-" % test_name):
                    all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
コード例 #30
0
ファイル: joint.py プロジェクト: biocyberman/bcbio-nextgen
def run_jointvc(items):
    items = [utils.to_single_data(x) for x in items]
    data = items[0]
    if not dd.get_jointcaller(data):
        data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data)
    # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these.
    chrom, coords = data["region"].split(":")
    start, end = coords.split("-")
    ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end)
    str_region = ready_region.replace(":", "_")
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint",
                                                            dd.get_variantcaller(data), str_region)),
                            "%s-%s-%s.vcf.gz" % (dd.get_batches(data)[0], dd.get_variantcaller(data), str_region))
    joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0]
    data["vrn_file_region"] = joint_out["vrn_file"]
    return data
コード例 #31
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
コード例 #32
0
ファイル: germline.py プロジェクト: zhangj5/bcbio-nextgen
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
コード例 #33
0
ファイル: variation.py プロジェクト: chapmanb/bcbio-nextgen
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {"validate": validate.combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
コード例 #34
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
コード例 #35
0
def run_jointvc(items):
    items = [utils.to_single_data(x) for x in items]
    data = items[0]
    if not dd.get_jointcaller(data):
        data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data)
    # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these.
    chrom, coords = data["region"].split(":")
    start, end = coords.split("-")
    ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end)
    str_region = ready_region.replace(":", "_")
    batches = dd.get_batches(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint",
                                                            dd.get_variantcaller(data), str_region)),
                            "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region))
    joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0]
    data["vrn_file_region"] = joint_out["vrn_file"]
    return data
コード例 #36
0
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    """
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(
            data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            gsamples.sort(key=dd.get_variantcaller_order)
            cur = copy.deepcopy(gsamples[0])
            cur.update({
                "batch_id":
                sorted(list(batches))[0]
                if batches else "_".join(batch_samples),
                "batch_samples":
                batch_samples,
                "variants": {
                    "variantcallers":
                    [dd.get_variantcaller(d) for d in gsamples],
                    "calls": [d.get("vrn_file") for d in gsamples]
                }
            })
            out.append(cur)

    def by_original_order(d):
        return min([
            sample_order.index(s) for s in d["batch_samples"]
            if s in sample_order
        ])

    return sorted(out, key=by_original_order)
コード例 #37
0
ファイル: vcfutils.py プロジェクト: lbeltrame/bcbio-nextgen
def somatic_batches(items):
    """Group items into somatic calling batches (tumor-only or tumor/normal).

    Returns batches, where a data item may be in pairs, and somatic and non_somatic
    (which are the original list of items).
    """
    non_somatic = []
    somatic = []
    data_by_batches = defaultdict(list)
    for data in items:
        if not get_paired_phenotype(data):
            non_somatic.append(data)
        else:
            somatic.append(data)
            batches = dd.get_batches(data)
            if batches:
                for batch in batches:
                    data_by_batches[batch].append(data)
    return data_by_batches.values(), somatic, non_somatic
コード例 #38
0
ファイル: vcfutils.py プロジェクト: zhangj5/bcbio-nextgen
def somatic_batches(items):
    """Group items into somatic calling batches (tumor-only or tumor/normal).

    Returns batches, where a data item may be in pairs, and somatic and non_somatic
    (which are the original list of items).
    """
    non_somatic = []
    somatic = []
    data_by_batches = defaultdict(list)
    for data in items:
        if not get_paired_phenotype(data):
            non_somatic.append(data)
        else:
            somatic.append(data)
            batches = dd.get_batches(data)
            if batches:
                for batch in batches:
                    data_by_batches[batch].append(data)
    return data_by_batches.values(), somatic, non_somatic
コード例 #39
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    sample_order = [
        dd.get_sample_name(utils.to_single_data(x)) for x in samples
    ]
    to_process, extras = _dup_samples_by_variantcaller(samples,
                                                       require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)

    def by_original_order(xs):
        return (min([sample_order.index(dd.get_sample_name(x)) for x in xs]),
                min([dd.get_variantcaller_order(x) for x in xs]))

    return sorted(batches + extras, key=by_original_order)
コード例 #40
0
def get_variants(data, include_germline=False):
    """Retrieve set of variant calls to use for heterogeneity analysis.
    """
    data = utils.deepish_copy(data)
    supported = [
        "precalled", "vardict", "vardict-java", "vardict-perl", "freebayes",
        "octopus", "strelka2"
    ]
    # Right now mutect2 and mutect do not provide heterozygous germline calls
    # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464
    # supported += ["mutect2", "mutect"]
    if include_germline:
        supported.insert(1, "gatk-haplotype")
    out = []
    # CWL based input
    if isinstance(data.get("variants"),
                  dict) and "samples" in data["variants"]:
        cur_vs = []
        # Unpack single sample list of files
        if (isinstance(data["variants"]["samples"],
                       (list, tuple)) and len(data["variants"]["samples"]) == 1
                and isinstance(data["variants"]["samples"][0], (list, tuple))):
            data["variants"]["samples"] = data["variants"]["samples"][0]
        for fname in data["variants"]["samples"]:
            variantcaller = utils.splitext_plus(os.path.basename(fname))[0]
            variantcaller = variantcaller.replace(
                dd.get_sample_name(data) + "-", "")
            for batch in dd.get_batches(data):
                variantcaller = variantcaller.replace(batch + "-", "")
            cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller})
        data["variants"] = cur_vs
    for v in data.get("variants", []):
        if v["variantcaller"] in supported and v.get("vrn_file"):
            out.append((supported.index(v["variantcaller"]), v))
    out.sort()
    return [xs[1] for xs in out]
コード例 #41
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [
        utils.to_single_data(x)
        for x in vcvalidate.summarize_grading(items, "svvalidate")
    ]
    out = {
        "sv": {
            "calls": []
        },
        "svvalidate": vcvalidate.combine_validations(items, "svvalidate")
    }
    added = set([])
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(
                        utils.safe_makedir(
                            os.path.join(dd.get_work_dir(data), "sv",
                                         "calls")), "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(
                        out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
    return [out]
コード例 #42
0
ファイル: genotype.py プロジェクト: lijiayong/bcbio-nextgen
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    from bcbio.pipeline import run_info
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    default_keys = set(["metadata__batch", "config__algorithm__validate",
                        "config__algorithm__validate_regions"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
        all_keys.update(set(data["cwl_keys"]))
    all_keys.update(default_keys)
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {}))
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras