def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk}
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group,
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i),
                                           "gcannotated": gcannotated_tsv}
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Exemple #2
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt")
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5")
    return [[data]]
Exemple #3
def _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir):
    """Write output argfile, preparing a CWL ready JSON or YAML representation of the world.
    with open(argfile, "w") as out_handle:
        if argfile.endswith(".json"):
            record_name, record_attrs = _get_record_attrs(out_keys)
            if record_name:
                if parallel in ["multi-batch"]:
                    recs = _nested_cwl_record(out, record_attrs, input_files)
                elif parallel in ["single-split", "multi-combined", "multi-parallel", "batch-single",
                    recs = [_collapse_to_cwl_record_single(utils.to_single_data(xs), record_attrs, input_files)
                            for xs in out]
                    samples = [utils.to_single_data(xs) for xs in out]
                    recs = [_collapse_to_cwl_record(samples, record_attrs, input_files)]
                json.dump(_combine_cwl_records(recs, record_name, parallel),
                            out_handle, sort_keys=True, indent=4, separators=(', ', ': '))
            elif parallel in ["single-split", "multi-combined", "batch-split"]:
                json.dump(_convert_to_cwl_json([utils.to_single_data(xs) for xs in out], fnargs, input_files),
                            out_handle, sort_keys=True, indent=4, separators=(', ', ': '))
                json.dump(_convert_to_cwl_json(utils.to_single_data(utils.to_single_data(out)), fnargs, input_files),
                            out_handle, sort_keys=True, indent=4, separators=(', ', ': '))
            yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
Exemple #4
def precall(items):
    """Perform initial pre-calling steps -- coverage calcuation by sample.

    Use sambamba to call average region coverage in regions, and convert into a correct format.
    items = [utils.to_single_data(x) for x in items]
    assert len(items) == 1, "Expect one item to Seq2C coverage calculation"
    data = utils.to_single_data(items)
    # sv_bed could specify a smaller region than variant coverage, so avoid
    # this sanity check
    # assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing"

    assert "seq2c_bed_ready" in data["config"]["algorithm"], "Error: svregions or variant_regions BED file required for Seq2C"

    bed_file = data["config"]["algorithm"]["seq2c_bed_ready"]
    bam_file = dd.get_align_bam(data)
    sample_name = dd.get_sample_name(data)

    work_dir = _sv_workdir(data)
    cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name)

    if "sv" not in data:
        data["sv"] = []
    data["sv"].append({"variantcaller": "seq2c",
                       "coverage": cov_file})
    return [data]
Exemple #5
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    from bcbio.structural import cnvkit
    if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items):
        return items
    items = [utils.to_single_data(x) for x in items]
    out = []
    for cnv_group in _group_by_cnv_method(multi.group_by_batch(items, False)):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn,
                                                         cnv_group.work_dir, data)
            if not data.get("regions"):
                data["regions"] = {}
            data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed}
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Exemple #6
def call_hla(data):
    data = utils.to_single_data(utils.to_single_data(data))
    hlacaller = tz.get_in(["config", "algorithm", "hlacaller"], data)
    if hlacaller and "hla" in data and tz.get_in(["hla", "fastq"], data):
        data = _CALLERS[hlacaller](data)
    elif "hla" not in data:
        data["hla"] = {}
    return [[data]]
def qc_to_rec(samples):
    """CWL: Convert a set of input samples into records for parallelization.
    samples = [utils.to_single_data(x) for x in samples]
    samples = cwlutils.assign_complex_to_samples(samples)
    to_analyze, extras = _split_samples_by_qc(samples)
    recs = cwlutils.samples_to_records([utils.to_single_data(x) for x in to_analyze + extras])
    return [[x] for x in recs]
Exemple #8
def _read_from_cwlinput(in_file, work_dir, runtime, parallel, input_order, output_cwl_keys):
    """Read data records from a JSON dump of inputs. Avoids command line flattening of records.
    with open(in_file) as in_handle:
        inputs = json.load(in_handle)
    items_by_key = {}
    passed_keys = set([])
    for key, input_val in ((k, v) for (k, v) in inputs.items() if not k.startswith(("sentinel", "ignore"))):
        if key.endswith("_toolinput"):
            key = key.replace("_toolinput", "")
        if input_order[key] == "record":
            cur_keys, items = _read_cwl_record(input_val)
            passed_keys |= cur_keys
            items_by_key[key] = items
            items_by_key[tuple(key.split("__"))] = _cwlvar_to_wdl(input_val)
    prepped = _merge_cwlinputs(items_by_key, input_order, parallel)
    out = []
    for data in prepped:
        if isinstance(data, (list, tuple)):
            out.append([_finalize_cwl_in(utils.to_single_data(x), work_dir, list(passed_keys),
                                         output_cwl_keys, runtime) for x in data])
            out.append(_finalize_cwl_in(data, work_dir, list(passed_keys), output_cwl_keys, runtime))
    return out
Exemple #9
def run(items):
    """Normalization and log2 ratio calculation plus CNV calling for full cohort.

    - Combine coverage of each region for each sample
    - Prepare read counts for each sample
    - Normalize coverages in cohort by gene and sample, and calculate log2 ratios
    - Call amplifications and deletions
    items = [utils.to_single_data(x) for x in items]
    work_dir = _sv_workdir(items[0])

    input_backs = list(set(filter(lambda x: x is not None,
                                  [dd.get_background_cnv_reference(d, "seq2c") for d in items])))
    coverage_file = _combine_coverages(items, work_dir, input_backs)
    read_mapping_file = _calculate_mapping_reads(items, work_dir, input_backs)
    normal_names = []
    if input_backs:
        with open(input_backs[0]) as in_handle:
            for line in in_handle:
                if len(line.split()) == 2:
    normal_names += [dd.get_sample_name(x) for x in items if population.get_affected_status(x) == 1]
    seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names)
    items = _split_cnv(items, seq2c_calls_file, read_mapping_file, coverage_file)
    return items
Exemple #10
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn):
    """Shared functionality for grouping by batches for variant calling and joint calling.
    singles = []
    batch_groups = collections.defaultdict(list)
    for args in xs:
        data = utils.to_single_data(args)
        caller, batch = caller_batch_fn(data)
        region = _list_to_tuple(data["region"]) if "region" in data else ()
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, region, caller)].append(utils.deepish_copy(data))
            data = prep_data_fn(data, [data])
    batches = []
    for batch, items in batch_groups.items():
        batch_data = utils.deepish_copy(_pick_lead_item(items))
        # For nested primary batches, split permanently by batch
        if tz.get_in(["metadata", "batch"], batch_data):
            batch_name = batch[0]
            batch_data["metadata"]["batch"] = batch_name
        batch_data = prep_data_fn(batch_data, items)
        batch_data["group_orig"] = _collapse_subitems(batch_data, items)
        batch_data["group"] = batch
    return singles + batches
Exemple #11
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
    return batches + extras
Exemple #12
def vc_output_record(samples):
    """Prepare output record from variant calling to feed into downstream analysis.

    Prep work handles reformatting so we return generated dictionaries.

    For any shared keys that are calculated only once for a batch, like variant calls
    for the batch, we assign to every sample.
    shared_keys = [["vrn_file"], ["validate", "summary"],
                   ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]]
    raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples])
    shared = {}
    for key in shared_keys:
        cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
        if len(cur) > 0:
            assert len(cur) == 1, (key, cur)
            shared[tuple(key)] = cur[0]
            shared[tuple(key)] = None
    out = []
    for d in raw:
        for key, val in shared.items():
            d = tz.update_in(d, key, lambda x: val)
    return out
Exemple #13
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    calcfns = {"cnvkit": _calculate_sv_coverage_cnvkit, "gatk-cnv": _calculate_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        out_target_file, out_anti_file = (None, None)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                   dd.get_sample_name(data), "bins"))
        out_target_file, out_anti_file = calcfns[cnvkit.bin_approach(data)](data, work_dir)
        if not os.path.exists(out_target_file):
            out_target_file, out_anti_file = (None, None)
    if "seq2c" in dd.get_svcaller(data):
        from bcbio.structural import seq2c
        seq2c_target = seq2c.precall(data)
        seq2c_target = None

    if not tz.get_in(["depth", "bins"], data):
        data = tz.update_in(data, ["depth", "bins"], lambda x: {})
    data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file, "seq2c": seq2c_target}
    return [[data]]
Exemple #14
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
    return out
Exemple #15
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(bam_file_ready, ref_file, data)
        sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed,
                           "sample_callable": sample_callable,
                           "offtarget_stats": offtarget_stats}
        data = coverage.assign_interval(data)
        highdepth_bed = highdepth.identify(data)
        data["regions"]["highdepth"] = highdepth_bed
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Exemple #16
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
    calcfns = {"cnvkit": _normalize_sv_coverage_cnvkit, "gatk-cnv": _normalize_sv_coverage_gatk}
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                    dd.get_sample_name(inputs[0]), "bins"))
        back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](group_id, inputs, backgrounds, work_dir,
                                                                        back_files, out_files)
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
    return out
Exemple #17
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed,
                           "callable": callable_bed,
                           "sample_callable": covinfo.callable,
                           "mapped_stats": readstats.get_cache_file(data)}
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
def _normalize_cwl_inputs(items):
    """Extract variation and validation data from CWL input list of batched samples.
    with_validate = {}
    vrn_files = []
    ready_items = []
    batch_samples = []
    for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items):
        if tz.get_in(["config", "algorithm", "validate"], data):
            with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data
        if data.get("vrn_file"):
    if len(with_validate) == 0:
        data = _pick_lead_item(ready_items)
        data["batch_samples"] = batch_samples
        return data
        assert len(with_validate) == 1, len(with_validate)
        assert len(set(vrn_files)) == 1, set(vrn_files)
        data = _pick_lead_item(with_validate.values())
        data["batch_samples"] = batch_samples
        data["vrn_file"] = vrn_files[0]
        return data
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
    return [out]
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
            cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples),
                        "batch_samples": batch_samples,
                        "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples],
                                     "calls": [d.get("vrn_file") for d in gsamples]}})

    def by_original_order(d):
        return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order])
    return sorted(out, key=by_original_order)
Exemple #21
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Exemple #22
def merge_split_alignments(data):
    """Merge split BAM inputs generated by common workflow language runs.
    data = utils.to_single_data(data)
    data = _merge_align_bams(data)
    data = _merge_hla_fastq_inputs(data)
    return [[data]]
Exemple #23
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
    return to_process, extras, background
Exemple #24
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Exemple #25
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {"validate": validate.combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                if cur_name not in added:
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
    for sample in sample_order:
    return [out]
Exemple #26
def to_cram(data):
    """Convert BAM archive files into indexed CRAM.
    data = utils.to_single_data(data)
    cram_file = cram.compress(dd.get_work_bam(data) or dd.get_align_bam(data), data)
    out_key = "archive_bam" if cwlutils.is_cwl_run(data) else "work_bam"
    data[out_key] = cram_file
    return [[data]]
Exemple #27
def _group_by_batches(items):
    out = collections.OrderedDict()
    for data in (utils.to_single_data(xs) for xs in items):
        for b in _get_batches(data):
            except KeyError:
                out[b] = [data]
    return out
Exemple #28
def get_in_samples(samples, fn):
    for a list of samples, return the value of a global option
    for sample in samples:
        sample = to_single_data(sample)
        if fn(sample, None):
            return fn(sample)
    return None
Exemple #29
def to_rec_single(samples, default_keys=None):
    """Convert output into a list of single CWL records.
    out = []
    for data in samples:
        recs = samples_to_records([normalize_missing(utils.to_single_data(data))], default_keys)
        assert len(recs) == 1
    return out
def _save_fastq_space(items):
    """Potentially save fastq space prior to merging, since alignments done.
    to_cleanup = {}
    for data in (utils.to_single_data(x) for x in items):
        for fname in data.get("files", []):
            if os.path.realpath(fname).startswith(dd.get_work_dir(data)):
                to_cleanup[fname] = data["config"]
    for fname, config in to_cleanup.items():
        utils.save_diskspace(fname, "Cleanup prep files after alignment finished", config)
Exemple #31
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
def _batch_split_by_sv(samples, stage):
    - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict
    - extras = samples without sv calling (should there be any?)
    - background - all samples
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        # data = sample
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            # why appending every sample to background?
            # x is sample - sv caller pair
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types
                                  ) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                # just creating PON - no calling
                if stage in ["standard"] and batch in ["pon_build"]:
                    batches = batch if isinstance(batch,
                                                  (list, tuple)) else [batch]
                    for b in batches:
                            to_process[(svcaller, b)].append(x)
                        except KeyError:
                            to_process[(svcaller, b)] = [x]
    return to_process, extras, background
Exemple #33
def clean_sample_data(samples):
    """Clean unnecessary information from sample data, reducing size for message passing.
    out = []
    for data in (utils.to_single_data(x) for x in samples):
        if "dirs" in data:
            data["dirs"] = {
                "work": data["dirs"]["work"],
                "galaxy": data["dirs"]["galaxy"],
                "fastq": data["dirs"].get("fastq")
        data["config"] = {
            "algorithm": data["config"]["algorithm"],
            "resources": data["config"]["resources"]
        for remove_attr in ["config_file", "algorithm"]:
            data.pop(remove_attr, None)
    return out
Exemple #34
def run(items):
    """Normalization and log2 ratio calculation plus CNV calling for full cohort.

    - Combine coverage of each region for each sample
    - Prepare read counts for each sample
    - Normalize coverages in cohort by gene and sample, and calculate log2 ratios
    - Call amplifications and deletions
    items = [utils.to_single_data(x) for x in items]
    work_dir = _sv_workdir(items[0])

    coverage_file = _combine_coverages(items, work_dir)
    read_mapping_file = _calculate_mapping_reads(items, work_dir)

    normal_names = [dd.get_sample_name(x) for x in items if get_paired_phenotype(x) == "normal"]
    seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names)
    _split_cnv(items, seq2c_calls_file)

    return items
Exemple #35
def organize_noalign(data):
    """CWL target to skip alignment and organize input data.
    data = utils.to_single_data(data[0])
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    work_bam = os.path.join(work_dir,
                            "%s-input.bam" % dd.get_sample_name(data))
    if data.get("files"):
        if data["files"][0].endswith(".cram"):
            work_bam = cram.to_bam(data["files"][0], work_bam, data)
            assert data["files"][0].endswith(".bam"), data["files"][0]
            utils.copy_plus(data["files"][0], work_bam)
        bam.index(work_bam, data["config"])
        work_bam = None
    data["align_bam"] = work_bam
    return data
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        if approach == "rtg":
            splits = rtg.calculate_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
            splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out],
                                          ["files", "align_split", "config__algorithm__quality_format"])
    return out
Exemple #37
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    data = utils.to_single_data(data)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if data["analysis"].lower().startswith("smallrna-seq"):
        work_bam = data["clean_fastq"]
    elif not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)):  # kraken doesn't need bam
  "QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0])
    return [[data]]
def run_jointvc(items):
    items = [utils.to_single_data(x) for x in items]
    data = items[0]
    if not dd.get_jointcaller(data):
        data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data)
    # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these.
    chrom, coords = data["region"].split(":")
    start, end = coords.split("-")
    ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end)
    str_region = ready_region.replace(":", "_")
    batches = dd.get_batches(data) or dd.get_sample_name(data)
    if not isinstance(batches, (list, tuple)):
        batches = [batches]
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint",
                                                            dd.get_variantcaller(data), str_region)),
                            "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region))
    joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0]
    data["vrn_file_region"] = joint_out["vrn_file"]
    return data
Exemple #39
def postprocess_variants(items):
    """Provide post-processing of variant calls: filtering and effects annotation.
    vrn_key = "vrn_file"
    if not isinstance(items, dict):
        items = [utils.to_single_data(x) for x in items]
        if "vrn_file_joint" in items[0]:
            vrn_key = "vrn_file_joint"
    data, items = _get_batch_representative(items, vrn_key)
    items = cwlutils.unpack_tarballs(items, data)
    data = cwlutils.unpack_tarballs(data, data)
    cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data))"Finalizing variant calls: %s" % cur_name)
    orig_vrn_file = data.get(vrn_key)
    data = _symlink_to_workdir(data, [vrn_key])
    data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"])
    if data.get(vrn_key):"Calculating variation effects for %s" % cur_name)
        ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data)
        if ann_vrn_file:
            data[vrn_key] = ann_vrn_file
        if vrn_stats:
            data["vrn_stats"] = vrn_stats
        orig_items = _get_orig_items(items)"Annotate VCF file: %s" % cur_name)
        data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items)"Filtering for %s" % cur_name)
        data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data),
                                              tz.get_in(("genome_resources", "variation"), data, {}),
                                              data, orig_items)"Prioritization for %s" % cur_name)
        prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items)
        if prio_vrn_file != data[vrn_key]:
            data[vrn_key] = prio_vrn_file
  "Germline extraction for %s" % cur_name)
            data = germline.extract(data, orig_items)

        if dd.get_align_bam(data):
            data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data),
                                     data, orig_items)
    if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file):
        data[vrn_key] = orig_vrn_file
    return [[data]]
Exemple #40
def concat_batch_variantcalls(items, region_block=True, skip_jointcheck=False):
    """CWL entry point: combine variant calls from regions into single VCF.
    items = [utils.to_single_data(x) for x in items]
    batch_name = _get_batch_name(items, skip_jointcheck)
    variantcaller = _get_batch_variantcaller(items)
    out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller,
                            "%s.vcf.gz" % (batch_name))
    if region_block:
        regions = [_region_to_coords(rs[0]) for rs in items[0]["region_block"]]
        regions = [_region_to_coords(r) for r in items[0]["region"]]
    vrn_file_regions = items[0]["vrn_file_region"]
    out_file = vcfutils.concat_variant_files(vrn_file_regions,
                                             out_file, regions,
    return {"vrn_file": out_file}
Exemple #41
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(
            data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
                if batches else "_".join(batch_samples),
                "variants": {
                    [dd.get_variantcaller(d) for d in gsamples],
                    "calls": [d.get("vrn_file") for d in gsamples]

    def by_original_order(d):
        return min([
            sample_order.index(s) for s in d["batch_samples"]
            if s in sample_order

    return sorted(out, key=by_original_order)
Exemple #42
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file,
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, salmon_dir)
    return [[data]]
def square_off(samples, run_parallel):
    """Perform joint calling at all variants within a batch.
    to_process = []
    extras = []
    for data in [utils.to_single_data(x) for x in samples]:
        added = False
        if tz.get_in(("metadata", "batch"), data):
            for add in genotype.handle_multiple_callers(data, "jointcaller", require_bam=False):
                if _is_jointcaller_compatible(add):
                    added = True
        if not added:
    processed = grouped_parallel_split_combine(to_process, _split_by_callable_region,
                                               multi.group_batches_joint, run_parallel,
                                               "square_batch_region", "concat_variant_files",
                                               "vrn_file", ["region", "sam_ref", "config"])
    return _combine_to_jointcaller(processed) + extras
Exemple #44
def _get_batch_representative(items, key):
    """Retrieve a representative data item from a batch.

    Handles standard bcbio cases (a single data item) and CWL cases with
    batches that have a consistent variant file.
    if isinstance(items, dict):
        return items, items
        vals = set([])
        out = []
        items = [utils.to_single_data(x) for x in items]
        for data in items:
            if key in data:
        if len(vals) != 1:
            raise ValueError("Incorrect values for %s: %s" % (key, list(vals)))
        return out[0], items
Exemple #45
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "structural",
                     dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(
        work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(
        work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file)
         or not utils.file_exists(out_anti_file))
            and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(
            data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(
            data, "antitarget",
            tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions,
        anti_cov_genes = annotate.add_genes(anti_cov.regions,
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file,
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {
            "target": out_target_file,
            "antitarget": out_anti_file
    return [[data]]
Exemple #46
def run_rnaseq_ann_filter(data):
    """Run RNA-seq annotation and filtering.
    data = to_single_data(data)
    if dd.get_vrn_file(data):
        eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0]
        if eff_file:
            data = dd.set_vrn_file(data, eff_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    variantcaller = dd.get_variantcaller(data)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data)
        data = dd.set_vrn_file(data, filter_file)
    # remove variants close to splice junctions
    vrn_file = dd.get_vrn_file(data)
    vrn_file = variation.filter_junction_variants(vrn_file, data)
    data = dd.set_vrn_file(data, vrn_file)
    return [[data]]
Exemple #47
def determine_indexes_to_make(samples):
    returns a subset of the samples that have different indexes in them to make sure we only
    make each index once
    samples = [to_single_data(x) for x in samples]
    indexes = set()
    tomake = []
    for data in samples:
        out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
        out_stem = os.path.join(out_dir, dd.get_genome_build(data))
        if dd.get_disambiguate(data):
            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
        if dd.get_disambiguate(data):
            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
        combined_file = out_stem + ".fa"
        if combined_file not in indexes:
    return tomake
Exemple #48
def _normalize_cwl_inputs(items):
    """Extract variation and validation data from CWL input list of batched samples.
    with_validate = {}
    vrn_files = []
    ready_items = []
    for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items):
        if tz.get_in(["config", "algorithm", "validate"], data):
            with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data
        if data.get("vrn_file"):
    if len(with_validate) == 0:
        return ready_items[0]
        assert len(with_validate) == 1, len(with_validate)
        assert len(set(vrn_files)) == 1
        data = with_validate.values()[0]
        data["vrn_file"] = vrn_files[0]
        return data
Exemple #49
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file,
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
Exemple #50
def variantcall_batch_region(items):
    """CWL entry point: variant call a batch of samples in a region.
    items = [utils.to_single_data(x) for x in items]
    align_bams = [dd.get_align_bam(x) for x in items]
    variantcaller = _get_batch_variantcaller(items)
    region = list(set([x.get("region") for x in items if "region" in x]))
    assert len(region) == 1, region
    region = region[0]
    caller_fn = get_variantcallers()[variantcaller]
    assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {})
    region = _region_to_coords(region)
    chrom, start, end = region
    region_str = "_".join(str(x) for x in region)
    batch_name = _get_batch_name(items)
    out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                            "%s-%s.vcf.gz" % (batch_name, region_str))
    call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]), assoc_files, region, out_file)
    return {"vrn_file_region": call_file, "region": "%s:%s-%s" % (chrom, start, end)}
Exemple #51
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": _combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                if cur_name not in added:
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
    for sample in sample_order:
    return [out]
Exemple #52
def generate_parallel(samples, run_parallel):
    """Provide parallel preparation of summary information for alignment and variant calling.
    to_analyze, extras = _split_samples_by_qc(samples)
    qced = run_parallel("pipeline_summary", to_analyze)
    samples = _combine_qc_samples(qced) + extras
    qsign_info = run_parallel("qsignature_summary", [samples])
    metadata_file = _merge_metadata([samples])
    summary_file = write_project_summary(samples, qsign_info)
    out = []
    for data in samples:
        if "summary" not in data[0]:
            data[0]["summary"] = {}
        data[0]["summary"]["project"] = summary_file
        data[0]["summary"]["metadata"] = metadata_file
        if qsign_info:
            data[0]["summary"]["mixup_check"] = qsign_info[0]["out_dir"]
    out = _add_researcher_summary(out, summary_file)
    # MultiQC must be run after all file outputs are set:
    return [[utils.to_single_data(d)] for d in run_parallel("multiqc_summary", [out])]
def _dup_samples_by_variantcaller(samples, require_bam=True):
    """Prepare samples by variant callers, duplicating any with multiple callers.
    samples = [utils.to_single_data(x) for x in samples]
    samples = germline.split_somatic(samples)
    to_process = []
    extras = []
    for data in samples:
        added = False
        for i, add in enumerate(
            added = True
            add = dd.set_variantcaller_order(add, i)
        if not added:
            data = _handle_precalled(data)
            data = dd.set_variantcaller_order(data, 0)
    return to_process, extras
Exemple #54
def run_salmon_decoy(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
        fq1, fq2 = files[0], None
    index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir))
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
Exemple #55
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
    return out
Exemple #56
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    ref_file = dd.get_ref_file(data)
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": covinfo.raw_callable,
            "sample_callable": covinfo.callable,
            "mapped_stats": readstats.get_cache_file(data)
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    elif dd.get_variant_regions(data):
        callable_region_bed, nblock_bed = \
            callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": dd.get_variant_regions(data),
            "sample_callable": dd.get_variant_regions(data)
    return [[data]]
Exemple #57
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    data = utils.to_single_data(data)
    work_bam = data.get("align_bam")
    if data["analysis"].lower().startswith("smallrna-seq"):
        work_bam = data["clean_fastq"]
    elif data["analysis"].lower().startswith("chip-seq"):
        work_bam = data["raw_bam"]
    elif not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data) is not None and work_bam:
            "QC: %s %s" %
            (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
        data["summary"] = _run_qc_tools(work_bam, data)
        if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
            data["summary"]["qc"] = data["summary"]["qc"].get(
    return [[data]]
Exemple #58
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.callable, bam_file_ready, ref_file, data)
        vrs_file = dd.get_variant_regions_merged(data)
        offtarget_stats = callable.calculate_offtarget_stats(
            bam_file_ready, data, vrs_file, "variant_regions")
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": covinfo.highdepth,
            "sample_callable": covinfo.callable,
            "coverage_bed": covinfo.coverage,
            "avg_coverage": covinfo.avg_coverage,
            "offtarget_stats": offtarget_stats
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
                "variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Exemple #59
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    params = {
        "min_coverage_for_downsampling": 10,
        "max_downsample_multiplier": 200
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "sample_callable": covinfo.callable,
            "coverage_depth_bed": covinfo.depth
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
                "variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Exemple #60
def normalize_sv_coverage(*items):
    """Normalize CNV coverage, providing flexible point for multiple methods.
    calcfns = {
        "cnvkit": _normalize_sv_coverage_cnvkit,
        "gatk-cnv": _normalize_sv_coverage_gatk
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [
        utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(
            items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(
            dd.get_sample_name(x) for x in items))
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(inputs[0]), "structural",
                         dd.get_sample_name(inputs[0]), "bins"))
        back_files, out_files = calcfns[cnvkit.bin_approach(inputs[0])](
            group_id, inputs, backgrounds, work_dir, back_files, out_files)
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(
    return out