Ejemplo n.º 1
0
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn):
    """Shared functionality for grouping by batches for variant calling and joint calling.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for args in xs:
        assert len(args) == 1
        data = args[0]
        caller, batch = caller_batch_fn(data)
        region = _list_to_tuple(data["region"]) if "region" in data else ()
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, region, caller)].append(utils.deepish_copy(data))
        else:
            data = prep_data_fn(data, [data])
            singles.append(data)
    batches = []
    for batch, items in batch_groups.iteritems():
        batch_data = utils.deepish_copy(_pick_lead_item(items))
        batch_data = prep_data_fn(batch_data, items)
        batch_data["group_orig"] = _collapse_subitems(batch_data, items)
        batch_data["group"] = batch
        batches.append(batch_data)
    return singles + batches
Ejemplo n.º 2
0
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn):
    """Shared functionality for grouping by batches for variant calling and joint calling.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for args in xs:
        data = utils.to_single_data(args)
        caller, batch = caller_batch_fn(data)
        region = _list_to_tuple(data["region"]) if "region" in data else ()
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, region, caller)].append(utils.deepish_copy(data))
        else:
            data = prep_data_fn(data, [data])
            singles.append(data)
    batches = []
    for batch, items in batch_groups.items():
        batch_data = utils.deepish_copy(_pick_lead_item(items))
        # For nested primary batches, split permanently by batch
        if tz.get_in(["metadata", "batch"], batch_data):
            batch_name = batch[0]
            batch_data["metadata"]["batch"] = batch_name
        batch_data = prep_data_fn(batch_data, items)
        batch_data["group_orig"] = _collapse_subitems(batch_data, items)
        batch_data["group"] = batch
        batches.append(batch_data)
    return singles + batches
Ejemplo n.º 3
0
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn):
    """Shared functionality for grouping by batches for variant calling and joint calling.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for args in xs:
        data = utils.to_single_data(args)
        caller, batch = caller_batch_fn(data)
        region = _list_to_tuple(data["region"]) if "region" in data else ()
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, region,
                              caller)].append(utils.deepish_copy(data))
        else:
            data = prep_data_fn(data, [data])
            singles.append(data)
    batches = []
    for batch, items in batch_groups.items():
        batch_data = utils.deepish_copy(_pick_lead_item(items))
        # For nested primary batches, split permanently by batch
        if tz.get_in(["metadata", "batch"], batch_data):
            batch_name = batch[0]
            batch_data["metadata"]["batch"] = batch_name
        batch_data = prep_data_fn(batch_data, items)
        batch_data["group_orig"] = _collapse_subitems(batch_data, items)
        batch_data["group"] = batch
        batches.append(batch_data)
    return singles + batches
Ejemplo n.º 4
0
def _group_batches_shared(xs, caller_batch_fn, prep_data_fn):
    """Shared functionality for grouping by batches for variant calling and joint calling.
    """
    singles = []
    batch_groups = collections.defaultdict(list)
    for args in xs:
        assert len(args) == 1
        data = args[0]
        caller, batch = caller_batch_fn(data)
        region = _list_to_tuple(data["region"]) if "region" in data else ()
        if batch is not None:
            batches = batch if isinstance(batch, (list, tuple)) else [batch]
            for b in batches:
                batch_groups[(b, region,
                              caller)].append(utils.deepish_copy(data))
        else:
            data = prep_data_fn(data, [data])
            singles.append(data)
    batches = []
    for batch, items in batch_groups.iteritems():
        batch_data = utils.deepish_copy(_pick_lead_item(items))
        batch_data = prep_data_fn(batch_data, items)
        batch_data["group_orig"] = _collapse_subitems(batch_data, items)
        batch_data["group"] = batch
        batches.append(batch_data)
    return singles + batches
Ejemplo n.º 5
0
def _get_validate(data):
    """Retrieve items to validate, from single samples or from combined joint calls.
    """
    if data.get("vrn_file") and tz.get_in(["config", "algorithm", "validate"], data):
        return utils.deepish_copy(data)
    elif "group_orig" in data:
        for sub in multi.get_orig_items(data):
            if "validate" in sub["config"]["algorithm"]:
                sub_val = utils.deepish_copy(sub)
                sub_val["vrn_file"] = data["vrn_file"]
                return sub_val
    return None
Ejemplo n.º 6
0
def _get_validate(data):
    """Retrieve items to validate, from single samples or from combined joint calls.
    """
    if data.get("vrn_file") and tz.get_in(["config", "algorithm", "validate"], data):
        return utils.deepish_copy(data)
    elif "group_orig" in data:
        for sub in multi.get_orig_items(data):
            if "validate" in sub["config"]["algorithm"]:
                sub_val = utils.deepish_copy(sub)
                sub_val["vrn_file"] = data["vrn_file"]
                return sub_val
    return None
Ejemplo n.º 7
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Ejemplo n.º 8
0
def _get_split_tasks(args, split_fn, file_key, outfile_i=-1):
    """Split up input files and arguments, returning arguments for parallel processing.

    outfile_i specifies the location of the output file in the arguments to
    the processing function. Defaults to the last item in the list.
    """
    split_args = []
    combine_map = {}
    finished_map = collections.OrderedDict()
    extras = []
    for data in args:
        out_final, out_parts = split_fn(data)
        for parts in out_parts:
            split_args.append([utils.deepish_copy(data)] + list(parts))
        for part_file in [x[outfile_i] for x in out_parts]:
            combine_map[part_file] = out_final
        if len(out_parts) == 0:
            if out_final is not None:
                if out_final not in finished_map:
                    data[file_key] = out_final
                    finished_map[out_final] = [data]
                else:
                    extras.append([data])
            else:
                extras.append([data])
    return split_args, combine_map, finished_map.values(), extras
Ejemplo n.º 9
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Ejemplo n.º 10
0
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir,
                                "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [
            config_utils.get_program("bcbio-variation-recall",
                                     edata["config"]), "ensemble",
            "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
            "--numpass",
            str(num_pass)
        ]
        # Remove filtered calls if we're dealing with tumor/normal calls
        if vcfutils.get_paired_phenotype(edata):
            cmd += ["--nofiltered"]
        cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files
        do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": None
    }
Ejemplo n.º 11
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
Ejemplo n.º 12
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
Ejemplo n.º 13
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)
    return batches + extras
Ejemplo n.º 14
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Ejemplo n.º 15
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "VariantFiltration", "-R", ref_file, "-V", vrn_file,
                "--cluster-window-size", "35", "--cluster-size", "3",
                "--filter-expression", "'FS > 30.0'", "--filter-name", "FS",
                "--filter-expression", "'QD < 2.0'", "--filter-name", "QD",
                "--output", tx_out_file
            ]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config,
                                           os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config),
                   "Filter RNA-seq variants.")
    return out_file
Ejemplo n.º 16
0
def write_project_summary(samples, qsign_info=None):
    """Write project summary information on the provided samples.
    write out dirs, genome resources,

    """
    work_dir = samples[0][0]["dirs"]["work"]
    out_file = os.path.join(work_dir, "project-summary.yaml")
    upload_dir = (os.path.join(work_dir, samples[0][0]["upload"]["dir"])
                  if "dir" in samples[0][0]["upload"] else "")
    date = str(datetime.now())
    prev_samples = _other_pipeline_samples(out_file, samples)
    with open(out_file, "w") as out_handle:
        yaml.safe_dump({"date": date}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        if qsign_info:
            qsign_out = utils.deepish_copy(qsign_info[0])
            qsign_out.pop("out_dir", None)
            yaml.safe_dump({"qsignature": qsign_out}, out_handle, default_flow_style=False,
                           allow_unicode=False)
        yaml.safe_dump({"upload": upload_dir}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        yaml.safe_dump({"bcbio_system": samples[0][0]["config"].get("bcbio_system", "")}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        yaml.safe_dump({"samples": prev_samples + [_save_fields(sample[0]) for sample in samples]}, out_handle,
                       default_flow_style=False, allow_unicode=False)
    return out_file
Ejemplo n.º 17
0
 def cl_gatk(self, params, tmp_dir, memscale=None):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config = utils.deepish_copy(config)
                 config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                         "magnitude": int(cores) // 2}
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"):
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
Ejemplo n.º 18
0
 def cl_gatk(self, params, tmp_dir, memscale=None):
     support_nt = set()
     support_nct = set(["BaseRecalibrator"])
     gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"])
     cores = self._config["algorithm"].get("num_cores", 1)
     config = self._config
     if cores and int(cores) > 1:
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         if prog in support_nt:
             params.extend(["-nt", str(cores)])
         elif prog in support_nct:
             params.extend(["-nct", str(cores)])
             if config["algorithm"].get("memory_adjust") is None:
                 config = utils.deepish_copy(config)
                 config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                                         "magnitude": int(cores) // 2}
     if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9"):
         if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0:
             params.extend(["-U", "LENIENT_VCF_PROCESSING"])
         params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"])
     if memscale:
         jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False)
     else:
         # Decrease memory slightly from configuration to avoid memory allocation errors
         jvm_opts = config_utils.adjust_opts(self._jvm_opts,
                                             {"algorithm": {"memory_adjust":
                                                            {"magnitude": 1.1, "direction": "decrease"}}})
         jvm_opts += get_default_jvm_opts(tmp_dir)
     if "keyfile" in self._gatk_resources:
         params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params
     return ["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]
Ejemplo n.º 19
0
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir,
                                "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [
            config_utils.get_program("bcbio-variation-recall",
                                     edata["config"]), "ensemble",
            "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
            "--numpass",
            str(num_pass), "--names", ",".join(callers)
        ]
        # Remove filtered calls, do not try to rescue, unless configured
        if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"],
                         edata):
            cmd += ["--nofiltered"]

        with file_transaction(edata, out_vcf_file) as tx_out_file:
            cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(
                str(x) for x in cmd))
            do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": None
    }
Ejemplo n.º 20
0
def _get_split_tasks(args, split_fn, file_key, outfile_i=-1):
    """Split up input files and arguments, returning arguments for parallel processing.

    outfile_i specifies the location of the output file in the arguments to
    the processing function. Defaults to the last item in the list.
    """
    split_args = []
    combine_map = {}
    finished_map = collections.OrderedDict()
    extras = []
    for data in args:
        out_final, out_parts = split_fn(data)
        for parts in out_parts:
            split_args.append([utils.deepish_copy(data)] + list(parts))
        for part_file in [x[outfile_i] for x in out_parts]:
            combine_map[part_file] = out_final
        if len(out_parts) == 0:
            if out_final is not None:
                if out_final not in finished_map:
                    data[file_key] = out_final
                    finished_map[out_final] = [data]
                else:
                    extras.append([data])
            else:
                extras.append([data])
    return split_args, combine_map, finished_map.values(), extras
Ejemplo n.º 21
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
Ejemplo n.º 22
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    if data["analysis"].startswith("wgbs-seq"):
        bismark_bam = dd.get_align_bam(data)
        sorted_bam = bam.sort(bismark_bam, data["config"])
        data = dd.set_align_bam(data, sorted_bam)
        data = dd.set_work_bam(data, bismark_bam)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"],
                                  data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(
                dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data),
                                                 data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1
                    and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(
                    dd.get_algorithm_qc(data)[0])
    return [[data]]
Ejemplo n.º 23
0
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    items = [_clean_flat_variantcaller(x) for x in items]
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    cur["metadata"]["batch"] = "%s-germline" % cur["description"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc["germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
Ejemplo n.º 24
0
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                cur["description"] = "%s-germline" % cur["description"]
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    del cur["metadata"]["batch"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc[
                        "germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
Ejemplo n.º 25
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples,
                                                       require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)
    return batches + extras
Ejemplo n.º 26
0
def get_variants(data, include_germline=False):
    """Retrieve set of variant calls to use for heterogeneity analysis.
    """
    data = utils.deepish_copy(data)
    supported = ["precalled", "vardict", "vardict-java", "vardict-perl",
                 "freebayes", "octopus", "strelka2"]
    # Right now mutect2 and mutect do not provide heterozygous germline calls
    # to be useful https://github.com/bcbio/bcbio-nextgen/issues/2464
    # supported += ["mutect2", "mutect"]
    if include_germline:
        supported.insert(1, "gatk-haplotype")
    out = []
    # CWL based input
    if isinstance(data.get("variants"), dict) and "samples" in data["variants"]:
        cur_vs = []
        # Unpack single sample list of files
        if (isinstance(data["variants"]["samples"], (list, tuple)) and
              len(data["variants"]["samples"]) == 1 and isinstance(data["variants"]["samples"][0], (list, tuple))):
            data["variants"]["samples"] = data["variants"]["samples"][0]
        for fname in data["variants"]["samples"]:
            variantcaller = utils.splitext_plus(os.path.basename(fname))[0]
            variantcaller = variantcaller.replace(dd.get_sample_name(data) + "-", "")
            for batch in dd.get_batches(data):
                variantcaller = variantcaller.replace(batch + "-", "")
            cur_vs.append({"vrn_file": fname, "variantcaller": variantcaller})
        data["variants"] = cur_vs
    for v in data.get("variants", []):
        if v["variantcaller"] in supported and v.get("vrn_file"):
            out.append((supported.index(v["variantcaller"]), v))
    out.sort()
    return [xs[1] for xs in out]
Ejemplo n.º 27
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
        all_keys.update(set(data["cwl_keys"]))
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Ejemplo n.º 28
0
def _run_ensemble_intersection(batch_id, vrn_files, callers, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [
            config_utils.get_program(
                "bcbio-variation-recall", edata["config"]),
            "ensemble",
            "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
            "--numpass", str(num_pass),
            "--names", ",".join(callers)
        ]
        # Remove filtered calls, do not try to rescue, unless configured
        if not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata):
            cmd += ["--nofiltered"]

        with file_transaction(edata, out_vcf_file) as tx_out_file:
            cmd += [tx_out_file, dd.get_ref_file(edata)] + vrn_files
            cmd = "%s %s" % (utils.local_path_export(), " ".join(str(x) for x in cmd))
            do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {"variantcaller": "ensemble",
            "vrn_file": out_vcf_file,
            "bed_file": None}
Ejemplo n.º 29
0
def write_project_summary(samples, qsign_info=None):
    """Write project summary information on the provided samples.
    write out dirs, genome resources,

    """
    work_dir = samples[0][0]["dirs"]["work"]
    out_file = os.path.join(work_dir, "project-summary.yaml")
    upload_dir = (os.path.join(work_dir, samples[0][0]["upload"]["dir"])
                  if "dir" in samples[0][0]["upload"] else "")
    date = str(datetime.now())
    prev_samples = _other_pipeline_samples(out_file, samples)
    with open(out_file, "w") as out_handle:
        yaml.safe_dump({"date": date}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        if qsign_info:
            qsign_out = utils.deepish_copy(qsign_info[0])
            qsign_out.pop("out_dir", None)
            yaml.safe_dump({"qsignature": qsign_out}, out_handle, default_flow_style=False,
                           allow_unicode=False)
        yaml.safe_dump({"upload": upload_dir}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        yaml.safe_dump({"bcbio_system": samples[0][0]["config"].get("bcbio_system", "")}, out_handle,
                       default_flow_style=False, allow_unicode=False)
        yaml.safe_dump({"samples": prev_samples + [_save_fields(sample[0]) for sample in samples]}, out_handle,
                       default_flow_style=False, allow_unicode=False)
    return out_file
Ejemplo n.º 30
0
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False):
    num_cores = dd.get_num_cores(data)
    memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None
    # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling
    config = utils.deepish_copy(data["config"])
    if "gatk4" in dd.get_tools_off({"config": config}):
        config["algorithm"]["tools_off"].remove("gatk4")
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, ld_preload=ld_preload)
Ejemplo n.º 31
0
def get_orig_items(base):
    """Retrieve original items from a diffed set of nested samples.
    """
    assert "group_orig" in base
    out = []
    for data_diff in base["group_orig"]:
        new = utils.deepish_copy(base)
        new.pop("group_orig")
        out.append(_patch_dict(data_diff, new))
    return out
Ejemplo n.º 32
0
def get_orig_items(base):
    """Retrieve original items from a diffed set of nested samples.
    """
    assert "group_orig" in base
    out = []
    for data_diff in base["group_orig"]:
        new = utils.deepish_copy(base)
        new.pop("group_orig")
        out.append(_patch_dict(data_diff, new))
    return out
Ejemplo n.º 33
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples]
    work_samples = _report_summary(work_samples, os.path.join(out_dir, "report"))
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)

    return [[data] for data in samples]
Ejemplo n.º 34
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples]
    work_samples = _report_summary(work_samples, os.path.join(out_dir, "report"))
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    out = []
    for i, data in enumerate(_group_by_samplename(samples)):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*.R*"))
                data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml"))
                data_files.append(file_list)
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}
                file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
                if file_list_final:
                    data["summary"]["multiqc"]["secondary"].append(file_list_final)
        out.append([data])
    return out
Ejemplo n.º 35
0
def _concat_records(items_by_key, input_order):
    """Concatenate records into a single key to avoid merging.

    Handles heterogeneous records that will then be sorted out in
    the processing fuction.
    """
    all_records = []
    for (k, t) in input_order.items():
        if t == "record":
            all_records.append(k)
    out_items_by_key = utils.deepish_copy(items_by_key)
    out_input_order = utils.deepish_copy(input_order)
    if len(all_records) > 1:
        final_k = all_records[0]
        final_v = items_by_key[final_k]
        for k in all_records[1:]:
            final_v += items_by_key[k]
            del out_items_by_key[k]
            del out_input_order[k]
        out_items_by_key[final_k] = final_v
    return out_items_by_key, out_input_order
Ejemplo n.º 36
0
def _check_for_single_nested(target, items_by_key, input_order):
    """Check for single nested inputs that match our target count and unnest.

    Handles complex var inputs where some have an extra layer of nesting.
    """
    out = utils.deepish_copy(items_by_key)
    for (k, t) in input_order.items():
        if t == "var":
            v = items_by_key[tuple(k.split("__"))]
            if _is_nested_single(v, target):
                out[tuple(k.split("__"))] = v[0]
    return out
Ejemplo n.º 37
0
def _concat_records(items_by_key, input_order):
    """Concatenate records into a single key to avoid merging.

    Handles heterogeneous records that will then be sorted out in
    the processing fuction.
    """
    all_records = []
    for (k, t) in input_order.items():
        if t == "record":
            all_records.append(k)
    out_items_by_key = utils.deepish_copy(items_by_key)
    out_input_order = utils.deepish_copy(input_order)
    if len(all_records) > 1:
        final_k = all_records[0]
        final_v = items_by_key[final_k]
        for k in all_records[1:]:
            final_v += items_by_key[k]
            del out_items_by_key[k]
            del out_input_order[k]
        out_items_by_key[final_k] = final_v
    return out_items_by_key, out_input_order
Ejemplo n.º 38
0
def _check_for_single_nested(target, items_by_key, input_order):
    """Check for single nested inputs that match our target count and unnest.

    Handles complex var inputs where some have an extra layer of nesting.
    """
    out = utils.deepish_copy(items_by_key)
    for (k, t) in input_order.items():
        if t == "var":
            v = items_by_key[tuple(k.split("__"))]
            if _is_nested_single(v, target):
                out[tuple(k.split("__"))] = v[0]
    return out
Ejemplo n.º 39
0
def add_highdepth_genome_exclusion(items):
    """Add exclusions to input items to avoid slow runtimes on whole genomes.
    """
    out = []
    for d in items:
        d = utils.deepish_copy(d)
        if dd.get_coverage_interval(d) == "genome":
            e = dd.get_exclude_regions(d)
            if "highdepth" not in e:
                e.append("highdepth")
                d = dd.set_exclude_regions(d, e)
        out.append(d)
    return out
Ejemplo n.º 40
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 41
0
def group_by_batch(items, require_bam=True):
    """Group a set of sample items by batch (or singleton) name.

    Items in multiple batches cause two batches to be merged together.
    """
    out = collections.defaultdict(list)
    batch_groups = _get_representative_batch(_merge_batches(_find_all_groups(items, require_bam)))
    for data in items:
        batches = _get_batches(data, require_bam)
        # take first batch as representative
        batch = batch_groups[batches[0]]
        out[batch].append(utils.deepish_copy(data))
    return dict(out)
Ejemplo n.º 42
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 43
0
def group_by_batch(items, require_bam=True):
    """Group a set of sample items by batch (or singleton) name.

    Items in multiple batches cause two batches to be merged together.
    """
    out = collections.defaultdict(list)
    batch_groups = _get_representative_batch(_merge_batches(_find_all_groups(items, require_bam)))
    for data in items:
        batches = _get_batches(data, require_bam)
        # take first batch as representative
        batch = batch_groups[batches[0]]
        out[batch].append(utils.deepish_copy(data))
    return dict(out)
Ejemplo n.º 44
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [
        utils.to_single_data(x)
        for x in vcvalidate.summarize_grading(items, "svvalidate")
    ]
    out = {
        "sv": {
            "calls": [],
            "prioritize": {
                "tsv": [],
                "raw": []
            }
        },
        "svvalidate": vcvalidate.combine_validations(items, "svvalidate")
    }
    added = set([])
    # Standard callers
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(
                        utils.safe_makedir(
                            os.path.join(dd.get_work_dir(data), "sv",
                                         "calls")), "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(
                        out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
    # prioritization
    for pdata in _group_by_sample(items):
        prioritysv = [
            x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get(
                "sv", []) if x["variantcaller"] == "sv-prioritize"
        ]
        if prioritysv:
            out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"])
            out["sv"]["prioritize"]["raw"].extend(
                prioritysv[0]["raw_files"].values())
    return [out]
Ejemplo n.º 45
0
def extract_germline_vcinfo(data, out_dir):
    """Extract germline VCFs from existing tumor inputs.
    """
    supported_germline = set(["vardict", "octopus", "freebayes"])
    if dd.get_phenotype(data) in ["tumor"]:
        for v in _get_variants(data):
            if v.get("variantcaller") in supported_germline:
                if v.get("germline"):
                    return v
                else:
                    d = utils.deepish_copy(data)
                    d["vrn_file"] = v["vrn_file"]
                    gd = germline.extract(d, [d], out_dir)
                    v["germline"] = gd["vrn_file_plus"]["germline"]
                    return v
Ejemplo n.º 46
0
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return list(batch_groups.values())
Ejemplo n.º 47
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_file = os.path.join(utils.safe_makedir(os.path.join("variation", "rnaseq", "gatk-haplotype")),
                            "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                     out_file=out_file)
    return dd.set_vrn_file(data, out_file)
Ejemplo n.º 48
0
def extract_germline_vcinfo(data, out_dir):
    """Extract germline VCFs from existing tumor inputs.
    """
    supported_germline = set(["vardict", "octopus", "freebayes"])
    if dd.get_phenotype(data) in ["tumor"]:
        for v in _get_variants(data):
            if v.get("variantcaller") in supported_germline:
                if v.get("germline"):
                    return v
                else:
                    d = utils.deepish_copy(data)
                    d["vrn_file"] = v["vrn_file"]
                    gd = germline.extract(d, [d], out_dir)
                    v["germline"] = gd["vrn_file_plus"]["germline"]
                    return v
Ejemplo n.º 49
0
def batch_for_jointvc(items):
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in items]:
        vc = dd.get_variantcaller(data)
        if genotype.is_joint(data):
            batches = dd.get_batches(data) or dd.get_sample_name(data)
            if not isinstance(batches, (list, tuple)):
                batches = [batches]
        else:
            batches = [dd.get_sample_name(data)]
        for b in batches:
            data = utils.deepish_copy(data)
            data["vrn_file_gvcf"] = data["vrn_file"]
            batch_groups[(b, vc)].append(data)
    return batch_groups.values()
Ejemplo n.º 50
0
def _group_by_sample(items):
    """Group a set of items by sample names + multiple callers for prioritization
    """
    by_sample = collections.defaultdict(list)
    for d in items:
        by_sample[dd.get_sample_name(d)].append(d)
    out = []
    for sample_group in by_sample.values():
        cur = utils.deepish_copy(sample_group[0])
        svs = []
        for d in sample_group:
            svs.append(d["sv"])
        cur["sv"] = svs
        out.append(cur)
    return out
Ejemplo n.º 51
0
def _group_by_sample(items):
    """Group a set of items by sample names + multiple callers for prioritization
    """
    by_sample = collections.defaultdict(list)
    for d in items:
        by_sample[dd.get_sample_name(d)].append(d)
    out = []
    for sample_group in by_sample.values():
        cur = utils.deepish_copy(sample_group[0])
        svs = []
        for d in sample_group:
            svs.append(d["sv"])
        cur["sv"] = svs
        out.append(cur)
    return out
Ejemplo n.º 52
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in to_process]:
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Ejemplo n.º 53
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0])
    return [[data]]
Ejemplo n.º 54
0
 def _clean_fields(d):
     if isinstance(d, dict):
         if "fields" in d:
             out = []
             for f in d["fields"]:
                 f = utils.deepish_copy(f)
                 f.pop("secondaryFiles", None)
                 out.append(f)
             d["fields"] = out
             return d
         else:
             out = {}
             for k, v in d.items():
                 out[k] = _clean_fields(v)
             return out
     else:
         return d
Ejemplo n.º 55
0
 def _clean_fields(d):
     if isinstance(d, dict):
         if "fields" in d:
             out = []
             for f in d["fields"]:
                 f = utils.deepish_copy(f)
                 f.pop("secondaryFiles", None)
                 out.append(f)
             d["fields"] = out
             return d
         else:
             out = {}
             for k, v in d.items():
                 out[k] = _clean_fields(v)
             return out
     else:
         return d
Ejemplo n.º 56
0
def _get_callers(items, stage, special_cases=False):
    """Retrieve available callers for the provided stage.

    Handles special cases like CNVkit that can be in initial or standard
    depending on if fed into Lumpy analysis.
    """
    callers = utils.deepish_copy(_CALLERS[stage])
    if special_cases and "cnvkit" in callers:
        has_lumpy = any("lumpy" in get_svcallers(d) or "lumpy" in d["config"]["algorithm"].get("svcaller_orig", [])
                        for d in items)
        if has_lumpy and any("lumpy_usecnv" in dd.get_tools_on(d) for d in items):
            if stage != "initial":
                del callers["cnvkit"]
        else:
            if stage != "standard":
                del callers["cnvkit"]
    return callers
Ejemplo n.º 57
0
def update_summary_qc(data, key, base=None, secondary=None):
    """
    updates summary_qc with a new section, keyed by key.
    stick files into summary_qc if you want them propagated forward
    and available for multiqc
    """
    summary = deepish_copy(get_summary_qc(data, {}))
    if key in summary:
        return data
    if base and secondary:
        summary[key] = {"base": base, "secondary": secondary}
    elif base:
        summary[key] = {"base": base}
    elif secondary:
        summary[key] = {"secondary": secondary}
    data = set_summary_qc(data, summary)
    return data
Ejemplo n.º 58
0
def _run_concat_variant_files_gatk4(input_file_list, out_file, config):
    """Use GATK4 GatherVcfs for concatenation of scattered VCFs.
    """
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            params = ["-T", "GatherVcfs", "-I", input_file_list, "-O", tx_out_file]
            # Use GATK4 for merging, tools_off: [gatk4] applies to variant calling
            config = utils.deepish_copy(config)
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            # Allow specification of verbosity in the unique style this tool uses
            resources = config_utils.get_resources("gatk", config)
            opts = [str(x) for x in resources.get("options", [])]
            if "--verbosity" in opts:
                params += ["--VERBOSITY:%s" % opts[opts.index("--verbosity") + 1]]
            broad_runner = broad.runner_from_config(config)
            broad_runner.run_gatk(params)
    return out_file
Ejemplo n.º 59
0
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [config_utils.get_program("bcbio-variation-recall", edata["config"]),
               "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
               "--numpass", str(num_pass)]
        # Remove filtered calls if we're dealing with tumor/normal calls
        if vcfutils.get_paired_phenotype(edata):
            cmd += ["--nofiltered"]
        cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files
        do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {"variantcaller": "ensemble",
            "vrn_file": out_vcf_file,
            "bed_file": None}
Ejemplo n.º 60
0
def summarize_samples(samples, run_parallel):
    """Back compatibility for existing pipelines. Should be replaced with summary when ready.
    """
    extras = []
    to_run = collections.defaultdict(list)
    multi_batches = set([])
    for data in [x[0] for x in samples]:
        if tz.get_in(["config", "algorithm", "coverage"], data):
            batches = tz.get_in(("metadata", "batch"), data, [dd.get_sample_name(data)])
            if not isinstance(batches, (tuple, list)):
                batches = [batches]
            else:
                multi_batches.add(dd.get_sample_name(data))
            for batch in batches:
                to_run[batch].append(utils.deepish_copy(data))
        else:
            extras.append([data])
    out = run_parallel("coverage_summary", [[xs] for xs in to_run.values()]) if len(to_run) > 0 else []
    out = _handle_multi_batches(out, multi_batches)
    assert len(out + extras) == len(samples), (len(out + extras), len(samples))
    return out + extras