Exemple #1
0
def summary(samples, run_parallel):
    cmd_name = 'chanjo'
    for data in samples:
        # input
        bam = tz.get_in(["work_bam"], data[0], None)
        sample_name = tz.get_in(['rgnames', 'sample'], data[0], None)
        bed_file = tz.get_in(["config", "algorithm", "coverage"], data[0], None)

        output_dir = os.path.abspath(tz.get_in(['upload', 'dir'], data[0]))
        if not os.path.exists(output_dir):
            safe_makedir(output_dir)

        output = os.path.join(output_dir, sample_name, '{0}-coverage.bed'.format(sample_name))
        if not utils.file_exists(output):
            with file_transaction(data, output) as tx_out_file:
                with codecs.open(bed_file, encoding='utf-8') as bed_stream:
                    with codecs.open(output, "w", encoding='utf-8') as coverage_stream:
                        for line in chanjo.annotate_bed_stream(bed_stream, bam):
                            coverage_stream.write(chanjo.serialize_interval(line))
                            coverage_stream.write('\n')

    out = []
    for x in samples[0]:
        output_dir = os.path.abspath(tz.get_in(['upload', 'dir'], data[0]))
        output = os.path.join(output_dir, sample_name, '{0}-coverage.bed'.format(sample_name))
        x["coverage"] = {"summary": output}
        out.append([x])
    return out
Exemple #2
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif (vcfutils.get_paired_phenotype(data)
            and "tumor" in [vcfutils.get_paired_phenotype(d) for d in get_orig_items(data)]):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                cur_batch = tz.get_in(["metadata", "batch"], data)
                if cur_batch:
                    sub_data["metadata"]["batch"] = cur_batch
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling or population runs, do not split back up and keep in batches
    else:
        out = []
        for sub_data in get_orig_items(data):
            cur_batch = tz.get_in(["metadata", "batch"], data)
            if cur_batch:
                sub_data["metadata"]["batch"] = cur_batch
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
Exemple #3
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Uses bgzip and grabix to prepare an indexed fastq file.
    """
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and aligner and (_is_cram_input(data["files"]) or
                                             objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        # skip if we're not BAM and not doing alignment splitting
        if ("files" not in data or data["files"][0] is None or not aligner
              or _no_index_needed(data)):
            return [[data]]
    ready_files = _prep_grabix_indexes(data["files"], data["dirs"], data)
    data["files"] = ready_files
    # bgzip preparation takes care of converting illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(ready_files[0], data["config"]["algorithm"]["align_split_size"])
    else:
        splits = [None]
    if len(splits) == 1:
        return [[data]]
    else:
        out = []
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = list(split)
            out.append([cur_data])
        return out
def _normalize_cwl_inputs(items):
    """Extract variation and validation data from CWL input list of batched samples.
    """
    with_validate = {}
    vrn_files = []
    ready_items = []
    batch_samples = []
    for data in (cwlutils.normalize_missing(utils.to_single_data(d)) for d in items):
        batch_samples.append(dd.get_sample_name(data))
        if tz.get_in(["config", "algorithm", "validate"], data):
            with_validate[_checksum(tz.get_in(["config", "algorithm", "validate"], data))] = data
        if data.get("vrn_file"):
            vrn_files.append(data["vrn_file"])
        ready_items.append(data)
    if len(with_validate) == 0:
        data = _pick_lead_item(ready_items)
        data["batch_samples"] = batch_samples
        return data
    else:
        assert len(with_validate) == 1, len(with_validate)
        assert len(set(vrn_files)) == 1, set(vrn_files)
        data = _pick_lead_item(with_validate.values())
        data["batch_samples"] = batch_samples
        data["vrn_file"] = vrn_files[0]
        return data
Exemple #5
0
def _ready_gzip_fastq(in_files, data):
    """Check if we have gzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    needs_convert = tz.get_in(["config", "algorithm", "quality_format"], data, "").lower() == "illumina"
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False
    return all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
Exemple #6
0
def detect_sv(items, all_items, config):
    """Top level parallel target for examining structural variation.
    """
    svcaller = config["algorithm"].get("svcaller_active")
    out = []
    if svcaller:
        if svcaller in _CALLERS:
            assert len(items) == 1
            data = items[0]
            data["sv"] = _CALLERS[svcaller](data)
            out.append([data])
        elif svcaller in _BATCH_CALLERS:
            if (svcaller in _NEEDS_BACKGROUND and
                  not vcfutils.is_paired_analysis([x.get("align_bam") for x in items], items)):
                names = set([tz.get_in(["rgnames", "sample"], x) for x in items])
                background = [x for x in all_items if tz.get_in(["rgnames", "sample"], x) not in names]
                for svdata in _BATCH_CALLERS[svcaller](items, background):
                    out.append([svdata])
            else:
                for svdata in _BATCH_CALLERS[svcaller](items):
                    out.append([svdata])
        else:
            raise ValueError("Unexpected structural variant caller: %s" % svcaller)
    else:
        out.append(items)
    return out
Exemple #7
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Exemple #8
0
def _get_variant_regions(items):
    """Retrieve variant regions defined in any of the input items.
    """
    return filter(lambda x: x is not None,
                  [tz.get_in(("config", "algorithm", "variant_regions"), data)
                   for data in items
                   if tz.get_in(["config", "algorithm", "coverage_interval"], data) != "genome"])
Exemple #9
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude.bed" % utils.splitext_plus(base_file)[0]
    all_vrs = _get_variant_regions(items)
    ready_region = (shared.subset_variant_regions(tz.first(all_vrs), chrom, base_file, items)
                    if len(all_vrs) > 0 else chrom)
    with shared.bedtools_tmpdir(items[0]):
        # Get a bedtool for the full region if no variant regions
        if ready_region == chrom:
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(pybedtools.BedTool(lcr_bed))
        else:
            want_bedtool = pybedtools.BedTool(ready_region).saveas()
        sv_exclude_bed = _get_sv_exclude_file(items)
        if sv_exclude_bed and len(want_bedtool) > 0:
            want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
        if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
            with file_transaction(out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Exemple #10
0
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
    """Ensure correct installation of VEP cache file.
    """
    if config is None: config = {}
    resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
    if tooldir:
        os.environ["PERL5LIB"] = "{t}/lib/perl5:{t}/lib/perl5/site_perl:{l}".format(
            t=tooldir, l=os.environ.get("PERL5LIB", ""))
    vepv = vep_version(config)
    if os.path.exists(resource_file) and vepv:
        with open(resource_file) as in_handle:
            resources = yaml.load(in_handle)
        ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
        ensembl_version = tz.get_in(["aliases", "ensembl_version"], resources)
        symlink_dir = _special_dbkey_maps(dbkey, ref_file)
        if symlink_dir:
            return symlink_dir, ensembl_name
        elif ensembl_name:
            vep_dir = utils.safe_makedir(os.path.normpath(os.path.join(
                os.path.dirname(os.path.dirname(ref_file)), "vep")))
            out_dir = os.path.join(vep_dir, ensembl_name, vepv)
            if not os.path.exists(out_dir):
                cmd = ["vep_install.pl", "-a", "c", "-s", ensembl_name,
                       "-c", vep_dir]
                if ensembl_version:
                    cmd += ["-v", ensembl_version]
                do.run(cmd, "Prepare VEP directory for %s" % ensembl_name)
                cmd = ["vep_convert_cache.pl", "-species", ensembl_name, "-version", vepv,
                       "-d", vep_dir]
                do.run(cmd, "Convert VEP cache to tabix %s" % ensembl_name)
            tmp_dir = os.path.join(vep_dir, "tmp")
            if os.path.exists(tmp_dir):
                shutil.rmtree(tmp_dir)
            return vep_dir, ensembl_name
    return None, None
Exemple #11
0
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file):
    """Shared preparation work for GATK variant calling.
    """
    data = items[0]
    config = data["config"]
    broad_runner = broad.runner_from_path("picard", config)
    broad_runner.run_fn("picard_index_ref", ref_file)
    for x in align_bams:
        bam.index(x, config)
    params = ["-R", ref_file]
    coverage_depth_min = tz.get_in(["algorithm", "coverage_depth_min"], config)
    if coverage_depth_min and coverage_depth_min < 4:
        confidence = "4.0"
        params += ["--standard_min_confidence_threshold_for_calling", confidence,
                   "--standard_min_confidence_threshold_for_emitting", confidence]
    for a in annotation.get_gatk_annotations(config):
        params += ["--annotation", a]
    for x in align_bams:
        params += ["-I", x]
    if dbsnp:
        params += ["--dbsnp", dbsnp]
    variant_regions = tz.get_in(["algorithm", "variant_regions"], config)
    region = subset_variant_regions(variant_regions, region, out_file, items)
    if region:
        params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"]
    broad_runner = broad.runner_from_config(config)
    return broad_runner, params
Exemple #12
0
def _get_caller(data):
    callers = [
        tz.get_in(["config", "algorithm", "jointcaller"], data),
        tz.get_in(["config", "algorithm", "variantcaller"], data),
        "precalled",
    ]
    return [c for c in callers if c][0]
Exemple #13
0
 def _extra_vars(args, cluster_config):
     return {"encrypted_mount": "/encrypted",
             "nfs_server": nfs_server,
             "nfs_clients": ",".join(nfs_clients),
             "login_user": tz.get_in(["nodes", "frontend", "login"], cluster_config),
             "encrypted_device": tz.get_in(["nodes", "frontend", "encrypted_volume_device"],
                                           cluster_config, "/dev/xvdf")}
Exemple #14
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion.

    Excludes high depth and centromere regions which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                    items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(shared.subset_bed_by_chrom(want_bedtool.saveas().fn,
                                                                             chrom, items[0]))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(sv_exclude_bed, nonamecheck=True).saveas()
            if any(dd.get_coverage_interval(d) == "genome" for d in items):
                want_bedtool = pybedtools.BedTool(shared.remove_highdepth_regions(want_bedtool.saveas().fn, items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(tz.get_in(["reference", "fasta", "base"], items[0]),
                                                        items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool, nonamecheck=True).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Exemple #15
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    convert_to_list = set(["config__algorithm__tools_on", "config__algorithm__tools_off"])
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    all_keys = set([])
    for data in to_process:
        all_keys.update(set(data["cwl_keys"]))
    for data in to_process:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
                data["cwl_keys"].append(raw_key)
            if raw_key in convert_to_list:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Exemple #16
0
def _square_batch_bcbio_variation(data, region, bam_files, vrn_files, out_file,
                                  todo="square"):
    """Run squaring or merging analysis using bcbio.variation.recall.
    """
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
    resources = config_utils.get_resources("bcbio-variation-recall", data["config"])
    # adjust memory by cores but leave room for run program memory
    memcores = int(math.ceil(float(cores) / 5.0))
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms250m", "-Xmx2g"]),
                                        {"algorithm": {"memory_adjust": {"direction": "increase",
                                                                         "magnitude": memcores}}})
    # Write unique VCFs and BAMs to input file
    input_file = "%s-inputs.txt" % os.path.splitext(out_file)[0]
    with open(input_file, "w") as out_handle:
        out_handle.write("\n".join(sorted(list(set(vrn_files)))) + "\n")
        if todo == "square":
            out_handle.write("\n".join(sorted(list(set(bam_files)))) + "\n")
    variantcaller = tz.get_in(("config", "algorithm", "jointcaller"), data).replace("-joint", "")
    cmd = ["bcbio-variation-recall", todo] + jvm_opts + broad.get_default_jvm_opts() + \
          ["-c", cores, "-r", bamprep.region_to_gatk(region)]
    if todo == "square":
        cmd += ["--caller", variantcaller]
    cmd += [out_file, ref_file, input_file]
    do.run(cmd, "%s in region: %s" % (cmd, bamprep.region_to_gatk(region)))
    return out_file
Exemple #17
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Exemple #18
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Exemple #19
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Exemple #20
0
def samples_to_records(samples, default_keys=None):
    """Convert samples into output CWL records.
    """
    from bcbio.pipeline import run_info
    RECORD_CONVERT_TO_LIST = set(["config__algorithm__tools_on", "config__algorithm__tools_off",
                                  "reference__genome_context"])
    all_keys = _get_all_cwlkeys(samples, default_keys)
    out = []
    for data in samples:
        for raw_key in sorted(list(all_keys)):
            key = raw_key.split("__")
            if tz.get_in(key, data) is None:
                data = tz.update_in(data, key, lambda x: None)
            if raw_key not in data["cwl_keys"]:
                data["cwl_keys"].append(raw_key)
            if raw_key in RECORD_CONVERT_TO_LIST:
                val = tz.get_in(key, data)
                if not val: val = []
                elif not isinstance(val, (list, tuple)): val = [val]
                data = tz.update_in(data, key, lambda x: val)
            # Booleans are problematic for CWL serialization, convert into string representation
            if isinstance(tz.get_in(key, data), bool):
                data = tz.update_in(data, key, lambda x: str(tz.get_in(key, data)))
        data["metadata"] = run_info.add_metadata_defaults(data.get("metadata", {}))
        out.append(data)
    return out
Exemple #21
0
def assign_complex_to_samples(items):
    """Assign complex inputs like variants and align outputs to samples.

    Handles list inputs to record conversion where we have inputs from multiple
    locations and need to ensure they are properly assigned to samples in many
    environments.

    The unpleasant approach here is to use standard file naming to match
    with samples so this can work in environments where we don't download/stream
    the input files (for space/time savings).
    """
    extract_fns = {("variants", "samples"): _get_vcf_samples,
                   ("align_bam",): _get_bam_samples}
    complex = {k: {} for k in extract_fns.keys()}
    for data in items:
        for k in complex:
            v = tz.get_in(k, data)
            if v is not None:
                for s in extract_fns[k](v, items):
                    if s:
                        complex[k][s] = v
    out = []
    for data in items:
        for k in complex:
            newv = tz.get_in([k, dd.get_sample_name(data)], complex)
            if newv:
                data = tz.update_in(data, k, lambda x: newv)
        out.append(data)
    return out
Exemple #22
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Exemple #23
0
def _cram_to_fastq_regions(regions, cram_file, dirs, data):
    """Convert CRAM files to fastq, potentially within sub regions.

    Returns multiple fastq files that can be merged back together.
    """
    base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep",
                                               "%s-parts" % base_name))
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    fnames = []
    is_paired = False
    for region in regions:
        rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
        out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                              (base_name, rext, fext))
                                 for fext in ["s1", "p1", "p2"]]
        if not utils.file_exists(out_p1):
            with file_transaction(out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2):
                sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
                cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                       "gz=1 collate=1 colsbs={max_mem} "
                       "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null "
                       "reference={ref_file}")
                if region:
                    cmd += " ranges='{region}'"
                do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
        if is_paired or not _is_gzip_empty(out_p1):
            fnames.append((out_p1, out_p2))
            is_paired = True
        else:
            fnames.append((out_s,))
    return fnames
Exemple #24
0
def get_recipes(path=None):
    """Get all the available conda recipes.

    Returns a namedtuple which contains the following keys:
        :name:      the name of the recipe
        :path:      the path for the package
        :version:   the version of the recipe
        :build:     the number of builds for the current version
    """
    path = path or CONFIG["abspath"]
    recipes = []

    for recipe in RECIPE_ORDER:
        recipe_path = os.path.join(path, recipe, "meta.yaml")

        if not os.path.exists(recipe_path):
            print("[x] Missing meta.yaml for {recipe}.".format(recipe=recipe))
            continue

        output_path, _ = execute(["conda", "build", recipe, "--output", "--numpy", CONFIG["numpy"]], cwd=path)

        with open(recipe_path, "r") as recipe_handle:
            config = yaml.safe_load(recipe_handle)
            recipes.append(
                RECIPE(
                    name=recipe,
                    path=output_path.strip(),
                    version=toolz.get_in(["package", "version"], config),
                    build=toolz.get_in(["build", "number"], config, 0),
                )
            )
    return recipes
Exemple #25
0
def _bgzip_from_cram(cram_file, dirs, data):
    """Create bgzipped fastq files from an input CRAM file in regions of interest.

    Returns a list with a single file, for single end CRAM files, or two
    files for paired end input.
    """
    region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
                   if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"]
                   else None)
    if region_file:
        regions = ["%s:%s-%s" % tuple(r) for r in pybedtools.BedTool(region_file)]
    else:
        regions = [None]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s-%s.fq.gz" %
                                          (utils.splitext_plus(os.path.basename(cram_file))[0], fext))
                             for fext in ["s1", "p1", "p2"]]
    if not utils.file_exists(out_s) and not utils.file_exists(out_p1):
        cram.index(cram_file)
        fastqs = _cram_to_fastq_regions(regions, cram_file, dirs, data)
        if len(fastqs[0]) == 1:
            with file_transaction(out_s) as tx_out_file:
                _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s)
        else:
            for i, out_file in enumerate([out_p1, out_p2]):
                ext = "/%s" % (i + 1)
                with file_transaction(out_file) as tx_out_file:
                    _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file, out_file, ext)
    if utils.file_exists(out_p1):
        return [out_p1, out_p2]
    else:
        assert utils.file_exists(out_s)
        return [out_s]
Exemple #26
0
def _meta_to_version(in_file):
    """Extract version information from meta description file.
    """
    with open(in_file) as in_handle:
        config = yaml.safe_load(in_handle)
    return (tz.get_in(["package", "version"], config),
            tz.get_in(["build", "number"], config, 0))
Exemple #27
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cmd = ["batch"] + test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", raw_work_dir, "--split",
                   "-p", str(tz.get_in(["config", "algorithm", "num_cores"], data, 1)),
                   "--output-reference", os.path.join(raw_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            args = cnvlib_cmd.parse_args(cmd)
            args.func(args)
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Exemple #28
0
def align_to_sort_bam(fastq1, fastq2, aligner, data):
    """Align to the named genome build, returning a sorted BAM file.
    """
    names = data["rgnames"]
    align_dir_parts = [data["dirs"]["work"], "align", names["sample"]]
    if data.get("disambiguate"):
        align_dir_parts.append(data["disambiguate"]["genome_build"])
    align_dir = utils.safe_makedir(apply(os.path.join, align_dir_parts))
    aligner_indexes = os.path.commonprefix(tz.get_in(("reference", aligner, "indexes"), data))
    if aligner_indexes.endswith("."):
        aligner_indexes = aligner_indexes[:-1]
    ref_file = tz.get_in(("reference", "fasta", "base"), data)
    if fastq1.endswith(".bam"):
        data = _align_from_bam(fastq1, aligner, aligner_indexes, ref_file,
                               names, align_dir, data)
    else:
        data = _align_from_fastq(fastq1, fastq2, aligner, aligner_indexes, ref_file,
                                 names, align_dir, data)
    if data["work_bam"] and utils.file_exists(data["work_bam"]):
        bam.index(data["work_bam"], data["config"])
        for extra in ["-sr", "-disc"]:
            extra_bam = utils.append_stem(data['work_bam'], extra)
            if utils.file_exists(extra_bam):
                bam.index(extra_bam, data["config"])
    return data
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None:
    initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master")
    initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data")

    # Get service configuration.
    _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True)

    ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config)
    coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config)

    global current_expected_task_count

    sdk_service.update_configuration(
        package_name,
        service_name,
        {
            "ingest_nodes": {"count": ingest_nodes_count + 1},
            "coordinator_nodes": {"count": coordinator_nodes_count + 1},
        },
        current_expected_task_count,
        # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than
        # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards
        # with `sdk_tasks.check_running`.
        wait_for_deployment=False,
    )

    # Should be running 2 tasks more.
    current_expected_task_count += 2
    sdk_tasks.check_running(service_name, current_expected_task_count)
    # Master nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids)
    # Data nodes should not restart.
    sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
Exemple #31
0
def get_background_cnv_reference(data, caller):
    out = tz.get_in(["config", "algorithm", "background", "cnv_reference"],
                    data)
    if out:
        return out.get(caller) if isinstance(out, dict) else out
Exemple #32
0
def _sv_workdir(data):
    return utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                           tz.get_in(["rgnames", "sample"], data),
                                           "cnvkit"))
Exemple #33
0
def get_resources(name, config):
    """Retrieve resources for a program, pulling from multiple config sources.
    """
    return tz.get_in(["resources", name], config,
                     tz.get_in(["resources", "default"], config, {}))
Exemple #34
0
def get_keys(lookup):
    """
    return the keys used to look up a function in the datadict
    """
    return tz.get_in((lookup, "keys"), LOOKUPS, None)
Exemple #35
0
 def present(config):
     try:
         value = tz.get_in(keys, config, no_default=True)
     except:
         value = False
     return True if value else False
Exemple #36
0
def _find_shared_batch(samples):
    for data in samples:
        batch = tz.get_in(["metadata", "batch"], data,
                          dd.get_sample_name(data))
        if not isinstance(batch, (list, tuple)):
            return batch
def test_dedup():
    ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))

    # make sure ds0 has duplicate C nodes with equivalent data
    assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc
    assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc

    ds = SimpleDocNav(dedup_lineage(ds0))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[
        'ac'].sources['cd'].doc

    # again but with raw doc
    ds = SimpleDocNav(dedup_lineage(ds0.doc))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[
        'ac'].sources['cd'].doc

    # Test that we detect inconsistent metadata for duplicate entries (test 1)
    # test: different values in the same spot
    ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    ds0.sources['ac'].doc['label'] = 'Modified'
    ds0 = SimpleDocNav(ds0.doc)
    assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc

    with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'):
        dedup_lineage(ds0)

    # Test that we detect inconsistent metadata for duplicate entries (test 2)
    # test: different sources structure
    ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    ds0.sources['ac'].doc['lineage']['source_datasets']['extra'] = ds0.sources[
        'ae'].doc.copy()
    assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc

    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Test that we detect inconsistent lineage subtrees for duplicate entries

    # Subtest 1: different set of keys
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd'] = {}
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 2: different values for "child" nodes
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be'
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 3: different name for child
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['CD'] = srcs['cd']
    del srcs['cd']
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)
Exemple #38
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        # Hiding metrics duplicated by Qualimap
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

        # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths
        avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples]
        # Picking all thresholds up to the highest sample average depth
        thresholds = [t for t in coverage.DEPTH_THRESHOLDS if t <= max(avg_depths)]
        # ...plus one more
        if len(thresholds) < len(coverage.DEPTH_THRESHOLDS):
            thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)])

        # Showing only thresholds surrounding any of average depths
        thresholds_hidden = []
        for i, t in enumerate(thresholds):
            if t > 20:  # Not hiding anything below 20x
                if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \
                   any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)):
                    pass
                else:
                    thresholds_hidden.append(t)

        # Hide coverage unless running full qualimap, downsampled inputs are confusing
        if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples):
            thresholds_hidden = thresholds + thresholds_hidden
            thresholds_hidden.sort()
            thresholds = []
        out['qualimap_config'] = {
            'general_stats_coverage': [str(t) for t in thresholds],
            'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]}

    # Avoid confusing peddy outputs, sticking to ancestry and sex prediction
    out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False,
                                             "error_sex_check": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
Exemple #39
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {
                        "loftee": _get_loftee,
                        "maxentscan": _get_maxentscan,
                        "genesplicer": _get_genesplicer,
                        "spliceregion": _get_spliceregion
                    }
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                        or tz.get_in(
                            ("config", "algorithm", "clinical_reporting"),
                            data)):
                    config_args += ["--pick"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_exac",
                       "--pubmed", "--variant_class"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Exemple #40
0
def get_NF_bam(data):
    """
    get the nucleosome free BAM file for ATAC-seq if it exists
    """
    return tz.get_in(("atac", "align", "NF"), data, None)
Exemple #41
0
def _get_caller(data):
    callers = [
        tz.get_in(["config", "algorithm", "jointcaller"], data),
        tz.get_in(["config", "algorithm", "variantcaller"], data), "precalled"
    ]
    return [c for c in callers if c][0]
Exemple #42
0
def get_type(data):
    """Retrieve the type of effects calculation to do.
    """
    if data["analysis"].lower().startswith("var"):
        return tz.get_in(("config", "algorithm", "effects"), data, "snpeff")
Exemple #43
0
def _get_sample_and_caller(data):
    return [
        tz.get_in(["metadata", "validate_sample"], data)
        or dd.get_sample_name(data),
        _get_caller_supplement(_get_caller(data), data)
    ]
Exemple #44
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region,
                                                   out_file, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      """| %s -c 'from bcbio.variation import freebayes; """
                                      """freebayes.call_somatic("%s", "%s")' """
                                      % (sys.executable, paired.tumor_name, paired.normal_name))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                     0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| {strandbias} "
                       "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                       "{freq_filter} "
                       "| bcftools filter -i 'QUAL >= 0' "
                       "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file
def _get_base_tmpdir(data, fallback_base_dir):
    config_tmpdir = tz.get_in(("config", "resources", "tmp", "dir"), data)
    if not config_tmpdir:
        config_tmpdir = tz.get_in(("resources", "tmp", "dir"), data)
    return config_tmpdir or os.path.join(fallback_base_dir, DEFAULT_TMP)
Exemple #46
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(
            vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(
            resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {
                "algorithm": {
                    "memory_adjust": {
                        "magnitude": threads,
                        "direction": "increase"
                    }
                }
            })
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (
            utils.local_path_export(), jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Exemple #47
0
def combine_multiple_callers(samples):
    """Collapse together variant calls from multiple approaches into single data item with `variants`.
    """
    by_bam = collections.OrderedDict()
    for data in (x[0] for x in samples):
        work_bam = tz.get_in(("combine", "work_bam", "out"), data,
                             data.get("align_bam"))
        jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), data)
        variantcaller = get_variantcaller(data)
        key = (multi.get_batch_for_key(data), work_bam)
        if key not in by_bam:
            by_bam[key] = []
        by_bam[key].append((variantcaller, jointcaller, data))
    out = []
    for callgroup in by_bam.values():
        ready_calls = []
        for variantcaller, jointcaller, data in callgroup:
            if variantcaller:
                cur = data.get("vrn_file_plus", {})
                cur.update({
                    "variantcaller":
                    variantcaller,
                    "vrn_file":
                    data.get("vrn_file_orig")
                    if jointcaller else data.get("vrn_file"),
                    "vrn_file_batch":
                    data.get("vrn_file_batch") if not jointcaller else None,
                    "vrn_stats":
                    data.get("vrn_stats"),
                    "validate":
                    data.get("validate") if not jointcaller else None
                })
                if jointcaller:
                    cur["population"] = False
                ready_calls.append(cur)
            if jointcaller:
                ready_calls.append({
                    "variantcaller": jointcaller,
                    "vrn_file": data.get("vrn_file"),
                    "vrn_file_batch": data.get("vrn_file_batch"),
                    "validate": data.get("validate"),
                    "do_upload": False
                })
            if not jointcaller and not variantcaller:
                ready_calls.append({
                    "variantcaller": "precalled",
                    "vrn_file": data.get("vrn_file"),
                    "validate": data.get("validate"),
                    "do_upload": False
                })
        final = callgroup[0][-1]

        def orig_variantcaller_order(x):
            try:
                return final["config"]["algorithm"][
                    "orig_variantcaller"].index(x["variantcaller"])
            except ValueError:
                return final["config"]["algorithm"]["orig_jointcaller"].index(
                    x["variantcaller"])

        if len(ready_calls
               ) > 1 and "orig_variantcaller" in final["config"]["algorithm"]:
            final["variants"] = sorted(ready_calls,
                                       key=orig_variantcaller_order)
            final["config"]["algorithm"]["variantcaller"] = final["config"][
                "algorithm"].pop("orig_variantcaller")
            if "orig_jointcaller" in final["config"]["algorithm"]:
                final["config"]["algorithm"]["jointcaller"] = final["config"][
                    "algorithm"].pop("orig_jointcaller")
        else:
            final["variants"] = ready_calls
        final.pop("vrn_file_batch", None)
        final.pop("vrn_file_orig", None)
        final.pop("vrn_file_plus", None)
        final.pop("vrn_stats", None)
        out.append([final])
    return out
Exemple #48
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    toval_data = cwlutils.unpack_tarballs(toval_data, toval_data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(
            os.path.join(toval_data["dirs"]["work"], "validate", sample,
                         caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError(
                "Multiple input files for validation: %s" %
                toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(
            toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(
                toval_data["config"]["algorithm"].get("validate_regions"),
                toval_data), toval_data)
        rm_interval_file = bedutils.clean_file(
            rm_interval_file,
            toval_data,
            prefix="validateregions-",
            bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data),
                                         data.get("genome_build"), base_dir,
                                         data)
        rm_interval_file = (naming.handle_synonyms(
            rm_interval_file, dd.get_ref_file(toval_data),
            data.get("genome_build"), base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data,
                            "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir,
                                         toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file,
                                       base_dir, toval_data)
            eval_files = _annotate_validations(eval_files, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file,
                                               rm_interval_file, base_dir,
                                               toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file,
                                                    rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Exemple #49
0
def is_human(data):
    return (tz.get_in(["genome_resources", "aliases", "human"], data, False)
            or dd.get_genome_build(data) in ["hg19", "GRCh37", "hg38"])
Exemple #50
0
def want_gvcf(items):
    jointcaller = tz.get_in(("config", "algorithm", "jointcaller"), items[0])
    want_gvcf = any("gvcf" in dd.get_tools_on(d) for d in items)
    return jointcaller or want_gvcf
Exemple #51
0
def stac_transform(input_stac: Document, relative: bool = True) -> Document:
    """Takes in a raw STAC 1.0 dictionary and returns an ODC dictionary"""

    product_label, product_name, region_code, default_grid = _stac_product_lookup(
        input_stac)

    # Generating UUID for products not having UUID.
    # Checking if provided id is valid UUID.
    # If not valid, creating new deterministic uuid using odc_uuid function based on product_name and product_label.
    # TODO: Verify if this approach to create UUID is valid.
    if _check_valid_uuid(input_stac["id"]):
        deterministic_uuid = input_stac["id"]
    else:
        if product_name in ["s2_l2a"]:
            deterministic_uuid = str(
                odc_uuid("sentinel-2_stac_process", "1.0.0", [product_label]))
        else:
            deterministic_uuid = str(
                odc_uuid(f"{product_name}_stac_process", "1.0.0",
                         [product_label]))

    # Check for projection extension properties that are not in the asset fields.
    # Specifically, proj:shape and proj:transform, as these are otherwise
    # fetched in _get_stac_bands.
    properties = input_stac["properties"]
    proj_shape = properties.get("proj:shape")
    proj_transform = properties.get("proj:transform")
    # TODO: handle old STAC that doesn't have grid information here...
    bands, grids = _get_stac_bands(
        input_stac,
        default_grid,
        relative=relative,
        proj_shape=proj_shape,
        proj_transform=proj_transform,
    )

    stac_properties, lineage = _get_stac_properties_lineage(input_stac)

    epsg = properties["proj:epsg"]
    native_crs = f"epsg:{epsg}"

    # Transform geometry to the native CRS at an appropriate precision
    geometry = Geometry(input_stac["geometry"], "epsg:4326")
    if native_crs != "epsg:4326":
        # Arbitrary precisions, but should be fine
        pixel_size = get_in(["default", "transform", 0], grids)
        precision = 0
        if pixel_size < 0:
            precision = 6

        geometry = _geographic_to_projected(geometry, native_crs, precision)

    stac_odc = {
        "$schema": "https://schemas.opendatacube.org/dataset",
        "id": deterministic_uuid,
        "crs": native_crs,
        "grids": grids,
        "product": {
            "name": product_name.lower()
        },
        "label": product_label,
        "properties": stac_properties,
        "measurements": bands,
        "lineage": {},
    }

    if region_code:
        stac_odc["properties"]["odc:region_code"] = region_code

    if geometry:
        stac_odc["geometry"] = geometry.json

    if lineage:
        stac_odc["lineage"] = lineage

    return stac_odc
Exemple #52
0
def get_variantcaller(data,
                      key="variantcaller",
                      default=None,
                      require_bam=True):
    if not require_bam or data.get("align_bam"):
        return tz.get_in(["config", "algorithm", key], data, default)
Exemple #53
0
def merge_split_alignments(samples, run_parallel):
    """Manage merging split alignments back into a final working BAM file.

    Perform de-duplication on the final merged file.
    """
    ready = []
    file_key = "work_bam"
    to_merge = collections.defaultdict(list)
    for data in (xs[0] for xs in samples):
        if data.get("combine"):
            out_key = tz.get_in(["combine", file_key, "out"], data)
            if not out_key:
                out_key = data["rgnames"]["lane"]
            to_merge[out_key].append(data)
        else:
            ready.append([data])
    ready_merge = []
    hla_merges = []
    for mgroup in to_merge.values():
        cur_data = mgroup[0]
        del cur_data["align_split"]
        for x in mgroup[1:]:
            cur_data["combine"][file_key]["extras"].append(x[file_key])
        ready_merge.append([cur_data])
        cur_hla = None
        for d in mgroup:
            hla_files = tz.get_in(["hla", "fastq"], d)
            if hla_files:
                if not cur_hla:
                    cur_hla = {
                        "rgnames": {
                            "sample": dd.get_sample_name(cur_data)
                        },
                        "config": cur_data["config"],
                        "dirs": cur_data["dirs"],
                        "hla": {
                            "fastq": []
                        }
                    }
                cur_hla["hla"]["fastq"].append(hla_files)
        if cur_hla:
            hla_merges.append([cur_hla])
    if not tz.get_in(["config", "algorithm", "kraken"], data):
        # kraken requires fasta filenames from data['files'] as input.
        # We don't want to remove those files if kraken qc is required.
        _save_fastq_space(samples)
    merged = run_parallel("delayed_bam_merge", ready_merge)
    hla_merge_raw = run_parallel("merge_split_alignments", hla_merges)
    hla_merges = {}
    for hla_merge in [x[0] for x in hla_merge_raw]:
        hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"],
                                                              hla_merge)

    # Add stable 'align_bam' target to use for retrieving raw alignment
    out = []
    for data in [x[0] for x in merged + ready]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if dd.get_sample_name(data) in hla_merges:
            data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)]
        else:
            hla_files = glob.glob(
                os.path.join(dd.get_work_dir(data), "align",
                             dd.get_sample_name(data), "hla", "*.fq"))
            if hla_files:
                data["hla"]["fastq"] = hla_files
        out.append([data])
    return out
Exemple #54
0
def eo3_grid_spatial(doc: Dict[str, Any],
                     resolution: Optional[float] = None) -> Dict[str, Any]:
    """Using doc[grids|crs|geometry] compute EO3 style grid spatial:

    Note that `geo_ref_points` are set to the 4 corners of the default grid
    only, while lon/lat bounds are computed using all the grids, unless tighter
    valid region is defined via `geometry` key, in which case it is used to
    determine lon/lat bounds instead.

    inputs:
    ```
    crs: "<:str>"
    geometry: <:GeoJSON object>  # optional
    grids:
       default:
          shape: [ny: int, nx: int]
          transform: [a0, a1, a2, a3, a4, a5, 0, 0, 1]
       <...> # optionally more grids
    ```

    Where transform is a linear mapping matrix from pixel space to projected
    space encoded in row-major order:

       [X]   [a0, a1, a2] [ Pixel]
       [Y] = [a3, a4, a5] [ Line ]
       [1]   [ 0,  0,  1] [  1   ]

    outputs:
    ```
      extent:
        lat: {begin=<>, end=<>}
        lon: {begin=<>, end=<>}

      grid_spatial:
        projection:
          spatial_reference: "<crs>"
          geo_ref_points: {ll: {x:<>, y:<>}, ...}
          valid_data: {...}
    ```

    """
    grid = toolz.get_in(['grids', 'default'], doc, None)
    crs = doc.get('crs', None)
    if crs is None or grid is None:
        raise ValueError("Input must have crs and grids.default")

    geometry = doc.get('geometry')

    if geometry is not None:
        valid_data = dict(valid_data=geometry)
    else:
        valid_data = {}

    oo = dict(grid_spatial=dict(
        projection={
            'spatial_reference': crs,
            'geo_ref_points': grid2ref_points(grid),
            **valid_data,
        }))

    x1, y1, x2, y2 = eo3_lonlat_bbox(doc, resolution=resolution)
    oo['extent'] = dict(lon=dict(begin=x1, end=x2), lat=dict(begin=y1, end=y2))
    return oo
Exemple #55
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [
        utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)
    ]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(
            items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[
            tz.get_in(["depth", "bins", "target"], x),
            tz.get_in(["depth", "bins", "antitarget"], x)
        ] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(
            dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(inputs[0]), "structural",
                         dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(
            filter(lambda x: x is not None,
                   [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(
                input_backs
            ) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(
                cnns,
                os.path.join(work_dir,
                             "background-%s-cnvkit.cnn" % (group_id)),
                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "structural",
                             dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(
                    work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append(
                    (tz.get_in(["depth", "bins", "target"], data),
                     tz.get_in(["depth", "bins", "antitarget"],
                               data), back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs,
                      inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[
                dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(
                data)]
        out.append([data])
    return out
Exemple #56
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(
        os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(
            ["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [
            normalize.normalize(f,
                                data,
                                passonly=passonly,
                                rerun_effects=False,
                                remove_oldeffects=True,
                                work_dir=utils.safe_makedir(
                                    os.path.join(base_dir, c)))
            for c, f in zip(caller_names, vrn_files)
        ]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files,
                                                  caller_names, base_dir,
                                                  edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir,
                                             edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file,
                                     base_dir, dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(
                callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get(
            "validate")
    else:
        out_vcf_file = os.path.join(base_dir,
                                    "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(
            out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {
            "variantcaller": "ensemble",
            "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
            "bed_file": None
        }
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Exemple #57
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input),
                                                 cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({
            "cnr": "%s.cnr" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                        zip(["evaluate"] * len(inputs), inputs)
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"],
                                       inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_general_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_original_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add, [
                _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                inputs + backgrounds)
                for cnns in tz.groupby("bam", raw_coverage_cnns).values()
            ])
            background_cnn = _cnvkit_background(
                _select_background_cnns(coverage_cnns), background_cnn,
                target_bed, antitarget_bed, inputs[0])
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = _cnvkit_background([
                x["file"] for x in coverage_cnns if x["itype"] == "background"
            ], background_cnn, target_bed, antitarget_bed, inputs[0])
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby(
                "bam", [x for x in coverage_cnns
                        if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [
            _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds)
            for cnr, data in fixed_cnrs
        ]
    return ckouts
Exemple #58
0
def _run_info_from_yaml(dirs,
                        run_info_yaml,
                        config,
                        sample_names=None,
                        integrations=None):
    """Read run information from a passed YAML file.
    """
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                config[iname] = retriever.set_cache(config[iname])
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
                                             makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"],
            ignore_keys=ALGORITHM_NOPATH_KEYS,
            fileonly_keys=ALGORITHM_FILEONLY_KEYS,
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                genome.abs_file_paths(
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
            ]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
                             item["description"]))
            item["vrn_file"] = genome.abs_file_paths(
                item["vrn_file"],
                do_download=all(not x for x in integrations.values()))
            if os.path.isfile(item["vrn_file"]):
                item["vrn_file"] = vcfutils.bgzip_and_index(item["vrn_file"],
                                                            config,
                                                            remove_orig=False,
                                                            out_dir=inputs_dir)
            if not tz.get_in(("metadata", "batch"), item):
                raise ValueError(
                    "%s: Please specify a metadata batch for variant file (vrn_file) input.\n"
                    % (item["description"]) +
                    "Batching with a standard sample provides callable regions for validation."
                )
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Exemple #59
0
def _get_files_project(sample, upload_config):
    """Retrieve output files associated with an entire analysis project.
    """
    out = [{"path": sample["provenance"]["programs"]}]
    for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]:
        if os.path.exists(
                os.path.join(log.get_log_dir(sample["config"]), fname)):
            out.append({
                "path":
                os.path.join(log.get_log_dir(sample["config"]), fname),
                "type":
                "external_command_log",
                "ext":
                ""
            })

    if "summary" in sample and sample["summary"].get("project"):
        out.append({"path": sample["summary"]["project"]})
    mixup_check = tz.get_in(["summary", "mixup_check"], sample)
    if mixup_check:
        out.append({
            "path": sample["summary"]["mixup_check"],
            "type": "directory",
            "ext": "mixup_check"
        })

    report = os.path.join(dd.get_work_dir(sample), "report")
    if utils.file_exists(report):
        out.append({"path": report, "type": "directory", "ext": "report"})

    if sample.get("seqcluster", None):
        out.append({
            "path": sample["seqcluster"],
            "type": "directory",
            "ext": "seqcluster"
        })

    for x in sample.get("variants", []):
        if "pop_db" in x:
            out.append({
                "path": x["pop_db"],
                "type": "sqlite",
                "variantcaller": x["variantcaller"]
            })
    for x in sample.get("variants", []):
        if "population" in x:
            pop_db = tz.get_in(["population", "db"], x)
            if pop_db:
                out.append({
                    "path": pop_db,
                    "type": "sqlite",
                    "variantcaller": x["variantcaller"]
                })
            out.extend(_get_variant_file(x, ("population", "vcf")))
    for x in sample.get("variants", []):
        if x.get("validate") and x["validate"].get("grading_summary"):
            out.append({"path": x["validate"]["grading_summary"]})
            break
    if "coverage" in sample:
        cov_db = tz.get_in(["coverage", "summary"], sample)
        if cov_db:
            out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"})
        all_coverage = tz.get_in(["coverage", "all"], sample)
        if all_coverage:
            out.append({
                "path": all_coverage,
                "type": "bed",
                "ext": "coverage"
            })

    if dd.get_mirna_counts(sample):
        out.append({"path": dd.get_mirna_counts(sample)})
    if dd.get_isomir_counts(sample):
        out.append({"path": dd.get_isomir_counts(sample)})
    if dd.get_combined_counts(sample):
        out.append({"path": dd.get_combined_counts(sample)})
    if dd.get_annotated_combined_counts(sample):
        out.append({"path": dd.get_annotated_combined_counts(sample)})
    if dd.get_combined_fpkm(sample):
        out.append({"path": dd.get_combined_fpkm(sample)})
    if dd.get_combined_fpkm_isoform(sample):
        out.append({"path": dd.get_combined_fpkm_isoform(sample)})
    if dd.get_assembled_gtf(sample):
        out.append({"path": dd.get_assembled_gtf(sample)})
    if dd.get_dexseq_counts(sample):
        out.append({"path": dd.get_dexseq_counts(sample)})
    if dd.get_express_counts(sample):
        out.append({"path": dd.get_express_counts(sample)})
    if dd.get_express_fpkm(sample):
        out.append({"path": dd.get_express_fpkm(sample)})
    if dd.get_express_tpm(sample):
        out.append({"path": dd.get_express_tpm(sample)})
    if dd.get_isoform_to_gene(sample):
        out.append({"path": dd.get_isoform_to_gene(sample)})
    if dd.get_square_vcf(sample):
        out.append({"path": dd.get_square_vcf(sample)})
    if dd.get_sailfish_tidy(sample):
        out.append({"path": dd.get_sailfish_tidy(sample)})
    if dd.get_sailfish_transcript_tpm(sample):
        out.append({"path": dd.get_sailfish_transcript_tpm(sample)})
    if dd.get_sailfish_gene_tpm(sample):
        out.append({"path": dd.get_sailfish_gene_tpm(sample)})
    return _add_meta(out, config=upload_config)
Exemple #60
0
def convert_sdmx_element(element, dataset_json, dataset_context, dsd_infos,
                         series_jsonl_file):
    global timings

    # Due to event=end, given to iterparse, we receive <Obs> then <Series> elements, in this order.

    if element.tag.endswith("Series"):

        # Ignore some specific XML element attributes corresponding to series SDMX attributes,
        # because series SDMX attributes do not exist in DBnomics.
        series_element_attributes = OrderedDict([
            (attribute_key, attribute_value)
            for attribute_key, attribute_value in element.attrib.items()
            if attribute_key not in {"TIME_FORMAT"}  # Redundant with FREQ.
        ])

        dimensions_codes_order = list(series_element_attributes.keys())
        if dataset_json["dimensions_codes_order"] is None:
            dataset_json["dimensions_codes_order"] = dimensions_codes_order
        else:
            # dimensions_codes_order must not change between series.
            assert dataset_json["dimensions_codes_order"] == dimensions_codes_order, \
                (dataset_json["dimensions_codes_order"], dimensions_codes_order)

        # Fill series dimensions labels in dataset.json.

        t0 = time.time()

        for dimension_code, dimension_value_code in series_element_attributes.items(
        ):
            if dimension_code not in dataset_json["dimensions_labels"]:
                dimension_label = dsd_infos["concepts"].get(dimension_code)
                if dimension_label and dimension_code not in dataset_json[
                        "dimensions_labels"]:
                    # Some dimensions labels are an empty string: e.g. bs_bs12_04.sdmx.xml
                    dataset_json["dimensions_labels"][
                        dimension_code] = dimension_label
            if dimension_code in dataset_json["dimensions_values_labels"] and \
                    dimension_value_code in dataset_json["dimensions_values_labels"][dimension_code]:
                continue
            codelist_code = dsd_infos["codelist_by_concept"][dimension_code]
            dimension_value_label = get_in(
                [codelist_code, dimension_value_code], dsd_infos["codelists"])
            if dimension_value_label:
                dataset_json["dimensions_values_labels"].setdefault(
                    dimension_code,
                    {})[dimension_value_code] = dimension_value_label

        timings["series_labels"] += time.time() - t0

        # Series code is not defined by provider: create it from dimensions values codes.
        series_code = ".".join(series_element_attributes[dimension_code]
                               for dimension_code in dimensions_codes_order)

        # Write series JSON to file.

        t0 = time.time()

        observations_header = [["PERIOD", "VALUE"] + dsd_infos["attributes"]]
        series_json = {
            "code":
            series_code,
            "dimensions": [
                series_element_attributes[
                    dimension_code]  # Every dimension MUST be defined for each series.
                for dimension_code in dimensions_codes_order
            ],
            "observations":
            observations_header +
            dataset_context["current_series_observations"],
        }

        dataset_context["observations_offsets"][
            series_code] = series_jsonl_file.tell()

        json.dump(series_json,
                  series_jsonl_file,
                  ensure_ascii=False,
                  sort_keys=True)
        series_jsonl_file.write("\n")

        timings["series_file"] += time.time() - t0

        # Reset context for next series.

        dataset_context["current_series_observations"] = []

    elif element.tag.endswith("Obs"):

        # Fill observations attributes labels in dataset.json.

        t0 = time.time()

        for attribute_code, attribute_value_code in element.attrib.items():
            # Ignore period and value observations XML attributes, because they don't need labels.
            if attribute_code in ["TIME_PERIOD", "OBS_VALUE"]:
                continue
            attribute_label = dsd_infos["concepts"].get(attribute_code)
            if attribute_label and attribute_code not in dataset_json[
                    "attributes_labels"]:
                dataset_json["attributes_labels"][
                    attribute_code] = attribute_label
            # Some attributes values codes are multi-valued and concatenated into the same string.
            attribute_value_codes = list(attribute_value_code) \
                if attribute_code == "OBS_STATUS" \
                else [attribute_value_code]
            for attribute_value_code in attribute_value_codes:
                if attribute_code in dataset_json["attributes_values_labels"] and \
                        attribute_value_code in dataset_json["attributes_values_labels"][attribute_code]:
                    continue
                codelist_code = dsd_infos["codelist_by_concept"][
                    attribute_code]
                attribute_value_label = get_in(
                    [codelist_code, attribute_value_code],
                    dsd_infos["codelists"])
                if attribute_value_label:
                    dataset_json["attributes_values_labels"].setdefault(
                        attribute_code,
                        {})[attribute_value_code] = attribute_value_label

        timings["observations_labels"] += time.time() - t0

        obs_value = element.attrib.get("OBS_VALUE")
        if obs_value is not None:
            obs_value = observations.value_to_float(obs_value)
        dataset_context["current_series_observations"].append([
            element.
            attrib["TIME_PERIOD"],  # SDMX periods are already normalized.
            obs_value,
        ] + [
            element.attrib.get(attribute_name, "")
            for attribute_name in dsd_infos["attributes"]
        ])