Esempio n. 1
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Esempio n. 2
0
def _add_bed_to_output(out, data):
    """Call ploidy and convert into BED representation.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = dd.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call",
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_call_file, out["cns"]]
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    out_file = "%s.bed" % os.path.splitext(call_file)[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                   "bed", "--sample-id", dd.get_sample_name(data),
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_out_file, call_file]
            if gender and gender.lower() == "male":
                cmd += ["--male-reference"]
            do.run(cmd, "CNVkit export BED")
    out["call_file"] = call_file
    out["vrn_file"] = annotate.add_genes(out_file, data)
    return out
Esempio n. 3
0
def combine_bed_by_size(input_beds, sample, work_dir, data, delim=","):
    """Combine a set of BED files, breaking into individual size chunks.
    """
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        has_regions = False
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end, event_str = line.split()[:4]
                                event = event_str.split("_", 1)[0]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end or event == "BND":
                                    out_handle.write(line)
                                    has_regions = True
                        if has_regions:
                            pybedtools.BedTool(all_file).sort(stream=True)\
                              .merge(c=4, o="distinct", delim=delim).saveas(tx_out_file)
            if utils.file_exists(size_out_file):
                ann_size_out_file = annotate.add_genes(size_out_file, data)
                size_beds.append(ann_size_out_file)
        if len(size_beds) > 0:
            out_file = bedutils.combine(size_beds, out_file, data)
    return out_file
Esempio n. 4
0
def _prep_bed(data, work_dir):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = clean_file(dd.get_variant_regions(data), data)

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
Esempio n. 5
0
def _annotate_bed(bed_fpath, data, work_dir):
    annotate_bed = annotate.add_genes(bed_fpath, data, work_dir=work_dir)
    if annotate_bed == bed_fpath:
        raise ValueError("BED file for Seq2C must be annotated with gene names, "
                         "however the input BED is 3-columns and we have no transcript "
                         "data to annotate with" + bed_fpath)
    return annotate_bed
Esempio n. 6
0
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data):
    """Convert support ids for SNPs and SSMs into associated genes.
    """
    locs = collections.defaultdict(set)
    for gid in gids:
        cur_locs = []
        try:
            cur_locs.append(ssm_locs[gid])
        except KeyError:
            for ssm_loc in cnv_ssms.get(gid, []):
                cur_locs.append(ssm_locs[ssm_loc])
        for chrom, pos in cur_locs:
            locs[chrom].add(pos)
    genes = set([])
    with tx_tmpdir(data) as tmpdir:
        chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else ""
        loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed")
        with open(loc_file, "w") as out_handle:
            for chrom in sorted(locs.keys()):
                for loc in sorted(list(locs[chrom])):
                    out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc))
        ann_file = annotate.add_genes(loc_file, data, max_distance=10000)
        for r in pybedtools.BedTool(ann_file):
            for gene in r.name.split(","):
                if gene != ".":
                    genes.add(gene)
    return sorted(list(genes))
Esempio n. 7
0
def _add_variantcalls_to_output(out, data):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = dd.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call",
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_call_file, out["cns"]]
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(dd.get_ploidy(data)),
                       "-o", tx_out_file, call_file]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Esempio n. 8
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            # pick targets, anti-targets and access files based on analysis type
            # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
            cov_interval = dd.get_coverage_interval(data)
            base_regions = dd.get_variant_regions(data)
            # For genome calls, subset to regions within 10kb of genes
            if cov_interval == "genome":
                base_regions = annotate.subset_by_genes(base_regions, data,
                                                        work_dir, pad=1e4)

            raw_target_bed = bedutils.merge_overlaps(base_regions, data,
                                                     out_dir=work_dir)
            target_bed = annotate.add_genes(raw_target_bed, data)

            # bail out if we ended up with no regions
            if not utils.file_exists(target_bed):
                return {}

            if cov_interval == "amplicon":
                target_opts = ["--targets", target_bed, "--access", target_bed]
            elif cov_interval == "genome":
                target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)]
            else:
                target_opts = ["--targets", target_bed, "--access", access_file]

            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  target_opts + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Esempio n. 9
0
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = _bam_to_outbase(test_bam, raw_work_dir)
        ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]}
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        split_cnns = run_multicore(
            _cnvkit_coverage,
            [
                (bam, bed, _bam_to_itype(bam), raw_work_dir, data)
                for bam in test_bams + background_bams
                for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)
            ],
            data["config"],
            parallel,
        )
        coverage_cnns = _merge_coverage(split_cnns, data)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn,
            target_bed,
            antitarget_bed,
            data,
        )
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [
                (cnns, background_cnn, data)
                for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()
            ],
            data["config"],
            parallel,
        )
        called_segs = run_multicore(
            _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel
        )
    return ckouts
Esempio n. 10
0
def _calculate_sv_coverage_cnvkit(data, work_dir):
    """Calculate coverage in an CNVkit ready format using mosdepth.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and
          (dd.get_align_bam(data) or dd.get_work_bam(data))):
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data)
    return out_target_file, out_anti_file
Esempio n. 11
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir)
        ckouts.append({"cnr": "%s.cnr" % out_base,
                       "cns": "%s.cns" % out_base,
                       "back_cnn": background_cnn})
    if not utils.file_exists(ckouts[0]["cnr"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() /
                        float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval,
                                                     pct_coverage, raw_work_dir, inputs[0])
        split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        split_cnns = run_multicore(_cnvkit_coverage,
                                   [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds],
                                   inputs[0]["config"], parallel)
        raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0])
        coverage_cnns = run_multicore(_cnvkit_metrics,
                                      [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds)
                                       for cnns in tz.groupby("bam", raw_coverage_cnns).values()],
                                      inputs[0]["config"], parallel)
        background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns),
                                            background_cnn, target_bed, antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(_cnvkit_fix,
                                   [(cnns, background_cnn, inputs + backgrounds) for cnns in
                                    tz.groupby("bam", [x for x in coverage_cnns
                                                       if x["itype"] == "evaluate"]).values()],
                                      inputs[0]["config"], parallel)
        run_multicore(_cnvkit_segment,
                      [(cnr, cov_interval, data) for cnr, data in fixed_cnrs],
                      inputs[0]["config"], parallel)
    return ckouts
Esempio n. 12
0
def _run_cnvkit_shared(data, test_bams, background_bams, work_dir, background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]

    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            cov_interval = dd.get_coverage_interval(data)
            raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir)
            # bail out if we ended up with no regions
            if not utils.file_exists(raw_target_bed):
                return {}
            target_bed = annotate.add_genes(raw_target_bed, data)

            # Do not paralleize cnvkit due to current issues with multi-processing
            cores = 1
            # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
            #             len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_bed] + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            if cov_interval not in ["amplicon", "genome"]:
                at_avg, at_min, t_avg = _get_antitarget_size(access_bed, target_bed)
                if at_avg:
                    cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                            "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Esempio n. 13
0
def _add_bed_to_output(out, data):
    """Add FreeBayes cnvmap BED-like representation to the output.
    """
    out_file = "%s.bed" % os.path.splitext(out["cns"])[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                   "freebayes", "--sample-id", dd.get_sample_name(data),
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_out_file, out["cns"]]
            gender = dd.get_gender(data)
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit export FreeBayes BED cnvmap")
    out["vrn_file"] = annotate.add_genes(out_file, data)
    return out
Esempio n. 14
0
def _calculate_sv_coverage_gatk(data, work_dir):
    """Calculate coverage in defined regions using GATK tools

    TODO: This does double calculations to get GATK4 compatible HDF read counts
    and then depth and gene annotations. Both are needed for creating heterogeneity inputs.
    Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format:

    CONTIG  START   END     COUNT
    chrM    1       1000    13268
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    # GATK compatible
    target_file = gatkcnv.collect_read_counts(data, work_dir)
    # heterogeneity compatible
    target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"], data), data, bedprep_dir=work_dir)
    target_cov = coverage.run_mosdepth(data, "target-gatk", target_in)
    target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
    return target_file, target_cov_genes
Esempio n. 15
0
def _add_variantcalls_to_output(out, data, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = population.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["-v", small_vrn_files[0]]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            if gender and gender.lower() != "unknown":
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Esempio n. 16
0
def _add_variantcalls_to_output(out, data, items, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data, items)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                if small_vrn_files[0].normal:
                    cmd += ["--normal-id", small_vrn_files[0].normal]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            gender = _get_batch_gender(items)
            if gender:
                cmd += ["--sample-sex", gender]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items)
    return out
Esempio n. 17
0
def prep_seq2c_bed(data):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    if dd.get_background_cnv_reference(data, "seq2c"):
        bed_file = _background_to_bed(
            dd.get_background_cnv_reference(data, "seq2c"), data)
    else:
        bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError(
                "BED file for Seq2C must be annotated with gene names, "
                "however the input BED is 3-columns and we have no transcript "
                "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " +
                     ready_file)

    return ready_file
Esempio n. 18
0
def _add_variantcalls_to_output(out, data, items, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            filters = ["--filter", "cn"]
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \
                  filters + \
                   ["--ploidy", str(ploidy.get_ploidy([data])),
                    "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data, items)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                if small_vrn_files[0].normal:
                    cmd += ["--normal-id", small_vrn_files[0].normal]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            gender = _get_batch_gender(items)
            if gender:
                cmd += ["--sample-sex", gender]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    out["vrn_file"] = shared.annotate_with_depth(out["vrn_file"], items)
    return out
Esempio n. 19
0
def _add_variantcalls_to_output(out, data, is_somatic=False):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = population.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call",
                   "--ploidy", str(ploidy.get_ploidy([data])),
                   "-o", tx_call_file, out["cns"]]
            small_vrn_files = _compatible_small_variants(data)
            if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]):
                cmd += ["-v", small_vrn_files[0]]
                if not is_somatic:
                    cmd += ["-m", "clonal"]
            if gender and gender.lower() != "unknown":
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not os.path.exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(ploidy.get_ploidy([data])),
                       "-o", tx_out_file, call_file]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Esempio n. 20
0
def _prep_bed(data, bed_file, work_dir):
    clean_file = os.path.join(work_dir, "%s-clean.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0]))
    bed = bt.BedTool(bed_file)
    col_num = bed.field_count()

    if not utils.file_uptodate(clean_file, bed_file):
        bed = bed.filter(lambda x: x.chrom and
                         not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))
        bed = bed.remove_invalid()
        with file_transaction(data, clean_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean BED file into " + clean_file)

    if col_num < 4:
        annotated_file = annotate.add_genes(clean_file, data, max_distance=0, work_dir=work_dir)
        if annotated_file == clean_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
    else:
        annotated_file = clean_file

    ready_file = os.path.join(work_dir, "%s-clean.bed" % (utils.splitext_plus(os.path.basename(annotated_file))[0]))
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])

        # Report all duplicated annotations one-per-line
        with file_transaction(data, ready_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for r in bed:
                    for g in r.name.split(','):
                        out.write('\t'.join(map(str, [r.chrom, r.start, r.end, g])) + '\n')
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
Esempio n. 21
0
def _calculate_sv_coverage_cnvkit(data, work_dir):
    """Calculate coverage in an CNVkit ready format using mosdepth.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    out_target_file = os.path.join(
        work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(
        work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file)
         or not utils.file_exists(out_anti_file))
            and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        target_cov = coverage.run_mosdepth(
            data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(
            data, "antitarget",
            tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions,
                                              data,
                                              max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file,
                                          data)
        out_anti_file = _add_log2_depth(anti_cov.regions, out_anti_file, data)
    return out_target_file, out_anti_file
Esempio n. 22
0
def _calculate_sv_coverage_gatk(data, work_dir):
    """Calculate coverage in defined regions using GATK tools

    TODO: This does double calculations to get GATK4 compatible HDF read counts
    and then depth and gene annotations. Both are needed for creating heterogeneity inputs.
    Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format:

    CONTIG  START   END     COUNT
    chrM    1       1000    13268
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    # GATK compatible
    target_file = gatkcnv.collect_read_counts(data, work_dir)
    # heterogeneity compatible
    target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"],
                                              data),
                                    data,
                                    bedprep_dir=work_dir)
    target_cov = coverage.run_mosdepth(data, "target-gatk", target_in)
    target_cov_genes = annotate.add_genes(target_cov.regions,
                                          data,
                                          max_distance=0)
    return target_file, target_cov_genes
Esempio n. 23
0
def _add_variantcalls_to_output(out, data):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = dd.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [
                os.path.join(os.path.dirname(sys.executable), "cnvkit.py"),
                "call", "--ploidy",
                str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]
            ]
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [
                    os.path.join(os.path.dirname(sys.executable), "cnvkit.py"),
                    "export", outformat, "--sample-id",
                    dd.get_sample_name(data), "--ploidy",
                    str(dd.get_ploidy(data)), "-o", tx_out_file, call_file
                ]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vcf_file"] = calls["vcf"]
    out["vrn_file"] = annotate.add_genes(calls["bed"], data)
    return out
Esempio n. 24
0
def _merge_target_information(samples):
    out_file = os.path.join("metrics", "target_info.yaml")
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    variant_regions = set(dd.get_variant_regions(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the sample across samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the sample across samples
    vcr = None
    if len(variant_regions) == 1:
        vcr = dd.get_variant_regions_orig(data)
        vcr_merged = dd.get_variant_regions_merged(data)
        vcr_ann = annotate.add_genes(vcr, data)
        info["variants_regions_info"] = {
            "bed":
            variant_regions,
            "size":
            sum(len(x) for x in pybedtools.BedTool(vcr_merged)),
            "regions":
            pybedtools.BedTool(vcr).count(),
            "genes":
            len(
                list(
                    set(r.name for r in pybedtools.BedTool(vcr_ann)
                        if r.name and r.name != "."))),
        }
    elif len(variant_regions) == 0:
        info["variants_regions_info"] = {"bed": None}

    # Reporting in MultiQC only if the target is the sample across samples
    if len(coverage_beds) == 1:
        bed = dd.get_coverage(data)
        if vcr and vcr == bed:
            info["coverage_bed_info"] = info["variants_regions_info"]
        elif bed:
            ann_bed = annotate.add_genes(bed, data)
            info["coverage_bed_info"] = {
                "bed":
                bed,
                "size":
                pybedtools.BedTool(bed).total_coverage(),
                "regions":
                pybedtools.BedTool(bed).count(),
                "genes":
                len(
                    list(
                        set(r.name for r in pybedtools.BedTool(ann_bed)
                            if r.name and r.name != "."))),
            }

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Esempio n. 25
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input),
                                   cur_raw_work_dir)
        ckouts.append({
            "cnr": "%s.cnr" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        pct_coverage = (
            pybedtools.BedTool(raw_target_bed).total_coverage() /
            float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     pct_coverage,
                                                     raw_work_dir, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        raw_coverage_cnns = [
            _cnvkit_coverage(cdata, bed, itype)
            for itype, cdata in samples_to_run
            for bed in [target_bed, antitarget_bed]
        ]
        coverage_cnns = reduce(operator.add, [
            _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                            inputs + backgrounds)
            for cnns in tz.groupby("bam", raw_coverage_cnns).values()
        ])
        background_cnn = _cnvkit_background(
            _select_background_cnns(coverage_cnns), background_cnn, target_bed,
            antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs + backgrounds)
             for cnns in tz.groupby(
                 "bam", [x for x in coverage_cnns
                         if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs]
    return ckouts
Esempio n. 26
0
def _run_cnvkit_shared(items,
                       test_bams,
                       background_bams,
                       work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))

    background_cnn = os.path.join(
        raw_work_dir,
        "%s_background.cnn" % (background_name if background_name else "flat"))
    ckouts = []
    for test_bam in test_bams:
        out_base = os.path.splitext(
            os.path.basename(test_bam))[0].split(".")[0]
        ckouts.append({
            "cnr": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cnr"]):
        data = items[0]
        cov_interval = dd.get_coverage_interval(data)
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, data, work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, data)
        parallel = {
            "type": "local",
            "cores": dd.get_cores(data),
            "progs": ["cnvkit"]
        }
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     raw_work_dir, data)

        def _bam_to_itype(bam):
            return "background" if bam in background_bams else "evaluate"

        coverage_cnns = run_multicore(
            _cnvkit_coverage,
            [(bam, bed, _bam_to_itype(bam), raw_work_dir, data)
             for bam in test_bams + background_bams
             for bed in [target_bed, antitarget_bed]], data["config"],
            parallel)
        background_cnn = _cnvkit_background(
            [x["file"] for x in coverage_cnns if x["itype"] == "background"],
            background_cnn, target_bed, antitarget_bed, data)
        fixed_cnrs = run_multicore(_cnvkit_fix, [
            (cnns, background_cnn, data)
            for cnns in tz.groupby(lambda x: x[
                "bam"], [x for x in coverage_cnns
                         if x["itype"] == "evaluate"]).values()
        ], data["config"], parallel)
        called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data)
                                                      for cnr in fixed_cnrs],
                                    data["config"], parallel)
    return ckouts