Ejemplo n.º 1
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
        with file_transaction(out_file) as tx_out_file:
            parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
            cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "{in_bam} | {parse_cmd} > {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_file
Ejemplo n.º 2
def priority_total_coverage(data):
    calculate coverage at 10 depth intervals in the priority regions
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Ejemplo n.º 3
def _handle_multiple_svcallers(data, stage):
    """Retrieve configured structural variation caller, handling multiple.
       data is one sample
    svs = get_svcallers(data)
    # special cases -- prioritization
    if stage == "ensemble" and dd.get_svprioritize(data):
    out = []
    for svcaller in svs:
        if svcaller in _get_callers([data], stage):
            base = copy.deepcopy(data)
            # clean SV callers present in multiple rounds and not this caller
            final_svs = []
            for sv in data.get("sv", []):
                if (stage == "ensemble" or sv["variantcaller"] == svcaller
                        or sv["variantcaller"] not in svs
                        or svcaller not in _get_callers(
                            [data], stage, special_cases=True)):
            base["sv"] = final_svs
            base["config"]["algorithm"]["svcaller"] = svcaller
            base["config"]["algorithm"]["svcaller_orig"] = svs
    return out
Ejemplo n.º 4
Ejemplo n.º 5
def priority_total_coverage(data, out_dir):
    calculate coverage at 10 depth intervals in the priority regions
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Ejemplo n.º 6
def priority_total_coverage(data, out_dir):
    calculate coverage at 10 depth intervals in the priority regions
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
        return {}
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-")
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(
            out_file, in_bam):
        return out_file
    cmdl = sambamba.make_command(
        "depth region",
        depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    with file_transaction(out_file) as tx_out_file:
        message = "Calculating region coverage of {bed_file} in {in_bam}"
        do.run(cmdl + " -o " + tx_out_file, message.format(**locals()))
    logger.debug("Saved svprioritize coverage into " + out_file)
    return out_file
Ejemplo n.º 7
def priority_total_coverage(data, out_dir):
    calculate coverage at 10 depth intervals in the priority regions
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Ejemplo n.º 8
def run(items):
    """Perform detection of structural variations with lumpy.
    if not all(
            utils.get_in(data, ("config", "algorithm", "aligner")) in
        ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items):
        raise ValueError(
            "Require bwa or minimap2 alignment input for lumpy structural variation detection"
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams,
                                          work_dir, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(
                lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name,
                                        [paired.normal_name], gt_vcfs,
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            if dd.get_svprioritize(data):
                effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
                effects_vcf = None
                "variantcaller": "lumpy",
                "vrn_file": effects_vcf or vcf_file,
                "exclude_file": exclude_file
    return out
Ejemplo n.º 9
def get_coords(data):
    for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}),
                             ("amplification", {"AMPLIFICATION"})]:
        out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {})
        priority_file = dd.get_svprioritize(data)
        if priority_file and os.path.basename(priority_file).find("civic") >= 0:
            for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)):
                out[gene] = (chrom, start, end)
        yield category, out
Ejemplo n.º 10
def run(items):
    """Perform detection of structural variations with lumpy, using bwa-mem alignment.
    if not all(utils.get_in(data, ("config", "algorithm", "aligner"))
               in ["bwa", "sentieon-bwa", False, None] for data in items):
        raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection")
    paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence,
                                         work_dir, items)
    gt_vcfs = {}
    for data in items:
        sample = dd.get_sample_name(data)
        sample_vcf = vcfutils.select_sample(lumpy_vcf, sample,
                                            utils.append_stem(lumpy_vcf, "-%s" % sample),
        if "bnd-genotype" in dd.get_tools_on(data):
            gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data)
        elif "lumpy-genotype" in dd.get_tools_off(data):
            gt_vcf = sample_vcf
            std_vcf, bnd_vcf = _split_breakends(sample_vcf, data)
            std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data)
            gt_vcf = vcfutils.concat_variant_files_bcftools(
                orig_files=[std_gt_vcf, bnd_vcf],
                out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0],
        gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs[dd.get_sample_name(data)]
        if dd.get_svprioritize(data):
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            effects_vcf = None
        data["sv"].append({"variantcaller": "lumpy",
                           "vrn_file": effects_vcf or vcf_file,
                           "exclude_file": exclude_file})
    return out
Ejemplo n.º 11
Ejemplo n.º 12
Ejemplo n.º 13
def _add_scatter_plot(out, data):
    out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0]
    priority_bed = dd.get_svprioritize(data)
    if not priority_bed:
        return None
    priority_bed = plot._prioritize_plot_regions(pybedtools.BedTool(priority_bed), data, os.path.dirname(out_file))
    if utils.file_exists(out_file):
        return out_file
    cnr = _remove_haplotype_chroms(out["cnr"], data)
    cns = _remove_haplotype_chroms(out["cns"], data)
    with file_transaction(data, out_file) as tx_out_file:
        cmd = [_get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l",
               priority_bed, cnr]
        do.run(cmd, "CNVkit scatter plot")
    return out_file
Ejemplo n.º 14
Ejemplo n.º 15
def priority_coverage(data):
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    line_batch += "%s\t%s\t%s\n" % (chrom, start, end)
                if not coord_batch:
                region_file = pybedtools.BedTool(line_batch,
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                samtools = config_utils.get_program("samtools", data["config"])
                bedtools = config_utils.get_program("bedtools", data["config"])
                cmd = (
                    "{samtools} view -b {in_bam} {coord_string} | "
                    "{bedtools} coverage -sorted -d -a {region_file} -b - | "
                    "awk {awk_string} >> {tx_out_file}")
        data['priority_coverage'] = os.path.abspath(out_file)
    return data
Ejemplo n.º 16
Ejemplo n.º 17
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed)
        parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
        cmdl += " | {parse_cmd} > {tx_out_file}"
        message = "Calculating base coverage of {bed_file} in {in_bam}"
        do.run(cmdl.format(**locals()), message.format(**locals()))
    return out_file
Ejemplo n.º 18
Ejemplo n.º 19
def get_coords(data):
    """Retrieve coordinates of genes of interest for prioritization.

    Can read from CIViC input data or a supplied BED file of chrom, start, end
    and gene information.
    for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}),
                             ("amplification", {"AMPLIFICATION"})]:
        out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {})
        priority_file = dd.get_svprioritize(data)
        if priority_file:
            if os.path.basename(priority_file).find("civic") >= 0:
                for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)):
                    out[gene] = (chrom, start, end)
            elif os.path.basename(priority_file).find(".bed") >= 0:
                for line in utils.open_gzipsafe(priority_file):
                    parts = line.strip().split("\t")
                    if len(parts) >= 4:
                        chrom, start, end, gene = parts[:4]
                        out[gene] = (chrom, int(start), int(end))
        yield category, out
Ejemplo n.º 20
def _handle_multiple_svcallers(data, stage):
    """Retrieve configured structural variation caller, handling multiple.
    svs = get_svcallers(data)
    # special cases -- prioritization
    if stage == "ensemble" and dd.get_svprioritize(data):
    out = []
    for svcaller in svs:
        if svcaller in _get_callers([data], stage):
            base = copy.deepcopy(data)
            # clean SV callers present in multiple rounds and not this caller
            final_svs = []
            for sv in data.get("sv", []):
                if stage == "ensemble" or sv["variantcaller"] == svcaller or sv["variantcaller"] not in svs:
            base["sv"] = final_svs
            base["config"]["algorithm"]["svcaller"] = svcaller
            base["config"]["algorithm"]["svcaller_orig"] = svs
    return out
Ejemplo n.º 21
def priority_total_coverage(data, out_dir):
    calculate coverage at 10 depth intervals in the priority regions
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-")
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam):
        return out_file
    cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                 depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    with file_transaction(out_file) as tx_out_file:
        message = "Calculating region coverage of {bed_file} in {in_bam}"
        do.run(cmdl + " -o " + tx_out_file, message.format(**locals()))
    logger.debug("Saved svprioritize coverage into " + out_file)
    return out_file
Ejemplo n.º 22
def get_coords(data):
    """Retrieve coordinates of genes of interest for prioritization.

    Can read from CIViC input data or a supplied BED file of chrom, start, end
    and gene information.
    for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}),
                             ("amplification", {"AMPLIFICATION"})]:
        out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {})
        priority_file = dd.get_svprioritize(data)
        if priority_file:
            if os.path.basename(priority_file).find("civic") >= 0:
                for chrom, start, end, gene in _civic_regions(
                        priority_file, vtypes, dd.get_disease(data)):
                    out[gene] = (chrom, start, end)
            elif os.path.basename(priority_file).find(".bed") >= 0:
                for line in utils.open_gzipsafe(priority_file):
                    parts = line.strip().split("\t")
                    if len(parts) >= 4:
                        chrom, start, end, gene = parts[:4]
                        out[gene] = (chrom, int(start), int(end))
        yield category, out
Ejemplo n.º 23
def priority_coverage(data, out_dir):
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
            cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "{in_bam} | {parse_cmd} > {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_file