Esempio n. 1
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
Esempio n. 2
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(
                set(
                    tz.concat(
                        [dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
Esempio n. 3
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(
        variant_file), "Manta finished without output file %s" % variant_file
    out = []
    for data in items:
        sample_file = _select_sample(data, variant_file, work_dir)
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({
            "variantcaller": "manta",
            "vrn_file": effects_vcf or sample_file
        })
        out.append(data)
    return out
Esempio n. 4
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
        if vcfutils.get_paired([data]):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    return to_run
Esempio n. 5
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file
    variant_file = shared.annotate_with_depth(variant_file, items)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "break-point-inspector" in dd.get_tools_on(data):
            if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(data):
                variant_file = _run_break_point_inspector(data, variant_file, paired, work_dir)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        vc = {"variantcaller": "manta",
              "do_upload": upload_counts[final_vcf] == 0,  # only upload a single file per batch
              "vrn_file": final_vcf}
        evidence_bam = _get_evidence_bam(work_dir, data)
        if evidence_bam:
            vc["read_evidence"] = evidence_bam
        data["sv"].append(vc)
        upload_counts[final_vcf] += 1
        out.append(data)
    return out
Esempio n. 6
0
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    items = [_clean_flat_variantcaller(x) for x in items]
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    cur["metadata"]["batch"] = "%s-germline" % cur["description"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc["germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
Esempio n. 7
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    purecn_out = _run_purecn(paired, work_dir)
    # XXX Currently finding edge case failures with Dx calling, needs additional testing
    # purecn_out = _run_purecn_dx(purecn_out, paired)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if purecn_out:
        purecn_out["variantcaller"] = "purecn"
        if "loh" in purecn_out:
            from bcbio.structural import titancna
            purecn_out["vrn_file"] = titancna.to_vcf(purecn_out["loh"],
                                                     "PureCN",
                                                     _get_header,
                                                     _loh_to_vcf,
                                                     paired.tumor_data,
                                                     sep=",")
            purecn_out["lohsummary"] = loh.summary_status(
                purecn_out, paired.tumor_data)
        if "sv" not in paired.tumor_data:
            paired.tumor_data["sv"] = []
        paired.tumor_data["sv"].append(purecn_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 8
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(
        variant_file), "Manta finished without output file %s" % variant_file
    out = []
    for data in items:
        if paired and paired.normal_bam and "break-point-inspector" in dd.get_tools_on(
                data):
            variant_file = _run_break_point_inspector(data, variant_file,
                                                      paired)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        data["sv"].append({"variantcaller": "manta", "vrn_file": final_vcf})
        out.append(data)
    return out
Esempio n. 9
0
def population_variant_regions(items, merged=False):
    """Retrieve the variant region BED file from a population of items.

    If tumor/normal, return the tumor BED file. If a population, return
    the BED file covering the most bases.
    """
    def _get_variant_regions(data):
        out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
        # Only need to merge for variant region inputs, not callable BED regions which don't overlap
        if merged and dd.get_variant_regions(data):
            merged_out = dd.get_variant_regions_merged(data)
            if merged_out:
                out = merged_out
            else:
                out = merge_overlaps(out, data)
        return out
    import pybedtools
    if len(items) == 1:
        return _get_variant_regions(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            return _get_variant_regions(paired.tumor_data)
        else:
            vrs = []
            for data in items:
                vr_bed = _get_variant_regions(data)
                if vr_bed:
                    vrs.append((pybedtools.BedTool(vr_bed).total_coverage(), vr_bed))
            vrs.sort(reverse=True)
            if vrs:
                return vrs[0][1]
Esempio n. 10
0
def annotate_with_depth(in_file, items):
    """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold)

    Currently annotates single sample and tumor samples in somatic analysis.
    """
    bam_file = None
    if len(items) == 1:
        bam_file = dd.get_align_bam(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            bam_file = paired.tumor_bam
    if bam_file:
        out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                if not in_file.endswith(".gz"):
                    in_file = vcfutils.bgzip_and_index(
                        in_file,
                        remove_orig=False,
                        out_dir=os.path.dirname(tx_out_file))
                ref_file = dd.get_ref_file(items[0])
                # cores for BAM reader thread, so max out at 4 based on recommendations
                cores = min([dd.get_num_cores(items[0]), 4])
                cmd = (
                    "duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} "
                    "-o {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Annotate SV depth with duphold")
        vcfutils.bgzip_and_index(out_file)
        return out_file
    else:
        return in_file
Esempio n. 11
0
def run(items):
    from bcbio import heterogeneity
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping TitanCNA; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    cn_file = _titan_cn_file(dd.get_normalized_depth(paired.tumor_data),
                             work_dir, paired.tumor_data)
    het_file = _titan_het_file(heterogeneity.get_variants(paired.tumor_data),
                               work_dir, paired)
    if _should_run(het_file):
        ploidy_outdirs = []
        for ploidy in [2, 3, 4]:
            for num_clusters in [1, 2, 3]:
                out_dir = _run_titancna(cn_file, het_file, ploidy,
                                        num_clusters, work_dir,
                                        paired.tumor_data)
            ploidy_outdirs.append((ploidy, out_dir))
        solution_file = _run_select_solution(ploidy_outdirs, work_dir,
                                             paired.tumor_data)
    else:
        logger.info("Skipping TitanCNA; not enough input data: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if "sv" not in paired.tumor_data:
        paired.tumor_data["sv"] = []
    paired.tumor_data["sv"].append(
        _finalize_sv(solution_file, paired.tumor_data))
    out.append(paired.tumor_data)
    return out
Esempio n. 12
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = vcfutils.get_somatic_variantcallers(items)
            if "vardict" in vcs:
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
            elif "mutect" in vcs or "mutect2" in vcs:
                raise ValueError(
                    "MuTect and MuTect2 require a 'phenotype: tumor' sample for calling, "
                    "in batch %s: %s" %
                    (batch, [dd.get_sample_name(data) for data in items]))
Esempio n. 13
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    purecn_out = _run_purecn(paired, work_dir)
    # XXX Currently finding edge case failures with Dx calling, needs additional testing
    # purecn_out = _run_purecn_dx(purecn_out, paired)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if purecn_out:
        purecn_out["variantcaller"] = "purecn"
        if "loh" in purecn_out:
            from bcbio.structural import titancna
            purecn_out["vrn_file"] = titancna.to_vcf(purecn_out["loh"], "PureCN", _get_header, _loh_to_vcf,
                                                     paired.tumor_data, sep=",")
            purecn_out["lohsummary"] = loh.summary_status(purecn_out, paired.tumor_data)
        if "sv" not in paired.tumor_data:
            paired.tumor_data["sv"] = []
        paired.tumor_data["sv"].append(purecn_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 14
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                cmd = [_get_cmd(), "segment", "-p", str(dd.get_cores(data)),
                       "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                if cov_interval == "genome":
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/chapmanb/bcbio-nextgen/issues/2171#issuecomment-348333650
                paired = vcfutils.get_paired(items)
                if paired:
                    cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Esempio n. 15
0
def population_variant_regions(items, merged=False):
    """Retrieve the variant region BED file from a population of items.

    If tumor/normal, return the tumor BED file. If a population, return
    the BED file covering the most bases.
    """
    def _get_variant_regions(data):
        out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
        if merged:
            merged_out = dd.get_variant_regions_merged(data)
            if merged_out:
                out = merged_out
            else:
                out = merge_overlaps(out, data)
        return out

    import pybedtools
    if len(items) == 1:
        return _get_variant_regions(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            return _get_variant_regions(paired.tumor_data)
        else:
            vrs = []
            for data in items:
                vr_bed = _get_variant_regions(data)
                if vr_bed:
                    vrs.append(
                        (pybedtools.BedTool(vr_bed).total_coverage(), vr_bed))
            vrs.sort(reverse=True)
            if vrs:
                return vrs[0][1]
Esempio n. 16
0
def split_somatic(items):
    """Split somatic batches, adding a germline target.

    Enables separate germline calling of samples using shared alignments.
    """
    somatic_groups, somatic, non_somatic = vcfutils.somatic_batches(items)
    # extract germline samples to run from normals in tumor/normal pairs
    germline_added = set([])
    germline = []
    for somatic_group in somatic_groups:
        paired = vcfutils.get_paired(somatic_group)
        if paired and paired.normal_data:
            cur = utils.deepish_copy(paired.normal_data)
            vc = dd.get_variantcaller(cur)
            if isinstance(vc, dict) and "germline" in vc:
                cur["description"] = "%s-germline" % cur["description"]
                if cur["description"] not in germline_added:
                    germline_added.add(cur["description"])
                    cur["rgnames"]["sample"] = cur["description"]
                    del cur["metadata"]["batch"]
                    cur["metadata"]["phenotype"] = "germline"
                    cur = remove_align_qc_tools(cur)
                    cur["config"]["algorithm"]["variantcaller"] = vc[
                        "germline"]
                    germline.append(cur)
    # Fix variantcalling specification for only somatic targets
    somatic_out = []
    for data in somatic:
        vc = dd.get_variantcaller(data)
        if isinstance(vc, dict) and "somatic" in vc:
            data["config"]["algorithm"]["variantcaller"] = vc["somatic"]
        somatic_out.append(data)
    return non_somatic + somatic_out + germline
Esempio n. 17
0
def run(items):
    paired = vcfutils.get_paired(items)
    # paired is PairedInfo of one T/N pair (or just T) - named tuple, paired.tumor_config
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    normaldb = tz.get_in(["algorithm", "background", "cnv_reference", "purecn_normaldb"], paired.tumor_config)
    # the right way of running purecn is with normaldb
    if normaldb:
        purecn_out = _run_purecn_normaldb(paired, work_dir)
        purecn_out = _run_purecn_dx(purecn_out, paired)
    else:
        purecn_out = _run_purecn(paired, work_dir)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if purecn_out:
        purecn_out["variantcaller"] = "purecn"
        if "loh" in purecn_out:
            from bcbio.structural import titancna
            purecn_out["vrn_file"] = titancna.to_vcf(purecn_out["loh"], "PureCN", _get_header, _loh_to_vcf,
                                                     paired.tumor_data, sep=",")
            purecn_out["lohsummary"] = loh.summary_status(purecn_out, paired.tumor_data)
        if "sv" not in paired.tumor_data:
            paired.tumor_data["sv"] = []
        paired.tumor_data["sv"].append(purecn_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 18
0
def annotate_with_depth(in_file, items):
    """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold)

    Currently annotates single sample and tumor samples in somatic analysis.
    """
    bam_file = None
    if len(items) == 1:
        bam_file = dd.get_align_bam(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            bam_file = paired.tumor_bam
    if bam_file:
        out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                if not in_file.endswith(".gz"):
                    in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False,
                                                       out_dir=os.path.dirname(tx_out_file))
                ref_file = dd.get_ref_file(items[0])
                # cores for BAM reader thread, so max out at 4 based on recommendations
                cores = min([dd.get_num_cores(items[0]), 4])
                cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} "
                       "-o {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate SV depth with duphold")
        vcfutils.bgzip_and_index(out_file)
        return out_file
    else:
        return in_file
Esempio n. 19
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm", "aligner")) in
        ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items):
        raise ValueError(
            "Require bwa or minimap2 alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams,
                                          work_dir, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(
                lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name,
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            if dd.get_svprioritize(data):
                effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            else:
                effects_vcf = None
            data["sv"].append({
                "variantcaller": "lumpy",
                "vrn_file": effects_vcf or vcf_file,
                "exclude_file": exclude_file
            })
        out.append(data)
    return out
Esempio n. 20
0
def _cnvkit_segment(cnr_file,
                    cov_interval,
                    data,
                    items,
                    out_file=None,
                    detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n"
                    )
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [
                    _get_cmd(), "segment", "-p",
                    str(cores), "-o", tx_out_file, cnr_file
                ]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(
                        cnr_file) and cov_interval != "genome":
                    cmd += [
                        "--vcf", small_vrn_files[0].name, "--sample-id",
                        small_vrn_files[0].sample
                    ]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment",
                                                       data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = (
                    "%s && export TMPDIR=%s && " %
                    (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Esempio n. 21
0
def _pick_lead_item(items):
    """Choose lead item for a set of samples.

    Picks tumors for tumor/normal pairs and first sample for batch groups.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        return paired.tumor_data
    else:
        return list(items)[0]
Esempio n. 22
0
def _annotate_somatic(data):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and os.path.exists(r["cosmic"]):
                return True
    return False
Esempio n. 23
0
def _annotate_somatic(data, retriever=None):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]):
                return True
    return False
Esempio n. 24
0
def _annotate_somatic(data, retriever=None):
    """Annotate somatic calls if we have cosmic data installed.
    """
    if is_human(data):
        paired = vcfutils.get_paired([data])
        if paired:
            r = dd.get_variation_resources(data)
            if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]):
                return True
    return False
Esempio n. 25
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams,
                                          work_dir, items)
    lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(
                lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name,
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            data["sv"].append({
                "variantcaller": "lumpy",
                "vrn_file": effects_vcf or vcf_file,
                "do_upload": upload_counts[vcf_file] ==
                0,  # only upload a single file per batch
                "exclude_file": exclude_file
            })
            upload_counts[vcf_file] += 1
        out.append(data)
    return out
Esempio n. 26
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using GATK4 CNV calling.

    TODO: implement germline calling with DetermineGermlineContigPloidy and GermlineCNVCaller
    """
    if not background: background = []
    paired = vcfutils.get_paired(items + background)
    if paired:
        out = _run_paired(paired)
    else:
        out = items
        logger.warn("GATK4 CNV calling currently only available for somatic samples: %s" %
                    ", ".join([dd.get_sample_name(d) for d in items + background]))
    return out
Esempio n. 27
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired or not paired.normal_name:
        logger.info(
            "Skipping PURPLE; need tumor/normal somatic calls in batch: %s" %
            " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    from bcbio import heterogeneity
    het_file = _amber_het_file(heterogeneity.get_variants(paired.tumor_data),
                               work_dir, paired)
    depth_file = _run_cobalt(paired, work_dir)
    print(het_file, depth_file)
    return items
Esempio n. 28
0
def run(items, background=None):
    """Detect copy number variations from batched set of samples using GATK4 CNV calling.

    TODO: implement germline calling with DetermineGermlineContigPloidy and GermlineCNVCaller
    """
    if not background: background = []
    paired = vcfutils.get_paired(items + background)
    if paired:
        out = _run_paired(paired)
    else:
        out = items
        logger.warn("GATK4 CNV calling currently only available for somatic samples: %s" %
                    ", ".join([dd.get_sample_name(d) for d in items + background]))
    return out
Esempio n. 29
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(paired.tumor_data if paired else items[0])
    workflow_file = _prep_config(items, paired, work_dir)
    variant_file = _run_workflow(items, paired, workflow_file, work_dir)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "manta",
                           "vrn_file": variant_file})
        out.append(data)
    return out
Esempio n. 30
0
def finalize_sv(orig_vcf, data, items):
    """Finalize structural variants, adding effects and splitting if needed.
    """
    paired = vcfutils.get_paired(items)
    # For paired/somatic, attach combined calls to tumor sample
    if paired:
        sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(data) else None
    else:
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0], dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data), sample_vcf, data["config"])
    if sample_vcf:
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
    else:
        effects_vcf = None
    return effects_vcf or sample_vcf
Esempio n. 31
0
def _compatible_small_variants(data, items):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    from bcbio import heterogeneity
    VarFile = collections.namedtuple("VarFile", ["name", "sample", "normal"])
    out = []
    paired = vcfutils.get_paired(items)
    for v in heterogeneity.get_variants(data, include_germline=not paired):
        vrn_file = v["vrn_file"]
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        if paired:
            out.append(VarFile(vrn_file, paired.tumor_name, paired.normal_name))
        else:
            out.append(VarFile(vrn_file, dd.get_sample_name(data), None))
    return out
Esempio n. 32
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(paired.tumor_data if paired else items[0])
    workflow_file = _prep_config(items, paired, work_dir)
    variant_file = _run_workflow(items, paired, workflow_file, work_dir)
    sample_file = _select_sample(items, paired, variant_file, work_dir)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append({"variantcaller": "manta", "vrn_file": sample_file})
        out.append(data)
    return out
Esempio n. 33
0
def _compatible_small_variants(data, items):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    from bcbio import heterogeneity
    VarFile = collections.namedtuple("VarFile", ["name", "sample", "normal"])
    out = []
    paired = vcfutils.get_paired(items)
    for v in heterogeneity.get_variants(data, include_germline=not paired):
        vrn_file = v["vrn_file"]
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        if paired:
            out.append(VarFile(vrn_file, paired.tumor_name, paired.normal_name))
        else:
            out.append(VarFile(vrn_file, dd.get_sample_name(data), None))
    return out
Esempio n. 34
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if analysis.startswith(("standard", "variant", "variant2")):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["contamination", "peddy"]
        if vcfutils.get_paired([data]):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Esempio n. 35
0
def _add_vcf_header_sample_cl(in_file, items, base_file):
    """Add phenotype information to a VCF header.

    Encode tumor/normal relationships in VCF header.
    Could also eventually handle more complicated pedigree information if useful.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        toadd = ["##SAMPLE=<ID=%s,Genomes=Tumor>" % paired.tumor_name]
        if paired.normal_name:
            toadd.append("##SAMPLE=<ID=%s,Genomes=Germline>" % paired.normal_name)
            toadd.append("##PEDIGREE=<Derived=%s,Original=%s>" % (paired.tumor_name, paired.normal_name))
        new_header = _update_header(in_file, base_file, toadd, _fix_generic_tn_names(paired))
        if vcfutils.vcf_has_variants(in_file):
            cmd = "bcftools reheader -h {new_header} | bcftools view "
            return cmd.format(**locals())
Esempio n. 36
0
def _compatible_small_variants(data, items):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    VarFile = collections.namedtuple("VarFile", ["name", "sample", "normal"])
    supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"])
    out = []
    for v in data.get("variants", []):
        vrn_file = v.get("vrn_file")
        if vrn_file and v.get("variantcaller") in supported:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            paired = vcfutils.get_paired(items)
            if paired:
                out.append(VarFile(vrn_file, paired.tumor_name, paired.normal_name))
            else:
                out.append(VarFile(vrn_file, dd.get_sample_name(data), None))
    return out
Esempio n. 37
0
def _add_vcf_header_sample_cl(in_file, items, base_file):
    """Add phenotype information to a VCF header.

    Encode tumor/normal relationships in VCF header.
    Could also eventually handle more complicated pedigree information if useful.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        toadd = ["##SAMPLE=<ID=%s,Genomes=Tumor>" % paired.tumor_name]
        if paired.normal_name:
            toadd.append("##SAMPLE=<ID=%s,Genomes=Germline>" % paired.normal_name)
            toadd.append("##PEDIGREE=<Derived=%s,Original=%s>" % (paired.tumor_name, paired.normal_name))
        new_header = _update_header(in_file, base_file, toadd, _fix_generic_tn_names(paired))
        if vcfutils.vcf_has_variants(in_file):
            cmd = "bcftools reheader -h {new_header} | bcftools view "
            return cmd.format(**locals())
Esempio n. 38
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items)
    lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            data["sv"].append({"variantcaller": "lumpy",
                               "vrn_file": effects_vcf or vcf_file,
                               "do_upload": upload_counts[vcf_file] == 0,  # only upload a single file per batch
                               "exclude_file": exclude_file})
            upload_counts[vcf_file] += 1
        out.append(data)
    return out
Esempio n. 39
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    purecn_out = _run_purecn(paired, work_dir)
    purecn_out = _run_purecn_dx(purecn_out, paired)
    purecn_out["variantcaller"] = "purecn"
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if "sv" not in paired.tumor_data:
        paired.tumor_data["sv"] = []
    paired.tumor_data["sv"].append(purecn_out)
    return out
Esempio n. 40
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(paired.tumor_data if paired else items[0])
    workflow_file = _prep_config(items, paired, work_dir)
    variant_file = _run_workflow(items, paired, workflow_file, work_dir)
    out = []
    for data in items:
        sample_file = _select_sample(data, variant_file, work_dir)
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({"variantcaller": "manta",
                           "vrn_file": effects_vcf or sample_file})
        out.append(data)
    return out
Esempio n. 41
0
def finalize_sv(orig_vcf, data, items):
    """Finalize structural variants, adding effects and splitting if needed.
    """
    paired = vcfutils.get_paired(items)
    # For paired/somatic, attach combined calls to tumor sample
    if paired:
        sample_vcf = orig_vcf if paired.tumor_name == dd.get_sample_name(
            data) else None
    else:
        sample_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(orig_vcf)[0],
                                       dd.get_sample_name(data))
        sample_vcf = vcfutils.select_sample(orig_vcf, dd.get_sample_name(data),
                                            sample_vcf, data["config"])
    if sample_vcf:
        effects_vcf, _ = effects.add_to_vcf(sample_vcf, data, "snpeff")
    else:
        effects_vcf = None
    return effects_vcf or sample_vcf
Esempio n. 42
0
def _cnvkit_segment(cnr_file, cov_interval, data, items, out_file=None, detailed=False):
    """Perform segmentation and copy number calling on normalized inputs
    """
    if not out_file:
        out_file = "%s.cns" % os.path.splitext(cnr_file)[0]
    if not utils.file_uptodate(out_file, cnr_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not _cna_has_values(cnr_file):
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write("chromosome\tstart\tend\tgene\tlog2\tprobes\tCN1\tCN2\tbaf\tweight\n")
            else:
                # Scale cores to avoid memory issues with segmentation
                # https://github.com/etal/cnvkit/issues/346
                if cov_interval == "genome":
                    cores = max(1, dd.get_cores(data) // 2)
                else:
                    cores = dd.get_cores(data)
                cmd = [_get_cmd(), "segment", "-p", str(cores), "-o", tx_out_file, cnr_file]
                small_vrn_files = _compatible_small_variants(data, items)
                if len(small_vrn_files) > 0 and _cna_has_values(cnr_file) and cov_interval != "genome":
                    cmd += ["--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample]
                    if small_vrn_files[0].normal:
                        cmd += ["--normal-id", small_vrn_files[0].normal]
                resources = config_utils.get_resources("cnvkit_segment", data["config"])
                user_options = resources.get("options", [])
                cmd += [str(x) for x in user_options]
                if cov_interval == "genome" and "--threshold" not in user_options:
                    cmd += ["--threshold", "0.00001"]
                # For tumors, remove very low normalized regions, avoiding upcaptured noise
                # https://github.com/bcbio/bcbio-nextgen/issues/2171#issuecomment-348333650
                # unless we want detailed segmentation for downstream tools
                paired = vcfutils.get_paired(items)
                if paired:
                    #if detailed:
                    #    cmd += ["-m", "hmm-tumor"]
                    if "--drop-low-coverage" not in user_options:
                        cmd += ["--drop-low-coverage"]
                # preferentially use conda installed Rscript
                export_cmd = ("%s && export TMPDIR=%s && "
                              % (utils.get_R_exports(), os.path.dirname(tx_out_file)))
                do.run(export_cmd + " ".join(cmd), "CNVkit segment")
    return out_file
Esempio n. 43
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired:
        logger.info("Skipping PureCN; no somatic tumor calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    purecn_out = _run_purecn(paired, work_dir)
    # XXX Currently finding edge case failures with Dx calling, needs additional testing
    # purecn_out = _run_purecn_dx(purecn_out, paired)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if purecn_out:
        purecn_out["variantcaller"] = "purecn"
        if "sv" not in paired.tumor_data:
            paired.tumor_data["sv"] = []
        paired.tumor_data["sv"].append(purecn_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 44
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired or not paired.normal_name:
        logger.info("Skipping PURPLE; need tumor/normal somatic calls in batch: %s" %
                    " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    from bcbio import heterogeneity
    vrn_files = heterogeneity.get_variants(paired.tumor_data, include_germline=False)
    het_file = _amber_het_file("pon", vrn_files, work_dir, paired)
    depth_file = _run_cobalt(paired, work_dir)
    purple_out = _run_purple(paired, het_file, depth_file, vrn_files, work_dir)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if "sv" not in paired.tumor_data:
        paired.tumor_data["sv"] = []
    paired.tumor_data["sv"].append(purple_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 45
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        data["sv"].append({"variantcaller": "manta", "vrn_file": final_vcf})
        out.append(data)
    return out
Esempio n. 46
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(
        variant_file), "Manta finished without output file %s" % variant_file
    variant_file = shared.annotate_with_depth(variant_file, items)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "break-point-inspector" in dd.get_tools_on(data):
            if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(
                    data):
                variant_file = _run_break_point_inspector(
                    data, variant_file, paired, work_dir)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        vc = {
            "variantcaller": "manta",
            "do_upload": upload_counts[final_vcf] ==
            0,  # only upload a single file per batch
            "vrn_file": final_vcf
        }
        evidence_bam = _get_evidence_bam(work_dir, data)
        if evidence_bam:
            vc["read_evidence"] = evidence_bam
        data["sv"].append(vc)
        upload_counts[final_vcf] += 1
        out.append(data)
    return out
Esempio n. 47
0
def run(items):
    paired = vcfutils.get_paired(items)
    if not paired or not paired.normal_name:
        logger.info(
            "Skipping PURPLE; need tumor/normal somatic calls in batch: %s" %
            " ".join([dd.get_sample_name(d) for d in items]))
        return items
    work_dir = _sv_workdir(paired.tumor_data)
    from bcbio import heterogeneity
    vrn_files = heterogeneity.get_variants(paired.tumor_data,
                                           include_germline=False)
    het_file = _amber_het_file("pon", vrn_files, work_dir, paired)
    depth_file = _run_cobalt(paired, work_dir)
    purple_out = _run_purple(paired, het_file, depth_file, vrn_files, work_dir)
    out = []
    if paired.normal_data:
        out.append(paired.normal_data)
    if "sv" not in paired.tumor_data:
        paired.tumor_data["sv"] = []
    paired.tumor_data["sv"].append(purple_out)
    out.append(paired.tumor_data)
    return out
Esempio n. 48
0
def population_variant_regions(items):
    """Retrieve the variant region BED file from a population of items.

    If tumor/normal, return the tumor BED file. If a population, return
    the BED file covering the most bases.
    """
    import pybedtools
    if len(items) == 1:
        return dd.get_variant_regions(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            return dd.get_variant_regions(paired.tumor_data)
        else:
            vrs = []
            for data in items:
                vr_bed = dd.get_variant_regions(data)
                if vr_bed:
                    vrs.append((pybedtools.BedTool(vr_bed).total_coverage(), vr_bed))
            vrs.sort(reverse=True)
            if vrs:
                return vrs[0][1]
Esempio n. 49
0
def population_variant_regions(items):
    """Retrieve the variant region BED file from a population of items.

    If tumor/normal, return the tumor BED file. If a population, return
    the BED file covering the most bases.
    """
    import pybedtools
    if len(items) == 1:
        return dd.get_variant_regions(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            return dd.get_variant_regions(paired.tumor_data)
        else:
            vrs = []
            for data in items:
                vr_bed = dd.get_variant_regions(data)
                if vr_bed:
                    vrs.append((pybedtools.BedTool(vr_bed).total_coverage(), vr_bed))
            vrs.sort(reverse=True)
            if vrs:
                return vrs[0][1]
Esempio n. 50
0
def run(items, background=None):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    if paired:
        inputs = [paired.tumor_data]
        background = [paired.normal_data] if paired.normal_bam else []
    else:
        assert not background
        inputs, background = sshared.find_case_control(items)
    work_dir = _sv_workdir(inputs[0])
    variant_file = _run_gridss(inputs, background, work_dir)
    out = []
    for data in items:
        sample_file = variant_file
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({"variantcaller": "gridss",
                           "vrn_file": effects_vcf or sample_file})
        out.append(data)
    return out
Esempio n. 51
0
def should_filter(items):
    """Check if we should do damage filtering on somatic calling with low frequency events.
    """
    return (vcfutils.get_paired(items) is not None and
            any("damage_filter" in dd.get_tools_on(d) for d in items))
Esempio n. 52
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
            paired = vcfutils.get_paired(items)
            vcfutils.write_empty_vcf(out_file, samples=[x for x in (paired.tumor_name, paired.normal_name) if x])
            return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list))
              and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                region=region, out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region,
                                          out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=ref_file,
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file