def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif (vcfutils.get_paired_phenotype(data) and "tumor"
          in [vcfutils.get_paired_phenotype(d) for d in get_orig_items(data)]):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                cur_batch = tz.get_in(["metadata", "batch"], data)
                if cur_batch:
                    sub_data["metadata"]["batch"] = cur_batch
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling or population runs, do not split back up and keep in batches
    else:
        out = []
        for sub_data in get_orig_items(data):
            cur_batch = tz.get_in(["metadata", "batch"], data)
            if cur_batch:
                sub_data["metadata"]["batch"] = cur_batch
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Exemple #3
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Exemple #4
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif (vcfutils.get_paired_phenotype(data)
            and "tumor" in [vcfutils.get_paired_phenotype(d) for d in get_orig_items(data)]):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                cur_batch = tz.get_in(["metadata", "batch"], data)
                if cur_batch:
                    sub_data["metadata"]["batch"] = cur_batch
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling or population runs, do not split back up and keep in batches
    else:
        out = []
        for sub_data in get_orig_items(data):
            cur_batch = tz.get_in(["metadata", "batch"], data)
            if cur_batch:
                sub_data["metadata"]["batch"] = cur_batch
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
Exemple #5
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Exemple #6
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                if "combine" in data:
                    sub_data["combine"] = data["combine"]
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            if "combine" in data:
                sub_data["combine"] = data["combine"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if vcfutils.get_paired_phenotype(items[0]):
        has_tumor = False
        has_normal = False
        for sub_data in items:
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                has_tumor = True
            elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                has_normal = True
        return has_tumor and not has_normal
Exemple #8
0
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if vcfutils.get_paired_phenotype(items[0]):
        has_tumor = False
        has_normal = False
        for sub_data in items:
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                has_tumor = True
            elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                has_normal = True
        return has_tumor and not has_normal
Exemple #9
0
def _do_prioritize(data):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if vcfutils.get_paired_phenotype(data):
        has_tumor = False
        has_normal = False
        orig_items = vmulti.get_orig_items(data) if tz.get_in(["metadata", "batch"], data) else [data]
        for sub_data in orig_items:
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                has_tumor = True
            elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                has_normal = True
        return has_tumor and not has_normal
Exemple #10
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Exemple #11
0
def _do_prioritize(data):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples.
    """
    if vcfutils.get_paired_phenotype(data):
        has_tumor = False
        has_normal = False
        orig_items = vmulti.get_orig_items(data) if tz.get_in(
            ["metadata", "batch"], data) else [data]
        for sub_data in orig_items:
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                has_tumor = True
            elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                has_normal = True
        return has_tumor and not has_normal
Exemple #12
0
def _do_high_depth_filter(data):
    """Check if we should do high depth filtering -- only on germline non-regional calls.
    """
    return True
    is_genome = tz.get_in(["config", "algorithm", "coverage_interval"], data, "").lower() == "genome"
    is_paired = vcfutils.get_paired_phenotype(data)
    return is_genome and not is_paired
Exemple #13
0
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples and feeding into PureCN
    which needs the germline annotations.
    """
    if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items):
        if vcfutils.get_paired_phenotype(items[0]):
            has_tumor = False
            has_normal = False
            for sub_data in items:
                if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                    has_tumor = True
                elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                    has_normal = True
            return has_tumor and not has_normal
Exemple #14
0
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir,
                                "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [
            config_utils.get_program("bcbio-variation-recall",
                                     edata["config"]), "ensemble",
            "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
            "--numpass",
            str(num_pass)
        ]
        # Remove filtered calls if we're dealing with tumor/normal calls
        if vcfutils.get_paired_phenotype(edata):
            cmd += ["--nofiltered"]
        cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files
        do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": None
    }
Exemple #15
0
def _do_prioritize(items):
    """Determine if we should perform prioritization.

    Currently done on tumor-only input samples and feeding into PureCN
    which needs the germline annotations.
    """
    if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items):
        if vcfutils.get_paired_phenotype(items[0]):
            has_tumor = False
            has_normal = False
            for sub_data in items:
                if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                    has_tumor = True
                elif vcfutils.get_paired_phenotype(sub_data) == "normal":
                    has_normal = True
            return has_tumor and not has_normal
Exemple #16
0
def _freebayes_custom(in_file, ref_file, data):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.

    Experimental: for testing new methods.
    """
    if vcfutils.get_paired_phenotype(data):
        return None
    config = data["config"]
    bv_ver = programs.get_version("bcbio_variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(
            os.path.join(os.path.dirname(in_file), "tmp"))
        bv_jar = config_utils.get_jar(
            "bcbio.variation",
            config_utils.get_program("bcbio_variation", config, "dir"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["java"] + jvm_opts + java_args + [
            "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file
        ]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file
Exemple #17
0
def _do_high_depth_filter(data):
    """Check if we should do high depth filtering -- only on germline non-regional calls.
    """
    is_genome = tz.get_in(["config", "algorithm", "coverage_interval"], data,
                          "").lower() == "genome"
    is_paired = vcfutils.get_paired_phenotype(data)
    return is_genome and not is_paired
Exemple #18
0
def _snpeff_args_from_config(data):
    """Retrieve snpEff arguments supplied through input configuration.
    """
    config = data["config"]
    args = ["-hgvs"]
    # General supplied arguments
    resources = config_utils.get_resources("snpeff", config)
    if resources.get("options"):
        args += [str(x) for x in resources.get("options", [])]
    # cancer specific calling arguments
    if vcfutils.get_paired_phenotype(data):
        args += ["-cancer"]

    effects_transcripts = dd.get_effects_transcripts(data)
    if effects_transcripts in set(["canonical_cancer"]):
        _, snpeff_base_dir = get_db(data)
        canon_list_file = os.path.join(snpeff_base_dir, "transcripts",
                                       "%s.txt" % effects_transcripts)
        if not utils.file_exists(canon_list_file):
            raise ValueError(
                "Cannot find expected file for effects_transcripts: %s" %
                canon_list_file)
        args += ["-canonList", canon_list_file]
    elif effects_transcripts == "canonical" or tz.get_in(
        ("config", "algorithm", "clinical_reporting"), data):
        args += ["-canon"]
    return args
Exemple #19
0
def _split_cnv(items, calls_fpath, read_mapping_file, coverage_file):
    out_items = []
    for item in items:
        cur_sv = {
            "variantcaller": "seq2c",
            "coverage": tz.get_in(["depth", "bins", "seq2c"], item)
        }
        if not get_paired_phenotype(item) == "normal":
            sample_name = dd.get_sample_name(item)
            work_dir = _sv_workdir(item)
            out_fname = os.path.join(work_dir, sample_name + '-calls.tsv')
            if not utils.file_exists(out_fname):
                with file_transaction(item, out_fname) as tx:
                    with open(tx, "w") as out, open(calls_fpath) as inp:
                        out.write(next(inp))
                        for l in inp:
                            if l.split("\t")[0] == sample_name:
                                out.write(l)
            cur_sv.update({
                "calls": out_fname,
                "vrn_file": to_vcf(out_fname, item),
                "read_mapping": read_mapping_file,
                "calls_all": calls_fpath,
                "coverage_all": coverage_file
            })
        if "sv" not in item:
            item["sv"] = []
        assert "seq2c" not in [x["variantcaller"] for x in item["sv"]], \
            "Do not expect existing seq2c variant output: %s" % (dd.get_sample_name(item))
        item["sv"].append(cur_sv)
        out_items.append(item)
    return out_items
Exemple #20
0
def _split_cnv(items, calls_fpath, read_mapping_file, coverage_file):
    out_items = []
    for item in items:
        cur_sv = {"variantcaller": "seq2c", "coverage": tz.get_in(["depth", "bins", "seq2c"], item)}
        if not get_paired_phenotype(item) == "normal":
            sample_name = dd.get_sample_name(item)
            work_dir = _sv_workdir(item)
            out_fname = os.path.join(work_dir, sample_name + '-calls.tsv')
            if not utils.file_exists(out_fname):
                with file_transaction(item, out_fname) as tx:
                    with open(tx, "w") as out, open(calls_fpath) as inp:
                        out.write(next(inp))
                        for l in inp:
                            if l.split("\t")[0] == sample_name:
                                out.write(l)
            cur_sv.update({"calls": out_fname, "vrn_file": to_vcf(out_fname, item),
                           "read_mapping": read_mapping_file, "calls_all": calls_fpath,
                           "coverage_all": coverage_file})
        if "sv" not in item:
            item["sv"] = []
        assert "seq2c" not in [x["variantcaller"] for x in item["sv"]], \
            "Do not expect existing seq2c variant output: %s" % (dd.get_sample_name(item))
        item["sv"].append(cur_sv)
        out_items.append(item)
    return out_items
Exemple #21
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on cancer samples. Since these will be pooled, need special
    functionality to handle them
    """
    if vcfutils.get_paired_phenotype(data):
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
        if orig_out_file.endswith(".gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Exemple #22
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on cancer samples. Since these will be pooled, need special
    functionality to handle them
    """
    if vcfutils.get_paired_phenotype(data):
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % os.path.splitext(vcf_file)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with open(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
    return out_file
Exemple #23
0
def _prep_load_script(work_bams, names, chrom, items):
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    print len(items), items[0].get("metadata")
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Exemple #24
0
def _prep_load_script(work_bams, names, chrom, items):
    if not chrom: chrom = ""
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Exemple #25
0
def _prep_load_script(work_bams, names, chrom, items):
    if not chrom: chrom = ""
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
Exemple #26
0
def _prep_load_script(work_bams, names, chrom, items):
    pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired"
    print len(items), items[0].get("metadata")
    if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]):
        load_script = _paired_load_script
    else:
        load_script = _population_load_script
    return load_script(work_bams, names, chrom, pairmode, items)
def extract(data, items, out_dir=None):
    """Extract germline calls for the given sample, if tumor only.
    """
    if vcfutils.get_paired_phenotype(data):
        if len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data, out_dir)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
Exemple #28
0
def _cnvkit_by_type(items, background):
    """Dispatch to specific CNVkit functionality based on input type.
    """
    if len(items + background) == 1:
        return _run_cnvkit_single(items[0])
    elif vcfutils.get_paired_phenotype(items[0]):
        return _run_cnvkit_cancer(items, background)
    else:
        return _run_cnvkit_population(items, background)
Exemple #29
0
def extract(data, items, out_dir=None):
    """Extract germline calls for the given sample, if tumor only.
    """
    if vcfutils.get_paired_phenotype(data):
        if len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data, out_dir)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
Exemple #30
0
def _cnvkit_by_type(items, background):
    """Dispatch to specific CNVkit functionality based on input type.
    """
    if len(items + background) == 1:
        return _run_cnvkit_single(items[0])
    elif vcfutils.get_paired_phenotype(items[0]):
        return _run_cnvkit_cancer(items, background)
    else:
        return _run_cnvkit_population(items, background)
Exemple #31
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, assign the combined file to the
    tumor sample instead of splitting, and remove variant files from the normal.
    """
    config = data["config"]
    vrn_file = data["vrn_file"]
    out = []
    # cancer tumor/normal
    if vcfutils.get_paired_phenotype(data):
        # handle trailing normals, which we don't need to process
        if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal":
            sub_data, sub_vrn_file = data["group_orig"][0]
            sub_data.pop("vrn_file", None)
            sub_data["vrn_file-shared"] = sub_vrn_file
            out.append(sub_data)
        else:
            has_tumor = False
            for sub_data, sub_vrn_file in data["group_orig"]:
                paired_phenotype = vcfutils.get_paired_phenotype(sub_data)
                if paired_phenotype == "tumor":
                    has_tumor = True
                    if not os.path.exists(sub_vrn_file):
                        utils.symlink_plus(vrn_file, sub_vrn_file)
                    sub_data["vrn_file"] = sub_vrn_file
                    out.append(sub_data)
                else:
                    sub_data.pop("vrn_file", None)
                    sub_data["vrn_file-shared"] = sub_vrn_file
                    out.append(sub_data)
            if not has_tumor:
                raise ValueError("Did not find tumor sample in paired analysis")
    # population or single sample
    else:
        for sub_data, sub_vrn_file in data["group_orig"]:
            if len(vcfutils.get_samples(vrn_file)) > 1:
                vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config)
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(vrn_file, sub_vrn_file)
            if sub_vrn_file:
                sub_data["vrn_file"] = sub_vrn_file
                out.append(sub_data)
    return out
Exemple #32
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, assign the combined file to the
    tumor sample instead of splitting, and remove variant files from the normal.
    """
    config = data["config"]
    vrn_file = data["vrn_file"]
    out = []
    # cancer tumor/normal
    if vcfutils.get_paired_phenotype(data):
        # handle trailing normals, which we don't need to process
        if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal":
            sub_data = data["group_orig"][0][0]
            sub_data.pop("vrn_file", None)
            out.append(sub_data)
        else:
            has_tumor = False
            for sub_data, sub_vrn_file in data["group_orig"]:
                paired_phenotype = vcfutils.get_paired_phenotype(sub_data)
                if paired_phenotype == "tumor":
                    has_tumor = True
                    if not os.path.exists(sub_vrn_file):
                        utils.symlink_plus(vrn_file, sub_vrn_file)
                    sub_data["vrn_file"] = sub_vrn_file
                    out.append(sub_data)
                else:
                    sub_data.pop("vrn_file", None)
                    out.append(sub_data)
            if not has_tumor:
                raise ValueError("Did not find tumor sample in paired analysis")
    # population or single sample
    else:
        for sub_data, sub_vrn_file in data["group_orig"]:
            if is_multisample(vrn_file):
                select_sample_from_vcf(vrn_file, sub_data["name"][-1], sub_vrn_file,
                                       data["sam_ref"], config)
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(vrn_file, sub_vrn_file)
            if sub_vrn_file:
                sub_data["vrn_file"] = sub_vrn_file
                out.append(sub_data)
    return out
Exemple #33
0
def _cnvkit_by_type(items, background):
    """Dispatch to specific CNVkit functionality based on input type.
    """
    access_file = _create_access_file(dd.get_ref_file(items[0]), _sv_workdir(items[0]), items[0])
    if len(items + background) == 1:
        return _run_cnvkit_single(items[0], access_file)
    elif vcfutils.get_paired_phenotype(items[0]):
        return _run_cnvkit_cancer(items, background, access_file)
    else:
        return _run_cnvkit_population(items, background, access_file)
Exemple #34
0
def _cnvkit_by_type(items, background):
    """Dispatch to specific CNVkit functionality based on input type.
    """
    access_file = _create_access_file(dd.get_ref_file(items[0]), _sv_workdir(items[0]), items[0])
    if len(items + background) == 1:
        return _run_cnvkit_single(items[0], access_file)
    elif vcfutils.get_paired_phenotype(items[0]):
        return _run_cnvkit_cancer(items, background, access_file)
    else:
        return _run_cnvkit_population(items, background, access_file)
Exemple #35
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [
            x for x in dd.get_viral_files(data)
            if os.path.basename(x) == "%s.fa" % viral_target
        ]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(
                utils.safe_makedir(out_dir), "%s-%s.bam" %
                (dd.get_sample_name(data),
                 utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-completeness.txt" % utils.splitext_plus(
                viral_bam)[0]
            cores = dd.get_num_cores(data)
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        tmpfile = "%s-tmp" % utils.splitext_plus(
                            tx_out_file)[0]
                        cmd = (
                            "samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Align unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    sample_name = dd.get_sample_name(data)
                    mosdepth_prefix = os.path.splitext(viral_bam)[0]
                    cmd = (
                        "mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                        "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                        "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                        "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                        "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
                        "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
                        "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
                        "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()),
                           "Analyse coverage of viral genomes")
            out["base"] = out_file
            out["secondary"] = []
    return out
Exemple #36
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
        if dd.get_chip_method(data) == "atac":
            to_run.append("ataqv")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if dd.has_variantcalls(data):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["peddy"]
            if "contamination" not in dd.get_tools_off(data):
                to_run += ["contamination"]
        if vcfutils.get_paired_phenotype(data):
            if "viral" not in dd.get_tools_off(data):
                to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Exemple #37
0
def _pick_lead_item(items):
    """Pick single representative sample for batch calling to attach calls to.

    For cancer samples, attach to tumor.
    """
    if vcfutils.is_paired_analysis([x["align_bam"] for x in items], items):
        for data in items:
            if vcfutils.get_paired_phenotype(data) == "tumor":
                return data
        raise ValueError("Did not find tumor sample in paired tumor/normal calling")
    else:
        return items[0]
Exemple #38
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor/normal or prioritized.
    """
    if vcfutils.get_paired_phenotype(data):
        is_paired = dd.get_batches(data) and len(items) > 1
        if is_paired:
            germline_vcf = _extract_germline(data["vrn_file"], data)
        else:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
        germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
        data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
Exemple #39
0
def _pick_lead_item(items):
    """Pick single representative sample for batch calling to attach calls to.

    For cancer samples, attach to tumor.
    """
    if vcfutils.is_paired_analysis([x["align_bam"] for x in items], items):
        for data in items:
            if vcfutils.get_paired_phenotype(data) == "tumor":
                return data
        raise ValueError("Did not find tumor sample in paired tumor/normal calling")
    else:
        return items[0]
Exemple #40
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor/normal or prioritized.
    """
    if vcfutils.get_paired_phenotype(data):
        is_paired = dd.get_batches(data) and len(items) > 1
        if is_paired:
            germline_vcf = _extract_germline(data["vrn_file"], data)
        else:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
        germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
        data["vrn_file_plus"] = {"germline": germline_vcf}
    return data
Exemple #41
0
def _get_ensemble_bed_files(items):
    """
    get all ensemble structural BED file calls, skipping any normal samples from
    tumor/normal calls
    """
    bed_files = []
    for data in items:
        for sv in data.get("sv", []):
            if sv["variantcaller"] == "sv-ensemble":
                if ("vrn_file" in sv and not vcfutils.get_paired_phenotype(data) == "normal"
                      and file_exists(sv["vrn_file"])):
                    bed_files.append(sv["vrn_file"])
    return bed_files
Exemple #42
0
def _split_cnv(items, calls_fpath):
    for item in items:
        if get_paired_phenotype(item) == "normal":
            continue

        sample_name = dd.get_sample_name(item)
        work_dir = _sv_workdir(item)
        out_fname = os.path.join(work_dir, sample_name + '-calls.tsv')
        if not utils.file_exists(out_fname):
            with file_transaction(item, out_fname) as tx:
                with open(tx, "w") as out, open(calls_fpath) as inp:
                    out.write(next(inp))
                    for l in inp:
                        if l.split("\t")[0] == sample_name:
                            out.write(l)
        item["sv"][0]["calls"] = out_fname
Exemple #43
0
def _snpeff_args_from_config(data):
    """Retrieve snpEff arguments supplied through input configuration.
    """
    config = data["config"]
    args = []
    # General supplied arguments
    resources = config_utils.get_resources("snpeff", config)
    if resources.get("options"):
        args += [str(x) for x in resources.get("options", [])]
    # cancer specific calling arguments
    if vcfutils.get_paired_phenotype(data):
        args += ["-cancer"]
    # Provide options tuned to reporting variants in clinical environments
    if config["algorithm"].get("clinical_reporting"):
        args += ["-canon", "-hgvs"]
    return args
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if analysis.startswith(("standard", "variant", "variant2")):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["contamination", "peddy"]
        if vcfutils.get_paired_phenotype(data):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Exemple #45
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Exemple #46
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Exemple #47
0
def _compatible_small_variants(data):
    """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit.
    """
    supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"])
    out = []
    for v in data.get("variants", []):
        vrn_file = v.get("vrn_file")
        if vrn_file and v.get("variantcaller") in supported:
            base, ext = utils.splitext_plus(os.path.basename(vrn_file))
            if vcfutils.get_paired_phenotype(data):
                out.append(vrn_file)
            else:
                sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"],
                                               "%s-%s%s" % (base, dd.get_sample_name(data), ext))
                sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file,
                                                         data["config"])
                out.append(sample_vrn_file)
    return out
Exemple #48
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
            cores = dd.get_num_cores(data)
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Align unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    sample_name = dd.get_sample_name(data)
                    mosdepth_prefix = os.path.splitext(viral_bam)[0]
                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                           "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                           "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                           "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                           "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
                           "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
                           "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
                           "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
            out["base"] = out_file
            out["secondary"] = []
    return out
Exemple #49
0
def run(items):
    """Normalization and log2 ratio calculation plus CNV calling for full cohort.

    - Combine coverage of each region for each sample
    - Prepare read counts for each sample
    - Normalize coverages in cohort by gene and sample, and calculate log2 ratios
    - Call amplifications and deletions
    """
    items = [utils.to_single_data(x) for x in items]
    work_dir = _sv_workdir(items[0])

    coverage_file = _combine_coverages(items, work_dir)
    read_mapping_file = _calculate_mapping_reads(items, work_dir)

    normal_names = [dd.get_sample_name(x) for x in items if get_paired_phenotype(x) == "normal"]
    seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names)
    _split_cnv(items, seq2c_calls_file)

    return items
Exemple #50
0
def _cnvkit_by_type(items, background, work_dir):
    """Dispatch to specific CNVkit functionality based on input type.
    """
    access_file = _create_access_file(dd.get_ref_file(items[0]), work_dir, items[0])
    if len(items + background) == 1:
        ckout = _run_cnvkit_single(items[0], access_file, work_dir)
    elif vcfutils.get_paired_phenotype(items[0]):
        ckout = _run_cnvkit_cancer(items, background, access_file, work_dir)
    else:
        ckout = _run_cnvkit_population(items, background, access_file, work_dir)
    ckout = _add_seg_to_output(ckout, items)
    ckout["variantcaller"] = "cnvkit"
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        data["sv"].append(ckout)
        out.append(data)
    return out
Exemple #51
0
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata):
    """Run intersection n out of x based ensemble method using bcbio.variation.recall.
    """
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id))
    if not utils.file_exists(out_vcf_file):
        num_pass = _get_num_pass(edata, len(vrn_files))
        cmd = [config_utils.get_program("bcbio-variation-recall", edata["config"]),
               "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1),
               "--numpass", str(num_pass)]
        # Remove filtered calls if we're dealing with tumor/normal calls
        if vcfutils.get_paired_phenotype(edata):
            cmd += ["--nofiltered"]
        cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files
        do.run(cmd, "Ensemble intersection calling: %s" % (batch_id))
    in_data = utils.deepish_copy(edata)
    in_data["vrn_file"] = out_vcf_file
    return {"variantcaller": "ensemble",
            "vrn_file": out_vcf_file,
            "bed_file": None}
Exemple #52
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [
            x for x in dd.get_viral_files(data)
            if os.path.basename(x) == "%s.fa" % viral_target
        ]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(
                utils.safe_makedir(out_dir), "%s-%s.bam" %
                (dd.get_sample_name(data),
                 utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(
                            tx_out_file)[0]
                        cmd = (
                            "samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}"
                        )
                        do.run(cmd.format(**locals()),
                               "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" %
                                         dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" %
                                                 (info.contig, info.aligned))
            out["base"] = out_file
    return out
Exemple #53
0
def _check_variantcaller(item):
    """Ensure specified variantcaller is a valid choice.
    """
    allowed = set(genotype.get_variantcallers().keys() + [None, False])
    vcs = item["algorithm"].get("variantcaller")
    if not isinstance(vcs, dict):
        vcs = {"variantcaller": vcs}
    for vc_set in vcs.values():
        if not isinstance(vc_set, (tuple, list)):
            vc_set = [vc_set]
        problem = [x for x in vc_set if x not in allowed]
        if len(problem) > 0:
            raise ValueError("Unexpected algorithm 'variantcaller' parameter: %s\n"
                             "Supported options: %s\n" % (problem, sorted(list(allowed))))
    # Ensure germline somatic calling only specified with tumor/normal samples
    if "germline" in vcs or "somatic" in vcs:
        paired = vcfutils.get_paired_phenotype(item)
        if not paired:
            raise ValueError("%s: somatic/germline calling in 'variantcaller' "
                             "but tumor/normal metadata phenotype not specified" % dd.get_sample_name(item))
Exemple #54
0
def _snpeff_args_from_config(data):
    """Retrieve snpEff arguments supplied through input configuration.
    """
    config = data["config"]
    args = []
    # Use older EFF formatting instead of new combined ANN formatting until
    # downstream tools catch up, then remove this.
    if LooseVersion(snpeff_version()) >= LooseVersion("4.1"):
        args += ["-formatEff", "-classic"]
    # General supplied arguments
    resources = config_utils.get_resources("snpeff", config)
    if resources.get("options"):
        args += [str(x) for x in resources.get("options", [])]
    # cancer specific calling arguments
    if vcfutils.get_paired_phenotype(data):
        args += ["-cancer"]
    # Provide options tuned to reporting variants in clinical environments
    if config["algorithm"].get("clinical_reporting"):
        args += ["-canon", "-hgvs"]
    return args
Exemple #55
0
def _freebayes_custom(in_file, ref_file, data):
    """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results.

    Experimental: for testing new methods.
    """
    if vcfutils.get_paired_phenotype(data):
        return None
    config = data["config"]
    bv_ver = programs.get_version("bcbio_variation", config=config)
    if LooseVersion(bv_ver) < LooseVersion("0.1.1"):
        return None
    out_file = "%s-filter%s" % os.path.splitext(in_file)
    if not utils.file_exists(out_file):
        tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp"))
        resources = config_utils.get_resources("bcbio_variation", config)
        jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
        java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
        cmd = ["bcbio-variation"] + jvm_opts + java_args + ["variant-filter", "freebayes", in_file, ref_file]
        do.run(cmd, "Custom FreeBayes filtering using bcbio.variation")
    return out_file