def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif (vcfutils.get_paired_phenotype(data) and "tumor" in [vcfutils.get_paired_phenotype(d) for d in get_orig_items(data)]): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": cur_batch = tz.get_in(["metadata", "batch"], data) if cur_batch: sub_data["metadata"]["batch"] = cur_batch sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling or population runs, do not split back up and keep in batches else: out = [] for sub_data in get_orig_items(data): cur_batch = tz.get_in(["metadata", "batch"], data) if cur_batch: sub_data["metadata"]["batch"] = cur_batch sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(get_orig_items(data)): if vcfutils.get_paired_phenotype(sub_data) == "tumor": sub_data["vrn_file"] = data["vrn_file"] else: sub_data.pop("vrn_file", None) out.append([sub_data]) return out # joint calling, do not split back up due to potentially large sample sizes elif tz.get_in(("config", "algorithm", "jointcaller"), data): return [[data]] # population or single sample else: out = [] for sub_data in get_orig_items(data): sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) sub_data["vrn_file_batch"] = data["vrn_file"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, do not split the final file and attach it to the tumor input. """ # not split, do nothing if "group_orig" not in data: return [[data]] # cancer tumor/normal elif vcfutils.get_paired_phenotype(data): out = [] for i, sub_data in enumerate(data["group_orig"]): if vcfutils.get_paired_phenotype(sub_data) == "tumor": if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = data["vrn_file"] out.append([sub_data]) return out # population or single sample else: out = [] for sub_data in data["group_orig"]: sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-") if len(vcfutils.get_samples(data["vrn_file"])) > 1: vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"]) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(data["vrn_file"], sub_vrn_file) if "combine" in data: sub_data["combine"] = data["combine"] sub_data["vrn_file"] = sub_vrn_file out.append([sub_data]) return out
def _do_prioritize(items): """Determine if we should perform prioritization. Currently done on tumor-only input samples. """ if vcfutils.get_paired_phenotype(items[0]): has_tumor = False has_normal = False for sub_data in items: if vcfutils.get_paired_phenotype(sub_data) == "tumor": has_tumor = True elif vcfutils.get_paired_phenotype(sub_data) == "normal": has_normal = True return has_tumor and not has_normal
def _do_prioritize(data): """Determine if we should perform prioritization. Currently done on tumor-only input samples. """ if vcfutils.get_paired_phenotype(data): has_tumor = False has_normal = False orig_items = vmulti.get_orig_items(data) if tz.get_in(["metadata", "batch"], data) else [data] for sub_data in orig_items: if vcfutils.get_paired_phenotype(sub_data) == "tumor": has_tumor = True elif vcfutils.get_paired_phenotype(sub_data) == "normal": has_normal = True return has_tumor and not has_normal
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def _do_prioritize(data): """Determine if we should perform prioritization. Currently done on tumor-only input samples. """ if vcfutils.get_paired_phenotype(data): has_tumor = False has_normal = False orig_items = vmulti.get_orig_items(data) if tz.get_in( ["metadata", "batch"], data) else [data] for sub_data in orig_items: if vcfutils.get_paired_phenotype(sub_data) == "tumor": has_tumor = True elif vcfutils.get_paired_phenotype(sub_data) == "normal": has_normal = True return has_tumor and not has_normal
def _do_high_depth_filter(data): """Check if we should do high depth filtering -- only on germline non-regional calls. """ return True is_genome = tz.get_in(["config", "algorithm", "coverage_interval"], data, "").lower() == "genome" is_paired = vcfutils.get_paired_phenotype(data) return is_genome and not is_paired
def _do_prioritize(items): """Determine if we should perform prioritization. Currently done on tumor-only input samples and feeding into PureCN which needs the germline annotations. """ if not any("tumoronly-prioritization" in dd.get_tools_off(d) for d in items): if vcfutils.get_paired_phenotype(items[0]): has_tumor = False has_normal = False for sub_data in items: if vcfutils.get_paired_phenotype(sub_data) == "tumor": has_tumor = True elif vcfutils.get_paired_phenotype(sub_data) == "normal": has_normal = True return has_tumor and not has_normal
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [ config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass) ] # Remove filtered calls if we're dealing with tumor/normal calls if vcfutils.get_paired_phenotype(edata): cmd += ["--nofiltered"] cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return { "variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None }
def _freebayes_custom(in_file, ref_file, data): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. Experimental: for testing new methods. """ if vcfutils.get_paired_phenotype(data): return None config = data["config"] bv_ver = programs.get_version("bcbio_variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir( os.path.join(os.path.dirname(in_file), "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", config, "dir")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-filter", "freebayes", in_file, ref_file ] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file
def _do_high_depth_filter(data): """Check if we should do high depth filtering -- only on germline non-regional calls. """ is_genome = tz.get_in(["config", "algorithm", "coverage_interval"], data, "").lower() == "genome" is_paired = vcfutils.get_paired_phenotype(data) return is_genome and not is_paired
def _snpeff_args_from_config(data): """Retrieve snpEff arguments supplied through input configuration. """ config = data["config"] args = ["-hgvs"] # General supplied arguments resources = config_utils.get_resources("snpeff", config) if resources.get("options"): args += [str(x) for x in resources.get("options", [])] # cancer specific calling arguments if vcfutils.get_paired_phenotype(data): args += ["-cancer"] effects_transcripts = dd.get_effects_transcripts(data) if effects_transcripts in set(["canonical_cancer"]): _, snpeff_base_dir = get_db(data) canon_list_file = os.path.join(snpeff_base_dir, "transcripts", "%s.txt" % effects_transcripts) if not utils.file_exists(canon_list_file): raise ValueError( "Cannot find expected file for effects_transcripts: %s" % canon_list_file) args += ["-canonList", canon_list_file] elif effects_transcripts == "canonical" or tz.get_in( ("config", "algorithm", "clinical_reporting"), data): args += ["-canon"] return args
def _split_cnv(items, calls_fpath, read_mapping_file, coverage_file): out_items = [] for item in items: cur_sv = { "variantcaller": "seq2c", "coverage": tz.get_in(["depth", "bins", "seq2c"], item) } if not get_paired_phenotype(item) == "normal": sample_name = dd.get_sample_name(item) work_dir = _sv_workdir(item) out_fname = os.path.join(work_dir, sample_name + '-calls.tsv') if not utils.file_exists(out_fname): with file_transaction(item, out_fname) as tx: with open(tx, "w") as out, open(calls_fpath) as inp: out.write(next(inp)) for l in inp: if l.split("\t")[0] == sample_name: out.write(l) cur_sv.update({ "calls": out_fname, "vrn_file": to_vcf(out_fname, item), "read_mapping": read_mapping_file, "calls_all": calls_fpath, "coverage_all": coverage_file }) if "sv" not in item: item["sv"] = [] assert "seq2c" not in [x["variantcaller"] for x in item["sv"]], \ "Do not expect existing seq2c variant output: %s" % (dd.get_sample_name(item)) item["sv"].append(cur_sv) out_items.append(item) return out_items
def _split_cnv(items, calls_fpath, read_mapping_file, coverage_file): out_items = [] for item in items: cur_sv = {"variantcaller": "seq2c", "coverage": tz.get_in(["depth", "bins", "seq2c"], item)} if not get_paired_phenotype(item) == "normal": sample_name = dd.get_sample_name(item) work_dir = _sv_workdir(item) out_fname = os.path.join(work_dir, sample_name + '-calls.tsv') if not utils.file_exists(out_fname): with file_transaction(item, out_fname) as tx: with open(tx, "w") as out, open(calls_fpath) as inp: out.write(next(inp)) for l in inp: if l.split("\t")[0] == sample_name: out.write(l) cur_sv.update({"calls": out_fname, "vrn_file": to_vcf(out_fname, item), "read_mapping": read_mapping_file, "calls_all": calls_fpath, "coverage_all": coverage_file}) if "sv" not in item: item["sv"] = [] assert "seq2c" not in [x["variantcaller"] for x in item["sv"]], \ "Do not expect existing seq2c variant output: %s" % (dd.get_sample_name(item)) item["sv"].append(cur_sv) out_items.append(item) return out_items
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on cancer samples. Since these will be pooled, need special functionality to handle them """ if vcfutils.get_paired_phenotype(data): return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on cancer samples. Since these will be pooled, need special functionality to handle them """ if vcfutils.get_paired_phenotype(data): return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % os.path.splitext(vcf_file) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with open(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) return out_file
def _prep_load_script(work_bams, names, chrom, items): pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired" print len(items), items[0].get("metadata") if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]): load_script = _paired_load_script else: load_script = _population_load_script return load_script(work_bams, names, chrom, pairmode, items)
def _prep_load_script(work_bams, names, chrom, items): if not chrom: chrom = "" pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired" if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]): load_script = _paired_load_script else: load_script = _population_load_script return load_script(work_bams, names, chrom, pairmode, items)
def extract(data, items, out_dir=None): """Extract germline calls for the given sample, if tumor only. """ if vcfutils.get_paired_phenotype(data): if len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data, out_dir) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} return data
def _cnvkit_by_type(items, background): """Dispatch to specific CNVkit functionality based on input type. """ if len(items + background) == 1: return _run_cnvkit_single(items[0]) elif vcfutils.get_paired_phenotype(items[0]): return _run_cnvkit_cancer(items, background) else: return _run_cnvkit_population(items, background)
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, assign the combined file to the tumor sample instead of splitting, and remove variant files from the normal. """ config = data["config"] vrn_file = data["vrn_file"] out = [] # cancer tumor/normal if vcfutils.get_paired_phenotype(data): # handle trailing normals, which we don't need to process if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal": sub_data, sub_vrn_file = data["group_orig"][0] sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) else: has_tumor = False for sub_data, sub_vrn_file in data["group_orig"]: paired_phenotype = vcfutils.get_paired_phenotype(sub_data) if paired_phenotype == "tumor": has_tumor = True if not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) else: sub_data.pop("vrn_file", None) sub_data["vrn_file-shared"] = sub_vrn_file out.append(sub_data) if not has_tumor: raise ValueError("Did not find tumor sample in paired analysis") # population or single sample else: for sub_data, sub_vrn_file in data["group_orig"]: if len(vcfutils.get_samples(vrn_file)) > 1: vcfutils.select_sample(vrn_file, sub_data["name"][-1], sub_vrn_file, config) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) if sub_vrn_file: sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) return out
def split_variants_by_sample(data): """Split a multi-sample call file into inputs for individual samples. For tumor/normal paired analyses, assign the combined file to the tumor sample instead of splitting, and remove variant files from the normal. """ config = data["config"] vrn_file = data["vrn_file"] out = [] # cancer tumor/normal if vcfutils.get_paired_phenotype(data): # handle trailing normals, which we don't need to process if len(data["group_orig"]) == 1 and vcfutils.get_paired_phenotype(data["group_orig"][0][0]) == "normal": sub_data = data["group_orig"][0][0] sub_data.pop("vrn_file", None) out.append(sub_data) else: has_tumor = False for sub_data, sub_vrn_file in data["group_orig"]: paired_phenotype = vcfutils.get_paired_phenotype(sub_data) if paired_phenotype == "tumor": has_tumor = True if not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) else: sub_data.pop("vrn_file", None) out.append(sub_data) if not has_tumor: raise ValueError("Did not find tumor sample in paired analysis") # population or single sample else: for sub_data, sub_vrn_file in data["group_orig"]: if is_multisample(vrn_file): select_sample_from_vcf(vrn_file, sub_data["name"][-1], sub_vrn_file, data["sam_ref"], config) elif not os.path.exists(sub_vrn_file): utils.symlink_plus(vrn_file, sub_vrn_file) if sub_vrn_file: sub_data["vrn_file"] = sub_vrn_file out.append(sub_data) return out
def _cnvkit_by_type(items, background): """Dispatch to specific CNVkit functionality based on input type. """ access_file = _create_access_file(dd.get_ref_file(items[0]), _sv_workdir(items[0]), items[0]) if len(items + background) == 1: return _run_cnvkit_single(items[0], access_file) elif vcfutils.get_paired_phenotype(items[0]): return _run_cnvkit_cancer(items, background, access_file) else: return _run_cnvkit_population(items, background, access_file)
def run(bam_file, data, out_dir): """Run viral QC analysis: 1. Extract the unmapped reads 2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files 3. Report viruses that are in more than 50% covered by at least 5x """ source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files' viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [ x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target ] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join( utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-completeness.txt" % utils.splitext_plus( viral_bam)[0] cores = dd.get_num_cores(data) if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: tmpfile = "%s-tmp" % utils.splitext_plus( tx_out_file)[0] cmd = ( "samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}" ) do.run(cmd.format(**locals()), "Align unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: sample_name = dd.get_sample_name(data) mosdepth_prefix = os.path.splitext(viral_bam)[0] cmd = ( "mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by " "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && " "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&" "echo '## Sample: {sample_name}' >> {tx_out_file} && " "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && " "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of viral genomes") out["base"] = out_file out["secondary"] = [] return out
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if dd.get_chip_method(data) == "atac": to_run.append("ataqv") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if dd.has_variantcalls(data): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["peddy"] if "contamination" not in dd.get_tools_off(data): to_run += ["contamination"] if vcfutils.get_paired_phenotype(data): if "viral" not in dd.get_tools_off(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run
def _pick_lead_item(items): """Pick single representative sample for batch calling to attach calls to. For cancer samples, attach to tumor. """ if vcfutils.is_paired_analysis([x["align_bam"] for x in items], items): for data in items: if vcfutils.get_paired_phenotype(data) == "tumor": return data raise ValueError("Did not find tumor sample in paired tumor/normal calling") else: return items[0]
def extract(data, items): """Extract germline calls for the given sample, if tumor/normal or prioritized. """ if vcfutils.get_paired_phenotype(data): is_paired = dd.get_batches(data) and len(items) > 1 if is_paired: germline_vcf = _extract_germline(data["vrn_file"], data) else: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} return data
def _get_ensemble_bed_files(items): """ get all ensemble structural BED file calls, skipping any normal samples from tumor/normal calls """ bed_files = [] for data in items: for sv in data.get("sv", []): if sv["variantcaller"] == "sv-ensemble": if ("vrn_file" in sv and not vcfutils.get_paired_phenotype(data) == "normal" and file_exists(sv["vrn_file"])): bed_files.append(sv["vrn_file"]) return bed_files
def _split_cnv(items, calls_fpath): for item in items: if get_paired_phenotype(item) == "normal": continue sample_name = dd.get_sample_name(item) work_dir = _sv_workdir(item) out_fname = os.path.join(work_dir, sample_name + '-calls.tsv') if not utils.file_exists(out_fname): with file_transaction(item, out_fname) as tx: with open(tx, "w") as out, open(calls_fpath) as inp: out.write(next(inp)) for l in inp: if l.split("\t")[0] == sample_name: out.write(l) item["sv"][0]["calls"] = out_fname
def _snpeff_args_from_config(data): """Retrieve snpEff arguments supplied through input configuration. """ config = data["config"] args = [] # General supplied arguments resources = config_utils.get_resources("snpeff", config) if resources.get("options"): args += [str(x) for x in resources.get("options", [])] # cancer specific calling arguments if vcfutils.get_paired_phenotype(data): args += ["-cancer"] # Provide options tuned to reporting variants in clinical environments if config["algorithm"].get("clinical_reporting"): args += ["-canon", "-hgvs"] return args
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if analysis.startswith(("standard", "variant", "variant2")): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["contamination", "peddy"] if vcfutils.get_paired_phenotype(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def extract(data, items): """Extract germline calls for the given sample, if tumor only. For germline calling done separately, fix VCF sample naming to match. """ if vcfutils.get_paired_phenotype(data): if dd.get_batches(data) and len(items) == 1: germline_vcf = _remove_prioritization(data["vrn_file"], data) germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"]) data["vrn_file_plus"] = {"germline": germline_vcf} elif dd.get_phenotype(data) == "germline": sample_name = dd.get_sample_name(data) vcf_samples = vcfutils.get_samples(data["vrn_file"]) if (sample_name.endswith("-germline") and len(vcf_samples) == 1 and sample_name.replace("-germline", "") == vcf_samples[0]): data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data) return data
def _compatible_small_variants(data): """Retrieve small variant (SNP, indel) VCFs compatible with CNVkit. """ supported = set(["vardict", "freebayes", "gatk-haplotype", "mutect2", "vardict"]) out = [] for v in data.get("variants", []): vrn_file = v.get("vrn_file") if vrn_file and v.get("variantcaller") in supported: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) if vcfutils.get_paired_phenotype(data): out.append(vrn_file) else: sample_vrn_file = os.path.join(dd.get_work_dir(data), v["variantcaller"], "%s-%s%s" % (base, dd.get_sample_name(data), ext)) sample_vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_vrn_file, data["config"]) out.append(sample_vrn_file) return out
def run(bam_file, data, out_dir): """Run viral QC analysis: 1. Extract the unmapped reads 2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files 3. Report viruses that are in more than 50% covered by at least 5x """ source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files' viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0] cores = dd.get_num_cores(data) if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Align unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: sample_name = dd.get_sample_name(data) mosdepth_prefix = os.path.splitext(viral_bam)[0] cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by " "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && " "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&" "echo '## Sample: {sample_name}' >> {tx_out_file} && " "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && " "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | " "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | " "sort -n -r -k 5,5 >> {tx_out_file}") do.run(cmd.format(**locals()), "Analyse coverage of viral genomes") out["base"] = out_file out["secondary"] = [] return out
def run(items): """Normalization and log2 ratio calculation plus CNV calling for full cohort. - Combine coverage of each region for each sample - Prepare read counts for each sample - Normalize coverages in cohort by gene and sample, and calculate log2 ratios - Call amplifications and deletions """ items = [utils.to_single_data(x) for x in items] work_dir = _sv_workdir(items[0]) coverage_file = _combine_coverages(items, work_dir) read_mapping_file = _calculate_mapping_reads(items, work_dir) normal_names = [dd.get_sample_name(x) for x in items if get_paired_phenotype(x) == "normal"] seq2c_calls_file = _call_cnv(items, work_dir, read_mapping_file, coverage_file, normal_names) _split_cnv(items, seq2c_calls_file) return items
def _cnvkit_by_type(items, background, work_dir): """Dispatch to specific CNVkit functionality based on input type. """ access_file = _create_access_file(dd.get_ref_file(items[0]), work_dir, items[0]) if len(items + background) == 1: ckout = _run_cnvkit_single(items[0], access_file, work_dir) elif vcfutils.get_paired_phenotype(items[0]): ckout = _run_cnvkit_cancer(items, background, access_file, work_dir) else: ckout = _run_cnvkit_population(items, background, access_file, work_dir) ckout = _add_seg_to_output(ckout, items) ckout["variantcaller"] = "cnvkit" out = [] for data in items: if "sv" not in data: data["sv"] = [] data["sv"].append(ckout) out.append(data) return out
def _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata): """Run intersection n out of x based ensemble method using bcbio.variation.recall. """ out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf.gz".format(batch_id)) if not utils.file_exists(out_vcf_file): num_pass = _get_num_pass(edata, len(vrn_files)) cmd = [config_utils.get_program("bcbio-variation-recall", edata["config"]), "ensemble", "--cores=%s" % edata["config"]["algorithm"].get("num_cores", 1), "--numpass", str(num_pass)] # Remove filtered calls if we're dealing with tumor/normal calls if vcfutils.get_paired_phenotype(edata): cmd += ["--nofiltered"] cmd += [out_vcf_file, dd.get_ref_file(edata)] + vrn_files do.run(cmd, "Ensemble intersection calling: %s" % (batch_id)) in_data = utils.deepish_copy(edata) in_data["vrn_file"] = out_vcf_file return {"variantcaller": "ensemble", "vrn_file": out_vcf_file, "bed_file": None}
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [ x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target ] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join( utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus( tx_out_file)[0] cmd = ( "samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}" ) do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def _check_variantcaller(item): """Ensure specified variantcaller is a valid choice. """ allowed = set(genotype.get_variantcallers().keys() + [None, False]) vcs = item["algorithm"].get("variantcaller") if not isinstance(vcs, dict): vcs = {"variantcaller": vcs} for vc_set in vcs.values(): if not isinstance(vc_set, (tuple, list)): vc_set = [vc_set] problem = [x for x in vc_set if x not in allowed] if len(problem) > 0: raise ValueError("Unexpected algorithm 'variantcaller' parameter: %s\n" "Supported options: %s\n" % (problem, sorted(list(allowed)))) # Ensure germline somatic calling only specified with tumor/normal samples if "germline" in vcs or "somatic" in vcs: paired = vcfutils.get_paired_phenotype(item) if not paired: raise ValueError("%s: somatic/germline calling in 'variantcaller' " "but tumor/normal metadata phenotype not specified" % dd.get_sample_name(item))
def _snpeff_args_from_config(data): """Retrieve snpEff arguments supplied through input configuration. """ config = data["config"] args = [] # Use older EFF formatting instead of new combined ANN formatting until # downstream tools catch up, then remove this. if LooseVersion(snpeff_version()) >= LooseVersion("4.1"): args += ["-formatEff", "-classic"] # General supplied arguments resources = config_utils.get_resources("snpeff", config) if resources.get("options"): args += [str(x) for x in resources.get("options", [])] # cancer specific calling arguments if vcfutils.get_paired_phenotype(data): args += ["-cancer"] # Provide options tuned to reporting variants in clinical environments if config["algorithm"].get("clinical_reporting"): args += ["-canon", "-hgvs"] return args
def _freebayes_custom(in_file, ref_file, data): """Custom FreeBayes filtering using bcbio.variation, tuned to human NA12878 results. Experimental: for testing new methods. """ if vcfutils.get_paired_phenotype(data): return None config = data["config"] bv_ver = programs.get_version("bcbio_variation", config=config) if LooseVersion(bv_ver) < LooseVersion("0.1.1"): return None out_file = "%s-filter%s" % os.path.splitext(in_file) if not utils.file_exists(out_file): tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(in_file), "tmp")) resources = config_utils.get_resources("bcbio_variation", config) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["bcbio-variation"] + jvm_opts + java_args + ["variant-filter", "freebayes", in_file, ref_file] do.run(cmd, "Custom FreeBayes filtering using bcbio.variation") return out_file