def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: dd.set_vrn_file(data, ann_file) filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) dd.set_vrn_file(data, filter_file) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) return [[data]]
def _run_gemini_stats(bam_file, data, out_dir): """Retrieve high level variant statistics from Gemini. """ out = {} gemini_dbs = [d for d in [tz.get_in(["population", "db"], x) for x in data.get("variants", [])] if d] if len(gemini_dbs) > 0: gemini_db = gemini_dbs[0] gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0] if not utils.file_uptodate(gemini_stat_file, gemini_db): gemini = config_utils.get_program("gemini", data["config"]) tstv = subprocess.check_output([gemini, "stats", "--tstv", gemini_db]) gt_counts = subprocess.check_output([gemini, "stats", "--gts-by-sample", gemini_db]) dbsnp_count = subprocess.check_output([gemini, "query", gemini_db, "-q", "SELECT count(*) FROM variants WHERE in_dbsnp==1"]) out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1] for line in gt_counts.split("\n"): parts = line.rstrip().split() if len(parts) > 0 and parts[0] != "sample": name, hom_ref, het, hom_var, _, total = parts out[name] = {} out[name]["Variations (heterozygous)"] = int(het) out[name]["Variations (homozygous)"] = int(hom_var) # same total variations for all samples, keep that top level as well. out["Variations (total)"] = int(total) out["Variations (in dbSNP)"] = int(dbsnp_count.strip()) if out.get("Variations (total)") > 0: out["Variations (in dbSNP) pct"] = "%.1f%%" % (out["Variations (in dbSNP)"] / float(out["Variations (total)"]) * 100.0) with open(gemini_stat_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) else: with open(gemini_stat_file) as in_handle: out = yaml.safe_load(in_handle) else: vcf_file = dd.get_vrn_file(data) if isinstance(vcf_file, list): vcf_file = vcf_file[0] if vcf_file: out_file = "%s-bcfstats.tsv" % utils.splitext_plus(vcf_file)[0] bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): cmd = ("{bcftools} stats -f PASS {vcf_file} > {out_file}") do.run(cmd.format(**locals()), "basic vcf stats %s" % data["name"][-1]) with open(out_file) as in_handle: for line in in_handle: if line.startswith("SN") and line.find("records") > -1: cols = line.split() print line out["Variations (total)"] = cols[-1] res = {} for k, v in out.iteritems(): if not isinstance(v, dict): res.update({k: v}) if k == data["name"][-1]: res.update(v) return res
def _default_conf_files(data, retriever): conf_files = [] if dd.get_variantcaller(data) or dd.get_vrn_file(data): if annotate_gemini(data, retriever): conf_files.append("gemini") if _annotate_somatic(data, retriever): conf_files.append("somatic") if dd.get_analysis(data).lower().find("rna-seq") >= 0: conf_files.append("rnaedit") return conf_files
def _default_conf_files(data, retriever): conf_files = [] if dd.get_variantcaller(data) or dd.get_vrn_file(data): if annotate_gemini(data, retriever): conf_files.append("gemini") if _annotate_somatic(data, retriever): conf_files.append("somatic") if dd.get_analysis(data).lower().find("rna-seq") >= 0: conf_files.append("rnaedit") return conf_files
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) # annotate RNA-editing events with vcfanno if dd.get_vrn_file(data): vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), "rnaedit", data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) ref_file = dd.get_ref_file(data) out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf") if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, out_file) updated_samples.append([data]) return updated_samples return samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) ref_file = dd.get_ref_file(data) out_file = os.path.join(dd.get_work_dir(data, "."), "variation", "combined.vcf") if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file, out_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, out_file) updated_samples.append([data]) return updated_samples return samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run(bam_file, data, out_dir): """Retrieve high level variant statistics from Gemini. """ out = {} gemini_dbs = [ d for d in [tz.get_in(["population", "db"], x) for x in data.get("variants", [])] if d ] if len(gemini_dbs) > 0: gemini_db = gemini_dbs[0] gemini_stat_file = "%s-stats.yaml" % os.path.splitext(gemini_db)[0] if not utils.file_uptodate(gemini_stat_file, gemini_db): gemini = config_utils.get_program("gemini", data["config"]) tstv = subprocess.check_output( [gemini, "stats", "--tstv", gemini_db]) gt_counts = subprocess.check_output( [gemini, "stats", "--gts-by-sample", gemini_db]) dbsnp_count = subprocess.check_output([ gemini, "query", gemini_db, "-q", "SELECT count(*) FROM variants WHERE in_dbsnp==1" ]) out["Transition/Transversion"] = tstv.split("\n")[1].split()[-1] for line in gt_counts.split("\n"): parts = line.rstrip().split() if len(parts) > 0 and parts[0] != "sample": name, hom_ref, het, hom_var, _, total = parts out[name] = {} out[name]["Variations (heterozygous)"] = int(het) out[name]["Variations (homozygous)"] = int(hom_var) # same total variations for all samples, keep that top level as well. out["Variations (total)"] = int(total) out["Variations (in dbSNP)"] = int(dbsnp_count.strip()) if out.get("Variations (total)") > 0: out["Variations (in dbSNP) pct"] = "%.1f%%" % ( out["Variations (in dbSNP)"] / float(out["Variations (total)"]) * 100.0) with open(gemini_stat_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) else: with open(gemini_stat_file) as in_handle: out = yaml.safe_load(in_handle) else: vcf_file = dd.get_vrn_file(data) if isinstance(vcf_file, list): vcf_file = vcf_file[0] if vcf_file: out_file = "%s-bcfstats.tsv" % utils.splitext_plus(vcf_file)[0] bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): cmd = ("{bcftools} stats -f PASS {vcf_file} > {out_file}") do.run(cmd.format(**locals()), "basic vcf stats %s" % dd.get_sample_name(data)) with open(out_file) as in_handle: for line in in_handle: if line.startswith("SN") and line.find("records") > -1: cols = line.split() out["Variations (total)"] = cols[-1] res = {} for k, v in out.iteritems(): if not isinstance(v, dict): res.update({k: v}) if k == dd.get_sample_name(data): res.update(v) return res