def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig( data) else None conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = ["gemini"] ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = [ "--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths" ] else: vcf2db_args = [] cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases. """ conf_files = dd.get_vcfanno(data) if conf_files: with_basepaths = collections.defaultdict(list) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for f in conf_files: data_basepath = (install.get_gemini_dir(data) if f.find("gemini") >= 0 and is_human(data, builds=["37"]) else None) with_basepaths[data_basepath].append(f) conf_files = with_basepaths.items() else: conf_files = _default_conf_files(data) out_file = None if conf_files: for data_basepath, conf_files in conf_files: ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data, data_basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) if dd.get_analysis(data).lower().find("rna-seq") >= 0: logger.info("Annotate RNA editing sites") ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data[vrn_key] = ann_file if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration( data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def _run_vcfanno(gemini_vcf, data, use_gemini=False): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None conf_files = dd.get_vcfanno(data) if not conf_files and use_gemini: conf_files = ["gemini"] if conf_files: return vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) else: return gemini_vcf
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: dd.set_vrn_file(data, ann_file) filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) dd.set_vrn_file(data, filter_file) return [[data]]
def handle_vcf_calls(vcf_file, data, orig_items): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(orig_items): return vcf_file else: if population.has_gemini_data(data): data_basepath = install.get_gemini_dir(data) if population.support_gemini_orig(data) else None ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath) if ann_vcf: priority_file = _prep_priority_filter_vcfanno(ann_vcf, data) return _apply_priority_filter(vcf_file, priority_file, data) # No GEMINI database for filtering, return original file return vcf_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig(data) else None ann_file = vcfanno.run_vcfanno(gemini_vcf, "gemini", data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) variantcaller = dd.get_variantcaller(data) if variantcaller and ("gatk-haplotype" in variantcaller): filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) return [[data]]
def handle_vcf_calls(vcf_file, data, orig_items): """Prioritize VCF calls based on external annotations supplied through GEMINI. """ if not _do_prioritize(orig_items): return vcf_file else: if population.has_gemini_data(data): data_basepath = install.get_gemini_dir( data) if population.support_gemini_orig(data) else None ann_vcf = vcfanno.run_vcfanno(vcf_file, ["gemini"], data, data_basepath) if ann_vcf: priority_file = _prep_priority_filter_vcfanno(ann_vcf, data) return _apply_priority_filter(vcf_file, priority_file, data) # No GEMINI database for filtering, return original file return vcf_file
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples)] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, "rnaedit", data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def run_vcfanno(vcf_file, data, decomposed=False): """Run vcfanno, providing annotations from external databases. """ conf_files = dd.get_vcfanno(data) if conf_files: conf_files = [(None, conf_files)] else: conf_files = _default_conf_files(data) out_file = None if conf_files: for data_basepath, conf_files in conf_files: ann_file = vcfanno.run_vcfanno(vcf_file, conf_files, data, data_basepath=data_basepath, decomposed=decomposed) if ann_file: out_file = ann_file vcf_file = ann_file return out_file
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) # annotate RNA-editing events with vcfanno if dd.get_vrn_file(data): vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), "rnaedit", data) data = dd.set_vrn_file(data, vrn_file) return [[data]]