def add_exac_from_vds(hail_context, vds, genome_version, root="va.exac", top_level_fields=TOP_LEVEL_FIELDS, info_fields=INFO_FIELDS, verbose=True): if genome_version == "37": exac_vds_path = 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vds' elif genome_version == "38": exac_vds_path = 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) exac_vds = hail_context.read(exac_vds_path).split_multi() top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=top_level_fields, other_source_root="vds", ) if verbose: print(top_fields_expr) info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(info_fields_expr) return (vds .annotate_variants_vds(exac_vds, expr=top_fields_expr) .annotate_variants_vds(exac_vds, expr=info_fields_expr) )
def add_mpc_from_vds(hail_context, vds, genome_version, root="va.mpc", info_fields=MPC_INFO_FIELDS, verbose=True): """Add MPC annotations [Samocha 2017] to the vds""" if genome_version == "37": mpc_vds_path = 'gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vds' elif genome_version == "38": mpc_vds_path = 'gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) mpc_vds = hail_context.read(mpc_vds_path).split_multi() expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(expr) #print("\n==> mpc summary: ") #print(mpc_vds.summarize()) return vds.annotate_variants_vds(mpc_vds, expr=expr)
def add_clinvar_from_vds(hail_context, vds, genome_version, root="va.clinvar", info_fields=CLINVAR_FIELDS, verbose=True): """Add clinvar annotations to the vds""" if genome_version == "37": clinvar_single_vcf = 'gs://seqr-reference-data/GRCh37/clinvar/clinvar_alleles.single.b37.vcf.gz' clinvar_multi_vcf = 'gs://seqr-reference-data/GRCh37/clinvar/clinvar_alleles.multi.b37.vcf.gz' elif genome_version == "38": clinvar_single_vcf = 'gs://seqr-reference-data/GRCh38/clinvar/clinvar_alleles.single.b38.vcf.gz' clinvar_multi_vcf = 'gs://seqr-reference-data/GRCh38/clinvar/clinvar_alleles.multi.b38.vcf.gz' else: raise ValueError("Invalid genome_version: " + str(genome_version)) clinvar_vds = hail_context.import_vcf( [clinvar_single_vcf, clinvar_multi_vcf], force_bgz=True, min_partitions=1000) expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(expr) #print("\n==> clinvar vds summary: ") #print("\n" + str(clinvar_vds.summarize())) return vds.annotate_variants_vds(clinvar_vds, expr=expr)
def add_cadd_to_vds(hail_context, vds, genome_version, root="va.cadd", info_fields=CADD_FIELDS, subset=None, verbose=True): """Add CADD scores to the vds""" if genome_version != "37" and genome_version != "38": raise ValueError("Invalid genome_version: " + str(genome_version)) expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(expr) #print("\n==> cadd summary: ") #print("\n" + str(cadd_vds.summarize())) cadd_vds_path = "gs://seqr-reference-data/GRCh%(genome_version)s/CADD/CADD_snvs_and_indels.vds" % locals() logger.info("==> Reading in CADD: %s" % cadd_vds_path) cadd_vds = hail_context.read(cadd_vds_path) if subset: import hail cadd_vds = cadd_vds.filter_intervals(hail.Interval.parse(subset)) vds = vds.annotate_variants_vds(cadd_vds, expr=expr) return vds
def add_cadd_from_vds(hail_context, vds, genome_version, root="va.cadd", info_fields=CADD_FIELDS, verbose=True): """Add CADD scores to the vds""" if genome_version == "37": cadd_snvs_vds_path = 'gs://seqr-reference-data/GRCh37/CADD/whole_genome_SNVs.vds' cadd_indels_vds_path = 'gs://seqr-reference-data/GRCh37/CADD/InDels.vds' elif genome_version == "38": cadd_snvs_vds_path = 'gs://seqr-reference-data/GRCh38/CADD/whole_genome_SNVs.liftover.GRCh38.vds' cadd_indels_vds_path = 'gs://seqr-reference-data/GRCh38/CADD/InDels.liftover.GRCh38.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) #cadd_vds = hail_context.import_vcf([cadd_snvs_vcf_path, cadd_indels_vcf_path], force_bgz=True, min_partitions=1000) cadd_vds = hail_context.read([cadd_snvs_vds_path, cadd_indels_vds_path]).split_multi() expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(expr) #print("\n==> cadd summary: ") #print("\n" + str(cadd_vds.summarize())) return vds.annotate_variants_vds(cadd_vds, expr=expr)
def add_topmed_to_vds(hail_context, vds, genome_version, root="va.topmed", fields=TOPMED_FIELDS, subset=None, verbose=True): """Add 1000 genome AC and AF annotations to the vds""" if genome_version == "37": raise ValueError("Not yet available") elif genome_version == "38": topmed_vds_path = 'gs://seqr-reference-data/GRCh38/TopMed/ALL.TOPMed_freeze5_hg38_dbSNP.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) topmed_vds = hail_context.read(topmed_vds_path).split_multi() if subset: import hail topmed_vds = topmed_vds.filter_intervals(hail.Interval.parse(subset)) expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=fields, other_source_root="vds.info", ) if verbose: print(expr) #print("\n==> topmed summary: ") #print("\n" + str(topmed_vds.summarize())) return vds.annotate_variants_vds(topmed_vds, expr=expr)
def add_1kg_phase3_from_vds(hail_context, vds, genome_version, root="va.g1k", fields=G1K_FIELDS, verbose=True): """Add 1000 genome AC and AF annotations to the vds""" if genome_version == "37": g1k_vds_path = 'gs://seqr-reference-data/GRCh37/1kg/1kg.wgs.phase3.20130502.GRCh37_sites.vds' elif genome_version == "38": g1k_vds_path = 'gs://seqr-reference-data/GRCh38/1kg/1kg.wgs.phase3.20170504.GRCh38_sites.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) g1k_vds = hail_context.read(g1k_vds_path).split_multi() expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=fields, other_source_root="vds.g1k", ) if verbose: print(expr) #print("\n==> 1kg summary: ") #print("\n" + str(g1k_vds.summarize())) return vds.annotate_variants_vds(g1k_vds, expr=expr)
def add_dbnsfp_to_vds(hail_context, vds, genome_version, root="va.dbnsfp", subset=None, verbose=True): """Add dbNSFP fields to the VDS""" if genome_version == "37": dbnsfp_path = "gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.vds" dbnsfp_schema = DBNSFP_SCHEMA_37 elif genome_version == "38": dbnsfp_path = "gs://seqr-reference-data/GRCh38/dbNSFP/v3.5/dbNSFP3.5a_variant.vds" dbnsfp_schema = DBNSFP_SCHEMA_38 else: raise ValueError("Invalid genome_version: " + str(genome_version)) # create sites-only VDS dbnsfp_vds = hail_context.read(dbnsfp_path) if subset: import hail dbnsfp_vds = dbnsfp_vds.filter_intervals(hail.Interval.parse(subset)) expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=dbnsfp_schema, other_source_root="vds", ) if verbose: print(expr) return vds.annotate_variants_vds(dbnsfp_vds, expr=expr)
def add_gnomad_to_vds(hail_context, vds, genome_version, exomes_or_genomes, root=None, top_level_fields=TOP_LEVEL_FIELDS, info_fields=INFO_FIELDS, subset=None, verbose=True): if genome_version not in ("37", "38"): raise ValueError("Invalid genome_version: %s. Must be '37' or '38'" % str(genome_version)) if exomes_or_genomes not in ("exomes", "genomes"): raise ValueError("Invalid genome_version: %s. Must be 'exomes' or 'genomes'" % str(genome_version)) if root is None: root = "va.gnomad_%s" % exomes_or_genomes gnomad_vds_path = GNOMAD_VDS_PATHS["%s_%s" % (exomes_or_genomes, genome_version)] gnomad_vds = hail_context.read(gnomad_vds_path).split_multi() if subset: import hail gnomad_vds = gnomad_vds.filter_intervals(hail.Interval.parse(subset)) #if genome_version == "38": #info_fields += """ # OriginalContig: String, # OriginalStart: String, #""" if exomes_or_genomes == "genomes": # remove any *SAS* fields from genomes since South Asian population only defined for exomes info_fields = "\n".join(field for field in info_fields.split("\n") if "SAS" not in field) top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=top_level_fields, other_source_root="vds", ) if verbose: print(top_fields_expr) info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(info_fields_expr) return (vds .annotate_variants_vds(gnomad_vds, expr=", ".join([top_fields_expr, info_fields_expr])) )
def compute_minimal_schema(vds, analysis_type): # add computed annotations parallel_computed_annotation_exprs = [ "va.docId = %s" % get_expr_for_variant_id(512), ] vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs) #pprint(vds.variant_schema) # apply schema to dataset INPUT_SCHEMA = {} if analysis_type == "GATK_VARIANTS": INPUT_SCHEMA["top_level_fields"] = """ docId: String, wasSplit: Boolean, aIndex: Int, """ INPUT_SCHEMA["info_fields"] = "" elif analysis_type in ["MANTA_SVS", "JULIA_SVS"]: INPUT_SCHEMA["top_level_fields"] = """ docId: String, """ INPUT_SCHEMA["info_fields"] = "" else: raise ValueError("Unexpected analysis_type: %s" % analysis_type) expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA) vds = vds.annotate_variants_expr(expr=expr) vds = vds.annotate_variants_expr("va = va.clean") return vds
AN: Int, --- BaseQRankSum: Double, --- ClippingRankSum: Double, DP: Int, FS: Double, InbreedingCoeff: Double, MQ: Double, --- MQRankSum: Double, QD: Double, --- ReadPosRankSum: Double, VQSLOD: Double, culprit: String, """ } expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA) vds = vds.annotate_variants_expr(expr=expr) vds = vds.annotate_variants_expr("va = va.clean") # add reference data CLINVAR_INFO_FIELDS = """ MEASURESET_TYPE: String, MEASURESET_ID: String, RCV: String, ALLELE_ID: String, CLINICAL_SIGNIFICANCE: String, PATHOGENIC: String, BENIGN: String, CONFLICTED: String,
def add_exac_to_vds(hail_context, vds, genome_version, root="va.exac", top_level_fields=TOP_LEVEL_FIELDS, info_fields=INFO_FIELDS, subset=None, verbose=True): if genome_version == "37": exac_vds_path = 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vds' elif genome_version == "38": exac_vds_path = 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.vds' else: raise ValueError("Invalid genome_version: " + str(genome_version)) #if genome_version == "38": # info_fields += """ # OriginalContig: String, # OriginalStart: String, # """ exac_vds = hail_context.read(exac_vds_path).split_multi() if subset: import hail exac_vds = exac_vds.filter_intervals(hail.Interval.parse(subset)) # ExAC VCF doesn't contain AF fields, so compute them here exac_vds = exac_vds.annotate_variants_expr(""" va.info.AF_AFR = if(va.info.AN_AFR > 0) va.info.AC_AFR[va.aIndex-1] / va.info.AN_AFR else NA:Float, va.info.AF_AMR = if(va.info.AN_AMR > 0) va.info.AC_AMR[va.aIndex-1] / va.info.AN_AMR else NA:Float, va.info.AF_EAS = if(va.info.AN_EAS > 0) va.info.AC_EAS[va.aIndex-1] / va.info.AN_EAS else NA:Float, va.info.AF_FIN = if(va.info.AN_FIN > 0) va.info.AC_FIN[va.aIndex-1] / va.info.AN_FIN else NA:Float, va.info.AF_NFE = if(va.info.AN_NFE > 0) va.info.AC_NFE[va.aIndex-1] / va.info.AN_NFE else NA:Float, va.info.AF_OTH = if(va.info.AN_OTH > 0) va.info.AC_OTH[va.aIndex-1] / va.info.AN_OTH else NA:Float, va.info.AF_SAS = if(va.info.AN_SAS > 0) va.info.AC_SAS[va.aIndex-1] / va.info.AN_SAS else NA:Float, va.info.AF_POPMAX = if(va.info.AN_POPMAX[va.aIndex-1] != "NA" && va.info.AN_POPMAX[va.aIndex-1].toInt() > 0) va.info.AC_POPMAX[va.aIndex-1].toInt() / va.info.AN_POPMAX[va.aIndex-1].toInt() else NA:Float """) top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=top_level_fields, other_source_root="vds", ) if verbose: print(top_fields_expr) info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr( root=root, other_source_fields=info_fields, other_source_root="vds.info", ) if verbose: print(info_fields_expr) vds = (vds.annotate_variants_vds(exac_vds, expr=", ".join( [top_fields_expr, info_fields_expr]))) from pprint import pprint pprint(vds.variant_schema) return vds