def run_vep(mt: hl.MatrixTable, genome_version: str, name: str = 'vep', block_size: int = 1000, vep_config_json_path=None) -> hl.MatrixTable: """Runs VEP. :param MatrixTable mt: MT to annotate with VEP :param str genome_version: "37" or "38" :param str name: Name for resulting row field :param int block_size: Number of rows to process per VEP invocation. :return: annotated MT :rtype: MatrixTable """ if vep_config_json_path is not None: config = vep_config_json_path mt = mt.annotate_globals(gencodeVersion="unknown") else: if genome_version not in ["37", "38"]: raise ValueError(f"Invalid genome version: {genome_version}") config = "file:///vep_data/vep-gcloud.json" mt = hl.vep(mt, config=config, name=name, block_size=block_size) logger.info("==> Done with VEP") return mt
def format_clinvar_variants(ds): # There are some variants with only one entry in alleles, ignore them for now. # TODO: These could be displayed in the ClinVar track even though they will never match a gnomAD variant. ds = ds.filter(hl.len(ds.alleles) == 2) # When a cluster is started with hailctl dataproc start cluster_name --vep, the init script for the # selected version of VEP links the appropriate configuration file to /vep_data/vep-gcloud.json ds = hl.vep(ds, "file:///vep_data/vep-gcloud.json", name="vep", block_size=1000) ds = ds.annotate(sorted_transcript_consequences=sorted_transcript_consequences_v3(ds.vep)) ds = ds.drop("vep") ds = ds.select( clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), clinvar_variation_id=ds.rsid, gold_stars=get_gold_stars(ds.info.CLNREVSTAT), review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), sorted_transcript_consequences=ds.sorted_transcript_consequences, ) ds = ds.annotate( chrom=normalized_contig(ds.locus), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus) ) return ds
def make_clinvar_hail2(clinvar_vcf_path, clinvar_variants_table, clinvar_mt_out_path): """ Import ClinVar vcf file, and turn it into a usable Hail2 mt :param str clinvar_vcf_path: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.vcf.bgz" :param str clinvar_variants_table: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.variants_table.tsv" :param bool repartition: :param int n_partitions: Number of partitions if repartition = True :param str clinvar_mt_out_path: "gs://gnomad-resources/clinvar/hail-0.2/clinvar_alleles.single.b37.hail2.vepped.mt" :return: split and VEP'd MT :rtype: MatrixTable """ clinvar_mt = hl.import_vcf(clinvar_vcf_path) variants_table = hl.import_table(clinvar_variants_table, impute=True) variants_table = variants_table.annotate( v=hl.parse_variant(variants_table.v)) variants_table = (variants_table.annotate( locus=variants_table.v.locus, alleles=variants_table.v.alleles).key_by('locus', 'alleles')) clinvar_mt = clinvar_mt.annotate_rows( va=variants_table[clinvar_mt.locus, clinvar_mt.alleles]) clinvar_mt = split_multi_dynamic(clinvar_mt, left_aligned=False) clinvar_mt = clinvar_mt.repartition(100) clinvar_vep = hl.vep(clinvar_mt, vep_config) clinvar_vep.write(clinvar_mt_out_path, overwrite=True) t = hl.read_matrix_table(clinvar_mt_out_path) t.rows().show()
def prepare_clinvar_variants(vcf_path, reference_genome): ds = import_clinvar_vcf(vcf_path, reference_genome) # There are some variants with only one entry in alleles, ignore them for now. # These could be displayed in the ClinVar track even though they will never match a gnomAD variant. ds = ds.filter(hl.len(ds.alleles) == 2) ds = hl.vep(ds) ds = ds.select( clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), clinvar_variation_id=ds.rsid, gold_stars=get_gold_stars(ds.info.CLNREVSTAT), review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map( lambda s: s.replace("^_", "") ), vep=ds.vep, ) ds = ds.annotate( chrom=normalized_contig(ds.locus.contig), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus) ) return ds
def run_vep( mt: hl.MatrixTable, genome_version: str, name: str = 'vep', block_size: int = 1000) -> hl.MatrixTable: """Runs VEP. :param MatrixTable mt: MT to annotate with VEP :param str genome_version: "37" or "38" :param str name: Name for resulting row field :param int block_size: Number of rows to process per VEP invocation. :return: annotated MT :rtype: MatrixTable """ if genome_version == "37": mt = mt.annotate_globals(gencodeVersion="19") # see gs://hail-common/vep/vep/homo_sapiens/85_GRCh38/info.txt config = "hdfs:///user/hdfs/vep85-loftee-gcloud.json" elif genome_version == "38": mt = mt.annotate_globals(gencodeVersion="25") config = "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json" else: raise ValueError(f"Invalid genome version: {genome_version}") mt = hl.vep(mt, config=config, name=name, block_size=block_size) logger.info("==> Done with VEP") return mt
def main(mt: str): """ Run vep using main.py wrapper """ hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(mt) # filter to biallelic loci only mt = mt.filter_rows(hl.len(mt.alleles) == 2) mt = mt.filter_rows(mt.alleles[1] != '*') vep = hl.vep(mt, config='file:///vep_data/vep-gcloud.json') vep_path = output_path('vep105_GRCh38.mt') vep.write(vep_path)
def make_exac_release_hail2(vcf_path, mt_out): """ From Konrad, who had already did this. I didn't actually run the code. :param str vcf_path: Example "gs://gnomad/raw/source/ExAC.r1.sites.vep.vcf.gz" :param str mt_out: Example: "gs://gnomad/raw/hail-0.2/vds/exac/exac.r1.sites.vep.vds" should be mt but whatevs :param bool repartition: :param int n_partitions: Number of partitions if repartition = True :return: Writes out VEP'd Hail0.2 MatrixTable :rtype: None """ mt = hl.import_vcf(vcf_path, force_bgz=True, min_partitions=1000) mt = split_multi_dynamic(mt) mt = hl.vep(mt, vep_config) mt.write(mt_out)
def make_gnomad_release_hail2(vcf_path, mt_out): """ Used to import, filter and VEP existing "bootleg" gnomAD VCF (01.26.2018) and write out as a Hail 0.2 MatrixTable :param str vcf_path: Example: "gs://gnomad-public/release/2.0.2/vcf/exomes/gnomad.exomes.r2.0.2.sites.vcf.bgz" :param str mt_out: Example: "gs://gnomad-berylc/tx-annotation/hail2/gnomad.exomes.r2.0.2.sites.split.vep.030818.mt" :return: Writes out VEP'd Hail0.2 MatrixTable :rtype: None """ release_mt = hl.import_vcf(vcf_path, min_partitions=8000) release_mt = split_multi_dynamic(release_mt) release_mt = release_mt.annotate_rows( as_pass=(release_mt.info.AS_FilterStatus[release_mt.a_index - 1] == "PASS") & (release_mt.filters.length() == 0)) release_mt = release_mt.filter_rows(release_mt.as_pass) release_mt = hl.vep(release_mt, vep_config) release_mt.write(mt_out)
def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config=None): """ VEP a table, or lookup variants in a reference database :param ht: Input Table :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`) :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference) :param vep_config: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen) :return: VEPped Table """ if reference is None: reference = hl.default_reference().name if reference_vep_ht is None: possible_refs = ("GRCh37", "GRCh38") if reference not in possible_refs: raise ValueError( f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(possible_refs)}' ) reference_vep_ht = hl.read_table(vep_context_ht_path(reference)) ht = ht.annotate(vep=reference_vep_ht[ht.key].vep) vep_ht = ht.filter(hl.is_defined(ht.vep)) revep_ht = ht.filter(hl.is_missing(ht.vep)) if vep_config is None: vep_config = vep_config_path(reference) revep_ht = hl.vep(revep_ht, vep_config) return vep_ht.union(revep_ht)
import hail as hl GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/' GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/' for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]: print(f"Checking 'hl.vep' replicates on '{path}'") expected = hl.read_matrix_table(path) actual = hl.vep(expected.select_rows(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json', csq=csq) vep_result_agrees = actual._same(expected) if vep_result_agrees: print('TEST PASSED') else: print('TEST FAILED') assert vep_result_agrees
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # perform sample and variant qc on remaining variants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("sample QC...") vds = hl.sample_qc(vds) print("variant QC...") vds = hl.variant_qc(vds) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # add VEP #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds = hl.vep(vds, "gs://ccdg-qc-multi/data/vep85-GRCh38-gcloud.json") #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write output VDS #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("writing VDS...") vds.write(qced_vds_excl_file, overwrite=True) print("writing sample QC...") vds.cols().flatten().export(sample_qc_info_postqc_file)
import_clinvar_xml, "/clinvar/clinvar.ht", {"clinvar_xml_path": pipeline.get_task("download_clinvar_xml")}, ) pipeline.add_task( "prepare_clinvar_grch38_variants", prepare_clinvar_variants, "/clinvar/clinvar_grch38_base.ht", {"clinvar_path": pipeline.get_task("import_clinvar_xml")}, {"reference_genome": "GRCh38"}, ) pipeline.add_task( "vep_clinvar_grch38_variants", lambda path: hl.vep(hl.read_table(path)).drop("vep_proc_id"), "/clinvar/clinvar_grch38_vepped.ht", {"path": pipeline.get_task("prepare_clinvar_grch38_variants")}, ) pipeline.add_task( "annotate_clinvar_grch38_transcript_consequences", annotate_transcript_consequences, "/clinvar/clinvar_grch38_annotated_1.ht", { "variants_path": pipeline.get_task("vep_clinvar_grch38_variants"), "transcripts_path": genes_pipeline.get_task("extract_grch38_transcripts"), "mane_transcripts_path": genes_pipeline.get_task("import_mane_select_transcripts"), }, )
def populate_clinvar(): clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz') mt = import_vcf('clinvar.vcf.gz', "38", drop_samples=True, min_partitions=2000, skip_invalid_loci=True) mt = mt.annotate_globals(version=clinvar_release_date) print("\n=== Running VEP ===") mt = hl.vep(mt, 'vep85-loftee-ruddle-b38.json', name="vep") print("\n=== Processing ===") mt = mt.annotate_rows( sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep)) mt = mt.annotate_rows( main_transcript= get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences)) mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), ) review_status_str = hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z"))) mt = mt.select_rows( allele_id=mt.info.ALLELEID, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), clinical_significance=hl.delimit( hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))), domains=get_expr_for_vep_protein_domains_set( vep_transcript_consequences_root=mt.vep.transcript_consequences), gene_ids=mt.gene_ids, gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root=mt. sortedTranscriptConsequences, gene_ids=mt.gene_ids), gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str], **{ f"main_transcript_{field}": mt.main_transcript[field] for field in mt.main_transcript.dtype.fields }, pos=get_expr_for_start_pos(mt), ref=get_expr_for_ref_allele(mt), review_status=review_status_str, transcript_consequence_terms=get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_ids=get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), transcript_id_to_consequence_json= get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), variant_id=get_expr_for_variant_id(mt), xpos=get_expr_for_xpos(mt.locus), ) print("\n=== Summary ===") hl.summarize_variants(mt) # Drop key columns for export rows = mt.rows() rows = rows.order_by(rows.variant_id).drop("locus", "alleles") rows.write('clinvar.ht', overwrite=True) '''
help="Elasticsearch block size to use when exporting", default=200, type=int) args = p.parse_args() if args.index_name: index_name = args.index_name.lower() else: index_name = "clinvar_grch{}".format(args.genome_version) print("\n=== Downloading VCF ===") mt = download_and_import_latest_clinvar_vcf(args.genome_version) print(dict(mt.globals.value)) print("\n=== Running VEP ===") mt = hl.vep(mt, "file:///vep/vep85-gcloud.json", name="vep", block_size=1000) print("\n=== Processing ===") mt = mt.annotate_rows(sortedTranscriptConsequences= get_expr_for_vep_sorted_transcript_consequences_array( vep_root=mt.vep)) mt = mt.annotate_rows( main_transcript= get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences )) mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )
"/clinvar/clinvar.ht", {"clinvar_xml_path": pipeline.get_task("download_clinvar_xml")}, ) pipeline.add_task( "prepare_clinvar_grch37_variants", prepare_clinvar_variants, "/clinvar/clinvar_grch37_base.ht", {"clinvar_path": pipeline.get_task("import_clinvar_xml")}, {"reference_genome": "GRCh37"}, ) pipeline.add_task( "vep_clinvar_grch37_variants", # tolerate_parse_error to ignore not a number error from "NaN" gene symbol lambda path: hl.vep(hl.read_table(path), tolerate_parse_error=True).drop("vep_proc_id"), "/clinvar/clinvar_grch37_vepped.ht", {"path": pipeline.get_task("prepare_clinvar_grch37_variants")}, ) pipeline.add_task( "annotate_clinvar_grch37_transcript_consequences", annotate_transcript_consequences, "/clinvar/clinvar_grch37_annotated_1.ht", { "variants_path": pipeline.get_task("vep_clinvar_grch37_variants"), "transcripts_path": genes_pipeline.get_task("extract_grch37_transcripts"), }, ) pipeline.add_task(
(snp2_con == "stop_retained_variant")): return "gained_stop_loss" else: return ("Unchanged") elif mnv_con == "stop_retained_variant": #this case, by definition one of the variant is stop_lost, and the other is stop_retained return ("Rescued stop loss") else: return ("Noncoding_or_else") #read MNV mnv = hl.read_table(sys.argv[1]) #annotate snv effects mnv = mnv.key_by("locus", "alleles") mnv = hl.vep(mnv, vep_config, name="snp2_vep") mnv = mnv.key_by() #unkey first mnv = mnv.rename({ 'locus': 'snp2_locus', 'alleles': 'snp2_alleles', "prev_locus": "locus", "prev_alleles": "alleles" }) #rename the snp1 locus and alleles as locus, allele mnv = mnv.key_by('locus', 'alleles') #and re-key mnv = hl.vep(mnv, vep_config, name="snp1_vep") #and vep #annotate MNV effects, specified by the distance if sys.argv[2] == 1: t = mnv.filter(mnv.dist == 1) t = t.annotate(refs=t.alleles[0] + t.snp2_alleles[0], alts=t.alleles[1] +
import hail as hl GNOMAD_CHR22_FIRST_1000 = "gs://hail-us-vep/vep_examplars/gnomad3_chr22_first_1000.mt" for path, csq in [(GNOMAD_CHR22_FIRST_1000, False)]: print(f"Checking 'hl.vep' replicates on '{path}'") expected = hl.read_matrix_table(path) actual = hl.vep(expected.rows().select(), 'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json', csq=csq) actual._force_count() # vep_result_agrees = actual._same(expected) # if vep_result_agrees: # print('TEST PASSED') # else: # print('TEST FAILED') # assert vep_result_agrees
import hail as hl # Read in the MatrixTable mt = hl.read_matrix_table('table.mt') # Apply VEP vep = hl.vep(mt, "vep85-loftee-ruddle-b38.json") # Write the MatrixTable (you don't want to re-apply VEP every time, it takes ~1 hour) vep.write('vep_matrixtable.mt') # Get SNVs snvs = mt.filter_rows(mt.vep.variant_class == "SNV") # Filter for specific consequence_terms snvs.filter_rows( snvs.vep.transcript_consequences.consequence_terms.contains( ["missense_variant"]) | snvs.vep.transcript_consequences.consequence_terms.contains( ["splice_acceptor_variant"]) | snvs.vep.transcript_consequences.consequence_terms.contains( ["splice_donor_variant"]) | snvs.vep.transcript_consequences.consequence_terms.contains( ["splice_region_variant"]) | snvs.vep.transcript_consequences.consequence_terms.contains(["start_lost"]) | snvs.vep.transcript_consequences.consequence_terms.contains( ["stop_gained"]) | snvs.vep.transcript_consequences.consequence_terms.contains( ["stop_lost"])).show() # Filter for rows that do NOT have those consequence_terms
mt1 = matrix_tables[i] next_mt = matrix_tables[i+1] if i == 0: # if first, combing first two MTs mt = mt1.union_cols(next_mt) else: # Else combine the combined MT with the next MT mt = mt.union_cols(next_mt) logging.info('Joined count: ' + str(mt.count())) logging.info('Splitting multiallelic variants') mt_split = hl.split_multi_hts(mt) logging.info('Split count: ' + str(mt_split.count())) logging.info('VEP annotating dataset.') mt_vep = hl.vep(mt_split, args.vep_config) if args.out_file is None: out_name = vcf_files[0] else: out_name = args.out_file if args.test: out_name = out_name + "_test" logging.info('Writing matrix table to bucket.') mt_vep.write(os.path.join(args.data_dir, out_name)) logging.info('Successfully completed import and VEP annotation. Copying logs to bucket and shutting down in 10 min.') h.copy_logs_output(log_dir, timestr=timestr, log_file=log_file, plot_dir=args.data_dir)
AN=mt_split.info.AN, EAS_AF=mt_split.info.EAS_AF[mt_split.a_index - 1], EUR_AF=mt_split.info.EUR_AF[mt_split.a_index - 1], AFR_AF=mt_split.info.AFR_AF[mt_split.a_index - 1], AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1], SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1], VT=(hl.case().when((mt_split.alleles[0].length() == 1) & (mt_split.alleles[1].length() == 1), 'SNP').when( mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL')), EX_TARGET=mt_split.info.EX_TARGET, MULTI_ALLELIC=mt_split.info.MULTI_ALLELIC, DP=mt_split.info.DP)) mt_split.describe() mt_split = mt_split.drop('old_locus', 'old_alleles', 'a_index') mt_split = mt_split.annotate_cols( sex=ht_samples[mt_split.s].gender, super_population=ht_samples[mt_split.s].super_pop, population=ht_samples[mt_split.s].pop) mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = hl.vep(mt_split, 'gs://hail-common/vep/vep/vep85-gcloud.json') mt_split.describe() mt_split.write( 'gs://hail-datasets/hail-data/1000_genomes_phase3_chrY.GRCh37.mt', overwrite=True)
import hail as hl hl.init() from hail.plot import show from pprint import pprint hl.plot.output_notebook() MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.mt' ht = hl.read_matrix_table(MT).rows() ht_vep = hl.vep(ht, "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json") ht_vep.write( "gs://dalio_bipolar_w1_w2_hail_02/data/annotations/vep_annotate.ht", overwrite=True)
def vep_or_lookup_vep(ht, reference_vep_ht=None, reference=None, vep_config_path=None, vep_version=None): """ VEP a table, or lookup variants in a reference database .. warning:: If `reference_vep_ht` is supplied, no check is performed to confirm `reference_vep_ht` was generated with the same version of VEP / VEP configuration as the VEP referenced in `vep_config_path`. :param ht: Input Table :param reference_vep_ht: A reference database with VEP annotations (must be in top-level `vep`) :param reference: If reference_vep_ht is not specified, find a suitable one in reference (if None, grabs from hl.default_reference) :param vep_config_path: vep_config to pass to hl.vep (if None, a suitable one for `reference` is chosen) :param vep_version: Version of VEPed context Table to use (if None, the default `vep_context` resource will be used) :return: VEPed Table """ if reference is None: reference = hl.default_reference().name if vep_config_path is None: vep_config_path = VEP_CONFIG_PATH vep_help = get_vep_help(vep_config_path) with hl.hadoop_open(vep_config_path) as vep_config_file: vep_config = vep_config_file.read() if reference_vep_ht is None: if reference not in POSSIBLE_REFS: raise ValueError( f'vep_or_lookup_vep got {reference}. Expected one of {", ".join(POSSIBLE_REFS)}' ) vep_context = get_vep_context(reference) if vep_version is None: vep_version = vep_context.default_version if vep_version not in vep_context.versions: logger.warning( f"No VEPed context Table available for genome build {reference} and VEP version {vep_version}, " f"all variants will be VEPed using the following VEP:\n{vep_help}" ) return hl.vep(ht, vep_config_path) logger.info( f"Using VEPed context Table from genome build {reference} and VEP version {vep_version}" ) reference_vep_ht = vep_context.versions[vep_version].ht() vep_context_help = hl.eval(reference_vep_ht.vep_help) vep_context_config = hl.eval(reference_vep_ht.vep_config) assert vep_help == vep_context_help, ( f"The VEP context HT version does not match the version referenced in the VEP config file." f"\nVEP context:\n{vep_context_help}\n\n VEP config:\n{vep_help}") assert vep_config == vep_context_config, ( f"The VEP context HT configuration does not match the configuration in {vep_config_path}." f"\nVEP context:\n{vep_context_config}\n\n Current config:\n{vep_config}" ) ht = ht.annotate(vep=reference_vep_ht[ht.key].vep) vep_ht = ht.filter(hl.is_defined(ht.vep)) revep_ht = ht.filter(hl.is_missing(ht.vep)) revep_ht = hl.vep(revep_ht, vep_config_path) return vep_ht.union(revep_ht)
import hail as hl GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/' GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/' for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]: print(f"Checking 'hl.vep' replicates on '{path}'") expected = hl.read_matrix_table(path) actual = hl.vep(expected.select_rows(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json', csq=csq) vep_result_agrees = actual._same(expected) if vep_result_agrees: print('TEST PASSED') else: print('TEST FAILED') assert vep_result_agrees
import hail as hl GOLD_STD = 'gs://hail-us-vep/vep_examplars/vep_no_csq_4dc19bc1b.mt/' GOLD_STD_CSQ = 'gs://hail-us-vep/vep_examplars/vep_csq_4dc19bc1b.mt/' for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]: print(f"Checking 'hl.vep' replicates on '{path}'") expected = hl.read_matrix_table(path) actual = hl.vep(expected.rows().select(), 'gs://hail-us-vep/vep85-loftee-gcloud-testing.json', csq=csq) actual._force_count() # vep_result_agrees = actual._same(expected) # if vep_result_agrees: # print('TEST PASSED') # else: # print('TEST FAILED') # assert vep_result_agrees
import hail.expr.aggregators as agg from bokeh.plotting import figure, output_file import numpy as np hl.init(default_reference='GRCh38',min_block_size=6) #Annotations: gsutil -m cp /medpop/esp2/mzekavat/CHIP/CHUD/data/variant_annot/somVariants.txt.bgz gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz kt = hl.import_table('gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz', impute = True,min_partitions=2000,no_header = True) kt2 = kt.key_by(**hl.parse_variant(kt.f0) kt2.describe() kt2.write('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht') kt2=hl.read_table('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht').repartition(1000) kt2 = hl.vep(kt2, 'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json') consequence_in_severity_order = [ "transcript_ablation" , "splice_acceptor_variant" , "splice_donor_variant" , "stop_gained" , "frameshift_variant" , "stop_lost" , "start_lost" , "transcript_amplification" , "inframe_insertion" , "inframe_deletion" , "missense_variant" , "protein_altering_variant" , "splice_region_variant"
& (r_de_novo_mt.father.DP >= args.min_parent_dp) & (r_de_novo_mt.AF[0] > args.min_aaf) & (r_de_novo_mt.AF[0] < 1 - args.min_aaf)) dnm_ht = dnm.key_cols_by().entries() dnm_ht = dnm_ht.filter((dnm_ht.alleles[0].length() == 1) & (dnm_ht.alleles[1].length() == 1)) dnm_ht = dnm_ht.filter((dnm_ht.father.AD[1] <= args.max_parent_alleles) & (dnm_ht.mother.AD[1] <= args.max_parent_alleles)) if args.dnm_checkpoint is not None: dnm_ht = dnm_ht.checkpoint(args.dnm_checkpoint) else: dnm_ht = hl.read_table(args.dnm_checkpoint) logger.info("Annotating data using vep...") if args.vep_checkpoint is None or not os.path.exists(args.vep_checkpoint): vep_dnm_ht = hl.vep(dnm_ht, config=args.vep_config, name="vep", csq=False) if args.vep_checkpoint is not None: vep_dnm_ht = vep_dnm_ht.checkpoint(args.vep_checkpoint, overwrite=True) else: vep_dnm_ht = hl.read_table(args.vep_checkpoint) def max_if_defined(expr): return hl.nanmax(hl.if_else( hl.is_defined(expr), expr, [.0], )) vep_dnm_ht = vep_dnm_ht.annotate( gene_symbol=hl.set(vep_dnm_ht.vep.transcript_consequences.gene_symbol),
denovos = hl.read_table(file_path) else: if not file_exists(file_path): print(f"{data_label}: Generating {file_path}") denovos = compute_samocha_denovos(mt, pedigree) denovos = denovos.checkpoint(file_path, overwrite=True, _read_if_exists=not force) else: print(f"Reading table {file_path}") denovos = hl.read_table(file_path) file_path = os.path.join(BASE_DIR, f"{data_label}.{algo_label}_denovos.vep.ht") if not file_exists(file_path): print(f"{data_label}: Generating {file_path}") denovos = denovos.key_by('locus', 'alleles') denovos = hl.vep(denovos, "file:///vep_data/vep-gcloud.json", name="vep", block_size=100) denovos = denovos.checkpoint(file_path, overwrite=True, _read_if_exists=not force) else: print(f"Reading table {file_path}") denovos = hl.read_table(file_path) file_path = os.path.join(BASE_DIR, f"{data_label}.{algo_label}_de_novos_table.tsv") print(f"{data_label}: annotate and export {file_path}") denovos = denovos.annotate(sorted_transcript_consequences = get_expr_for_vep_sorted_transcript_consequences_array(denovos.vep)) denovos = denovos.annotate(transcript_consequence_terms = get_expr_for_vep_consequence_terms(denovos.sorted_transcript_consequences)) denovos = denovos.annotate(transcript_consequence_categories = denovos.sorted_transcript_consequences.map(lambda c: c.category)) denovos = denovos.annotate(transcript_consequence = hl.cond(hl.len(denovos.transcript_consequence_terms) > 0, denovos.transcript_consequence_terms[0], "other")) denovos = denovos.annotate(transcript_consequence_category = hl.cond(hl.len(denovos.transcript_consequence_categories) > 0, denovos.transcript_consequence_categories[0], "other"))
import hail as hl an_gen = hl.read_matrix_table('gs://phenotype_31063/hail/ukb31063.genotype.mt') an_gen = hl.vep(an_gen, 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json') an_gen.describe() #write to a bucket an_gen.write('gs://rec_project/genotype_annotations.mt', overwrite=True)
# Now we need --requester-pays-allow-all --vep GRCh37 when starting the cluster # hailctl dataproc start dp --vep GRCh37 --requester-pays-allow-all --region us-central1 # Ensure that the variant list is moved to the cloud # gsutil cp ~/Repositories/BipEx/BSC_variant_data/BSC_BipEx_gene_variants.v2.txt gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/ # gsutil cp ~/Repositories/BipEx/BSC_data/variant_data/BSC_BipEx_gene_variants.v3.20201018.txt gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/ <- New file with updated definition based on SCHEMA thresholds, MAC <= 5 not in gnomAD top 10. # Read in the file, and annotate using VEP...making sure that the reference file is build 37. ht = hl.import_table( #'gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/BSC_BipEx_gene_variants.v2.txt', 'gs://dalio_bipolar_w1_w2_hail_02/data/BSC_variants/BSC_BipEx_gene_variants.v3.20201018.txt', impute=True, no_header=True) ht = ht.rename({'f0': 'gene', 'f2': 'position', 'f3': 'ref', 'f4': 'alt'}) ht = ht.annotate(chr=ht.f1.replace('chr', '')) ht = ht.annotate(locus=hl.locus(ht.chr, ht.position, reference_genome='GRCh37'), alleles=[ht.ref, ht.alt]) ht = ht.key_by(ht.locus, ht.alleles) # Now create locus and alleles # Create a compound row key to allow us to annotate. # Note that the location of the loftee file changed # (check https://hail.is/docs/0.2/methods/genetics.html?highlight=vep#hail.methods.vep for most up to date location). ht_vep = hl.vep(ht, "gs://hail-us-vep/vep85-loftee-gcloud.json") ht_vep.write( "gs://dalio_bipolar_w1_w2_hail_02/data/annotations/bsc_variants_vep_annotate.ht", overwrite=True)