def populate_gtex(): meta_ht = hl.import_table( '/home/ml2529/gtex_data/GTEx_v7_Annotations_SampleAttributesDS.txt', delimiter='\t', key='SAMPID') mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv', row_key='transcript_id', row_fields={ 'transcript_id': hl.tstr, 'gene_id': hl.tstr }, entry_type=hl.tfloat32) #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32) mt = mt.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'}) #pprint.pprint(meta_ht.describe()) #pprint.pprint(gtex_mt.describe()) mt = mt.annotate_cols(tissue=meta_ht[mt.col_id].SMTSD) #pprint.pprint(mt.describe()) #pprint.pprint(mt.show(include_row_fields=True)) cut_dict = { 'tissue': hl.agg.filter(hl.is_defined(mt.tissue), hl.agg.counter(mt.tissue)) } #pprint.pprint(cut_dict) cut_data = mt.aggregate_cols(hl.struct(**cut_dict)) #pprint.pprint(cut_data.tissue) #call_stats = hl.agg.filter(mt.tissue == 'Lung', hl.agg.mean(mt.x)) #pprint.pprint(call_stats) #mt = mt.annotate_rows(Lung=call_stats) #pprint.pprint(mt.show(include_row_fields=True)) for x in sorted(cut_data['tissue'].keys()): #pprint.pprint(x) call_stats = hl.agg.filter(mt.tissue == x, hl.agg.mean(mt.x)) mt = mt.transmute_rows(**{f"{tissue_abbr[x]}": call_stats}) #pprint.pprint(mt.show(include_row_fields=True)) ht = mt.rows() #ht.write('gtex_expression.ht',overwrite=True) export_ht_to_es(ht, index_name='gtex_tissue_tpms_by_transcript', index_type='tissue_tpms') '''
def populate_dnms(): #ht = hl.import_table('/home/ubuntu/data/merged_dnms_cohortFreq12_perFamily_variant_id_042320.tsv',delimiter='\t',impute=True,key='variant_id') ht = hl.import_table( '/home/ubuntu/data/SSC_denovo_wgs_cshl_variant_id_gene_id_conf.tsv', delimiter='\t', impute=True) #pprint.pprint(ht.describe()) #pprint.pprint(ht.show()) export_ht_to_es(ht, index_name='autism_dnms', index_type='variant') ht = hl.import_table( '/home/ubuntu/data/merged_dnms_cohortFreq12_perFamily_variant_id_042320.tsv', delimiter='\t', impute=True) export_ht_to_es(ht, index_name='autism_dnms', index_type='variant')
def populate_gtex(): ht = hl.import_table( '/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm_medians_by_tissue_wo_versions.tsv.gz', delimiter='\t', key='transcript_id', force_bgz=True, impute=True) #mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32) #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32) ht = ht.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'}) #pprint.pprint(ht.describe()) #pprint.pprint(ht.show()) #ht.write('gtex_expression.ht',overwrite=True) export_ht_to_es(ht, index_name='gtex_tissue_tpms_by_transcript', index_type='tissue_tpms')
def populate_clinvar(): #clinvar_release_date = _parse_clinvar_release_date('clinvar.vcf.gz') #mt = import_vcf('clinvar.vcf.gz', "37", drop_samples=True, min_partitions=2000, skip_invalid_loci=True) #mt = mt.annotate_globals(version=clinvar_release_date) ''' print("\n=== Running VEP ===") mt = hl.vep(mt, 'vep85-loftee-local.json', name="vep") print("\n=== Processing ===") mt = mt.annotate_rows( sortedTranscriptConsequences=get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep) ) mt = mt.annotate_rows( main_transcript=get_expr_for_worst_transcript_consequence_annotations_struct( vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences ) ) mt = mt.annotate_rows( gene_ids=get_expr_for_vep_gene_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences ), ) review_status_str = hl.delimit(hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)), key=lambda s: s.replace("^_", "z"))) mt = mt.select_rows( allele_id=mt.info.ALLELEID, alt=get_expr_for_alt_allele(mt), chrom=get_expr_for_contig(mt.locus), clinical_significance=hl.delimit(hl.sorted(hl.array(hl.set(mt.info.CLNSIG)), key=lambda s: s.replace("^_", "z"))), domains=get_expr_for_vep_protein_domains_set(vep_transcript_consequences_root=mt.vep.transcript_consequences), gene_ids=mt.gene_ids, gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map( vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences, gene_ids=mt.gene_ids ), gold_stars=CLINVAR_GOLD_STARS_LOOKUP[review_status_str], **{f"main_transcript_{field}": mt.main_transcript[field] for field in mt.main_transcript.dtype.fields}, pos=get_expr_for_start_pos(mt), ref=get_expr_for_ref_allele(mt), review_status=review_status_str, transcript_consequence_terms=get_expr_for_vep_consequence_terms_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences ), transcript_ids=get_expr_for_vep_transcript_ids_set( vep_transcript_consequences_root=mt.sortedTranscriptConsequences ), transcript_id_to_consequence_json=get_expr_for_vep_transcript_id_to_consequence_map( vep_transcript_consequences_root=mt.sortedTranscriptConsequences ), variant_id=get_expr_for_variant_id(mt), xpos=get_expr_for_xpos(mt.locus), ) #print("\n=== Summary ===") #hl.summarize_variants(mt) # Drop key columns for export rows = mt.rows() rows = rows.order_by(rows.variant_id).drop("locus", "alleles") rows.write('clinvar.ht',overwrite=True) ''' print("\n=== Exporting to Elasticsearch ===") rows = hl.read_table('clinvar.ht') export_ht_to_es(rows, index_name = 'clinvar_grch37',index_type = 'variant')
def run_pipeline(args): hl.init(log='./hail_annotation_pipeline.log') ht = hl.read_table(args.input) export_ht_to_es(ht,index_name='ssc_genomes')
def run_pipeline(args): hl.init(log='./hail_annotation_pipeline.log') ht = hl.read_table(args.input) export_ht_to_es(ht, index_name='gnomad_structural_variants')
def populate_constraint(): #ds = hl.read_table('gnomad.v2.1.1.lof_metrics.by_transcript.ht') #ds = hl.import_table('constraint_final_standard.txt.bgz',delimiter='\t',key='transcript',impute=True) ds = hl.import_table('constraint_final_cleaned.txt.bgz',delimiter='\t',key='transcript',impute=True) #ds = hl.import_table('missing_small.txt',delimiter='\t',key='transcript',impute=True) # The globals in the Hail table cause a serialization error during Elasticsearch export ds = ds.select_globals() pprint.pprint(ds.describe()) ''' population_dict_fields = [ "pop_no_lofs", "pop_obs_het_lof", "pop_obs_hom_lof", "pop_defined", "pop_p", ] populations = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"] # Convert dicts to structs for Elasticsearch export ds = ds.annotate( **{ f: hl.struct(**{pop: ds[f][pop] for pop in populations}) for f in population_dict_fields } ) ''' # Convert interval to struct for Elasticsearch export ''' ds = ds.annotate( interval=hl.struct( chrom=ds.interval.start.contig, start=ds.interval.start.position, end=ds.interval.end.position, ) ) ds = ds.key_by() ''' ds = ds.transmute(gene_name=ds.gene, transcript_id=ds.transcript) #ds.write(args.output_url) ''' ds = ds.select('exp_lof','exp_mis','exp_syn','obs_lof','obs_mis','obs_syn', 'oe_lof','oe_lof_lower','oe_mis','oe_mis_lower','oe_mis_upper', 'oe_syn','oe_syn_lower','oe_syn_upper', 'lof_z','mis_z','syn_z', 'pLI','pNull','pRec') ''' ''' ds = ds.select('exp_lof','exp_mis','exp_syn','obs_lof','obs_mis','obs_syn', 'lof_z','mis_z','syn_z', 'pLI','pNull','pRec') ''' pprint.pprint(ds.describe()) pprint.pprint(ds.show()) export_ht_to_es(ds, index_name = 'gnomad_constraint_2_1_1',index_type = 'constraint')