def read_gwas_table_with_varlist(gwas_table_tsv, varlist, type_dic, checkpoint_path, gwas_ht=None, no_return=False): # varlist can be one file or a list of files # type dic is for gwas_table_tsv if gwas_ht is None: gwas_tsv = hl.import_table(gwas_table_tsv, key=['rsid'], types=type_dic) else: phenotypes = gwas_ht['phenotypes'].collect()[0] i, j = get_index_in_nested_list(phenotypes, gwas_table_tsv) gwas_tsv = gwas_ht.annotate(beta=gwas_ht['beta'][i][j], pval=gwas_ht['p_value'][i][j]) gwas_tsv = gwas_tsv.select('beta', 'pval', 'locus', 'alleles') clump_snp = hl.import_table(varlist, key=['f0'], no_header=True) gwas_tsv = gwas_tsv.filter(hl.is_defined(clump_snp[gwas_tsv.rsid])) if gwas_ht is None: k = hl.parse_variant(gwas_tsv.variant) gwas_tsv = gwas_tsv.annotate(**k) gwas_tsv = gwas_tsv.key_by(gwas_tsv.locus, gwas_tsv.alleles) # gwas_tsv = gwas_tsv.repartition(40) # gwas_tsv = gwas_tsv.cache() if no_return is False: gwas_tsv = gwas_tsv.checkpoint(checkpoint_path, overwrite=True) return gwas_tsv else: gwas_tsv.write(checkpoint_path, overwrite=True)
def make_clinvar_hail2(clinvar_vcf_path, clinvar_variants_table, clinvar_mt_out_path): """ Import ClinVar vcf file, and turn it into a usable Hail2 mt :param str clinvar_vcf_path: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.vcf.bgz" :param str clinvar_variants_table: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.variants_table.tsv" :param bool repartition: :param int n_partitions: Number of partitions if repartition = True :param str clinvar_mt_out_path: "gs://gnomad-resources/clinvar/hail-0.2/clinvar_alleles.single.b37.hail2.vepped.mt" :return: split and VEP'd MT :rtype: MatrixTable """ clinvar_mt = hl.import_vcf(clinvar_vcf_path) variants_table = hl.import_table(clinvar_variants_table, impute=True) variants_table = variants_table.annotate( v=hl.parse_variant(variants_table.v)) variants_table = (variants_table.annotate( locus=variants_table.v.locus, alleles=variants_table.v.alleles).key_by('locus', 'alleles')) clinvar_mt = clinvar_mt.annotate_rows( va=variants_table[clinvar_mt.locus, clinvar_mt.alleles]) clinvar_mt = split_multi_dynamic(clinvar_mt, left_aligned=False) clinvar_mt = clinvar_mt.repartition(100) clinvar_vep = hl.vep(clinvar_mt, vep_config) clinvar_vep.write(clinvar_mt_out_path, overwrite=True) t = hl.read_matrix_table(clinvar_mt_out_path) t.rows().show()
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None: """ Creates a rank file for 2.0.2 RF and writes it to its correct location. :param str data_type: One of 'exomes' or 'genomes' :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB)) :return: Nothing :rtype: None """ logger.info( f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}") if not hl.hadoop_exists( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'): ht = hl.import_table(get_2_0_2_rf_path(data_type, beta), types={'chrom': hl.tstr}, impute=True, min_partitions=1000) if 'chrom' in ht.row: ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos), alleles=[ht.ref, ht.alt]) else: ht = ht.transmute( v=hl.parse_variant(ht.v), rfprob=ht.rf_rpob_tp # Yes, this is awful ) ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles) ht = ht.key_by('locus', 'alleles') gnomad_ht = get_gnomad_annotations(data_type) ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob) ht.write( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = hl.read_table( f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht') ht = add_rank(ht, score_expr=1 - ht.score, subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton, 'adj_rank': ht.ac > 0, 'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0), 'adj_singleton_rank': ht.singleton & (ht.ac > 0), 'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0) }) ht.write(score_ranking_path(data_type, 'rf_2.0.2{}'.format('_beta' if beta else '')), overwrite=True)
def test_variant_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100}, {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5}, {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.variant_qc(mt, 'vqc') r = mt.rows().collect() self.assertEqual(r[0].vqc.AF, [0.5, 0.5]) self.assertEqual(r[0].vqc.AC, [3, 3]) self.assertEqual(r[0].vqc.AN, 6) self.assertEqual(r[0].vqc.homozygote_count, [1, 1]) self.assertEqual(r[0].vqc.n_called, 3) self.assertEqual(r[0].vqc.n_not_called, 1) self.assertEqual(r[0].vqc.call_rate, 0.75) self.assertEqual(r[0].vqc.n_het, 1) self.assertEqual(r[0].vqc.n_non_ref, 2) self.assertEqual(r[0].vqc.het_freq_hwe, 0.6) self.assertEqual(r[0].vqc.p_value_hwe, 0.7) self.assertEqual(r[0].vqc.dp_stats.min, 0) self.assertEqual(r[0].vqc.dp_stats.max, 100) self.assertEqual(r[0].vqc.dp_stats.mean, 51.25) self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645) self.assertEqual(r[0].vqc.gq_stats.min, 10) self.assertEqual(r[0].vqc.gq_stats.max, 11) self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334) self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168) self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375]) self.assertEqual(r[1].vqc.AC, [1, 4, 3]) self.assertEqual(r[1].vqc.AN, 8) self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1]) self.assertEqual(r[1].vqc.n_called, 4) self.assertEqual(r[1].vqc.n_not_called, 0) self.assertEqual(r[1].vqc.call_rate, 1.0) self.assertEqual(r[1].vqc.n_het, 2) self.assertEqual(r[1].vqc.n_non_ref, 4) self.assertEqual(r[1].vqc.p_value_hwe, None) self.assertEqual(r[1].vqc.het_freq_hwe, None) self.assertEqual(r[1].vqc.dp_stats.min, 5) self.assertEqual(r[1].vqc.dp_stats.max, 5) self.assertEqual(r[1].vqc.dp_stats.mean, 5) self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0) self.assertEqual(r[1].vqc.gq_stats.min, 10) self.assertEqual(r[1].vqc.gq_stats.max, 10) self.assertEqual(r[1].vqc.gq_stats.mean, 10) self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
def _write_tmp_mt(): ## part 1: about an 1 hr with 30 workers (possibly starting with 10 then increasing to 30 if progress stalls) variants = hl.import_table( 'gs://nbaya/hapmap3_variants.tsv.gz', force=True ) # download here: https://github.com/nikbaya/split/blob/master/hapmap3_variants.tsv.gz variants = variants.key_by(**hl.parse_variant(variants.v)) mt = get_ukb_imputed_data( 'all', variant_list=variants, entry_fields=('dosage', )) # 'all' = autosomes only # print(mt.count()) # (1089172, 487409) # mt = mt.checkpoint(mt_path.replace('nbaya/','nbaya/tmp-'), overwrite=overwrite) mt.write(tmp_mt_path, _read_if_exists=True)
def get_ldsim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=None): data = [] rs = np.random.RandomState(seed) for v in range(n_variants): for s in range(n_samples): for c in range(n_contigs): data.append({ 'v': f'{c+1}:{v+1}:A:C', 's': f's{s+1:09d}', 'cm': .1, 'GT': hl.Call([rs.randint(0, 2), rs.randint(0, 2)]) }) ht = hl.Table.parallelize( data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s'], row_fields=['cm']) return add_default_plink_fields(mt)
def test_sample_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5}, {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4}, {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5}, {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3}, {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.sample_qc(mt, 'sqc') r = mt.cols().select('sqc').collect() self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11) self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807) self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0) self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20) self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999) self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990) self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0) self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5) self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333) self.assertEqual(r[0].sqc.n_called, 5) self.assertEqual(r[0].sqc.n_not_called, 1) self.assertEqual(r[0].sqc.n_hom_ref, 1) self.assertEqual(r[0].sqc.n_het, 1) self.assertEqual(r[0].sqc.n_hom_var, 3) self.assertEqual(r[0].sqc.n_insertion, 2) self.assertEqual(r[0].sqc.n_deletion, 0) self.assertEqual(r[0].sqc.n_singleton, 3) self.assertEqual(r[0].sqc.n_transition, 1) self.assertEqual(r[0].sqc.n_transversion, 3) self.assertEqual(r[0].sqc.n_star, 0) self.assertEqual(r[0].sqc.n_non_ref, 4) self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333) self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333) self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0): data = [] rs = np.random.RandomState(seed) contig_index = dividx(n_variants, n_contigs) assert contig_index.ndim == 1 assert contig_index.size == n_variants for v in range(n_variants): c = contig_index[v] for s in range(n_samples): data.append({ "v": f"{c+1}:{v+1}:A:C", "s": f"S{s+1:07d}", "cm": 0.1, "GT": hl.Call([rs.randint(0, 2), rs.randint(0, 2)]), }) ht = hl.Table.parallelize( data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}")) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(row_key=["locus", "alleles"], col_key=["s"], row_fields=["cm"]) return add_default_plink_fields(mt)
def preprocess1(variant_set): print('\n##################') print('Starting Pre-processing 1: Creating variants table (variant_set: ' + variant_set + ')') print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) print('\n##################') if variant_set == 'hm3': variants = hl.import_table('gs://nbaya/split/hapmap3_variants.tsv') elif variant_set == 'qc_pos': variants = hl.import_table( 'gs://ukb31063-mega-gwas/qc/ukb31063.gwas_variants.autosomes.tsv') variants = variants.annotate(**hl.parse_variant(variants.v)) variants = variants.key_by('locus', 'alleles') variants.write('gs://nbaya/split/' + variant_set + '_variants.ht') print('\n##################') print( 'Finished Pre-processing 1: Creating variants table using variant_set: ' + variant_set) print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) print('\n##################')
hl.init(log='/hail.log', min_block_size=2048) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # define files #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # input vds_file = 'gs://ccdg-qc-multi/vds/raw/hail2_allchr.vds' onep_file = 'gs://ccdg-qc-multi/out/onep_variants_table.tsv' # output vds_onep_file = 'gs://ccdg-qc-multi/vds/raw/hail2_onep.vds' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # create subset #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds = hl.read_matrix_table(vds_file) onep = hl.import_table(onep_file, no_header=True).key_by('f0') onep2 = onep.transmute(**hl.parse_variant( onep.f0, reference_genome=hl.genetics.GenomeReference.GRCh38())).key_by( 'locus', 'alleles') vds = vds.filter_rows(hl.is_defined(onep2[vds.locus, vds.alleles]), keep=True) vds.write(vds_onep_file, overwrite=True) # print runtime stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
import hail as hl from pprint import pprint from bokeh.io import output_notebook,show,save from bokeh.layouts import gridplot from bokeh.models import Span import hail.expr.aggregators as agg from bokeh.plotting import figure, output_file import numpy as np hl.init(default_reference='GRCh38',min_block_size=6) #Annotations: gsutil -m cp /medpop/esp2/mzekavat/CHIP/CHUD/data/variant_annot/somVariants.txt.bgz gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz kt = hl.import_table('gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz', impute = True,min_partitions=2000,no_header = True) kt2 = kt.key_by(**hl.parse_variant(kt.f0) kt2.describe() kt2.write('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht') kt2=hl.read_table('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht').repartition(1000) kt2 = hl.vep(kt2, 'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json') consequence_in_severity_order = [ "transcript_ablation" , "splice_acceptor_variant" , "splice_donor_variant" , "stop_gained" , "frameshift_variant" , "stop_lost" , "start_lost" , "transcript_amplification"
def main(args): betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100'] spike_slab = hl.import_table( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt', impute=True) spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v)) if args.compute_true_phenotypes: # get the white british subset eur = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv' ).key_by('s') # read in imputed data, subset to chr22 mt = hl.read_matrix_table( 'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt') mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')]) # annotate and filter imputed data to all sites with causal effects mt = mt.annotate_rows(ss=spike_slab[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt.ss)) # compute true PRS (i.e. phenotypes) annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas} # write out phenos for white British unrelated subset mt = mt.annotate_cols(**annot_expr) mt = mt.filter_cols(hl.is_defined(eur[mt.s])) mt.cols().write( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht', stage_locally=True, overwrite=True) if args.run_gwas: # read back in PRS (now true phenotypes) phenos = hl.read_table( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht' ).key_by('s') phenos.show() covariates = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', impute=True, types={ 's': hl.tstr }).key_by('s') full_mt = hl.read_matrix_table( 'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt') full_mt = full_mt.annotate_cols(**covariates[full_mt.s]) full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')]) # annotate and filter imputed data to all sites with causal effects full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key]) full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss)) # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs for i in range(10): subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1)) subset_pheno = subset_pheno.order_by( subset_pheno.r).add_index('global_idx').key_by('s') subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000) mt = full_mt.annotate_cols(**subset_pheno[full_mt.s]) mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2) result_ht = hl.linear_regression_rows( y=[mt[i] for i in betas], x=mt.dosage, covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)], pass_through=['rsid', 'maf']) subset_pheno.export( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_' + str(i) + '.tsv.gz') result_ht.write( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.ht', overwrite=True) if args.write_gwas: for i in range(10): result_ht = hl.read_table( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.ht') result_ht = result_ht.key_by() get_expr = { field + '_' + x: result_ht[field][i] for i, x in enumerate(betas) for field in ['beta', 'standard_error', 'p_value'] } result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0], alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\ .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')
IMPUTESEX_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_imputesex.ht' IMPUTESEX_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_imputesex.tsv' Y_NCALLED = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_ycalled.tsv' INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list' PRUNED_CHRX_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_chrX.prune.in' PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht' ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0') ht_pruned_chrx_variants = hl.import_table(PRUNED_CHRX_VARIANTS, no_header=True) sample_annotations = hl.read_table(PHENOTYPES_TABLE) ht_pruned_chrx_variants = ht_pruned_chrx_variants.annotate( **hl.parse_variant(ht_pruned_chrx_variants.f0, reference_genome='GRCh38')) ht_pruned_chrx_variants = ht_pruned_chrx_variants.key_by( ht_pruned_chrx_variants.locus, ht_pruned_chrx_variants.alleles) mt = hl.read_matrix_table(MT_HARDCALLS) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_pruned_chrx_variants[mt.row_key])) n = mt.count() print('n samples:') print(n[1]) print('n variants:') print(n[0]) imputed_sex = hl.impute_sex(mt.GT, female_threshold=0.6, male_threshold=0.6)
ht_final_samples = hl.import_table(FINAL_SAMPLE_LIST, no_header=True, key='f0') ht_final_variants = hl.import_table(FINAL_VARIANT_LIST, types={ 'locus': hl.tlocus(reference_genome='GRCh38'), 'alleles': hl.tarray(hl.tstr) }) ht_final_variants = ht_final_variants.key_by(ht_final_variants.locus, ht_final_variants.alleles) ht_final_pruned_variants = hl.import_table(FINAL_PRUNED_VARIANTS, no_header=True) ht_final_pruned_variants = ht_final_pruned_variants.annotate( **hl.parse_variant(ht_final_pruned_variants.f0, reference_genome='GRCh38')) ht_final_pruned_variants = ht_final_pruned_variants.key_by( ht_final_pruned_variants.locus, ht_final_pruned_variants.alleles) sample_annotations = hl.read_table(PHENOTYPES_TABLE) impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE) annotation_annotations = hl.read_table(ANNOTATION_TABLE) mt = hl.read_matrix_table(MT) mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split') mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key]) mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key])
def read_gwas_table(gwas_table_tsv, type_dic): gwas_tsv = hl.import_table(gwas_table_tsv, types=type_dic) gwas_tsv = gwas_tsv.annotate(v=hl.parse_variant(gwas_tsv.variant)) gwas_tsv = gwas_tsv.key_by(gwas_tsv.v.locus, gwas_tsv.v.alleles) gwas_tsv = gwas_tsv.drop('v') return gwas_tsv
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt' PCA_SCORES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/09_pca_scores.tsv' PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht' INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list' PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list' IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list' mt = hl.read_matrix_table(MT_HARDCALLS) sample_annotations = hl.read_table(PHENOTYPES_TABLE) ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0') ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True) ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0') ht_pruned_variants = ht_pruned_variants.annotate(**hl.parse_variant(ht_pruned_variants.f0, reference_genome='GRCh38')) ht_pruned_variants = ht_pruned_variants.key_by(ht_pruned_variants.locus, ht_pruned_variants.alleles) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key])) mt = mt.filter_cols(~hl.is_defined(ht_ibd_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_pruned_variants[mt.row_key])) mt = mt.annotate_cols(phenotype = sample_annotations[mt.s]).repartition(128).persist() n = mt.count() print('n samples:') print(n[1]) print('n variants:') print(n[0])
def main(args): ht_snp = hl.import_table(args.snp, impute=True) ht_snp = ht_snp.annotate(variant=hl.delimit([ ht_snp.chromosome, hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2 ], delimiter=':')) ht_snp = ht_snp.annotate( **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38')) ht_snp = ht_snp.key_by('locus', 'alleles') ht_snp = ht_snp.add_index('idx_snp') ht_snp = ht_snp.checkpoint(new_temp_file()) # annotate vep gnomad = hl.read_table( 'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) ht_snp = ht_snp.join(gnomad.select('vep'), how='left') ht_snp = process_consequences(ht_snp) # extract most severe ht_snp = ht_snp.annotate(vep=(hl.case().when( hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical), ht_snp.vep.worst_csq_for_variant_canonical).when( hl.is_defined(ht_snp.vep.worst_csq_for_variant), ht_snp.vep.worst_csq_for_variant).or_missing()), is_canonical_vep=hl.is_defined( ht_snp.vep.worst_csq_for_variant_canonical)) ht_snp = ht_snp.annotate(most_severe=hl.if_else( hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence, 'intergenic_variant'), gene_most_severe=ht_snp.vep.gene_symbol) ht_snp = ht_snp.select_globals() ht_snp = ht_snp.drop('vep') ht_snp = ht_snp.annotate( **annotate_consequence_category(ht_snp.most_severe)) ht_snp = ht_snp.checkpoint(new_temp_file()) df = ht_snp.key_by().drop('locus', 'alleles', 'variant', 'idx_snp').to_pandas() # annotate LD for pop in POPS: ht = hl.read_table( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht' ) ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38')) ht = ht.filter(hl.is_defined(ht.locus_hg38)) ht = ht.key_by('locus_hg38', 'alleles').drop('locus') ht = ht_snp.join(ht, 'inner') ht = ht.checkpoint(new_temp_file()) lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect() idx = ht.idx.collect() bm = BlockMatrix.read( f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm' ) bm = bm.filter(idx, idx) # re-densify triangluar matrix bm = bm + bm.T - get_diag_mat(bm.diagonal()) bm = bm.filter_rows( np.where(np.array(idx) == lead_idx[0])[0].tolist())**2 idx_snp = ht.idx_snp.collect() r2 = bm.to_numpy()[0] df[f'gnomad_lead_r2_{pop}'] = np.nan df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2 if args.out.startswith('gs://'): fopen = hl.hadoop_open else: fopen = open with fopen(args.out, 'w') as f: df.to_csv(f, sep='\t', na_rep='NA', index=False)
'Cells_Transformed_fibroblasts', 'Colon_Sigmoid', 'Colon_Transverse', 'Esophagus_Gastroesophageal_Junction', 'Esophagus_Mucosa', 'Esophagus_Muscularis', 'Heart_Atrial_Appendage', 'Heart_Left_Ventricle', 'Liver', 'Lung', 'Minor_Salivary_Gland', 'Muscle_Skeletal', 'Nerve_Tibial', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Skin_Not_Sun_Exposed_Suprapubic', 'Skin_Sun_Exposed_Lower_leg', 'Small_Intestine_Terminal_Ileum', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'V****a', 'Whole_Blood' ] hts = [(hl.import_table( 'gs://hail-datasets/raw-data/gtex/v7/single-tissue-eqtl/processed/{}.allpairs.tsv.bgz' .format(x)).annotate(tissue='{}'.format(x))) for x in tissues] ht_union = hl.Table.union(*hts) ht_union = ht_union.annotate(**hl.parse_variant( ht_union.variant_id.replace('_b37$', '').replace('_', ':'))) ht_union = ht_union.drop('variant_id') ht_union = ht_union.annotate(tss_distance=hl.int(ht_union['tss_distance']), maf=hl.float(ht_union['maf']), ma_samples=hl.int(ht_union['ma_samples']), ma_count=hl.int(ht_union['ma_count']), pval_nominal=hl.float(ht_union['pval_nominal']), slope=hl.float(ht_union['slope']), slope_se=hl.float( hl.or_missing(ht_union['slope_se'] != '-nan', ht_union['slope_se']))) mt = ht_union.to_matrix_table(row_key=['locus', 'alleles', 'gene_id'], col_key=['tissue'], row_fields=['tss_distance', 'maf']) mt = mt.partition_rows_by(['locus'], 'locus', 'alleles', 'gene_id')
# ukb_snps = ukb_snps.annotate(sort_al=hl.sorted(ukb_snps.alleles)) if verbose: print("\nCount 1: " + str(ukb_snps.count()) + '\n') ukb_snps = ukb_snps.filter( hl.is_snp(ukb_snps.alleles[0], ukb_snps.alleles[1]) & (~(ukb_snps.locus.contig == 'X')) & (~((ukb_snps.locus.contig == '6') & (ukb_snps.locus.position > 25000000) & (ukb_snps.locus.position < 34000000)))) if verbose: print("\nCount 2: " + str(ukb_snps.count()) + '\n') # merge in, filter on MAF from the UKBB GWAS sample ukb_qc = hl.import_table(GWAS_qc) ukb_qc = ukb_qc.annotate(vstruct=hl.parse_variant(ukb_qc.variant)) ukb_qc = ukb_qc.annotate(locus=ukb_qc.vstruct.locus, alleles=ukb_qc.vstruct.alleles).key_by( 'locus', 'alleles') ukb_qc2 = ukb_snps.join(ukb_qc.select(ukb_qc.minor_AF)) if verbose: print("\nCount 3: " + str(ukb_qc2.count()) + '\n') ukb_qc2 = ukb_qc2.filter((hl.float(ukb_qc2.minor_AF) > 0.01) & (hl.float(ukb_qc2.minor_AF) < 0.99)) if verbose: print("\nCount 4: " + str(ukb_qc2.count()) + '\n') # merge in rsid, info (from full UKB sample) # and filter to info > 0.9 ukb_mfi = hl.read_table(GWAS_mfi).key_by('locus', 'alleles').repartition(