def man_qq_plts(mt): gwas_ht = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0]) pvals = gwas_ht.select(gwas_ht.p_value) man_df = pvals.to_pandas() man_df_pruned = man_df[['locus.contig', 'locus.position', 'p_value']] man_df_pruned.columns = ['CHR', 'BP', 'P'] man_df_pruned = man_df_pruned.dropna() man_df_pruned = man_df_pruned.replace(to_replace=["X", "Y", "MT"], value=[23, 24, 25]) buffer = io.BytesIO() figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 10)) qqman.manhattan(man_df_pruned, ax=axes[0], xrotation=90.0, title="Manhattan plot") qqman.qqplot(man_df_pruned, ax=axes[1], title="QQ plot") figure.tight_layout() plt.savefig(buffer, format='PNG') plt.clf() plt.close() buffer.seek(0) plt_base64 = base64.b64encode(buffer.read()).decode('ascii') return '<img src="data:image/png;base64,{}">'.format(plt_base64)
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) downsampled = mt.sample_rows(0.01, seed=11223344) eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows( y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def gwas(mt, x, y, cov_list=[], with_intercept=True, pass_through=[], path_to_save=None, normalize_x=False, is_std_cov_list=False): '''Runs GWAS in Hail''' mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x}) if normalize_x: mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x)) mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) / mt.__gt_stats.stdev) mt = mt.drop('__gt_stats') if is_std_cov_list: cov_list = [ 'isFemale', 'age', 'age_squared', 'age_isFemale', 'age_squared_isFemale' ] + ['PC{:}'.format(i) for i in range(1, 21)] if str in list(map(lambda x: type(x), cov_list)): cov_list = list(map(lambda x: mt[x] if type(x) is str else x, cov_list)) cov_list = ([1] if with_intercept else []) + cov_list pass_through = list(set(['rsid'] + pass_through)) print(f'variables to pass through: {pass_through}') gwas_ht = hl.linear_regression_rows(y=mt.__y, x=mt.__x, covariates=cov_list, pass_through=pass_through) gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept) gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP') gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n) ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht' ) # sumstats template as a hail table ss_template = ss_template.key_by('SNP') ss = ss_template.annotate(Z=gwas_ht[ss_template.SNP].Z, N=gwas_ht[ss_template.SNP].N) if path_to_save is not None: ss.export(path_to_save) return ss
def run_grouped_regressions(mt, ss_output, pheno, pheno_name): ht = hl.linear_regression_rows( y=[[mt['phenotypes'][y]] for y in pheno], x=mt.dosage, covariates=[ 1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())] ], pass_through=['varid', 'rsid']) ht = ht.annotate_globals(phenotypes=pheno) # check this ht.write(ss_output + pheno_name + '.ht', overwrite=args.overwrite)
def gwas(mt, x, y, cov_list=[], with_intercept=True, pass_through=[], path_to_save=None, normalize_x=True, is_std_cov_list=False): '''Runs GWAS''' mt = mt._annotate_all(col_exprs={'y': y}, entry_exprs={'x': x}) if normalize_x: mt = normalize_genotypes(mt, mt.x) mt = mt.annotate_entries(x=mt.__norm_gt).drop('__norm_gt') if is_std_cov_list: cov_list = [ 'isFemale', 'age', 'age_squared', 'age_isFemale', 'age_squared_isFemale' ] + ['PC{:}'.format(i) for i in range(1, 21)] if str in list(map(lambda x: type(x), cov_list)): cov_list = list(map(lambda x: mt[x] if type(x) is str else x, cov_list)) cov_list = ([1] if with_intercept else []) + cov_list print(pass_through) gwas_ht = hl.linear_regression_rows(y=mt.y, x=mt.x, covariates=cov_list, pass_through=['rsid'] + pass_through) gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept) gwas_ht = gwas_ht.rename({'rsid': 'SNP'}).key_by('SNP') gwas_ht = gwas_ht.select(Z=gwas_ht.t_stat, N=gwas_ht.n) sumstats_template = hl.import_table( 'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64}) sumstats_template = sumstats_template.key_by('SNP') sumstats = sumstats_template.annotate(Z=gwas_ht[sumstats_template.SNP].Z, N=gwas_ht[sumstats_template.SNP].N) if path_to_save is not None: sumstats.export(path_to_save) return gwas_ht
def linear_regression_rows(mt_path): mt = hl.read_matrix_table(mt_path) num_phenos = 100 num_covs = 20 pheno_dict = {f"pheno_{i}": hl.rand_unif(0, 1) for i in range(num_phenos)} cov_dict = {f"cov_{i}": hl.rand_unif(0, 1) for i in range(num_covs)} mt = mt.annotate_cols(**pheno_dict) mt = mt.annotate_cols(**cov_dict) res = hl.linear_regression_rows( y=[mt[key] for key in pheno_dict.keys()], x=mt.x, covariates=[mt[key] for key in cov_dict.keys()]) res._force_count()
def run_gwas(mt, phen: str, sim_name: str, subset_idx: int, param_suffix: str, wd: str, is_logreg=True): assert {'GT', 'dosage'}.intersection( mt.entry ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data" mt = mt.filter_cols(mt.subset_idx == subset_idx) mt = mt.filter_cols(hl.is_defined(mt[phen])) print( f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n') if 'dosage' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2) elif 'GT' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz' if not hl.hadoop_is_file(gwas_path): gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles() if is_logreg: gwas_ht = hl.logistic_regression_rows(test='wald', y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) else: gwas_ht = hl.linear_regression_rows(y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) gwas_ht.select('EAF', 'beta', 'standard_error', 'p_value').export(gwas_path) else: print(f'GWAS already run! ({gwas_path})') gwas_ht = hl.import_table(gwas_path, impute=True, force=True) gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus), alleles=gwas_ht.alleles.replace( '\[\"', '').replace('\"\]', '').split('\",\"')) gwas_ht = gwas_ht.key_by('locus', 'alleles') return gwas_ht
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[ 1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2] ]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
if n_chunks == 2: #traditional split of population into even halves (no meta-analysis) mt_A = mt.filter_cols(mt.group_id == 0) mt_B = mt.filter_cols(mt.group_id == 1) cov_list_A = [ mt_A['isFemale'], mt_A['age'], mt_A['age_squared'], mt_A['age_isFemale'], mt_A['age_squared_isFemale'] ] + [mt_A['PC{:}'.format(i)] for i in range(1, 21)] cov_list_B = [ mt_B['isFemale'], mt_B['age'], mt_B['age_squared'], mt_B['age_isFemale'], mt_B['age_squared_isFemale'] ] + [mt_B['PC{:}'.format(i)] for i in range(1, 21)] ht_A = hl.linear_regression_rows(y=mt_A.y, x=mt_A.x, covariates=[1] + cov_list_A, pass_through=['rsid']) ht_B = hl.linear_regression_rows(y=mt_B.y, x=mt_B.x, covariates=[1] + cov_list_B, pass_through=['rsid']) ht_A = ht_A.rename({'rsid': 'SNP'}).key_by('SNP') ht_B = ht_B.rename({'rsid': 'SNP'}).key_by('SNP') ht_A = ht_A.select(Z=ht_A.beta / ht_A.standard_error) ht_B = ht_B.select(Z=ht_B.beta / ht_B.standard_error) sumstats_template = hl.import_table( 'gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',
test="firth", #controls false positives y=hl.float(mt.AffectionBool), x=mt.GT.n_alt_alleles(), covariates=[ 1, hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2], mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7], mt.scores[8], mt.scores[9] ]) ######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable) gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1), x=mt.GT.n_alt_alleles(), covariates=[ 1, hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2], mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7], mt.scores[8], mt.scores[9] ]) ######## 6.3 Q-Q plot qqplot = hl.plot.qq(gwas.p_value) show(qqplot) ######## 6.4 Manhattan-like plots #GWAS significanse level = 5.0 10e-8, suggestive: 5.0 10e-8 < P < 5.0 * 10e-6. # Calculate Bonferroni based cut off lines Bonferroni_line = -np.log10(0.05 / mt.count_rows()) Suggestive_line = -np.log10(1 / mt.count_rows())
args = parser.parse_args() hl.import_vcf(args.vcf_path).write(args.workdir + 'hail.mt', overwrite=True) mt = hl.read_matrix_table(args.workdir + 'hail.mt') table = (hl.import_table(args.annotation_path, impute=True) .key_by('Sample')) # Start benchmarking after I/O t0 = time.time() mt = mt.annotate_cols(pheno = table[mt.s]) mt = hl.sample_qc(mt) mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97)) ab = mt.AD[1] / hl.sum(mt.AD) filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) | (mt.GT.is_hom_var() & (ab >= 0.9))) fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab)) mt = mt.filter_entries(filter_condition_ab) mt = hl.variant_qc(mt) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6) gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0]) p = hl.plot.manhattan(gwas.p_value) print('Time Elapsed: {}'.format(time.time()- t0))
def gwas(y, x, cov): g = hl.linear_regression_rows(y=y, x=x, covariates=cov, pass_through=['rsid']) return g
# COMMAND ---------- print('Samples: %d Variants: %d' % (common_mt.count_cols(), common_mt.count_rows())) # COMMAND ---------- # MAGIC %md These filters removed about 15% of sites (we started with a bit over 10,000). This is _NOT_ representative of most sequencing datasets! We have already downsampled the full thousand genomes dataset to include more common variants than we'd expect by chance. # MAGIC # MAGIC In Hail, the association tests accept column fields for the sample phenotype and covariates. Since we've already got our phenotype of interest (caffeine consumption) in the dataset, we are good to go: # COMMAND ---------- gwas = hl.linear_regression_rows(y=common_mt.CaffeineConsumption, x=common_mt.GT.n_alt_alleles(), covariates=[1.0]) gwas.row.describe() # COMMAND ---------- # MAGIC %md Looking at the bottom of the above printout, you can see the linear regression adds new row fields for the beta, standard error, t-statistic, and p-value. # MAGIC # MAGIC Hail makes it easy to make a [Q-Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q–Q_plot). # COMMAND ---------- p = hl.plot.qq(gwas.p_value) displayBokeh(p) # COMMAND ----------
def gwas(mt, x, y, cov_list=[], with_intercept=True, pass_through=[], path_to_save=None, normalize_x=False, is_std_cov_list=False): r'''Runs GWAS''' mt = mt._annotate_all(col_exprs={'__y': y}, entry_exprs={'__x': x}) print('\n... Calculating allele frequency ...') mt_freq_rows = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) / 2).rows() #frequency of alternate allele mt_freq_rows = mt_freq_rows.key_by('rsid') if normalize_x: mt = mt.annotate_rows(__gt_stats=hl.agg.stats(mt.__x)) mt = mt.annotate_entries(__x=(mt.__x - mt.__gt_stats.mean) / mt.__gt_stats.stdev) mt = mt.drop('__gt_stats') if is_std_cov_list: cov_list = [ 'isFemale', 'age', 'age_squared', 'age_isFemale', 'age_squared_isFemale' ] + ['PC{:}'.format(i) for i in range(1, 21)] if str in list(map(lambda x: type(x), cov_list)): cov_list = list(map(lambda x: mt[x] if type(x) is str else x, cov_list)) cov_list = ([1] if with_intercept else []) + cov_list print(f'pass through: {pass_through}') gwas_ht = hl.linear_regression_rows(y=mt.__y, x=mt.__x, covariates=cov_list, pass_through=['rsid'] + pass_through) gwas_ht = gwas_ht.annotate_globals(with_intercept=with_intercept) gwas_ht = gwas_ht.key_by('rsid') ss_template = hl.read_table('gs://nbaya/rg_sex/hm3.sumstats_template.ht' ) # sumstats template as a hail table ss_template = ss_template.key_by('SNP') ss = ss_template.annotate(chr=gwas_ht[ss_template.SNP].locus.contig, bpos=gwas_ht[ss_template.SNP].locus.position, freq=mt_freq_rows[ss_template.SNP].freq, beta=gwas_ht[ss_template.SNP].beta, z=gwas_ht[ss_template.SNP].t_stat, pval=gwas_ht[ss_template.SNP].p_value, n=gwas_ht[ss_template.SNP].n) ss = ss.drop('N') ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2'}) print(ss.describe()) if path_to_save is not None: ss.export(path_to_save) return ss
gwas = hl.linear_regression_rows( y=[ mt.sample_qc_and_phenotype.fbc.neut_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.eo_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mono_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.lymph_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.baso_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.ret_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.hlr_p_gwas_normalised, mt.sample_qc_and_phenotype.fbc.hct_gwas_normalised, mt.sample_qc_and_phenotype.fbc.pct_gwas_normalised, mt.sample_qc_and_phenotype.fbc.hgb_gwas_normalised, mt.sample_qc_and_phenotype.fbc.rbc_gwas_normalised, mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised, mt.sample_qc_and_phenotype.fbc.plt_gwas_normalised, mt.sample_qc_and_phenotype.fbc.rdw_gwas_normalised, mt.sample_qc_and_phenotype.fbc.pdw_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mcv_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mch_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mchc_gwas_normalised, mt.sample_qc_and_phenotype.fbc.ret_gwas_normalised, mt.sample_qc_and_phenotype.fbc.hlr_gwas_normalised, mt.sample_qc_and_phenotype.fbc.neut_gwas_normalised, mt.sample_qc_and_phenotype.fbc.mono_gwas_normalised, mt.sample_qc_and_phenotype.fbc.baso_gwas_normalised, mt.sample_qc_and_phenotype.fbc.eo_gwas_normalised, mt.sample_qc_and_phenotype.fbc.lymph_gwas_normalised, mt.sample_qc_and_phenotype.fbc.irf_gwas_normalised, mt.sample_qc_and_phenotype.fbc.myeloid_wbc_gwas_normalised, mt.sample_qc_and_phenotype.fbc.gran_gwas_normalised, mt.sample_qc_and_phenotype.fbc.eo_baso_sum_gwas_normalised, mt.sample_qc_and_phenotype.fbc.neut_eo_sum_gwas_normalised, mt.sample_qc_and_phenotype.fbc.baso_neut_sum_gwas_normalised, mt.sample_qc_and_phenotype.fbc.gran_p_myeloid_wbc_gwas_normalised, mt.sample_qc_and_phenotype.fbc.eo_p_gran_gwas_normalised, mt.sample_qc_and_phenotype.fbc.neut_p_gran_gwas_normalised, mt.sample_qc_and_phenotype.fbc.baso_p_gran_gwas_normalised # mt.sample_qc_and_phenotype.fbc.wbc_gwas_normalised, # mt.sample_qc_and_phenotype.fbc.mpv_gwas_normalised], ], x=mt.GT.n_alt_alleles(), covariates=[1.0], pass_through=[mt.rsid])
u = 8 rgs = [-0.3, -0.1, 0, 0.1, 0.3] # Import matrix and annotate cols with phenotypes mt = hl.read_matrix_table(input_matrix) mt = mt.annotate_cols(U=mt.y[u]) mt = mt.annotate_cols(X=mt.y[x]) for i in range(5): mt = mt.annotate_cols(Y=mt.y[i + 2]) # GWAS of X, Y in all result_ht = hl.linear_regression_rows(y=[mt.X, mt.Y], x=mt.GT.n_alt_alleles(), covariates=[1], pass_through=['rsid']) result_ht = result_ht.annotate(A1=result_ht.alleles[0], A2=result_ht.alleles[1]).key_by() file_names = [ output_bucket + 'gwas/gwas_X_' + str(rgs[i]) + '.tsv', output_bucket + 'gwas/gwas_Y_' + str(rgs[i]) + '.tsv' ] for j, file_name in enumerate(file_names): result_ht.select(result_ht.locus, result_ht.A1, result_ht.A2, result_ht.rsid,
mt = hl.import_bgen( path= f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen', sample_file=f'gs://ukb31063/ukb31063.{contig}.sample', entry_fields=['dosage'], variants=ht_variants) mt = mt.annotate_cols(phenotypes=ht_phenotypes[mt.s], covariates=ht_covariates[mt.s]) phenotypes = list(mt['phenotypes'].keys()) ht = hl.linear_regression_rows( y=[[mt['phenotypes'][y]] for y in phenotypes], x=mt.dosage, covariates=[ 1, *[mt['covariates'][x] for x in list(mt['covariates'].keys())] ], pass_through=['varid', 'rsid']) ht = ht.annotate_globals(phenotypes=phenotypes) if dilution: ht.write( f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.dilution_factor.ht', overwrite=True) else: ht.write( f'gs://ukb31063-mega-gwas/biomarkers/results/ukb31063.biomarker_gwas_results.{sex}.{contig}.pipeline_{pipeline}.ht', overwrite=True)
def main(args): betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100'] spike_slab = hl.import_table( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt', impute=True) spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v)) if args.compute_true_phenotypes: # get the white british subset eur = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv' ).key_by('s') # read in imputed data, subset to chr22 mt = hl.read_matrix_table( 'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt') mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')]) # annotate and filter imputed data to all sites with causal effects mt = mt.annotate_rows(ss=spike_slab[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt.ss)) # compute true PRS (i.e. phenotypes) annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas} # write out phenos for white British unrelated subset mt = mt.annotate_cols(**annot_expr) mt = mt.filter_cols(hl.is_defined(eur[mt.s])) mt.cols().write( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht', stage_locally=True, overwrite=True) if args.run_gwas: # read back in PRS (now true phenotypes) phenos = hl.read_table( 'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht' ).key_by('s') phenos.show() covariates = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', impute=True, types={ 's': hl.tstr }).key_by('s') full_mt = hl.read_matrix_table( 'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt') full_mt = full_mt.annotate_cols(**covariates[full_mt.s]) full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')]) # annotate and filter imputed data to all sites with causal effects full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key]) full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss)) # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs for i in range(10): subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1)) subset_pheno = subset_pheno.order_by( subset_pheno.r).add_index('global_idx').key_by('s') subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000) mt = full_mt.annotate_cols(**subset_pheno[full_mt.s]) mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2) result_ht = hl.linear_regression_rows( y=[mt[i] for i in betas], x=mt.dosage, covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)], pass_through=['rsid', 'maf']) subset_pheno.export( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_' + str(i) + '.tsv.gz') result_ht.write( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.ht', overwrite=True) if args.write_gwas: for i in range(10): result_ht = hl.read_table( 'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.ht') result_ht = result_ht.key_by() get_expr = { field + '_' + x: result_ht[field][i] for i, x in enumerate(betas) for field in ['beta', 'standard_error', 'p_value'] } result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0], alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\ .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')
# Filter MatrixTable and get sample samples_to_keep = set(df.loc[(df['sel'] == 1), 's']) set_to_keep = hl.literal(samples_to_keep) mt_sampled = mt.filter_cols(set_to_keep.contains(mt['s']), keep=True) i = '_' + str(OR_x) + '_' + str(or_sex) # Export phenotypes mt_sampled.cols().select( 's', 'sex', 'y0').key_by().export(out_bucket + 'phenotypes/pheno' + i + '.tsv') # Unadjusted GWASs result_ht = hl.linear_regression_rows(y=[mt_sampled.sex, mt_sampled.y0], x=mt_sampled.GT.n_alt_alleles(), covariates=[1], pass_through=['rsid']) result_ht = result_ht.annotate(A1=result_ht.alleles[0], A2=result_ht.alleles[1]).key_by() file_names = [ out_bucket + 'gwas/gwas_sex' + i + '.tsv', out_bucket + 'gwas/gwas_y0' + i + '.tsv' ] for j, file_name in enumerate(file_names): result_ht.select(result_ht.locus, result_ht.A1, result_ht.A2, result_ht.rsid,
tend = time.time() logging.info('--> Annotate with {} FINISHED! {} seconds elapsed'.format('covariates', tend - tstart)) # prepare phenotypes and covariates into list of lists and list logging.info('Start preparing `y` and `covariates` for `linear_regression_rows`') pheno_list_of_lists = [ [ mt[i][j] for j in mt[i] ] for i in list(subset_ht_dic.keys()) ] pheno_list_of_names = [ [ f'{i}_x_{j}' for j in mt[i] ] for i in list(subset_ht_dic.keys()) ] covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ] logging.info('Prepare `y` and `covariates` for `linear_regression_rows` FINISHED!') # run GWAS logging.info('Start running GWAS') tstart = time.time() gwas_out = hl.linear_regression_rows( y = pheno_list_of_lists, x = mt.dosage, covariates = [1] + covar_list, pass_through = ['varid', 'rsid'] ) gwas_out = gwas_out.annotate_globals(phenotypes = pheno_list_of_names) tend = time.time() logging.info('Running GWAS FINISHED! {} seconds elapsed'.format(tend - tstart)) # write GWAS results onto disk logging.info('Start writing GWAS result to disk') tstart = time.time() ## if target folder does not exist, create it target_folder = os.path.dirname(args.output_filename) if not os.path.exists(target_folder) and target_folder is not '': os.makedirs(target_folder) ## check if extension of output file is .ht, if not add it filename, file_extension = os.path.splitext(args.output_filename)
def filter(self, mt): gwas = hl.linear_regression_rows(y=mt.is_case, x=mt.GT.n_alt_alleles(), covariates=[1.0]) n_sig_variants = gwas.filter(gwas.p_value < 5E-8).count() return gwas, n_sig_variants
sim_mt.aggregate_cols(hl.agg.stats(sim_mt.y_no_noise)) y = sim_mt.select_cols(sim_mt.y_no_noise).make_table() cov = hl.import_table('/Users/nbaya/Documents/lab/ldscsim/ukb31063.gwas_covariates.both_sexes.tsv',impute=True, types={'s': hl.tstr}).key_by('s') mt0 = sim_mt.annotate_cols(**cov[sim_mt.s]) mt0 = mt0.rename({'__norm_gt__': 'x'}) mt = mt0 cov_list = [ mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'], mt['age_squared_isFemale'] ]+ [mt['PC{:}'.format(i)] for i in range(1, 21)] ht = hl.linear_regression_rows( y=mt.y, x=mt.x, covariates=[1]+cov_list, pass_through = ['rsid']) ht = ht.rename({'rsid':'SNP'}).key_by('SNP') ht = ht.select(Z = ht.beta/ht.standard_error) sumstats_template = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz',types={'N': hl.tint64}) sumstats_template = sumstats_template.key_by('SNP') sumstats_template = sumstats_template.annotate(N = n_samples) # sumstats_template.show() sumstats = sumstats_template.annotate(Z = ht[sumstats_template.SNP]['Z'])