def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2, pi=pi, is_annot_inf=is_annot_inf, annot_coef_dict=annot_coef_dict, annot_regex=annot_regex, h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows( __beta=hl.rand_norm( 0, hl.sqrt(mt1.__agg_annot * (h2 / annot_sum if h2_normalize else 1))) ) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def calculate_phenotypes(mt, genotype, beta, h2, popstrat=None, popstrat_var=None): """Calculates phenotypes by multiplying genotypes and betas. Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with all relevant fields passed as parameters. genotype : :class:`.Expression` Entry field of genotypes. beta : :class:`.Expression` Row field of SNP effects. h2 : :obj:`float` or :obj:`int` or :obj:`list` SNP-based heritability (:math:`h^2`) of simulated trait. Can only be ``None`` if running annotation-informed model. popstrat : :class:`.Expression`, optional Column field containing population stratification term. popstrat_var : :obj:`float` or :obj:`int` Variance of population stratification term. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated phenotype as column field. """ assert popstrat_var is None or (popstrat_var >= 0), 'popstrat_var must be non-negative' tid = ''.join( random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5) ) # "temporary id" -- random string to identify temporary intermediate fields generated by this method mt = annotate_all( mt=mt, row_exprs={'beta_' + tid: beta}, col_exprs={} if popstrat is None else {'popstrat_' + tid: popstrat}, entry_exprs={'gt_' + tid: genotype}) mt = normalize_genotypes(mt['gt_' + tid]) if mt['beta_' + tid].dtype == dtype('array<float64>'): #if >1 traits h2 = h2 if type(h2) is list else [h2] mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg( lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + tid])) mt = mt.annotate_cols( y=mt.y_no_noise + hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x)))) else: mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + tid] * mt['norm_gt'])) mt = mt.annotate_cols(y=mt.y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if popstrat is not None: var_factor = 1 if popstrat_var is None else (popstrat_var**( 1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' + tid])).stdev mt = mt.annotate_cols(y_w_popstrat=mt.y + mt['popstrat_' + tid] * var_factor) mt = _clean_fields(mt, tid) return mt
def sim_phenotypes(mt, h2, popstrat=None, popstrat_c=None): mt1 = mt.annotate_cols(__y_no_noise = hl.agg.sum(mt.__beta * mt.__norm_gt)) mt2 = mt1.annotate_cols(__y = mt1.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2))) if popstrat is not None: return add_pop_strat(mt2, popstrat, popstrat_c) else: return mt2
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None, cov_regex=None): '''Calculates phenotypes given betas and genotypes. Adding population stratification is optional''' check_mt_sources(mt,genotype,beta) check_popstrat_args(is_popstrat=is_popstrat,cov_coef_dict=cov_coef_dict,cov_regex=cov_regex) mt1 = mt._annotate_all(row_exprs={'__beta':beta}, entry_exprs={'__gt':genotype}, global_exprs={'__is_popstrat':is_popstrat, '__cov_coef_dict':none_to_null(cov_coef_dict), '__cov_regex':none_to_null(cov_regex)}) mt2 = normalize_genotypes(mt1.__gt) print('\rCalculating phenotypes{}...'.format(' w/ population stratification' if is_popstrat else '').ljust(81)) mt3 = mt2.annotate_cols(__y_no_noise = hl.agg.sum(mt2.__beta * mt2.__norm_gt)) if h2 is None: h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2 if h2 > 1: print(f'WARNING: Total SNP-based h2 = {h2} (>1)') print('Not adding environmental noise') h2=1 mt4 = mt3.annotate_cols(__y = mt3.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2))) if is_popstrat: return add_popstrat(mt4, y=mt4.__y, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) else: return mt4
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict, annot_regex=annot_regex,h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2)) mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi) return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def pc_project( mt: hl.MatrixTable, loadings_ht: hl.Table, loading_location: str = "loadings", af_location: str = "pca_af", ) -> hl.Table: """ Project samples in `mt` on pre-computed PCs. :param mt: MT containing the samples to project :param loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA :param loading_location: Location of expression for loadings in `loadings_ht` :param af_location: Location of expression for allele frequency in `loadings_ht` :return: Table with scores calculated from loadings in column `scores` """ n_variants = loadings_ht.count() mt = mt.annotate_rows( pca_loadings=loadings_ht[mt.row_key][loading_location], pca_af=loadings_ht[mt.row_key][af_location], ) mt = mt.filter_rows( hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af) & (mt.pca_af > 0) & (mt.pca_af < 1)) gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt( n_variants * 2 * mt.pca_af * (1 - mt.pca_af)) mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm)) return mt.cols().select("scores")
def pc_hwe_gt( mt: hl.MatrixTable, loadings_ht: hl.Table, loading_location: str = "loadings", af_location: str = "pca_af", ) -> hl.MatrixTable: n_variants = loadings_ht.count() mt = mt.annotate_rows( pca_loadings=loadings_ht[mt.row_key][loading_location], pca_af=loadings_ht[mt.row_key][af_location], ) mt = mt.filter_rows( hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af) & (mt.pca_af > 0) & (mt.pca_af < 1) ) # Attach normalized entries to be used in projection mt = mt.annotate_entries( GTN=(mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt(n_variants * 2 * mt.pca_af * (1 - mt.pca_af)) ) return mt
def sim_phenotypes(mt, genotype, h2, beta, popstrat=None, popstrat_s2=1): '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional''' print('\rCalculating phenotypes{}...'.format( '' if popstrat is None else ' w/ population stratification').ljust(81)) if popstrat is None: mt1 = mt._annotate_all(row_exprs={'__beta': beta}, entry_exprs={'__gt': genotype}) else: mt1 = mt._annotate_all(row_exprs={'__beta': beta}, col_exprs={'__popstrat': popstrat}, entry_exprs={'__gt': genotype}, global_exprs={'__popstrat_s2': popstrat_s2}) mt2 = normalize_genotypes(mt1, mt1.__gt) mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta * mt2.__norm_gt)) mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if popstrat is None: return mt4 else: return add_pop_strat(mt4, y=mt4.__y, popstrat=mt4.__popstrat, popstrat_s2=hl.eval(mt4.__popstrat_s2))
def make_betas(mt, h2, pi=1, annot=None): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' M = mt.count_rows() if annot is not None: print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2)) mt1 = mt._annotate_all(row_exprs={'__annot': annot}, global_exprs={'__h2': h2}) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot)) return mt1.annotate_rows( __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2))) else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=h2, __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def hwe_normalize(call_expr): mt = matrix_table_source('hwe_normalize/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "hwe_normalize: found 0 variants after filtering out monomorphic sites." ) info( f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else( (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return normalized_gt
def make_betas(mt, h2, pi=1, annot=None): M = mt.count_rows() if annot is not None: annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True) return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M))) else: return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def blockmatrix_irs(self): scalar_ir = ir.F64(2) vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64)) read = ir.BlockMatrixRead( ir.BlockMatrixNativeReader(resource('blockmatrix_example/0'))) add_two_bms = ir.BlockMatrixMap2( read, read, 'l', 'r', ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r')), "Union") negate_bm = ir.BlockMatrixMap( read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element')), False) sqrt_bm = ir.BlockMatrixMap( read, 'element', hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir, False) persisted = ir.BlockMatrixRead(ir.BlockMatrixPersistReader('x', read)) scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1) col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1) row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1) broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256) broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256) broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256) transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256) matmul = ir.BlockMatrixDot(broadcast_scalar, transpose) rectangle = ir.Literal(hl.tarray(hl.tint64), [0, 1, 5, 6]) band = ir.Literal(hl.ttuple(hl.tint64, hl.tint64), (-1, 1)) intervals = ir.Literal( hl.ttuple(hl.tarray(hl.tint64), hl.tarray(hl.tint64)), ([0, 1, 5, 6], [5, 6, 8, 9])) sparsify1 = ir.BlockMatrixSparsify(read, rectangle, ir.RectangleSparsifier) sparsify2 = ir.BlockMatrixSparsify(read, band, ir.BandSparsifier(True)) sparsify3 = ir.BlockMatrixSparsify(read, intervals, ir.RowIntervalSparsifier(True)) densify = ir.BlockMatrixDensify(read) pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64)**construct_expr( ir.Ref('r'), hl.tfloat64))._ir squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r', pow_ir, "NeedsDense") slice_bm = ir.BlockMatrixSlice( matmul, [slice(0, 2, 1), slice(0, 1, 1)]) return [ read, persisted, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm, col_vector_to_bm, row_vector_to_bm, broadcast_scalar, broadcast_col, broadcast_row, squared_bm, transpose, sparsify1, sparsify2, sparsify3, densify, matmul, slice_bm ]
def pc_project(call_expr, loadings_expr, af_expr): """Projects genotypes onto pre-computed PCs. Requires loadings and allele-frequency from a reference dataset (see example). Note that `loadings_expr` must have no missing data and reflect the rows from the original PCA run for this method to be accurate. Example ------- >>> # Compute loadings and allele frequency for reference dataset >>> _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True) # doctest: +SKIP >>> mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) # doctest: +SKIP >>> loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af) # doctest: +SKIP >>> # Project new genotypes onto loadings >>> ht = pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af) # doctest: +SKIP Parameters ---------- call_expr : :class:`.CallExpression` Entry-indexed call expression for genotypes to project onto loadings. loadings_expr : :class:`.ArrayNumericExpression` Location of expression for loadings af_expr : :class:`.Float64Expression` Location of expression for allele frequency Returns ------- :class:`.Table` Table with scores calculated from loadings in column `scores` """ check_entry_indexed('pc_project', call_expr) check_row_indexed('pc_project', loadings_expr) check_row_indexed('pc_project', af_expr) gt_source = call_expr._indices.source loadings_source = loadings_expr._indices.source af_source = af_expr._indices.source loadings_expr = _get_expr_or_join(loadings_expr, loadings_source, gt_source, '_loadings') af_expr = _get_expr_or_join(af_expr, af_source, gt_source, '_af') mt = gt_source._annotate_all(row_exprs={'_loadings': loadings_expr, '_af': af_expr}, entry_exprs={'_call': call_expr}) if isinstance(loadings_source, hl.MatrixTable): n_variants = loadings_source.count_rows() else: n_variants = loadings_source.count() mt = mt.filter_rows(hl.is_defined(mt._loadings) & hl.is_defined(mt._af) & (mt._af > 0) & (mt._af < 1)) gt_norm = (mt._call.n_alt_alleles() - 2 * mt._af) / hl.sqrt(n_variants * 2 * mt._af * (1 - mt._af)) return mt.select_cols(scores=hl.agg.array_sum(mt._loadings * gt_norm)).cols()
def make_random_function(self, mt): from functools import reduce #check that row key of annotations matches row key of mt mt = mt.add_row_index() rows = [rf for rf in self.a_ht.row] self.a_ht = self.a_ht.annotate(__a__=reduce( self.f, map(lambda x: self.a_ht[rows[x]], range(len(rows))))) std = self.a_ht.aggregate(hl.agg.stats(self.a_ht.__a__)).stdev self.a_ht = self.a_ht.annotate(__a__=self.a_ht.__a__ * hl.sqrt(self.h2 / std)) return mt.annotate_rows(beta=hl.literal( self.a_ht.__a__.take(mt.count_rows()))[hl.int32(mt.row_idx)])
def spectral_moments(self, num_moments, R): eigval_powers = hl.nd.vstack([ self.S.map(lambda x: x**(2 * i)) for i in range(1, num_moments + 1) ]) moments = eigval_powers @ ( self.V1t[:, :self.k] @ R).map(lambda x: x**2) means = moments.sum(1) / self.k variances = (moments - means.reshape( -1, 1)).map(lambda x: x**2).sum(1) / (self.k - 1) stdevs = variances.map(lambda x: hl.sqrt(x)) return hl.struct(moments=means, stdevs=stdevs)
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def pc_project( # reference: https://github.com/macarthur-lab/gnomad_hail/blob/master/utils/generic.py#L131 mt: hl.MatrixTable, loadings_ht: hl.Table, loading_location: str = "loadings", af_location: str = "pca_af") -> hl.Table: n_variants = loadings_ht.count() mt = mt.annotate_rows( pca_loadings=loadings_ht[mt.row_key][loading_location], pca_af=loadings_ht[mt.row_key][af_location]) mt = mt.filter_rows( hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af) & (mt.pca_af > 0) & (mt.pca_af < 1)) gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt( n_variants * 2 * mt.pca_af * (1 - mt.pca_af)) mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm)) return mt.cols().select('scores')
def sim_corr_phenotypes(mt, cov_array): h2_ls = np.diag(cov_array) n_phens = len(h2_ls) for i in range(n_phens): mt = mt._annotate_all( col_exprs={ f'__y_no_noise_{i}': hl.agg.sum(mt[f'__beta_{i}'] * mt.__norm_gt) }) for i in range(n_phens): mt = mt._annotate_all( col_exprs={ f'__y_{i}': mt[f'__y_no_noise_{i}'] + hl.rand_norm(0, hl.sqrt(1 - h2_ls[i])) }) return mt
def metaanalyze_gwas(subsets, gwas_ht_list, sim_name, param_suffix, wd): if len(gwas_ht_list) == 1: # if list is single GWAS, don't meta-analyze return gwas_ht_list[0] sample_ct_dict = {} for subset_idx, tmp_gwas_ht in enumerate(gwas_ht_list, 1): sample_ct = subsets.filter(subsets.subset_idx == subset_idx).count() sample_ct_dict[subset_idx] = sample_ct print( f'\n\nmeta-analysis sample count subset {subset_idx}: {sample_ct}\n\n' ) comb_gwas_ht = gwas_ht_list[0].annotate(subset_idx=1, n=sample_ct_dict[1]) union_args = [ ht.annotate(subset_idx=subset_idx, n=sample_ct_dict[subset_idx]) for subset_idx, ht in enumerate(gwas_ht_list[1:], 2) ] # list of gwas_ht's to join comb_gwas_ht = comb_gwas_ht.union(*union_args) comb_gwas_ht = comb_gwas_ht.annotate(w=1 / (comb_gwas_ht['standard_error']**2)) agg_expr = { 'meta_se': hl.sqrt(1 / (hl.agg.sum(comb_gwas_ht.w))), 'meta_beta': hl.agg.sum(comb_gwas_ht['beta'] * comb_gwas_ht.w) / hl.agg.sum(comb_gwas_ht.w), 'meta_EAF': hl.agg.sum(comb_gwas_ht['EAF'] * comb_gwas_ht['n']) / hl.agg.sum(comb_gwas_ht['n']) } comb_gwas_ht = comb_gwas_ht.group_by('locus', 'alleles').aggregate(**agg_expr) comb_gwas_ht = comb_gwas_ht.annotate( meta_pval=2 * hl.pnorm(-hl.abs(comb_gwas_ht.meta_beta / comb_gwas_ht.meta_se))) meta_gwas_path = f'{wd}/gwas.logreg.{sim_name}.{param_suffix}.tsv.gz' comb_gwas_ht.export(meta_gwas_path)
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None, cov_regex=None, normalize_gt=True): '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional''' check_mt_sources(mt, genotype, beta) check_popstrat_args(is_popstrat=is_popstrat, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) mt1 = mt._annotate_all(row_exprs={'__beta': beta}, entry_exprs={'__gt': genotype}, global_exprs={ '__is_popstrat': is_popstrat, '__cov_coef_dict': none_to_null(cov_coef_dict), '__cov_regex': none_to_null(cov_regex) }) if normalize_gt: mt2 = normalize_genotypes(mt1.__gt) else: mt2 = mt1.annotate_entries(__norm_gt=mt1.__gt) print('\rCalculating phenotypes{}...'.format( ' w/ population stratification' if is_popstrat else '').ljust(81)) mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta * mt2.__norm_gt)) if h2 is None: h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2 if h2 > 1: print(f'WARNING: Total SNP-based h2 = {h2} (>1)') print('Not adding environmental noise') h2 = 1 mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if is_popstrat: return add_popstrat(mt4, y=mt4.__y, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) else: return mt4
def blockmatrix_irs(self): scalar_ir = ir.F64(2) vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64)) read = ir.BlockMatrixRead( ir.BlockMatrixNativeReader(resource('blockmatrix_example/0'))) add_two_bms = ir.BlockMatrixMap2( read, read, 'l', 'r', ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r'))) negate_bm = ir.BlockMatrixMap( read, 'element', ir.ApplyUnaryPrimOp('-', ir.Ref('element'))) sqrt_bm = ir.BlockMatrixMap( read, 'element', hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir) scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1) col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1) row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1) broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256) broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256) broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256) transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256) matmul = ir.BlockMatrixDot(broadcast_scalar, transpose) pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64)**construct_expr( ir.Ref('r'), hl.tfloat64))._ir squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, 'l', 'r', pow_ir) slice_bm = ir.BlockMatrixSlice( matmul, [slice(0, 2, 1), slice(0, 1, 1)]) return [ read, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm, col_vector_to_bm, row_vector_to_bm, broadcast_scalar, broadcast_col, broadcast_row, squared_bm, transpose, matmul, slice_bm ]
def _make_tsm_from_call(call_expr, block_size, mean_center=False, hwe_normalize=False): mt = matrix_table_source('_make_tsm/entry_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) if mean_center or hwe_normalize: mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined( mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "_make_tsm: found 0 variants after filtering out monomorphic sites." ) info( f"_make_tsm: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.unfilter_entries() mt = mt.select_entries(__x=hl.or_else(mt.__gt - mt.__mean_gt, 0.0)) if hwe_normalize: mt = mt.annotate_rows( __hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.select_entries(__x=mt.__x / mt.__hwe_scaled_std_dev) else: mt = mt.select_entries(__x=mt.__gt) A, ht = mt_to_table_of_ndarray(mt.__x, block_size, return_checkpointed_table_also=True) A = A.persist() return TallSkinnyMatrix(A, A.ndarray, ht, list(mt.col_key))
def pc_project(mt: hl.MatrixTable, pc_loadings: hl.Table, loading_location: str = "loadings", af_location: str = "pca_af") -> hl.MatrixTable: """ Projects samples in `mt` on PCs computed in `pc_mt` :param MatrixTable mt: MT containing the samples to project :param Table pc_loadings: MT containing the PC loadings for the variants :param str loading_location: Location of expression for loadings in `pc_loadings` :param str af_location: Location of expression for allele frequency in `pc_loadings` :return: MT with scores calculated from loadings """ n_variants = mt.count_rows() mt = mt.annotate_rows(**pc_loadings[mt.locus, mt.alleles]) mt = mt.filter_rows( hl.is_defined(mt[loading_location]) & hl.is_defined(mt[af_location]) & (mt[af_location] > 0) & (mt[af_location] < 1)) gt_norm = (mt.GT.n_alt_alleles() - 2 * mt[af_location]) / hl.sqrt( n_variants * 2 * mt[af_location] * (1 - mt[af_location])) return mt.annotate_cols(pca_scores=hl.agg.array_sum(mt[loading_location] * gt_norm))
def blockmatrix_irs(self): scalar_ir = ir.F64(2) vector_ir = ir.MakeArray([ir.F64(3), ir.F64(2)], hl.tarray(hl.tfloat64)) read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader(resource('blockmatrix_example/0'))) add_two_bms = ir.BlockMatrixMap2(read, read, ir.ApplyBinaryPrimOp('+', ir.Ref('l'), ir.Ref('r'))) negate_bm = ir.BlockMatrixMap(read, ir.ApplyUnaryPrimOp('-', ir.Ref('element'))) sqrt_bm = ir.BlockMatrixMap(read, hl.sqrt(construct_expr(ir.Ref('element'), hl.tfloat64))._ir) scalar_to_bm = ir.ValueToBlockMatrix(scalar_ir, [1, 1], 1) col_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [2, 1], 1) row_vector_to_bm = ir.ValueToBlockMatrix(vector_ir, [1, 2], 1) broadcast_scalar = ir.BlockMatrixBroadcast(scalar_to_bm, [], [2, 2], 256) broadcast_col = ir.BlockMatrixBroadcast(col_vector_to_bm, [0], [2, 2], 256) broadcast_row = ir.BlockMatrixBroadcast(row_vector_to_bm, [1], [2, 2], 256) transpose = ir.BlockMatrixBroadcast(broadcast_scalar, [1, 0], [2, 2], 256) matmul = ir.BlockMatrixDot(broadcast_scalar, transpose) pow_ir = (construct_expr(ir.Ref('l'), hl.tfloat64) ** construct_expr(ir.Ref('r'), hl.tfloat64))._ir squared_bm = ir.BlockMatrixMap2(scalar_to_bm, scalar_to_bm, pow_ir) return [ read, add_two_bms, negate_bm, sqrt_bm, scalar_to_bm, col_vector_to_bm, row_vector_to_bm, broadcast_scalar, broadcast_col, broadcast_row, squared_bm, transpose, matmul ]
def pc_project(mt, loadings_ht, loading_location="loadings", af_location="pca_af"): """ Projects samples in `mt` on pre-computed PCs. :param MatrixTable mt: MT containing the samples to project into previously calculated PCs :param Table loadings_ht: HT containing the PCA loadings and allele frequencies used for the PCA :param str loading_location: Location of expression for loadings in `loadings_ht` :param str af_location: Location of expression for allele frequency in `loadings_ht` :return: Hail Table with scores calculated from loadings in column `scores` :rtype: Table From Konrad Karczewski """ n_variants = loadings_ht.count() # Annotate matrix table with pca loadings and af from other dataset which pcs were calculated from mt = mt.annotate_rows( pca_loadings=loadings_ht[mt.row_key][loading_location], pca_af=loadings_ht[mt.row_key][af_location]) # Filter to rows where pca_loadings and af are defined, and af > 0 and < 1 mt = mt.filter_rows( hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af) & (mt.pca_af > 0) & (mt.pca_af < 1)) # Calculate genotype normalization constant # Basically, mean centers and normalizes the genotypes under the binomial distribution so that they can be # multiplied by the PC loadings to get the projected principal components gt_norm = (mt.GT.n_alt_alleles() - 2 * mt.pca_af) / hl.sqrt( n_variants * 2 * mt.pca_af * (1 - mt.pca_af)) mt = mt.annotate_cols(scores=hl.agg.array_sum(mt.pca_loadings * gt_norm)) return mt.cols().select('scores')
def merge_stats_counters_expr( stats: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: """ Merges multiple stats counters, assuming that they were computed on non-overlapping data. Examples: - Merge stats computed on indel and snv separately - Merge stats computed on bi-allelic and multi-allelic variants separately - Merge stats computed on autosomes and sex chromosomes separately :param stats: An array of stats counters to merge :return: Merged stats Struct """ def add_stats( i: hl.expr.StructExpression, j: hl.expr.StructExpression ) -> hl.expr.StructExpression: """ This merges two stast counters together. It assumes that all stats counter fields are present in the struct. :param i: accumulator: struct with mean, n and variance :param j: new element: stats_struct -- needs to contain mean, n and variance :return: Accumulation over all elements: struct with mean, n and variance """ delta = j.mean - i.mean n_tot = i.n + j.n return hl.struct( min=hl.min(i.min, j.min), max=hl.max(i.max, j.max), mean=(i.mean * i.n + j.mean * j.n) / n_tot, variance=i.variance + j.variance + (delta * delta * i.n * j.n) / n_tot, n=n_tot, sum=i.sum + j.sum, ) # Gather all metrics present in all stats counters metrics = set(stats[0]) dropped_metrics = set() for stat_expr in stats[1:]: stat_expr_metrics = set(stat_expr) dropped_metrics = dropped_metrics.union(stat_expr_metrics.difference(metrics)) metrics = metrics.intersection(stat_expr_metrics) if dropped_metrics: logger.warning( f"The following metrics will be dropped during stats counter merging as they do not appear in all counters: {', '.join(dropped_metrics)}" ) # Because merging standard deviation requires having the mean and n, # check that they are also present if `stdev` is. Otherwise remove stdev if "stdev" in metrics: missing_fields = [x for x in ["n", "mean"] if x not in metrics] if missing_fields: logger.warning( f'Cannot merge `stdev` from given stats counters since they are missing the following fields: {",".join(missing_fields)}' ) metrics.remove("stdev") # Create a struct with all possible stats for merging. # This step helps when folding because we can rely on the struct schema # Note that for intermediate merging, we compute the variance rather than the stdev all_stats = hl.array(stats).map( lambda x: hl.struct( min=x.min if "min" in metrics else hl.null(hl.tfloat64), max=x.max if "max" in metrics else hl.null(hl.tfloat64), mean=x.mean if "mean" in metrics else hl.null(hl.tfloat64), variance=x.stdev * x.stdev if "stdev" in metrics else hl.null(hl.tfloat64), n=x.n if "n" in metrics else hl.null(hl.tfloat64), sum=x.sum if "sum" in metrics else hl.null(hl.tfloat64), ) ) # Merge the stats agg_stats = all_stats[1:].fold(add_stats, all_stats[0]) # Return only the metrics that were present in all independent stats counters # If `stdev` is present, then compute it from the variance return agg_stats.select( **{ metric: agg_stats[metric] if metric != "stdev" else hl.sqrt(agg_stats.variance) for metric in metrics } )
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression: """Compute multivariate linear regression statistics. Examples -------- Regress HT against an intercept (1), SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], multiple_standard_error=4.949747468305833, multiple_r_squared=0.7175792507204611, adjusted_r_squared=0.1527377521613834, f_stat=1.2704081632653061, multiple_p_value=0.5314327326007864, n=4) Regress blood pressure against an intercept (1), genotype, age, and the interaction of genotype and age: >>> ds_ann = ds.annotate_rows(linreg = ... hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ... ds.GT.n_alt_alleles(), ... ds.pheno.age, ... ds.GT.n_alt_alleles() * ds.pheno.age])) Warning ------- As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- In relation to `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__ in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes ``summary(lm(y ~ x1 + x2))`` and ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes ``summary(lm(y ~ x1 + x2 - 1))``. More generally, `nested_dim` defines the number of effects to fit in the nested (null) model, with the effects on the remaining covariates fixed to zero. The returned struct has ten fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each covariate. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error for each covariate. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t-statistic for each covariate. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each covariate. - `multiple_standard_error` (:py:data:`.tfloat64`): Estimated standard deviation of the random error. - `multiple_r_squared` (:py:data:`.tfloat64`): Coefficient of determination for nested models. - `adjusted_r_squared` (:py:data:`.tfloat64`): Adjusted `multiple_r_squared` taking into account degrees of freedom. - `f_stat` (:py:data:`.tfloat64`): F-statistic for nested models. - `multiple_p_value` (:py:data:`.tfloat64`): p-value for the `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of nested models. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y`, all elements of `x`, and `weight` (if set) are non-missing. All but the last field are missing if `n` is less than or equal to the number of covariates or if the covariates are linearly dependent. If set, the `weight` parameter generalizes the model to `weighted least squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful for heteroscedastic (diagonal but non-constant) variance. Warning ------- If any weight is negative, the resulting statistics will be ``nan``. Parameters ---------- y : :class:`.Float64Expression` Response (dependent variable). x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Covariates (independent variables). nested_dim : :obj:`int` The null model includes the first `nested_dim` covariates. Must be between 0 and `k` (the length of `x`). weight : :class:`.Float64Expression`, optional Non-negative weight for weighted least squares. Returns ------- :class:`.StructExpression` Struct of regression results. """ x = wrap_to_list(x) if len(x) == 0: raise ValueError("linreg: must have at least one covariate in `x`") hl.methods.statgen._warn_if_no_intercept('linreg', x) if weight is None: return _linreg(y, x, nested_dim) else: return _linreg(hl.sqrt(weight) * y, [hl.sqrt(weight) * xi for xi in x], nested_dim)
def _get_info_agg_expr( mt: hl.MatrixTable, sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_SUM_AGG_FIELDS, int32_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_INT32_SUM_AGG_FIELDS, median_agg_fields: Union[ List[str], Dict[str, hl.expr.NumericExpression] ] = INFO_MEDIAN_AGG_FIELDS, array_sum_agg_fields: Union[ List[str], Dict[str, hl.expr.ArrayNumericExpression] ] = INFO_ARRAY_SUM_AGG_FIELDS, prefix: str = "", ) -> Dict[str, hl.expr.Aggregation]: """ Helper function containing code to create Aggregators for both site or AS info expression aggregations. Notes: 1. If `SB` is specified in array_sum_agg_fields, it will be aggregated as `AS_SB_TABLE`, according to GATK standard nomenclature. 2. If `RAW_MQandDP` is specified in array_sum_agg_fields, it will be used for the `MQ` calculation and then dropped according to GATK recommendation. 3. If `RAW_MQ` and `MQ_DP` are given, they will be used for the `MQ` calculation and then dropped according to GATK recommendation. 4. If the fields to be aggregate (`sum_agg_fields`, `int32_sum_agg_fields`, `median_agg_fields`) are passed as list of str, then they should correspond to entry fields in `mt` or in `mt.gvcf_info`. Priority is given to entry fields in `mt` over those in `mt.gvcf_info` in case of a name clash. :param mt: Input MT :param sum_agg_fields: Fields to aggregate using sum. :param int32_sum_agg_fields: Fields to aggregate using sum using int32. :param median_agg_fields: Fields to aggregate using (approximate) median. :param median_agg_fields: Fields to aggregate using element-wise summing over an array. :param prefix: Optional prefix for the fields. Used for adding 'AS_' in the AS case. :return: Dictionary of expression names and their corresponding aggregation Expression """ def _agg_list_to_dict( mt: hl.MatrixTable, fields: List[str] ) -> Dict[str, hl.expr.NumericExpression]: out_fields = {} if "gvcf_info" in mt.entry: out_fields = {f: mt.gvcf_info[f] for f in fields if f in mt.gvcf_info} out_fields.update({f: mt[f] for f in fields if f in mt.entry}) # Check that all fields were found missing_fields = [f for f in fields if f not in out_fields] if missing_fields: raise ValueError( "Could not find the following field(s)in the MT entry schema (or nested under mt.gvcf_info: {}".format( ",".join(missing_fields) ) ) return out_fields # Map str to expressions where needed if isinstance(sum_agg_fields, list): sum_agg_fields = _agg_list_to_dict(mt, sum_agg_fields) if isinstance(int32_sum_agg_fields, list): int32_sum_agg_fields = _agg_list_to_dict(mt, int32_sum_agg_fields) if isinstance(median_agg_fields, list): median_agg_fields = _agg_list_to_dict(mt, median_agg_fields) if isinstance(array_sum_agg_fields, list): array_sum_agg_fields = _agg_list_to_dict(mt, array_sum_agg_fields) # Create aggregators agg_expr = {} agg_expr.update( { f"{prefix}{k}": hl.agg.approx_quantiles(expr, 0.5) for k, expr in median_agg_fields.items() } ) agg_expr.update( {f"{prefix}{k}": hl.agg.sum(expr) for k, expr in sum_agg_fields.items()} ) agg_expr.update( { f"{prefix}{k}": hl.int32(hl.agg.sum(expr)) for k, expr in int32_sum_agg_fields.items() } ) agg_expr.update( { f"{prefix}{k}": hl.agg.array_agg(lambda x: hl.agg.sum(x), expr) for k, expr in array_sum_agg_fields.items() } ) # Handle annotations combinations and casting for specific annotations # If RAW_MQandDP is in agg_expr or if both MQ_DP and RAW_MQ are, compute MQ instead mq_tuple = None if f"{prefix}RAW_MQandDP" in agg_expr: logger.info( f"Computing {prefix}MQ as sqrt({prefix}RAW_MQandDP[0]/{prefix}RAW_MQandDP[1]). " f"Note that {prefix}MQ will be set to 0 if {prefix}RAW_MQandDP[1] == 0." ) mq_tuple = agg_expr.pop(f"{prefix}RAW_MQandDP") elif f"{prefix}RAW_MQ" in agg_expr and f"{prefix}MQ_DP" in agg_expr: logger.info( f"Computing {prefix}MQ as sqrt({prefix}MQ_DP/{prefix}RAW_MQ). " f"Note that MQ will be set to 0 if {prefix}RAW_MQ == 0." ) mq_tuple = (agg_expr.pop(f"{prefix}MQ_DP"), agg_expr.pop(f"{prefix}RAW_MQ")) if mq_tuple is not None: agg_expr[f"{prefix}MQ"] = hl.cond( mq_tuple[1] > 0, hl.sqrt(mq_tuple[0] / mq_tuple[1]), 0 ) # If both VarDP and QUALapprox are present, also compute QD. if f"{prefix}VarDP" in agg_expr and f"{prefix}QUALapprox" in agg_expr: logger.info( f"Computing {prefix}QD as {prefix}QUALapprox/{prefix}VarDP. " f"Note that {prefix}QD will be set to 0 if {prefix}VarDP == 0." ) var_dp = hl.int32(hl.agg.sum(int32_sum_agg_fields["VarDP"])) agg_expr[f"{prefix}QD"] = hl.cond( var_dp > 0, agg_expr[f"{prefix}QUALapprox"] / var_dp, 0 ) # SB needs to be cast to int32 for FS down the line if f"{prefix}SB" in agg_expr: agg_expr[f"{prefix}SB"] = agg_expr[f"{prefix}SB"].map(lambda x: hl.int32(x)) return agg_expr
def linreg(y, x, nested_dim=1, weight=None) -> StructExpression: """Compute multivariate linear regression statistics. Examples -------- Regress HT against an intercept (1), SEX, and C1: >>> table1.aggregate(agg.linreg(table1.HT, [1, table1.SEX == 'F', table1.C1])) Struct(beta=[88.50000000000014, 81.50000000000057, -10.000000000000068], standard_error=[14.430869689661844, 59.70552738231206, 7.000000000000016], t_stat=[6.132686518775844, 1.365032746099571, -1.428571428571435], p_value=[0.10290201427537926, 0.40250974549499974, 0.3888002244284281], multiple_standard_error=4.949747468305833, multiple_r_squared=0.7175792507204611, adjusted_r_squared=0.1527377521613834, f_stat=1.2704081632653061, multiple_p_value=0.5314327326007864, n=4) Regress blood pressure against an intercept (1), genotype, age, and the interaction of genotype and age: >>> ds_ann = ds.annotate_rows(linreg = ... hl.agg.linreg(ds.pheno.blood_pressure, ... [1, ... ds.GT.n_alt_alleles(), ... ds.pheno.age, ... ds.GT.n_alt_alleles() * ds.pheno.age])) Warning ------- As in the example, the intercept covariate ``1`` must be included **explicitly** if desired. Notes ----- In relation to `lm.summary <https://stat.ethz.ch/R-manual/R-devel/library/stats/html/summary.lm.html>`__ in R, ``linreg(y, x = [1, mt.x1, mt.x2])`` computes ``summary(lm(y ~ x1 + x2))`` and ``linreg(y, x = [mt.x1, mt.x2], nested_dim=0)`` computes ``summary(lm(y ~ x1 + x2 - 1))``. More generally, `nested_dim` defines the number of effects to fit in the nested (null) model, with the effects on the remaining covariates fixed to zero. The returned struct has ten fields: - `beta` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated regression coefficient for each covariate. - `standard_error` (:class:`.tarray` of :py:data:`.tfloat64`): Estimated standard error for each covariate. - `t_stat` (:class:`.tarray` of :py:data:`.tfloat64`): t-statistic for each covariate. - `p_value` (:class:`.tarray` of :py:data:`.tfloat64`): p-value for each covariate. - `multiple_standard_error` (:py:data:`.tfloat64`): Estimated standard deviation of the random error. - `multiple_r_squared` (:py:data:`.tfloat64`): Coefficient of determination for nested models. - `adjusted_r_squared` (:py:data:`.tfloat64`): Adjusted `multiple_r_squared` taking into account degrees of freedom. - `f_stat` (:py:data:`.tfloat64`): F-statistic for nested models. - `multiple_p_value` (:py:data:`.tfloat64`): p-value for the `F-test <https://en.wikipedia.org/wiki/F-test#Regression_problems>`__ of nested models. - `n` (:py:data:`.tint64`): Number of samples included in the regression. A sample is included if and only if `y`, all elements of `x`, and `weight` (if set) are non-missing. All but the last field are missing if `n` is less than or equal to the number of covariates or if the covariates are linearly dependent. If set, the `weight` parameter generalizes the model to `weighted least squares <https://en.wikipedia.org/wiki/Weighted_least_squares>`__, useful for heteroscedastic (diagonal but non-constant) variance. Warning ------- If any weight is negative, the resulting statistics will be ``nan``. Parameters ---------- y : :class:`.Float64Expression` Response (dependent variable). x : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` Covariates (independent variables). nested_dim : :obj:`int` The null model includes the first `nested_dim` covariates. Must be between 0 and `k` (the length of `x`). weight : :class:`.Float64Expression`, optional Non-negative weight for weighted least squares. Returns ------- :class:`.StructExpression` Struct of regression results. """ x = wrap_to_list(x) if len(x) == 0: raise ValueError("linreg: must have at least one covariate in `x`") hl.methods.statgen._warn_if_no_intercept('linreg', x) if weight is None: return _linreg(y, x, nested_dim) else: return _linreg( hl.sqrt(weight) * y, [hl.sqrt(weight) * xi for xi in x], nested_dim)
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def calculate_phenotypes(mt, genotype, beta, h2, popstrat=None, popstrat_var=None, exact_h2=False): r"""Calculates phenotypes by multiplying genotypes and betas. Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with all relevant fields passed as parameters. genotype : :class:`.Expression` or :class:`.CallExpression` Entry field of genotypes. beta : :class:`.Expression` Row field of SNP effects. h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray` SNP-based heritability (:math:`h^2`) of simulated trait. Can only be ``None`` if running annotation-informed model. popstrat : :class:`.Expression`, optional Column field containing population stratification term. popstrat_var : :obj:`float` or :obj:`int` Variance of population stratification term. exact_h2: :obj:`bool` Whether to exactly simulate ratio of variance of genetic component of phenotype to variance of phenotype to be h2. If `False`, ratio will be h2 in expectation. Observed h2 in the simulation will be close to expected h2 for large-scale simulations. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated phenotype as column field. """ print('calculating phenotype') h2 = h2.tolist() if type(h2) is np.ndarray else ( [h2] if type(h2) is not list else h2) assert popstrat_var is None or (popstrat_var >= 0), 'popstrat_var must be non-negative' uid = Env.get_uid(base=100) mt = annotate_all( mt=mt, row_exprs={'beta_' + uid: beta}, col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat}, entry_exprs={ 'gt_' + uid: genotype.n_alt_alleles() if genotype.dtype is hl.dtype('call') else genotype }) mt = mt.filter_rows(hl.agg.stats(mt['gt_' + uid]).stdev > 0) mt = normalize_genotypes(mt['gt_' + uid]) if mt['beta_' + uid].dtype == hl.dtype('array<float64>'): # if >1 traits if exact_h2: raise ValueError( 'exact_h2=True not supported for multitrait simulations') else: mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg( lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + uid])) mt = mt.annotate_cols( y=mt.y_no_noise + hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x)))) else: if exact_h2 and min([h2[0], 1 - h2[0]]) != 0: print('exact h2') mt = mt.annotate_cols(**{ 'y_no_noise_' + uid: hl.agg.sum(mt['beta_' + uid] * mt['norm_gt']) }) y_no_noise_stdev = mt.aggregate_cols( hl.agg.stats(mt['y_no_noise_' + uid]).stdev) mt = mt.annotate_cols( y_no_noise=hl.sqrt(h2[0]) * mt['y_no_noise_' + uid] / y_no_noise_stdev ) # normalize genetic component of phenotype to have variance of exactly h2 mt = mt.annotate_cols( **{'noise_' + uid: hl.rand_norm(0, hl.sqrt(1 - h2[0]))}) noise_stdev = mt.aggregate_cols( hl.agg.stats(mt['noise_' + uid]).stdev) mt = mt.annotate_cols(noise=hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev) mt = mt.annotate_cols( y=mt.y_no_noise + hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev) else: mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + uid] * mt['norm_gt'])) mt = mt.annotate_cols(y=mt.y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2[0]))) if popstrat is not None: var_factor = 1 if popstrat_var is None else (popstrat_var**( 1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' + uid])).stdev mt = mt.rename({'y': 'y_no_popstrat'}) mt = mt.annotate_cols(y=mt.y_no_popstrat + mt['popstrat_' + uid] * var_factor) mt = _clean_fields(mt, uid) return mt
def make_betas(mt, h2, pi=None, annot=None, rg=None): r"""Generates betas under different models. Simulates betas (SNP effects) under the infinitesimal, spike & slab, or annotation-informed models, depending on parameters passed. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable containing genotypes to be used. Also should contain variant annotations as row fields if running the annotation-informed model or covariates as column fields if adding population stratification. h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray` SNP-based heritability of simulated trait(s). pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Probability of SNP being causal when simulating under the spike & slab model. If doing two-trait spike & slab `pi` is a list of probabilities for overlapping causal SNPs (see docstring of :func:`.multitrait_ss`) annot : :class:`.Expression`, optional Row field of aggregated annotations for annotation-informed model. rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Genetic correlation between traits. Returns ------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with betas as a row field, simulated according to specified model. pi : :obj:`list` Probability of a SNP being causal for different traits, possibly altered from input `pi` if covariance matrix for multitrait simulation was not positive semi-definite. rg : :obj:`list` Genetic correlation between traits, possibly altered from input `rg` if covariance matrix for multitrait simulation was not positive semi-definite. """ h2 = h2.tolist() if type(h2) is np.ndarray else ( [h2] if type(h2) is not list else h2) pi = pi.tolist() if type(pi) is np.ndarray else ( [pi] if type(pi) is not list else pi) rg = rg.tolist() if type(rg) is np.ndarray else ( [rg] if type(rg) is not list else rg) assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert (pi is not [None]) or all( x >= 0 and x <= 1 for x in pi), 'pi values for spike & slab must be between 0 and 1' assert (rg == [None] or all(x >= -1 and x <= 1 for x in rg)), 'rg values must be between -1 and 1 or None' if annot is not None: # multi-trait annotation-informed assert rg == [ None ], 'Correlated traits not supported for annotation-informed model' h2 = h2 if type(h2) is list else [h2] annot_sum = mt.aggregate_rows(hl.agg.sum(annot)) mt = mt.annotate_rows(beta=hl.literal(h2).map( lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M))))) elif len(h2) > 1 and (pi == [None] or pi == [1]): # multi-trait correlated infinitesimal mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg) elif len(h2) == 2 and len(pi) > 1 and len( rg) == 1: # two trait correlated spike & slab print('multitrait ss') mt, pi, rg = multitrait_ss(mt=mt, h2=h2, rg=0 if rg is [None] else rg[0], pi=pi) elif len(h2) == 1 and len( pi) == 1: # single trait infinitesimal/spike & slab M = mt.count_rows() pi_temp = 1 if pi == [None] else pi[0] mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) * hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp)))) else: raise ValueError('Parameters passed do not match any models.') return mt, pi, rg
def default_generate_gene_lof_summary( mt: hl.MatrixTable, collapse_indels: bool = False, tx: bool = False, lof_csq_set: Set[str] = LOF_CSQ_SET, meta_root: str = "meta", pop_field: str = "pop", filter_loftee: bool = False, ) -> hl.Table: """ Generate summary counts for loss-of-function (LoF), missense, and synonymous variants. Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant, and observed/expected (OE) ratio of samples with homozygous pLoF variant calls. Summary counts are (all per gene): - Number of samples with no pLoF variants. - Number of samples with heterozygous pLoF variants. - Number of samples with homozygous pLoF variants. - Total number of sites with genotype calls. - All of the above stats grouped by population. Assumes MT was created using `default_generate_gene_lof_matrix`. .. note:: Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only). If LoF variants have not been filtered and `filter_loftee` is True, expects MT has the row annotation `vep`. :param mt: Input MatrixTable. :param collapse_indels: Whether to collapse indels. Default is False. :param tx: Whether input MT has transcript expression data. Default is False. :param lof_csq_set: Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET. :param meta_root: String indicating top level name for sample metadata. Default is 'meta'. :param pop_field: String indiciating field with sample population assignment information. Default is 'pop'. :param filter_loftee: Filters to LOFTEE pass variants (and no LoF flags) only. Default is False. :return: Table with het/hom summary counts. """ if collapse_indels: grouping = ["gene_id", "gene", "most_severe_consequence"] if tx: grouping.append("expressed") else: grouping.extend(["transcript_id", "canonical"]) mt = (mt.group_rows_by(*grouping).aggregate_rows( n_sites=hl.agg.sum(mt.n_sites), n_sites_array=hl.agg.array_sum(mt.n_sites_array), classic_caf=hl.agg.sum(mt.classic_caf), max_af=hl.agg.max(mt.max_af), classic_caf_array=hl.agg.array_sum(mt.classic_caf_array), ).aggregate_entries( num_homs=hl.agg.sum(mt.num_homs), num_hets=hl.agg.sum(mt.num_hets), defined_sites=hl.agg.sum(mt.defined_sites), ).result()) if filter_loftee: lof_ht = get_most_severe_consequence_for_summary(mt.rows()) mt = mt.filter_rows( hl.is_defined(lof_ht[mt.row_key].lof) & (lof_ht[mt.row_key].lof == "HC") & (lof_ht[mt.row_key].no_lof_flags)) ht = mt.annotate_rows( lof=hl.struct( **get_het_hom_summary_dict( csq_set=lof_csq_set, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), missense=hl.struct( **get_het_hom_summary_dict( csq_set={"missense_variant"}, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), synonymous=hl.struct( **get_het_hom_summary_dict( csq_set={"synonymous_variant"}, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), ).rows() ht = ht.annotate( p=(1 - hl.sqrt(hl.float64(ht.lof.no_alt_calls) / ht.lof.defined)), pop_p=hl.dict( hl.array(ht.lof.pop_defined).map(lambda x: ( x[0], 1 - hl.sqrt( hl.float64(ht.lof.pop_no_alt_calls.get(x[0])) / x[1]), ))), ) ht = ht.annotate(exp_hom_lof=ht.lof.defined * ht.p * ht.p) return ht.annotate(oe=ht.lof.obs_hom / ht.exp_hom_lof)
def main(args): ######################################################################## ### initialize phenos = [ 'height', 'bmi', 'sbp', 'dbp', 'wbc', 'monocyte', 'neutrophil', 'eosinophil', 'basophil', 'lymphocyte', 'rbc', 'mch', 'mcv', 'mchc', 'hb', 'ht', 'plt' ] phenos.sort() phenotype = 'ALL17' if args.clump_basename is None: clumps = args.dirname + args.basename + '_ALL17.clumped' prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename + '-pt-sumstats-locus-allele-keyed.kt' contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename else: clumps = args.dirname + args.clump_basename + '_ALL17.clumped' prs_loci_table_location = args.dirname + 'keytables/ukb-' + phenotype + '_' + args.basename_out + '-pt-sumstats-locus-allele-keyed.kt' contig_row_dict_location = args.dirname + 'contig_row_dict-' + phenotype + '_' + args.basename_out # clumps = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '.clumped' # ss_filename = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '.tsv.gz' # out_base = args.dirname + end_dir + 'UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + args.iter + '_beta' + args.which_beta + '_gwas_PRS' clump_table_location = clumps.replace('.clumped', '.kt') contigs = {'0{}'.format(x): str(x) for x in range(1, 10)} bgen_files = 'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen' start = time.time() # large block size because we read very little data (due to filtering & ignoring genotypes) hl.init(branching_factor=10, min_block_size=2000) # set min_block_size only in import_bgen ################################################################################ ### set up the sumstats table (chr, bp for union SNPs) # if (args.generate_prs_loci_table): # t = hl.import_table(sumstats_text_file, # delimiter='\s+', # impute=True) # t = t.select(locus = hl.locus(hl.str(t.CHR), t.BP)) # t = t.key_by('locus') # t.write(prs_loci_table_location, overwrite=True) # # ss = hl.read_table(prs_loci_table_location) if args.read_clumps: clump_file = hl.import_table(clumps, delimiter='\s+', impute=True) clump_file = clump_file.select( locus=hl.locus(hl.str(clump_file.CHR), clump_file.BP)) clump_file = clump_file.key_by('locus') clump_file.write(clump_table_location, overwrite=True) clump_file = hl.read_table(clump_table_location) # ################################################################################ # ### determine the indices of the prs variants in bgen # if (args.generate_contig_row_dict): # mt = hl.methods.import_bgen(bgen_files, # [], # contig_recoding=contigs, # _row_fields=['file_row_idx']) # prs_rows = mt.filter_rows(hl.is_defined(ss[mt.locus])).rows() # print('about to collect') # # remove all unnecessary data, dropping keys and other irrelevant fields # prs_rows = prs_rows.key_by() # prs_rows = prs_rows.select(contig=prs_rows.locus.contig, # file_row_idx=prs_rows.file_row_idx) # contig_row_list = prs_rows.collect() # print('finished collecting') # contig_reformed = [(x['contig'], x['file_row_idx']) for x in contig_row_list] # print('reformed') # from collections import defaultdict # contig_row_dict = defaultdict(list) # for k, v in contig_reformed: # contig_row_dict[k].append(v) # print('dictionary created') # # with hl.hadoop_open(contig_row_dict_location, 'wb') as f: # pickle.dump(contig_row_dict, f) # else: # with hl.hadoop_open(contig_row_dict_location, 'rb') as f: # contig_row_dict = pickle.load(f) ################################################################################ ### Get true phenotypes from UKBB if args.pheno_table: # phenotypes = hl.import_table('gs://phenotype_31063/ukb31063.phesant_phenotypes.both_sexes.tsv.bgz', # key='userId', quote='"', impute=True, types={'userId': hl.tstr}, missing='') phenotypes = hl.import_table( 'gs://armartin/disparities/ukbb/UKB_phenos_ALL17.txt.bgz', key='eid', impute=True, types={'eid': hl.tstr}) covariates = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', key='s', impute=True, types={'s': hl.tstr}) samples = covariates.annotate(**phenotypes[covariates.s]) # Write pheno/covar/sample info table for pheno in phenos: #sampleids = hl.import_table('gs://ukb31063-mega-gwas/hail-0.1/qc/ukb31063.gwas_samples.txt', delimiter='\s+').key_by('s') gwas_holdout = hl.import_table( 'gs://armartin/mama/ukb31063.gwas_samples.gwas_vs_holdout.txt', delimiter='\s+').key_by('s') samples = samples.annotate(**{ pheno + '_holdout': gwas_holdout[samples.s].in_gwas == 'FALSE' }) samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True) if args.ss_tables: # Write ss info for pheno in phenos: print(pheno) # change sumstats to bgz #ss = hl.import_table('gs://armartin/disparities/pheno_31063_holdout_gwas_' + pheno + '.txt.gz', ss = hl.import_table(args.dirname + pheno + '_' + args.basename + '.*.bgz', delimiter='\s+', impute=True, types={ 'MAMA_BETA': hl.tfloat, 'MAMA_PVAL': hl.tfloat, 'BP': hl.tint }) #, 'N': hl.tint}) ss = ss.key_by( locus=hl.locus(hl.str(ss.CHR), hl.int(ss.BP))).repartition(200) ss.write(args.dirname + pheno + '_' + args.basename + '.ht', True) ################################################################################ ### Run the PRS using phenotype-specific clump variants if args.write_bgen: mt_all = hl.import_bgen( bgen_files, ['dosage'], sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', variants=clump_file.locus) # contig_row_dict2 = {'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{contig}_v3.bgen'.format(contig=k): v for k, v in contig_row_dict.items()} # mt_all = hl.methods.import_bgen(bgen_files, # ['dosage'], # sample_file='gs://phenotype_31063/ukb31063.autosomes.sample', # contig_recoding=contigs, # _variants_per_file=contig_row_dict2, # _row_fields=[]) #samples.write(args.dirname + args.basename + '_holdout_gwas_phenos.ht', True) samples = hl.read_table(args.dirname + args.basename + '_holdout_gwas_phenos.ht') mt_all = mt_all.annotate_cols(**samples[ mt_all.s]) # ok that phenos keyed on userId not s? # if args.clump_basename is None: mt_all.repartition(5000, shuffle=False).write( args.dirname + args.basename + '_ALL17.mt', True) else: mt_all.repartition(5000, shuffle=False).write( args.dirname + args.basename_out + '_ALL17.mt', True) mt_all = hl.read_matrix_table(args.dirname + args.basename + '_ALL17.mt') for pheno in phenos: #[6:len(phenos)]: print(pheno) ss = hl.read_table(args.dirname + pheno + '_' + args.basename + '.ht') """ To add: - Filter only to samples in holdout GWAS - Filter to rows in phenotype-specific clump file - Build PRS for 10 p-value thresholds - Also fix nt1/nt2 to A1 and A2 (check) from sumstats. """ # filter to only samples held out from GWAS mt = mt_all.filter_cols(mt_all[pheno + '_holdout']) mt = mt.annotate_rows(ss=ss[mt.locus]) mt = annotate_beta(mt, mt.ss) p_max = { 's1': 5e-8, 's2': 1e-6, 's3': 1e-4, 's4': 1e-3, 's5': 1e-2, 's6': .05, 's7': .1, 's8': .2, 's9': .5, 's10': 1 } if args.clump_basename is None: pheno_clump = specific_clumps(args.dirname + pheno + '_' + args.basename + '.clumped') else: pheno_clump = specific_clumps(args.dirname + pheno + '_' + args.clump_basename + '.clumped') mt = mt.filter_rows(pheno_clump.get(mt.locus, False)) print(mt.count()) # divide by sd's of frequencies to get standardized betas back to allelic scale for MAMA betas (only, not METAL) # sqrt(2pq) if args.betas_are_standardized: annot_expr = { k: hl.agg.sum(mt.beta / hl.sqrt(2 * hl.float(mt.ss.FRQ) * 1 - hl.float(mt.ss.FRQ)) * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } else: annot_expr = { k: hl.agg.sum(mt.beta * mt.dosage * hl.int(mt.ss.MAMA_PVAL < v)) for k, v in p_max.items() } mt = mt.annotate_cols(**annot_expr) if args.clump_basename is None: mt.cols().write(args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.ht', stage_locally=True, overwrite=True) ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.ht') else: mt.cols().write(args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.ht', stage_locally=True, overwrite=True) ht = hl.read_table(args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.ht') ht_out = ht.drop(*[x for x in list(ht.row) if 'holdout' in x], *[x for x in phenos if pheno not in x]) if args.clump_basename is None: output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename + '_PRS.txt.bgz' else: output_location = args.dirname + 'UKB_' + pheno + '_' + args.basename_out + '_PRS.txt.bgz' ht_out.export(output_location) end = time.time() print("Success! Job was completed in %s" % time.strftime("%H:%M:%S", time.gmtime(end - start)))
def run_meta_split(i): print('####################') print('Starting split ' + str(i)) print('####################') starttime = datetime.datetime.now() pi = ['A'] * int(n_chunks / 2) + ['B'] * int(n_chunks / 2) seed_id = int(batch + str(i).zfill(4)) #create a seed_id unique to every split randstate = np.random.RandomState(seed_id) #seed with seed_id randstate.shuffle(pi) gmt_shuf = gmt.annotate_cols(label=hl.literal(pi)[hl.int32(gmt.col_idx)]) mt = gmt_shuf.group_cols_by(gmt_shuf.label).aggregate( unnorm_meta_beta=hl.agg.sum(gmt_shuf.beta / gmt_shuf.standard_error**2), inv_se2=hl.agg.sum(1 / gmt_shuf.standard_error**2)).key_rows_by('SNP') ht = mt.make_table() ht = ht.annotate(A_Z=ht['A.unnorm_meta_beta'] / hl.sqrt(ht['A.inv_se2']), B_Z=ht['B.unnorm_meta_beta'] / hl.sqrt(ht['B.inv_se2'])) ht = ht.drop('A.unnorm_meta_beta', 'B.unnorm_meta_beta', 'A.inv_se2', 'B.inv_se2') variants = hl.import_table('gs://nbaya/rg_sex/50_snps_alleles_N.tsv.gz', types={'N': hl.tint64}) variants = variants.key_by('SNP') # mt_all = hl.read_matrix_table('gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_'+phen+'_grouped'+str(n_chunks)+'_batch_'+batch+'.mt') #matrix table containing individual samples. OUTDATED ht_all = hl.read_table( 'gs://nbaya/split/ukb31063.hm3_variants.gwas_samples_' + phen + '_grouped' + str(n_chunks) + '_batch_' + batch + '.ht') #hail table containing individual samples variants = variants.annotate(N=hl.int32(ht_all.count() / 2)) variants.show() metaA = variants.annotate(Z=ht[variants.SNP].A_Z) metaB = variants.annotate(Z=ht[variants.SNP].B_Z) # metaA_path = 'gs://nbaya/rg_sex/'+phen+'_meta_A_n'+str(n_chunks)+'_batch_'+batch+'_s'+str(i)+'.tsv.bgz' # metaB_path = 'gs://nbaya/rg_sex/'+phen+'_meta_B_n'+str(n_chunks)+'_batch_'+batch+'_s'+str(i)+'.tsv.bgz' metaA_path = 'gs://nbaya/rg_sex/' + variant_set + '_' + phen + '_meta_A_n' + str( n_chunks) + '_batch_' + batch + '_s' + str( i ) + '.tsv.bgz' #only used by qc_pos variant set and later hm3 phens metaB_path = 'gs://nbaya/rg_sex/' + variant_set + '_' + phen + '_meta_B_n' + str( n_chunks) + '_batch_' + batch + '_s' + str( i ) + '.tsv.bgz' #only used by qc_pos variant set and later hm3 phens metaA.export(metaA_path) metaB.export(metaB_path) endtime = datetime.datetime.now() elapsed = endtime - starttime print('####################') print('Completed iteration ' + str(i)) print('Files written to:') print(metaA_path + '\t' + metaB_path) print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) + ' minutes') print('####################')