def test_make_table_row_equivalence(self): mt = hl.utils.range_matrix_table(3, 3) mt = mt.annotate_rows(r1 = hl.rand_norm(), r2 = hl.rand_norm()) mt = mt.annotate_entries(e1 = hl.rand_norm(), e2 = hl.rand_norm()) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) assert mt.make_table().select(*mt.row_value)._same(mt.rows())
def calculate_phenotypes(mt, genotype, beta, h2, popstrat=None, popstrat_var=None): """Calculates phenotypes by multiplying genotypes and betas. Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with all relevant fields passed as parameters. genotype : :class:`.Expression` Entry field of genotypes. beta : :class:`.Expression` Row field of SNP effects. h2 : :obj:`float` or :obj:`int` or :obj:`list` SNP-based heritability (:math:`h^2`) of simulated trait. Can only be ``None`` if running annotation-informed model. popstrat : :class:`.Expression`, optional Column field containing population stratification term. popstrat_var : :obj:`float` or :obj:`int` Variance of population stratification term. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated phenotype as column field. """ assert popstrat_var is None or (popstrat_var >= 0), 'popstrat_var must be non-negative' tid = ''.join( random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5) ) # "temporary id" -- random string to identify temporary intermediate fields generated by this method mt = annotate_all( mt=mt, row_exprs={'beta_' + tid: beta}, col_exprs={} if popstrat is None else {'popstrat_' + tid: popstrat}, entry_exprs={'gt_' + tid: genotype}) mt = normalize_genotypes(mt['gt_' + tid]) if mt['beta_' + tid].dtype == dtype('array<float64>'): #if >1 traits h2 = h2 if type(h2) is list else [h2] mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg( lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + tid])) mt = mt.annotate_cols( y=mt.y_no_noise + hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x)))) else: mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + tid] * mt['norm_gt'])) mt = mt.annotate_cols(y=mt.y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if popstrat is not None: var_factor = 1 if popstrat_var is None else (popstrat_var**( 1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' + tid])).stdev mt = mt.annotate_cols(y_w_popstrat=mt.y + mt['popstrat_' + tid] * var_factor) mt = _clean_fields(mt, tid) return mt
def test_maximal_independent_set_types(self): ht = hl.utils.range_table(10) ht = ht.annotate(i=hl.struct(a='1', b=hl.rand_norm(0, 1)), j=hl.struct(a='2', b=hl.rand_norm(0, 1))) ht = ht.annotate(ii=hl.struct(id=ht.i, rank=hl.rand_norm(0, 1)), jj=hl.struct(id=ht.j, rank=hl.rand_norm(0, 1))) hl.maximal_independent_set(ht.ii, ht.jj).count()
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2, pi=pi, is_annot_inf=is_annot_inf, annot_coef_dict=annot_coef_dict, annot_regex=annot_regex, h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows( __beta=hl.rand_norm( 0, hl.sqrt(mt1.__agg_annot * (h2 / annot_sum if h2_normalize else 1))) ) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def test_make_table_row_equivalence(self): mt = hl.utils.range_matrix_table(3, 3) mt = mt.annotate_rows(r1=hl.rand_norm(), r2=hl.rand_norm()) mt = mt.annotate_entries(e1=hl.rand_norm(), e2=hl.rand_norm()) mt = mt.key_cols_by(col_idx=hl.str(mt.col_idx)) assert mt.make_table().select(*mt.row_value)._same(mt.rows())
def make_betas(mt, h2, pi=1, annot=None): M = mt.count_rows() if annot is not None: annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True) return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M))) else: return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def test_plot_roc_curve(self): x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm()) x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm()) ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False)) _, aucs = hl.experimental.plot_roc_curve( ht, ['score1', 'score2', 'score3'])
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict, annot_regex=annot_regex,h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2)) mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi) return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def make_betas(mt, h2, pi=1, annot=None): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' M = mt.count_rows() if annot is not None: print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2)) mt1 = mt._annotate_all(row_exprs={'__annot': annot}, global_exprs={'__h2': h2}) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot)) return mt1.annotate_rows( __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2))) else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=h2, __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def get_subset(mt_pop, pop_dict: dict, pops: list, n_max: int): r''' Get Hail table sample of max size = `n_max` for list of populations `pops`. ''' pop_prop_dict, n_total = get_pop_prop_dict(pop_dict=pop_dict, pops=pops) limiting_pop = min(pop_prop_dict, key=pop_prop_dict.get) n_sample = int( min(pop_dict[limiting_pop] / pop_prop_dict[limiting_pop], n_max)) if n_sample != n_max: print( f'Using sample size of {n_sample} instead of {n_max} due to limiting population size in {limiting_pop}' ) print({k: v * n_sample for k, v in pop_prop_dict.items()}) cols = mt_pop.cols() if len(pops) == 1 and n_sample == pop_dict[pops[ 0]]: # if sampling a single population `pop` and n_sample is the same as the population's size. ht_sample = cols else: cols = cols.annotate(tmp_rand=hl.rand_norm()) cols = cols.order_by('tmp_rand') cols = cols.add_index(name='rand_idx') ht_sample = cols.filter(cols.rand_idx < n_sample) ht_sample = ht_sample.drop('tmp_rand', 'rand_idx') ht_sample = ht_sample.key_by('s') ht_sample = ht_sample.select( 'pop') # keyed by 's', thus the two remaining fields are 'pop' and 's' return ht_sample
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None, cov_regex=None): '''Calculates phenotypes given betas and genotypes. Adding population stratification is optional''' check_mt_sources(mt,genotype,beta) check_popstrat_args(is_popstrat=is_popstrat,cov_coef_dict=cov_coef_dict,cov_regex=cov_regex) mt1 = mt._annotate_all(row_exprs={'__beta':beta}, entry_exprs={'__gt':genotype}, global_exprs={'__is_popstrat':is_popstrat, '__cov_coef_dict':none_to_null(cov_coef_dict), '__cov_regex':none_to_null(cov_regex)}) mt2 = normalize_genotypes(mt1.__gt) print('\rCalculating phenotypes{}...'.format(' w/ population stratification' if is_popstrat else '').ljust(81)) mt3 = mt2.annotate_cols(__y_no_noise = hl.agg.sum(mt2.__beta * mt2.__norm_gt)) if h2 is None: h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2 if h2 > 1: print(f'WARNING: Total SNP-based h2 = {h2} (>1)') print('Not adding environmental noise') h2=1 mt4 = mt3.annotate_cols(__y = mt3.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2))) if is_popstrat: return add_popstrat(mt4, y=mt4.__y, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) else: return mt4
def sim_phenotypes(mt, h2, popstrat=None, popstrat_c=None): mt1 = mt.annotate_cols(__y_no_noise = hl.agg.sum(mt.__beta * mt.__norm_gt)) mt2 = mt1.annotate_cols(__y = mt1.__y_no_noise + hl.rand_norm(0,hl.sqrt(1-h2))) if popstrat is not None: return add_pop_strat(mt2, popstrat, popstrat_c) else: return mt2
def sim_phenotypes(mt, genotype, h2, beta, popstrat=None, popstrat_s2=1): '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional''' print('\rCalculating phenotypes{}...'.format( '' if popstrat is None else ' w/ population stratification').ljust(81)) if popstrat is None: mt1 = mt._annotate_all(row_exprs={'__beta': beta}, entry_exprs={'__gt': genotype}) else: mt1 = mt._annotate_all(row_exprs={'__beta': beta}, col_exprs={'__popstrat': popstrat}, entry_exprs={'__gt': genotype}, global_exprs={'__popstrat_s2': popstrat_s2}) mt2 = normalize_genotypes(mt1, mt1.__gt) mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta * mt2.__norm_gt)) mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if popstrat is None: return mt4 else: return add_pop_strat(mt4, y=mt4.__y, popstrat=mt4.__popstrat, popstrat_s2=hl.eval(mt4.__popstrat_s2))
def get_shuffled_ht(ht, phen: str, is_cas: bool, seed=None): r''' Returns shuffled Table of cases if `is_cas`=True, controls if `is_cas`=False. Case status is determined by binary field `phen`. ''' ht = ht.filter(ht[phen] == is_cas) ht = ht.annotate(tmp_rand=hl.rand_norm(seed=seed)) ht = ht.order_by('tmp_rand') ht = ht.add_index('tmp_idx') return ht
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def sim_corr_phenotypes(mt, cov_array): h2_ls = np.diag(cov_array) n_phens = len(h2_ls) for i in range(n_phens): mt = mt._annotate_all( col_exprs={ f'__y_no_noise_{i}': hl.agg.sum(mt[f'__beta_{i}'] * mt.__norm_gt) }) for i in range(n_phens): mt = mt._annotate_all( col_exprs={ f'__y_{i}': mt[f'__y_no_noise_{i}'] + hl.rand_norm(0, hl.sqrt(1 - h2_ls[i])) }) return mt
def calculate_phenotypes(mt, genotype, h2, beta, is_popstrat=False, cov_coef_dict=None, cov_regex=None, normalize_gt=True): '''Simulate phenotypes given betas and genotypes. Adding population stratification is optional''' check_mt_sources(mt, genotype, beta) check_popstrat_args(is_popstrat=is_popstrat, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) mt1 = mt._annotate_all(row_exprs={'__beta': beta}, entry_exprs={'__gt': genotype}, global_exprs={ '__is_popstrat': is_popstrat, '__cov_coef_dict': none_to_null(cov_coef_dict), '__cov_regex': none_to_null(cov_regex) }) if normalize_gt: mt2 = normalize_genotypes(mt1.__gt) else: mt2 = mt1.annotate_entries(__norm_gt=mt1.__gt) print('\rCalculating phenotypes{}...'.format( ' w/ population stratification' if is_popstrat else '').ljust(81)) mt3 = mt2.annotate_cols(__y_no_noise=hl.agg.sum(mt2.__beta * mt2.__norm_gt)) if h2 is None: h2 = mt3.aggregate_cols(hl.agg.stats(mt3.__y_no_noise)).stdev**2 if h2 > 1: print(f'WARNING: Total SNP-based h2 = {h2} (>1)') print('Not adding environmental noise') h2 = 1 mt4 = mt3.annotate_cols(__y=mt3.__y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2))) if is_popstrat: return add_popstrat(mt4, y=mt4.__y, cov_coef_dict=cov_coef_dict, cov_regex=cov_regex) else: return mt4
def _reduced_svd(A: TallSkinnyMatrix, k=10, compute_U=False, iterations=2, iteration_size=None): # Set Parameters q = iterations if iteration_size is None: L = k + 2 else: L = iteration_size assert ((q + 1) * L >= k) n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_U) info("_reduced_svd: Computing local SVD") return fact.reduced_svd(k)
def test_linear_mixed_regression_pass_through(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit(log_gamma=0) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats( mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1))) ht = hl.linear_mixed_regression_rows( (mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position]) assert mt_chr3.aggregate_rows( hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
def make_random_function(self, mt): M = mt.count_rows() # number of variants return hl.rand_norm(0, hl.sqrt(self.h2 / M)) #SQUARE ROOT?
ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters q = q_iterations L = k + oversampling_param n = A.take(1)[0].ndarray.shape[1] # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW U, S, V = hailBlanczos(A, G, k, q) scores = V.transpose() * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) lt = ht.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt else: return eigens, st, None
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True)
def test_linear_mixed_regression_pass_through(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit(log_gamma=0) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1))) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position]) assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
def make_betas(mt, h2, pi=1, annot=None, rg=None): """Generates betas under different models. Simulates betas (SNP effects) under the infinitesimal, spike & slab, or annotation-informed models, depending on parameters passed. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable containing genotypes to be used. Also should contain variant annotations as row fields if running the annotation-informed model or covariates as column fields if adding population stratification. h2 : :obj:`float` or :obj:`int` or :obj:`list` SNP-based heritability of simulated trait(s). pi : :obj:`float` or :obj:`int` or :obj:`list` Probability of SNP being causal when simulating under the spike & slab model. If doing two-trait spike & slab `pi` is a list of probabilities for overlapping causal SNPs (see docstring of :func:`.multitrait_ss`) annot : :class:`.Expression` Row field of aggregated annotations for annotation-informed model. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with betas as a row field, simulated according to specified model. """ h2 = [h2] if type(h2) is not list else h2 pi = [pi] if type(pi) is not list else pi rg = [rg] if type(rg) is not list else rg assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert (all( x >= 0 and x <= 1 for x in pi)), 'pi values for spike & slab must be between 0 and 1' assert (rg == [None] or all(x >= 0 and x <= 1 for x in rg)), 'rg values must be between 0 and 1 or None' if annot is not None: #multi-trait annotation-informed assert rg == [ None ], 'Correlated traits not supported for annotation-informed model' h2 = h2 if type(h2) is list else [h2] M = mt.count_rows() annot_var = mt.aggregate_rows(hl.agg.stats(annot)).stdev**2 mt = mt.annotate_rows( beta=hl.literal(h2).map(lambda x: hl.rand_norm( 0, hl.sqrt(annot * x / (annot_var * M)))) ) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance return mt elif len(h2) > 1 and pi == [1]: #multi-trait correlated infinitesimal return multitrait_inf(mt=mt, h2=h2, rg=rg) elif len(h2) == 2 and len(pi) > 1: #two trait correlated spike & slab return multitrait_ss(mt=mt, h2=h2, rg=0 if rg is [None] else rg[0], pi=pi) elif len(h2) == 1 and len( pi) == 1: #single trait infinitesimal/spike & slab M = mt.count_rows() return mt.annotate_rows(beta=hl.rand_bool(pi[0]) * hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi[0])))) else: raise ValueError('Insufficient parameters')
def make_random_function(self, mt): #pi is slab prob M = mt.count_rows() # number of variants # return hl.cond(hl.rand_unif(0,1) < self.pi, hl.rand_norm(0,self.h2/(M*self.pi)), 0) # return hl.cond(hl.rand_bool(self.pi), hl.rand_norm(0,hl.sqrt(self.h2/(M*self.pi))), 0) return hl.rand_bool(self.pi) * hl.rand_norm(0, self.h2 / (M * self.pi))
def run_rf_test( mt: hl.MatrixTable, output: str = "/tmp" ) -> Tuple[pyspark.ml.PipelineModel, hl.MatrixTable]: """ Runs a dummy test RF on a given MT. 1. Creates row annotations and labels to run model on 2. Trains a RF pipeline model (including median imputation of missing values in created annotations) 3. Saves the RF pipeline model 4. Applies the model to the MT and prints features importance :param mt: Input MT :param output: Output files prefix to save the RF model :return: RF model and MatrixTable after applying RF model """ mt = mt.annotate_rows( feature1=hl.rand_bool(0.1), feature2=hl.rand_norm(0.0, 1.0), feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)), ) mt = mt.annotate_rows(label=hl.cond(mt["feature1"] & (mt["feature2"] > 0), "TP", "FP")) ht = mt.rows() def f3stats(ht): return ht.aggregate( hl.struct( n=hl.agg.count_where(hl.is_defined(ht["feature3"])), med=hl.median(hl.agg.collect(ht["feature3"])), )) f3_before_imputation = f3stats(ht) logger.info("Feature3 defined values before imputation: {}".format( f3_before_imputation.n)) logger.info("Feature3 median: {}".format(f3_before_imputation.med)) features_to_impute = ["feature3"] quantiles = get_columns_quantiles(ht, features_to_impute, [0.5]) quantiles = {k: v[0] for k, v in quantiles.items()} logger.info("Features median:\n{}".format(f"{k}: {v}\n" for k, v in quantiles.items())) ht = ht.annotate( **{f: hl.or_else(ht[f], quantiles[f]) for f in features_to_impute}) ht = ht.annotate_globals(medians=quantiles) f3_after_imputation = f3stats(ht) logger.info("Feature3 defined values after imputation: {}".format( f3_after_imputation.n)) logger.info("Feature3 median: {}".format(f3_after_imputation.med)) ht = ht.select("label", "feature1", "feature2", "feature3") label = "label" features = ["feature1", "feature2", "feature3"] rf_model = train_rf(ht, features, label) save_model(rf_model, out_path=output + "/rf.model", overwrite=True) rf_model = load_model(output + "/rf.model") return rf_model, apply_rf_model(ht, rf_model, features, label)
mt = hl.read_matrix_table('gs://mattia/mattia-simulations/simEUR350_2.mt') # # # Select phenotype columns and output bucket # mt.y: # y0:y3 h2 = 0.1 # y4:y7 h2 = 0.3 # h2: 0.1 # mt = mt.annotate_cols(y0=mt.y[0]) # mt = mt.annotate_cols(y1=mt.y[1]) # out_bucket = 'gs://.../simulations_0.1/' # # # # # # # # h2: 0.1 # # # # # # # # # mt = mt.annotate_cols(y0=mt.y[0]+mt.sex*1+hl.rand_norm(0, 0.1)) out_bucket = 'gs://mattia/mattia-simulations/simulation_sex_on_X_0.1_1/' # Export phenotypes mt.cols().select('s', 'sex', 'y0').key_by().export(out_bucket + 'phenotypes/pheno_0.tsv') # Sampling parameters OR = [1.2, 1.5, 2, 3, 5] k = 1 # no sex diff effect df = mt.cols().select('s', 'y0', 'sex').key_by().to_pandas() for o in OR: # Participation bias df['z'] = df['y0'] * log(o) df['prob'] = [1 / (1 + exp(-z)) for z in df['z']]
def _pca_and_moments(A, k=10, num_moments=5, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128, moment_samples=100): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) # Set Parameters q = q_iterations L = k + oversampling_param n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_loadings) info("_reduced_svd: Computing local SVD") U, S, V = fact.reduced_svd(k) p = min(num_moments // 2, 10) # Generate random matrix G2 for moment estimation G2 = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) # Project out components in subspace fact.V, which we can compute exactly G2 = G2 - fact.V @ (fact.V.T @ G2) Q1, R1 = hl.nd.qr(G2)._persist() fact2 = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = fact2.spectral_moments(num_moments, R1) # Add back exact moments moments = moments_and_stdevs.moments + hl.nd.array([ fact.S.map(lambda x: x**(2 * i)).sum() for i in range(1, num_moments + 1) ]) moments_and_stdevs = hl.eval( hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) else: lt = None return eigens, st, lt, moments, stdevs
def calculate_phenotypes(mt, genotype, beta, h2, popstrat=None, popstrat_var=None, exact_h2=False): r"""Calculates phenotypes by multiplying genotypes and betas. Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with all relevant fields passed as parameters. genotype : :class:`.Expression` or :class:`.CallExpression` Entry field of genotypes. beta : :class:`.Expression` Row field of SNP effects. h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray` SNP-based heritability (:math:`h^2`) of simulated trait. Can only be ``None`` if running annotation-informed model. popstrat : :class:`.Expression`, optional Column field containing population stratification term. popstrat_var : :obj:`float` or :obj:`int` Variance of population stratification term. exact_h2: :obj:`bool` Whether to exactly simulate ratio of variance of genetic component of phenotype to variance of phenotype to be h2. If `False`, ratio will be h2 in expectation. Observed h2 in the simulation will be close to expected h2 for large-scale simulations. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated phenotype as column field. """ print('calculating phenotype') h2 = h2.tolist() if type(h2) is np.ndarray else ( [h2] if type(h2) is not list else h2) assert popstrat_var is None or (popstrat_var >= 0), 'popstrat_var must be non-negative' uid = Env.get_uid(base=100) mt = annotate_all( mt=mt, row_exprs={'beta_' + uid: beta}, col_exprs={} if popstrat is None else {'popstrat_' + uid: popstrat}, entry_exprs={ 'gt_' + uid: genotype.n_alt_alleles() if genotype.dtype is hl.dtype('call') else genotype }) mt = mt.filter_rows(hl.agg.stats(mt['gt_' + uid]).stdev > 0) mt = normalize_genotypes(mt['gt_' + uid]) if mt['beta_' + uid].dtype == hl.dtype('array<float64>'): # if >1 traits if exact_h2: raise ValueError( 'exact_h2=True not supported for multitrait simulations') else: mt = mt.annotate_cols(y_no_noise=hl.agg.array_agg( lambda beta: hl.agg.sum(beta * mt['norm_gt']), mt['beta_' + uid])) mt = mt.annotate_cols( y=mt.y_no_noise + hl.literal(h2).map(lambda x: hl.rand_norm(0, hl.sqrt(1 - x)))) else: if exact_h2 and min([h2[0], 1 - h2[0]]) != 0: print('exact h2') mt = mt.annotate_cols(**{ 'y_no_noise_' + uid: hl.agg.sum(mt['beta_' + uid] * mt['norm_gt']) }) y_no_noise_stdev = mt.aggregate_cols( hl.agg.stats(mt['y_no_noise_' + uid]).stdev) mt = mt.annotate_cols( y_no_noise=hl.sqrt(h2[0]) * mt['y_no_noise_' + uid] / y_no_noise_stdev ) # normalize genetic component of phenotype to have variance of exactly h2 mt = mt.annotate_cols( **{'noise_' + uid: hl.rand_norm(0, hl.sqrt(1 - h2[0]))}) noise_stdev = mt.aggregate_cols( hl.agg.stats(mt['noise_' + uid]).stdev) mt = mt.annotate_cols(noise=hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev) mt = mt.annotate_cols( y=mt.y_no_noise + hl.sqrt(1 - h2[0]) * mt['noise_' + uid] / noise_stdev) else: mt = mt.annotate_cols(y_no_noise=hl.agg.sum(mt['beta_' + uid] * mt['norm_gt'])) mt = mt.annotate_cols(y=mt.y_no_noise + hl.rand_norm(0, hl.sqrt(1 - h2[0]))) if popstrat is not None: var_factor = 1 if popstrat_var is None else (popstrat_var**( 1 / 2)) / mt.aggregate_cols(hl.agg.stats(mt['popstrat_' + uid])).stdev mt = mt.rename({'y': 'y_no_popstrat'}) mt = mt.annotate_cols(y=mt.y_no_popstrat + mt['popstrat_' + uid] * var_factor) mt = _clean_fields(mt, uid) return mt
def table_aggregate_downsample_sparse(): ht = hl.utils.range_table(250_000_000, 8) ht.aggregate(hl.agg.downsample(hl.rand_norm()**5, hl.rand_norm()**5))
def make_betas(mt, h2, pi=None, annot=None, rg=None): r"""Generates betas under different models. Simulates betas (SNP effects) under the infinitesimal, spike & slab, or annotation-informed models, depending on parameters passed. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable containing genotypes to be used. Also should contain variant annotations as row fields if running the annotation-informed model or covariates as column fields if adding population stratification. h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray` SNP-based heritability of simulated trait(s). pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Probability of SNP being causal when simulating under the spike & slab model. If doing two-trait spike & slab `pi` is a list of probabilities for overlapping causal SNPs (see docstring of :func:`.multitrait_ss`) annot : :class:`.Expression`, optional Row field of aggregated annotations for annotation-informed model. rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Genetic correlation between traits. Returns ------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with betas as a row field, simulated according to specified model. pi : :obj:`list` Probability of a SNP being causal for different traits, possibly altered from input `pi` if covariance matrix for multitrait simulation was not positive semi-definite. rg : :obj:`list` Genetic correlation between traits, possibly altered from input `rg` if covariance matrix for multitrait simulation was not positive semi-definite. """ h2 = h2.tolist() if type(h2) is np.ndarray else ( [h2] if type(h2) is not list else h2) pi = pi.tolist() if type(pi) is np.ndarray else ( [pi] if type(pi) is not list else pi) rg = rg.tolist() if type(rg) is np.ndarray else ( [rg] if type(rg) is not list else rg) assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert (pi is not [None]) or all( x >= 0 and x <= 1 for x in pi), 'pi values for spike & slab must be between 0 and 1' assert (rg == [None] or all(x >= -1 and x <= 1 for x in rg)), 'rg values must be between -1 and 1 or None' if annot is not None: # multi-trait annotation-informed assert rg == [ None ], 'Correlated traits not supported for annotation-informed model' h2 = h2 if type(h2) is list else [h2] annot_sum = mt.aggregate_rows(hl.agg.sum(annot)) mt = mt.annotate_rows(beta=hl.literal(h2).map( lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M))))) elif len(h2) > 1 and (pi == [None] or pi == [1]): # multi-trait correlated infinitesimal mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg) elif len(h2) == 2 and len(pi) > 1 and len( rg) == 1: # two trait correlated spike & slab print('multitrait ss') mt, pi, rg = multitrait_ss(mt=mt, h2=h2, rg=0 if rg is [None] else rg[0], pi=pi) elif len(h2) == 1 and len( pi) == 1: # single trait infinitesimal/spike & slab M = mt.count_rows() pi_temp = 1 if pi == [None] else pi[0] mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) * hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp)))) else: raise ValueError('Parameters passed do not match any models.') return mt, pi, rg
def test_suite(mt, genotype, popstrat): '''Testing suite for simulation framework''' mt = mt._annotate_all(row_exprs={ 'a1': hl.rand_norm(), 'a2': hl.rand_bool(0.1) }, col_exprs={'popstrat': popstrat}, entry_exprs={'gt': genotype}) mt = mt.annotate_rows(annot=mt.a1 + mt.a2) n_sim = 7 #number of simulations sim_h2_ls = np.round(np.random.uniform(low=0, high=1, size=n_sim), 4) obs_h2_ls = [] sim_mt_ls = [] # Infinitesimal sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[0], genotype=mt.gt)) # Spike & slab sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[1], pi=0.1, genotype=mt.gt)) # Annotation-informed sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[1], genotype=mt.gt, annot=mt.annot) ) #has same h2 as previous spike and slab to check if sims match # Infinitesimal + population stratification, popstrat_s2 = 0.5 sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[3], genotype=mt.gt, popstrat=mt.popstrat, popstrat_s2=0.5)) # Infinitesimal + population stratification, popstrat_s2 = 0.25 sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[3], genotype=mt.gt, popstrat=mt.popstrat, popstrat_s2=0.25)) # Spike & slab + population stratification sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[5], pi=0.1, genotype=mt.gt, popstrat=mt.popstrat)) # Annotation-informed + population stratification sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[6], genotype=mt.gt, annot=mt.annot, popstrat=mt.popstrat)) for sim_mt in sim_mt_ls: print(sim_mt.describe()) for sim_i, sim_mt in enumerate(sim_mt_ls): obs_h2_ls.append( np.round( sim_mt.aggregate_cols( hl.agg.stats(sim_mt['__y_no_noise']).stdev**2), 4)) print('\nExpected h2s: {} \nObserved h2s: {}'.format(sim_h2_ls, obs_h2_ls))
def test_plot_roc_curve(self): x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm()) x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm()) ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False)) _, aucs = hl.experimental.plot_roc_curve(ht, ['score1', 'score2', 'score3'])
def generate_datasets(doctest_namespace, output_dir): doctest_namespace['hl'] = hl files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds print("finished setting up doctest...")