def test_pcrelate_issue_5263(): mt = hl.balding_nichols_model(3, 50, 100) expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all') mt = mt.select_entries(GT2=mt.GT, GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5))) actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all') assert expected._same(actual, tolerance=1e-4)
def test_plot_roc_curve(self): x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm()) x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm()) ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False)) _, aucs = hl.experimental.plot_roc_curve( ht, ['score1', 'score2', 'score3'])
def kyle_sex_specific_qc(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.annotate_cols(sex=hl.cond(hl.rand_bool(0.5), 'Male', 'Female')) (num_males, num_females) = mt.aggregate_cols((hl.agg.count_where(mt.sex == 'Male'), hl.agg.count_where(mt.sex == 'Female'))) mt = mt.annotate_rows( male_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Male')), male_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Male')), male_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Male')), female_hets=hl.agg.count_where(mt.GT.is_het() & (mt.sex == 'Female')), female_homvars=hl.agg.count_where(mt.GT.is_hom_var() & (mt.sex == 'Female')), female_calls=hl.agg.count_where(hl.is_defined(mt.GT) & (mt.sex == 'Female')) ) mt = mt.annotate_rows( call_rate=(hl.case() .when(mt.locus.in_y_nonpar(), (mt.male_calls / num_males)) .when(mt.locus.in_x_nonpar(), (mt.male_calls + 2 * mt.female_calls) / (num_males + 2 * num_females)) .default((mt.male_calls + mt.female_calls) / (num_males + num_females))), AC=(hl.case() .when(mt.locus.in_y_nonpar(), mt.male_homvars) .when(mt.locus.in_x_nonpar(), mt.male_homvars + mt.female_hets + 2 * mt.female_homvars) .default(mt.male_hets + 2 * mt.male_homvars + mt.female_hets + 2 * mt.female_homvars)), AN=(hl.case() .when(mt.locus.in_y_nonpar(), mt.male_calls) .when(mt.locus.in_x_nonpar(), mt.male_calls + 2 * mt.female_calls) .default(2 * mt.male_calls + 2 * mt.female_calls)) ) mt.rows()._force_count()
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2, pi=pi, is_annot_inf=is_annot_inf, annot_coef_dict=annot_coef_dict, annot_regex=annot_regex, h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt, coef_dict=annot_coef_dict, regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows( __beta=hl.rand_norm( 0, hl.sqrt(mt1.__agg_annot * (h2 / annot_sum if h2_normalize else 1))) ) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=none_to_null(h2), __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def make_betas(mt, h2, pi=1, annot=None): M = mt.count_rows() if annot is not None: annot_stats = mt.aggregate_rows(hl.agg.stats(mt.__annot), _localize=True) return mt.annotate_rows(__beta = hl.rand_norm(0, (mt.__annot - annot_stats.mean) / annot_stats.stdev * hl.sqrt(h2 / M))) else: return mt.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def test_king_filtered_entries_no_error(): plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations') mt = hl.import_plink(bed=f'{plink_path}.bed', bim=f'{plink_path}.bim', fam=f'{plink_path}.fam') mt = mt.filter_entries(hl.rand_bool(0.5)) hl.king(mt.GT)._force_count_rows()
def test_aggregate(self): vds = self.get_vds() vds = vds.annotate_globals(foo=5) vds = vds.annotate_rows(x1=agg.count()) vds = vds.annotate_cols(y1=agg.count()) vds = vds.annotate_entries(z1=vds.DP) qv = vds.aggregate_rows(agg.count()) qs = vds.aggregate_cols(agg.count()) qg = vds.aggregate_entries(agg.count()) self.assertIsNotNone(vds.aggregate_entries(hl.agg.take(vds.s, 1)[0])) self.assertEqual(qv, 346) self.assertEqual(qs, 100) self.assertEqual(qg, qv * qs) qvs = vds.aggregate_rows( hl.Struct(x=agg.collect(vds.locus.contig), y=agg.collect(vds.x1))) qss = vds.aggregate_cols( hl.Struct(x=agg.collect(vds.s), y=agg.collect(vds.y1))) qgs = vds.aggregate_entries( hl.Struct(x=agg.collect(agg.filter(False, vds.y1)), y=agg.collect(agg.filter(hl.rand_bool(0.1), vds.GT))))
def _spectral_moments(A, num_moments, p=None, moment_samples=500, block_size=128): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) n = A.ncols if p is None: p = min(num_moments // 2, 10) # TODO: When moment_samples > n, we should just do a TSQR on A, and compute # the spectrum of R. assert moment_samples < n, '_spectral_moments: moment_samples must be smaller than num cols of A' G = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) Q1, R1 = hl.nd.qr(G)._persist() fact = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = hl.eval(fact.spectral_moments(num_moments, R1)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs return moments, stdevs
def test_aggregate(self): vds = self.get_vds() vds = vds.annotate_globals(foo=5) vds = vds.annotate_rows(x1=agg.count()) vds = vds.annotate_cols(y1=agg.count()) vds = vds.annotate_entries(z1=vds.DP) qv = vds.aggregate_rows(agg.count()) qs = vds.aggregate_cols(agg.count()) qg = vds.aggregate_entries(agg.count()) self.assertIsNotNone(vds.aggregate_entries(hl.agg.take(vds.s, 1)[0])) self.assertEqual(qv, 346) self.assertEqual(qs, 100) self.assertEqual(qg, qv * qs) qvs = vds.aggregate_rows(hl.Struct(x=agg.collect(vds.locus.contig), y=agg.collect(vds.x1))) qss = vds.aggregate_cols(hl.Struct(x=agg.collect(vds.s), y=agg.collect(vds.y1))) qgs = vds.aggregate_entries(hl.Struct(x=agg.filter(False, agg.collect(vds.y1)), y=agg.filter(hl.rand_bool(0.1), agg.collect(vds.GT))))
def test_to_matrix_table(self): N, M = 50, 50 mt = hl.utils.range_matrix_table(N, M) mt = mt.key_cols_by(s='Col' + hl.str(M - mt.col_idx)) mt = mt.annotate_cols(c1=hl.rand_bool(0.5)) mt = mt.annotate_rows(r1=hl.rand_bool(0.5)) mt = mt.annotate_entries(e1=hl.rand_bool(0.5)) re_mt = mt.entries().to_matrix_table(['row_idx'], ['s'], ['r1'], ['col_idx', 'c1']) new_col_order = re_mt.col_idx.collect() mapping = [ t[1] for t in sorted([(old, new) for new, old in enumerate(new_col_order)]) ] assert re_mt.choose_cols(mapping).drop('col_idx')._same( mt.drop('col_idx'))
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus=hl.locus('20', mt.row_idx + 1), alleles=['A', 'G']).key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s=hl.str(mt.col_idx)).key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a=hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b=hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP=hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def make_betas(mt, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None, h2_normalize=True): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' check_beta_args(h2=h2,pi=pi,is_annot_inf=is_annot_inf,annot_coef_dict=annot_coef_dict, annot_regex=annot_regex,h2_normalize=h2_normalize) M = mt.count_rows() if is_annot_inf: print('\rSimulating {} annotation-informed betas {}'.format( 'h2-normalized' if h2_normalize else '', '(default coef: 1)' if annot_coef_dict is None else 'using annot_coef_dict')) mt1 = agg_fields(mt=mt,coef_dict=annot_coef_dict,regex=annot_regex) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__agg_annot)) return mt1.annotate_rows(__beta = hl.rand_norm(0, hl.sqrt(mt1.__agg_annot*(h2/annot_sum if h2_normalize else 1)))) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance else: print('Simulating betas using {} model w/ h2 = {}'.format(('infinitesimal' if pi is 1 else 'spike & slab'),h2)) mt1 = mt.annotate_globals(__h2 = none_to_null(h2), __pi = pi) return mt1.annotate_rows(__beta = hl.rand_bool(pi)*hl.rand_norm(0,hl.sqrt(h2/(M*pi))))
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1), alleles = ['A', 'G']) .key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s = hl.str(mt.col_idx)) .key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def make_betas(mt, h2, pi=1, annot=None): '''Simulate betas. Options: Infinitesimal model, spike & slab, annotation-informed''' M = mt.count_rows() if annot is not None: print('\rSimulating annotation-informed betas w/ h2 = {}'.format(h2)) mt1 = mt._annotate_all(row_exprs={'__annot': annot}, global_exprs={'__h2': h2}) annot_sum = mt1.aggregate_rows(hl.agg.sum(mt1.__annot)) return mt1.annotate_rows( __beta=hl.rand_norm(0, hl.sqrt(mt1.__annot / annot_sum * h2))) else: print('Simulating betas using {} model w/ h2 = {}'.format( ('infinitesimal' if pi is 1 else 'spike & slab'), h2)) mt1 = mt.annotate_globals(__h2=h2, __pi=pi) return mt1.annotate_rows(__beta=hl.rand_bool(pi) * hl.rand_norm(0, hl.sqrt(h2 / (M * pi))))
def logistic_regression_rows_wald_nd(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.head(2000) num_phenos = 5 num_covs = 2 pheno_dict = { f"pheno_{i}": hl.rand_bool(.5, seed=i) for i in range(num_phenos) } cov_dict = { f"cov_{i}": hl.rand_unif(0, 1, seed=i) for i in range(num_covs) } mt = mt.annotate_cols(**pheno_dict) mt = mt.annotate_cols(**cov_dict) res = hl._logistic_regression_rows_nd( test='wald', y=[mt[key] for key in pheno_dict.keys()], x=mt.x, covariates=[mt[key] for key in cov_dict.keys()]) res._force_count()
def make_random_function(self, mt): #pi is slab prob M = mt.count_rows() # number of variants # return hl.cond(hl.rand_unif(0,1) < self.pi, hl.rand_norm(0,self.h2/(M*self.pi)), 0) # return hl.cond(hl.rand_bool(self.pi), hl.rand_norm(0,hl.sqrt(self.h2/(M*self.pi))), 0) return hl.rand_bool(self.pi) * hl.rand_norm(0, self.h2 / (M * self.pi))
def sample_training_examples( ht: hl.Table, tp_expr: hl.BooleanExpression, fp_expr: hl.BooleanExpression, fp_to_tp: float = 1.0, test_expr: Optional[hl.expr.BooleanExpression] = None, ) -> hl.Table: """ Returns a Table of all positive and negative training examples in `ht` with an annotation indicating those that should be used for training given a true positive (TP) to false positive (FP) ratio. The returned Table has the following annotations: - train: indicates if the variant should be used for training. A row is given False for the annotation if True for `test_expr`, True for both `tp_expr and fp_expr`, or it is pruned out to obtain the desired `fp_to_tp` ratio. - label: indicates if a variant is a 'TP' or 'FP' and will also be labeled as such for variants defined by `test_expr`. .. note:: - This function does not support multi-allelic variants. - The function will give some stats about the TPs/FPs provided (Ti, Tv, indels). :param ht: Input Table. :param tp_expr: Expression for TP examples. :param fp_expr: Expression for FP examples. :param fp_to_tp: FP to TP ratio. If set to <= 0, all training examples are used. :param test_expr: Optional expression to exclude a set of variants from training set. Still contains TP/FP label annotation. :return: Table subset with corresponding TP and FP examples with desired FP to TP ratio. """ ht = ht.select( _tp=hl.or_else(tp_expr, False), _fp=hl.or_else(fp_expr, False), _exclude=False if test_expr is None else test_expr, ) ht = ht.filter(ht._tp | ht._fp).persist() # Get stats about TP / FP sets def _get_train_counts(ht: hl.Table) -> Tuple[int, int]: """ Determine the number of TP and FP variants in the input Table and report some stats on Ti, Tv, indels. :param ht: Input Table :return: Counts of TP and FP variants in the table """ train_stats = hl.struct(n=hl.agg.count()) if "alleles" in ht.row and ht.row.alleles.dtype == hl.tarray(hl.tstr): train_stats = train_stats.annotate( ti=hl.agg.count_where( hl.expr.is_transition(ht.alleles[0], ht.alleles[1]) ), tv=hl.agg.count_where( hl.expr.is_transversion(ht.alleles[0], ht.alleles[1]) ), indel=hl.agg.count_where( hl.expr.is_indel(ht.alleles[0], ht.alleles[1]) ), ) # Sample training examples pd_stats = ( ht.group_by(**{"contig": ht.locus.contig, "tp": ht._tp, "fp": ht._fp}) .aggregate(**train_stats) .to_pandas() ) logger.info(pformat(pd_stats)) pd_stats = pd_stats.fillna(False) # Number of true positive and false positive variants to be sampled for the training set n_tp = pd_stats[pd_stats["tp"] & ~pd_stats["fp"]]["n"].sum() n_fp = pd_stats[~pd_stats["tp"] & pd_stats["fp"]]["n"].sum() return n_tp, n_fp n_tp, n_fp = _get_train_counts(ht.filter(~ht._exclude)) prob_tp = prob_fp = 1.0 if fp_to_tp > 0: desired_fp = fp_to_tp * n_tp if desired_fp < n_fp: prob_fp = desired_fp / n_fp else: prob_tp = n_fp / desired_fp logger.info( f"Training examples sampling: tp={prob_tp}*{n_tp}, fp={prob_fp}*{n_fp}" ) train_expr = ( hl.case(missing_false=True) .when(ht._fp & hl.or_else(~ht._tp, True), hl.rand_bool(prob_fp)) .when(ht._tp & hl.or_else(~ht._fp, True), hl.rand_bool(prob_tp)) .default(False) ) else: train_expr = ~(ht._tp & ht._fp) logger.info(f"Using all {n_tp} TP and {n_fp} FP training examples.") label_expr = ( hl.case(missing_false=True) .when(ht._tp & hl.or_else(~ht._fp, True), "TP") .when(ht._fp & hl.or_else(~ht._tp, True), "FP") .default(hl.null(hl.tstr)) ) return ht.select(train=train_expr & ~ht._exclude, label=label_expr)
import hail as hl hl.set_global_seed(0) mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4) mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT)) hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
def make_betas(mt, h2, pi=1, annot=None, rg=None): """Generates betas under different models. Simulates betas (SNP effects) under the infinitesimal, spike & slab, or annotation-informed models, depending on parameters passed. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable containing genotypes to be used. Also should contain variant annotations as row fields if running the annotation-informed model or covariates as column fields if adding population stratification. h2 : :obj:`float` or :obj:`int` or :obj:`list` SNP-based heritability of simulated trait(s). pi : :obj:`float` or :obj:`int` or :obj:`list` Probability of SNP being causal when simulating under the spike & slab model. If doing two-trait spike & slab `pi` is a list of probabilities for overlapping causal SNPs (see docstring of :func:`.multitrait_ss`) annot : :class:`.Expression` Row field of aggregated annotations for annotation-informed model. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with betas as a row field, simulated according to specified model. """ h2 = [h2] if type(h2) is not list else h2 pi = [pi] if type(pi) is not list else pi rg = [rg] if type(rg) is not list else rg assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert (all( x >= 0 and x <= 1 for x in pi)), 'pi values for spike & slab must be between 0 and 1' assert (rg == [None] or all(x >= 0 and x <= 1 for x in rg)), 'rg values must be between 0 and 1 or None' if annot is not None: #multi-trait annotation-informed assert rg == [ None ], 'Correlated traits not supported for annotation-informed model' h2 = h2 if type(h2) is list else [h2] M = mt.count_rows() annot_var = mt.aggregate_rows(hl.agg.stats(annot)).stdev**2 mt = mt.annotate_rows( beta=hl.literal(h2).map(lambda x: hl.rand_norm( 0, hl.sqrt(annot * x / (annot_var * M)))) ) # if is_h2_normalized: scale variance of betas to be h2, else: keep unscaled variance return mt elif len(h2) > 1 and pi == [1]: #multi-trait correlated infinitesimal return multitrait_inf(mt=mt, h2=h2, rg=rg) elif len(h2) == 2 and len(pi) > 1: #two trait correlated spike & slab return multitrait_ss(mt=mt, h2=h2, rg=0 if rg is [None] else rg[0], pi=pi) elif len(h2) == 1 and len( pi) == 1: #single trait infinitesimal/spike & slab M = mt.count_rows() return mt.annotate_rows(beta=hl.rand_bool(pi[0]) * hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi[0])))) else: raise ValueError('Insufficient parameters')
def test_suite(mt, genotype, popstrat): '''Testing suite for simulation framework''' mt = mt._annotate_all(row_exprs={ 'a1': hl.rand_norm(), 'a2': hl.rand_bool(0.1) }, col_exprs={'popstrat': popstrat}, entry_exprs={'gt': genotype}) mt = mt.annotate_rows(annot=mt.a1 + mt.a2) n_sim = 7 #number of simulations sim_h2_ls = np.round(np.random.uniform(low=0, high=1, size=n_sim), 4) obs_h2_ls = [] sim_mt_ls = [] # Infinitesimal sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[0], genotype=mt.gt)) # Spike & slab sim_mt_ls.append(simulate(mt=mt, h2=sim_h2_ls[1], pi=0.1, genotype=mt.gt)) # Annotation-informed sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[1], genotype=mt.gt, annot=mt.annot) ) #has same h2 as previous spike and slab to check if sims match # Infinitesimal + population stratification, popstrat_s2 = 0.5 sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[3], genotype=mt.gt, popstrat=mt.popstrat, popstrat_s2=0.5)) # Infinitesimal + population stratification, popstrat_s2 = 0.25 sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[3], genotype=mt.gt, popstrat=mt.popstrat, popstrat_s2=0.25)) # Spike & slab + population stratification sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[5], pi=0.1, genotype=mt.gt, popstrat=mt.popstrat)) # Annotation-informed + population stratification sim_mt_ls.append( simulate(mt=mt, h2=sim_h2_ls[6], genotype=mt.gt, annot=mt.annot, popstrat=mt.popstrat)) for sim_mt in sim_mt_ls: print(sim_mt.describe()) for sim_i, sim_mt in enumerate(sim_mt_ls): obs_h2_ls.append( np.round( sim_mt.aggregate_cols( hl.agg.stats(sim_mt['__y_no_noise']).stdev**2), 4)) print('\nExpected h2s: {} \nObserved h2s: {}'.format(sim_h2_ls, obs_h2_ls))
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s = hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path)
import os, shutil import hail as hl if not os.path.isdir("output/"): os.mkdir("output/") files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1),
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s=hl.str(mt.sample_idx)) mt = mt.annotate_entries( GT=hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path) chimera0 = mt.filter_rows(mt.locus.position < n_variants / 2) chimera0 = chimera0.filter_cols(chimera0.s == "0") chimera1 = mt.filter_rows(mt.locus.position >= n_variants / 2) chimera1 = chimera1.filter_cols(chimera1.s == "1") chimera1 = chimera1.key_cols_by(s="0") mt2 = chimera0.union_rows(chimera1) hl.export_vcf(mt2, path + "-chimera.vcf") hl.export_plink(mt2, path + "-chimera")
def generate_datasets(doctest_namespace, output_dir): doctest_namespace['hl'] = hl files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds print("finished setting up doctest...")
def make_betas(mt, h2, pi=None, annot=None, rg=None): r"""Generates betas under different models. Simulates betas (SNP effects) under the infinitesimal, spike & slab, or annotation-informed models, depending on parameters passed. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable containing genotypes to be used. Also should contain variant annotations as row fields if running the annotation-informed model or covariates as column fields if adding population stratification. h2 : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray` SNP-based heritability of simulated trait(s). pi : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Probability of SNP being causal when simulating under the spike & slab model. If doing two-trait spike & slab `pi` is a list of probabilities for overlapping causal SNPs (see docstring of :func:`.multitrait_ss`) annot : :class:`.Expression`, optional Row field of aggregated annotations for annotation-informed model. rg : :obj:`float` or :obj:`int` or :obj:`list` or :class:`numpy.ndarray`, optional Genetic correlation between traits. Returns ------- mt : :class:`.MatrixTable` :class:`.MatrixTable` with betas as a row field, simulated according to specified model. pi : :obj:`list` Probability of a SNP being causal for different traits, possibly altered from input `pi` if covariance matrix for multitrait simulation was not positive semi-definite. rg : :obj:`list` Genetic correlation between traits, possibly altered from input `rg` if covariance matrix for multitrait simulation was not positive semi-definite. """ h2 = h2.tolist() if type(h2) is np.ndarray else ( [h2] if type(h2) is not list else h2) pi = pi.tolist() if type(pi) is np.ndarray else ( [pi] if type(pi) is not list else pi) rg = rg.tolist() if type(rg) is np.ndarray else ( [rg] if type(rg) is not list else rg) assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert (pi is not [None]) or all( x >= 0 and x <= 1 for x in pi), 'pi values for spike & slab must be between 0 and 1' assert (rg == [None] or all(x >= -1 and x <= 1 for x in rg)), 'rg values must be between -1 and 1 or None' if annot is not None: # multi-trait annotation-informed assert rg == [ None ], 'Correlated traits not supported for annotation-informed model' h2 = h2 if type(h2) is list else [h2] annot_sum = mt.aggregate_rows(hl.agg.sum(annot)) mt = mt.annotate_rows(beta=hl.literal(h2).map( lambda x: hl.rand_norm(0, hl.sqrt(annot * x / (annot_sum * M))))) elif len(h2) > 1 and (pi == [None] or pi == [1]): # multi-trait correlated infinitesimal mt, rg = multitrait_inf(mt=mt, h2=h2, rg=rg) elif len(h2) == 2 and len(pi) > 1 and len( rg) == 1: # two trait correlated spike & slab print('multitrait ss') mt, pi, rg = multitrait_ss(mt=mt, h2=h2, rg=0 if rg is [None] else rg[0], pi=pi) elif len(h2) == 1 and len( pi) == 1: # single trait infinitesimal/spike & slab M = mt.count_rows() pi_temp = 1 if pi == [None] else pi[0] mt = mt.annotate_rows(beta=hl.rand_bool(pi_temp) * hl.rand_norm(0, hl.sqrt(h2[0] / (M * pi_temp)))) else: raise ValueError('Parameters passed do not match any models.') return mt, pi, rg
def run_rf_test( mt: hl.MatrixTable, output: str = "/tmp" ) -> Tuple[pyspark.ml.PipelineModel, hl.MatrixTable]: """ Runs a dummy test RF on a given MT. 1. Creates row annotations and labels to run model on 2. Trains a RF pipeline model (including median imputation of missing values in created annotations) 3. Saves the RF pipeline model 4. Applies the model to the MT and prints features importance :param mt: Input MT :param output: Output files prefix to save the RF model :return: RF model and MatrixTable after applying RF model """ mt = mt.annotate_rows( feature1=hl.rand_bool(0.1), feature2=hl.rand_norm(0.0, 1.0), feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)), ) mt = mt.annotate_rows(label=hl.cond(mt["feature1"] & (mt["feature2"] > 0), "TP", "FP")) ht = mt.rows() def f3stats(ht): return ht.aggregate( hl.struct( n=hl.agg.count_where(hl.is_defined(ht["feature3"])), med=hl.median(hl.agg.collect(ht["feature3"])), )) f3_before_imputation = f3stats(ht) logger.info("Feature3 defined values before imputation: {}".format( f3_before_imputation.n)) logger.info("Feature3 median: {}".format(f3_before_imputation.med)) features_to_impute = ["feature3"] quantiles = get_columns_quantiles(ht, features_to_impute, [0.5]) quantiles = {k: v[0] for k, v in quantiles.items()} logger.info("Features median:\n{}".format(f"{k}: {v}\n" for k, v in quantiles.items())) ht = ht.annotate( **{f: hl.or_else(ht[f], quantiles[f]) for f in features_to_impute}) ht = ht.annotate_globals(medians=quantiles) f3_after_imputation = f3stats(ht) logger.info("Feature3 defined values after imputation: {}".format( f3_after_imputation.n)) logger.info("Feature3 median: {}".format(f3_after_imputation.med)) ht = ht.select("label", "feature1", "feature2", "feature3") label = "label" features = ["feature1", "feature2", "feature3"] rf_model = train_rf(ht, features, label) save_model(rf_model, out_path=output + "/rf.model", overwrite=True) rf_model = load_model(output + "/rf.model") return rf_model, apply_rf_model(ht, rf_model, features, label)
reference_genome='GRCh37') for chrom in range(2, 23): mtT = hl.import_plink(bed='gs://.../ukb_imp_chr%s_v3.bed' % chrom, bim='gs://.../ukb_imp_chr%s_v3.bim' % chrom, fam='gs://.../ukb_imp_chr%s_v3.fam' % chrom, reference_genome='GRCh37') mt = mt.union_rows(mtT) # Keep only unrelated (~361k samples) tb2 = hl.import_table('gs://.../unrelated_samples.txt', impute=True) tb2 = tb2.annotate(s_str=hl.str(tb2.s)).key_by('s_str') mt = mt.semi_join_cols(tb2).add_col_index() mt = mt.annotate_cols(s_index=mt.col_idx).key_cols_by('s_index') # Extract 350k samples + basic QC random.seed(123) indices = random.sample(range(mt.count_cols()), 350000) mt = mt.choose_cols(list(indices)) mt = hl.variant_qc(mt) mt = mt.filter_rows((mt.variant_qc.AF[1] >= 0.05) & (mt.variant_qc.AF[1] <= 0.95)) # Randomly assign sex (1 F, 0 M) mt = mt.annotate_cols(sex=hl.cond(hl.rand_bool(0.5, seed=123), 1, 0)) # Simulate phenotypes and save MatrixTable sim = hl.experimental.ldscsim.simulate_phenotypes(mt, mt.GT, h2=h2, rg=rg) print(sim.describe()) sim.write(out, overwrite=True)
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
"info_DP").write("gs://gnomad-qingbowang/gen_chr20_all_siteDP.ht", overwrite=True) print(et.count()) #sample size #agg = et.group_by('agrees_PID').aggregate(n=hl.agg.stats(et.info_DP)) #agg.write("gs://gnomad-qingbowang/phased_has_higher_cov_ex_chr20_all.ht") #agg.write("gs://gnomad-qingbowang/phased_has_higher_cov_gen_chr20_all_siteDP.ht") exomes = get_gnomad_data("exomes", release_samples=True, adj=True, release_annotations=True) #実際はgenomeだけど. #ex20 = hl.filter_intervals(exomes.select_rows("allele_info").select_cols(), [hl.parse_locus_interval("20:start-2M")]) #first 2Mb #ex20 = hl.filter_intervals(exomes.select_rows("info_DP").select_cols(), [hl.parse_locus_interval("20")]) ex20 = exomes.select_rows("info_DP").select_cols() ex20 = ex20.filter_cols( hl.rand_bool(0.01)) #~=1500 samples for small test, exome #ex20 = ex20.filter_cols(hl.rand_bool(0.1)) #~=1500 samples for small test, genome #全sampleに関して行うとしよう. ex20 = hl.filter_alleles(ex20, lambda allele, i: hl.is_snp( ex20.alleles[0], allele)) # currently take only SNP ex20 = ex20.filter_entries( ex20.GT.is_het()) # throw away unwanted entries (non alt) ex20_pair = hl.window_by_locus(ex20, 2) #just look at nearby pairs for now ex20_pair = ex20_pair.filter_entries( (hl.is_defined(ex20_pair.GT) & (ex20_pair.prev_entries.length() > 0))) ex20_pair = ex20_pair.filter_entries( ex20_pair.prev_entries.filter(lambda x: x.GT.is_het()).length() > 0) et = ex20_pair.entries() et = et.annotate(indices=hl.range(0, hl.len(et.prev_rows))) et = et.explode('indices') et = et.transmute(prev_row=et.prev_rows[et.indices],
import os, shutil import hail as hl if not os.path.isdir("output/"): os.mkdir("output/") files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1),
def _pca_and_moments(A, k=10, num_moments=5, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128, moment_samples=100): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) # Set Parameters q = q_iterations L = k + oversampling_param n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_loadings) info("_reduced_svd: Computing local SVD") U, S, V = fact.reduced_svd(k) p = min(num_moments // 2, 10) # Generate random matrix G2 for moment estimation G2 = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) # Project out components in subspace fact.V, which we can compute exactly G2 = G2 - fact.V @ (fact.V.T @ G2) Q1, R1 = hl.nd.qr(G2)._persist() fact2 = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = fact2.spectral_moments(num_moments, R1) # Add back exact moments moments = moments_and_stdevs.moments + hl.nd.array([ fact.S.map(lambda x: x**(2 * i)).sum() for i in range(1, num_moments + 1) ]) moments_and_stdevs = hl.eval( hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) else: lt = None return eigens, st, lt, moments, stdevs
import hail as hl import hail.expr.aggregators as agg from hail.stats import * from hail.utils.java import warn if not os.path.isdir("output/"): os.mkdir("output/") files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_in_kinship=hl.rand_bool(0.9), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"),
def test_plot_roc_curve(self): x = hl.utils.range_table(100).annotate(score1=hl.rand_norm(), score2=hl.rand_norm()) x = x.annotate(tp=hl.cond(x.score1 > 0, hl.rand_bool(0.7), False), score3=x.score1 + hl.rand_norm()) ht = x.annotate(fp=hl.cond(~x.tp, hl.rand_bool(0.2), False)) _, aucs = hl.experimental.plot_roc_curve(ht, ['score1', 'score2', 'score3'])