def test_dndarray_sum(): n_variants = 10 n_samples = 10 block_size = 3 n_blocks = 16 mt1 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt1 = mt1.select_entries(dosage=hl.float(mt1.GT.n_alt_alleles())) mt2 = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt2 = mt2.select_entries(dosage=hl.float(mt2.GT.n_alt_alleles())) da1 = hl.experimental.dnd.array(mt1, 'dosage', block_size=block_size) da2 = hl.experimental.dnd.array(mt2, 'dosage', block_size=block_size) da_sum = (da1 + da2).checkpoint(new_temp_file()) assert da_sum._force_count_blocks() == n_blocks da_result = da_sum.collect() a1 = np.array(mt1.dosage.collect()).reshape(n_variants, n_samples) a2 = np.array(mt2.dosage.collect()).reshape(n_variants, n_samples) a_result = a1 + a2 assert np.array_equal(da_result, a_result)
def test_pc_project(self): mt = hl.balding_nichols_model(3, 100, 50) _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True) mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af) mt_to_project = hl.balding_nichols_model(3, 100, 50) ht = hl.experimental.pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af) assert ht._force_count() == 100
def test_pcrelate(self): dataset = hl.balding_nichols_model(3, 100, 100) dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx)) t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi") self.assertTrue(isinstance(t, hl.Table)) t.count()
def test_self_kinship(): mt = hl.balding_nichols_model(3, 10, 50) with_self = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin20', block_size=16, include_self_kinship=True) without_self = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin20', block_size=16) assert with_self.count() == 55 assert without_self.count() == 45 with_self_self_kin_only = with_self.filter( with_self.i.sample_idx == with_self.j.sample_idx) assert with_self_self_kin_only.count( ) == 10, with_self_self_kin_only.collect() with_self_no_self_kin = with_self.filter( with_self.i.sample_idx != with_self.j.sample_idx) assert with_self_no_self_kin.count() == 45, with_self_no_self_kin.collect() assert with_self_no_self_kin._same(without_self) without_self_self_kin_only = without_self.filter( without_self.i.sample_idx == without_self.j.sample_idx) assert without_self_self_kin_only.count( ) == 0, without_self_self_kin_only.collect()
def test_pcrelate_paths(): mt = hl.balding_nichols_model(3, 50, 100) _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False) kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64) kin2 = hl.pc_relate(mt.GT, 0.05, k=2, min_kinship=0.01, statistics='kin2', block_size=128).cache() kin3 = hl.pc_relate(mt.GT, 0.02, k=3, min_kinship=0.1, statistics='kin20', block_size=64).cache() kin_s1 = hl.pc_relate(mt.GT, 0.10, scores_expr=scores3[mt.col_key].scores[:2], statistics='kin', block_size=32) assert kin1._same(kin_s1, tolerance=1e-4) assert kin1.count() == 50 * 49 / 2 assert kin2.count() > 0 assert kin2.filter(kin2.kin < 0.01).count() == 0 assert kin3.count() > 0 assert kin3.filter(kin3.kin < 0.1).count() == 0
def test_pcrelate_issue_5263(): mt = hl.balding_nichols_model(3, 50, 100) expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all') mt = mt.select_entries(GT2=mt.GT, GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5))) actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all') assert expected._same(actual, tolerance=1e-4)
def test_manhattan_plot(): mt = hl.balding_nichols_model(3, 10, 100) ht = mt.rows() ht = ht.annotate(pval=.02) fig = ggplot(ht, aes(x=ht.locus, y=-hl.log10(ht.pval))) + geom_point() + geom_hline(yintercept=-math.log10(5e-8)) pfig = fig.to_plotly() expected_ticks = ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y') assert pfig.layout.xaxis.ticktext == expected_ticks
def pc_relate_big(): mt = hl.balding_nichols_model(3, 2 * 4096, 2 * 4096).checkpoint( hl.utils.new_temp_file(extension='mt')) mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1))) rel = hl.pc_relate(mt.GT, 0.05, scores_expr=mt.scores, statistics='kin', min_kinship=0.05) rel._force_count()
def test_hwe_normalized_pca(): mt = hl.balding_nichols_model(3, 100, 50) eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=True) assert len(eigenvalues) == 2 assert isinstance(scores, hl.Table) scores.count() == 100 assert isinstance(loadings, hl.Table) _, _, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=False) assert loadings is None
def test_rrm(self): seed = 0 n1 = 100 m1 = 200 k = 3 fst = .9 dataset = hl.balding_nichols_model(k, n1, m1, fst=(k * [fst]), seed=seed, n_partitions=4) dataset = dataset.annotate_cols(s = hl.str(dataset.sample_idx)).key_cols_by('s') def direct_calculation(ds): ds = BlockMatrix.from_entry_expr(ds['GT'].n_alt_alleles()).to_numpy() # filter out constant rows isconst = lambda r: any([all([(gt < c + .01) and (gt > c - .01) for gt in r]) for c in range(3)]) ds = np.array([row for row in ds if not isconst(row)]) nvariants, nsamples = ds.shape sumgt = lambda r: sum([i for i in r if i >= 0]) sumsq = lambda r: sum([i ** 2 for i in r if i >= 0]) mean = [sumgt(row) / nsamples for row in ds] stddev = [sqrt(sumsq(row) / nsamples - mean[i] ** 2) for i, row in enumerate(ds)] mat = np.array([[(g - mean[i]) / stddev[i] for g in row] for i, row in enumerate(ds)]) rrm = (mat.T @ mat) / nvariants return rrm def hail_calculation(ds): rrm = hl.realized_relationship_matrix(ds['GT']) fn = utils.new_temp_file(suffix='.tsv') rrm.export_tsv(fn) data = [] with open(utils.uri_path(fn)) as f: f.readline() for line in f: row = line.strip().split() data.append(list(map(float, row))) return np.array(data) manual = direct_calculation(dataset) rrm = hail_calculation(dataset) self.assertTrue(np.allclose(manual, rrm))
def test_pca(self): dataset = hl.balding_nichols_model(3, 100, 100) eigenvalues, scores, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=True) self.assertEqual(len(eigenvalues), 2) self.assertTrue(isinstance(scores, hl.Table)) self.assertEqual(scores.count(), 100) self.assertTrue(isinstance(loadings, hl.Table)) self.assertEqual(loadings.count(), 100) _, _, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=False) self.assertEqual(loadings, None)
def test_medium_collect(): n_variants = 100 n_samples = 100 block_size = 32 mt = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size) a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples) assert np.array_equal(da.collect(), a)
def test(self): mt = hl.balding_nichols_model(3, 10, 10) t = mt.rows() mt.GT.show() mt.locus.show() mt.af.show() mt.pop.show() mt.sample_idx.show() mt.bn.show() mt.bn.fst.show() mt.GT.n_alt_alleles().show() (mt.GT.n_alt_alleles() * mt.GT.n_alt_alleles()).show() (mt.af * mt.GT.n_alt_alleles()).show() t.af.show() (t.af * 3).show()
def test_DB(self): mt = hl.balding_nichols_model(n_populations=3, n_samples=50, n_variants=10010) db = hl.experimental.DB() mt = db.annotate_rows_db(mt, "DANN") actual = mt.filter_rows(hl.is_defined(mt.DANN)).DANN.collect() expected = [ hl.Struct(score=0.3618202027281013), hl.Struct(score=0.36516159615040267), hl.Struct(score=0.3678246364006052), hl.Struct(score=0.3697632743148331) ] for i in range(len(array1)): self.assertAlmostEqual(actual[i], expected[i])
def test_king_homo_estimator(): hl.set_global_seed(1) mt = hl.balding_nichols_model(2, 5, 5) mt = mt.select_entries(genotype_score=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'genotype_score', block_size=3) def sqr(x): return x * x score_difference = da.T.inner_product( da, lambda l, r: sqr(l - r), lambda l, r: l + r, hl.float(0), hl.agg.sum).checkpoint(new_temp_file()) assert np.array_equal( score_difference.collect(), np.array([[0., 6., 4., 2., 4.], [6., 0., 6., 4., 6.], [4., 6., 0., 6., 0.], [2., 4., 6., 0., 6.], [4., 6., 0., 6., 0.]]))
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) with hl.TemporaryDirectory(ensure_exists=False) as path: # non-field expressions currently take a separate code path BlockMatrix.write_from_entry_expr(mt.x + 1, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path)) BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm + 2)
def test_medium_matmul(): n_variants = 100 n_samples = 100 block_size = 32 n_blocks = 16 mt = hl.balding_nichols_model(n_populations=2, n_variants=n_variants, n_samples=n_samples) mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles())) da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size) da = (da @ da.T).checkpoint(new_temp_file()) assert da._force_count_blocks() == n_blocks da_result = da.collect().reshape(n_variants, n_variants) a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples) a_result = a @ a.T assert np.array_equal(da_result, a_result)
def test_write_from_entry_expr_overwrite(self): mt = hl.balding_nichols_model(1, 1, 1) mt = mt.select_entries(x=mt.GT.n_alt_alleles()) bm = BlockMatrix.from_entry_expr(mt.x) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path)) BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True) self._assert_eq(BlockMatrix.read(path), bm) # non-field expressions currently take a separate code path path2 = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x + 1, path2) self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2)) BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True) self._assert_eq(BlockMatrix.read(path2), bm + 2)
def test_balding_nichols_model(self): from hail.stats import TruncatedBetaDist ds = hl.balding_nichols_model(2, 20, 25, 3, pop_dist=[1.0, 2.0], fst=[.02, .06], af_dist=TruncatedBetaDist(a=0.01, b=2.0, min=0.05, max=0.95), seed=1) self.assertEqual(ds.count_cols(), 20) self.assertEqual(ds.count_rows(), 25) self.assertEqual(ds.n_partitions(), 3) glob = ds.globals self.assertEqual(glob.n_populations.value, 2) self.assertEqual(glob.n_samples.value, 20) self.assertEqual(glob.n_variants.value, 25) self.assertEqual(glob.pop_dist.value, [1, 2]) self.assertEqual(glob.fst.value, [.02, .06]) self.assertEqual(glob.seed.value, 1) self.assertEqual(glob.ancestral_af_dist.value, hl.Struct(type='TruncatedBetaDist', a=0.01, b=2.0, min=0.05, max=0.95))
import hail as hl S = 500 V = 2000 mt = hl.balding_nichols_model(1, S, V, 500) mt = mt.annotate_cols( n_called=hl.agg.filter(hl.is_defined(mt.GT), hl.agg.count())) mt = mt.filter_cols(mt.n_called > 0).count()
import plotly import plotly.express as px import json from aiohttp import web import aiohttp_jinja2 app = web.Application() routes = web.RouteTableDef() if not hl.hadoop_exists('bn.mt'): # Generate data for demonstratation purposes, this should already exist mt = hl.balding_nichols_model(5, 100, 10000, pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2], fst=[.02, .06, .04, .12, .08], af_dist=hl.rand_beta(a=0.01, b=2.0, lower=0.05, upper=1.0), mixture=True) mt = hl.variant_qc(mt) mt.write('bn.mt', overwrite=True) mt = hl.read_matrix_table('bn.mt') if not hl.hadoop_exists('scores.t'): # Generate data for demonstratation purposes, this should already exist scores = hl.hwe_normalized_pca(mt.GT, k=5)[1] scores = scores.annotate(**mt.cols()[scores.sample_idx]) scores.write('scores.t')
def king(): mt = hl.balding_nichols_model(6, n_variants=10000, n_samples=4096) path = hl.utils.new_temp_file(extension='mt') hl.king(mt.GT).write(path, overwrite=True)
#! /usr/bin/python import sys import hail as hl n_samples = int(sys.argv[1]) n_variants = int(sys.argv[2]) path = sys.argv[3] mt = hl.balding_nichols_model(1, n_samples, n_variants) mt = mt.key_cols_by(s = hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2)) hl.export_vcf(mt, path + ".vcf") hl.export_plink(mt, path)
n_sim = int(320e3) # over simulate to be able to have ascertainment else: raise ValueError(f'sim_name="{sim_name}" does not match any models') hl.set_global_seed(seed) gt_sim_suffix = f'bn.npops_{n_pops}.nvars_{n_vars}.nsim_{n_sim}' if sim_name[: 3] == 'bn_' else '' # suffix for genotype simulation (empty string if using ukb data) param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}' betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz' phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz' if sim_name[:3] == 'bn_': mt = hl.balding_nichols_model(n_populations=n_pops, n_samples=n_sim, n_variants=n_vars, fst=fst) mt = mt.filter_rows( (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) < 0.5)) # remove invariant SNPs mt = mt.annotate_cols(s=hl.str(mt.sample_idx)) if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path): # betas = hl.import_table(betas_path, impute=True, force=True) # betas = betas.annotate(locus = hl.parse_locus(betas.locus), # alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"')) # betas = betas.key_by('locus','alleles') phens = hl.import_table(phens_path, key=['s'],
def _test_linear_mixed_model_low_rank(self): seed = 0 n_populations = 8 fst = n_populations * [.9] n_samples = 500 n_variants = 200 n_orig_markers = 100 n_culprits = 10 n_covariates = 3 sigma_sq = 1 tau_sq = 1 from numpy.random import RandomState prng = RandomState(seed) x = np.hstack((np.ones(shape=(n_samples, 1)), prng.normal(size=(n_samples, n_covariates - 1)))) mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants, fst=fst, af_dist=hl.rand_unif(0.1, 0.9, seed=seed), seed=seed) pa_t_path = utils.new_temp_file(suffix='bm') a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path) a = BlockMatrix.read(a_t_path).T.to_numpy() g = a[:, -n_orig_markers:] g_std = self._filter_and_standardize_cols(g) n_markers = g_std.shape[1] k = (g_std @ g_std.T) * n_samples / n_markers beta = np.arange(n_covariates) beta_stars = np.array([1] * n_culprits) y = prng.multivariate_normal( np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)), sigma_sq * k + tau_sq * np.eye(n_samples)) # low rank computation of S, P l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n_samples / n_markers) p = (g_std @ (v / np.sqrt(sl))).T # compare with full rank S, P sk0, uk = np.linalg.eigh(k) sk = sk0[-n_eigenvectors:] pk = uk[:, -n_eigenvectors:].T assert np.allclose(sk, s) assert np.allclose(np.abs(pk), np.abs(p)) # build and fit model py = p @ y px = p @ x pa = p @ a model = LinearMixedModel(py, px, s, y, x) assert model.n == n_samples assert model.f == n_covariates assert model.r == n_eigenvectors assert model.low_rank model.fit() # check effect sizes tend to be near 1 for first n_marker alternative models BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True) df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1 # compare NumPy and Hail LMM per alternative df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas() assert np.min(df_numpy['chi_sq']) > 0 na_numpy = df_numpy.isna().any(axis=1) na_lmm = df_lmm.isna().any(axis=1) assert na_numpy.sum() <= 10 assert na_lmm.sum() <= 10 assert np.logical_xor(na_numpy, na_lmm).sum() <= 5 mask = ~(na_numpy | na_lmm) lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask])) assert lmm_vs_numpy_p_value[10] < 1e-12 # 10 least p-values differences assert lmm_vs_numpy_p_value[-1] < 1e-8 # all p-values
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, {'locus': hl.Locus('1', 2), 'cm': 3.0}, {'locus': hl.Locus('1', 4), 'cm': 4.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('3', 3), 'cm': 5.0}] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
def test_linear_mixed_model_full_rank(self): seed = 0 n_populations = 8 fst = n_populations * [.9] n_samples = 200 n_variants = 500 n_orig_markers = 500 n_culprits = 20 n_covariates = 3 sigma_sq = 1 tau_sq = 1 from numpy.random import RandomState prng = RandomState(seed) x = np.hstack((np.ones(shape=(n_samples, 1)), prng.normal(size=(n_samples, n_covariates - 1)))) mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants, fst=fst, seed=seed) pa_t_path = utils.new_temp_file(suffix='bm') a = BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles()).T.to_numpy() g = a[:, -n_orig_markers:] g_std = self._filter_and_standardize_cols(g) n_markers = g_std.shape[1] k = (g_std @ g_std.T) * n_samples / n_markers beta = np.arange(n_covariates) beta_stars = np.array([1] * n_culprits) y = prng.multivariate_normal( np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)), sigma_sq * k + tau_sq * np.eye(n_samples)) s, u = np.linalg.eigh(k) p = u.T # build and fit model py = p @ y px = p @ x pa = p @ a model = LinearMixedModel(py, px, s) assert model.n == n_samples assert model.f == n_covariates assert model.r == n_samples assert (not model.low_rank) model.fit() # check effect sizes tend to be near 1 for first n_marker alternative models BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True) df_lmm = model.fit_alternatives(pa_t_path).to_pandas() assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1 # compare NumPy and Hail LMM per alternative df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas() na_numpy = df_numpy.isna().any(axis=1) na_lmm = df_lmm.isna().any(axis=1) assert na_numpy.sum() <= 20 assert na_lmm.sum() <= 20 assert np.logical_xor(na_numpy, na_lmm).sum() <= 10 mask = ~(na_numpy | na_lmm) lmm_vs_numpy_p_value = np.sort( np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask])) assert lmm_vs_numpy_p_value[10] < 1e-12 # 10 least p-values differences assert lmm_vs_numpy_p_value[-1] < 1e-8 # all p-values
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows( mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{ 'locus': hl.Locus('1', 1), 'cm': 1.0 }, { 'locus': hl.Locus('1', 2), 'cm': 3.0 }, { 'locus': hl.Locus('1', 4), 'cm': 4.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('3', 3), 'cm': 5.0 }] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows( ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.null(hl.tlocus()), 'cm': 1.0 }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64) }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
import hail as hl hl.set_global_seed(0) mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4) mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx)) mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT)) hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
import hail as hl mt = hl.balding_nichols_model(3, 100, 100) t = hl.skat(mt.locus, mt.ancestral_af, mt.pop, mt.GT.n_alt_alleles(), covariates=[1]) t.show()
import hail as hl mt = hl.balding_nichols_model(3, 100, 100) gts_as_rows = mt.annotate_rows( mean=hl.agg.mean(hl.float(mt.GT.n_alt_alleles())), genotypes=hl.agg.collect(hl.float(mt.GT.n_alt_alleles()))).rows() groups = gts_as_rows.group_by( ld_block=gts_as_rows.locus.position // 10).aggregate( genotypes=hl.agg.collect(gts_as_rows.genotypes), ys=hl.agg.collect(gts_as_rows.mean)) df = groups.to_spark() from pyspark.sql.functions import udf def get_intercept(X, y): from sklearn import linear_model clf = linear_model.Lasso(alpha=0.1) clf.fit(X, y) return float(clf.intercept_) get_intercept_udf = udf(get_intercept) df.select(get_intercept_udf("genotypes", "ys").alias("intercept")).show()