Ejemplo n.º 1
0
def test_dndarray_sum():
    n_variants = 10
    n_samples = 10
    block_size = 3
    n_blocks = 16
    mt1 = hl.balding_nichols_model(n_populations=2,
                                   n_variants=n_variants,
                                   n_samples=n_samples)
    mt1 = mt1.select_entries(dosage=hl.float(mt1.GT.n_alt_alleles()))
    mt2 = hl.balding_nichols_model(n_populations=2,
                                   n_variants=n_variants,
                                   n_samples=n_samples)
    mt2 = mt2.select_entries(dosage=hl.float(mt2.GT.n_alt_alleles()))

    da1 = hl.experimental.dnd.array(mt1, 'dosage', block_size=block_size)
    da2 = hl.experimental.dnd.array(mt2, 'dosage', block_size=block_size)
    da_sum = (da1 + da2).checkpoint(new_temp_file())
    assert da_sum._force_count_blocks() == n_blocks
    da_result = da_sum.collect()

    a1 = np.array(mt1.dosage.collect()).reshape(n_variants, n_samples)
    a2 = np.array(mt2.dosage.collect()).reshape(n_variants, n_samples)
    a_result = a1 + a2

    assert np.array_equal(da_result, a_result)
Ejemplo n.º 2
0
 def test_pc_project(self):
     mt = hl.balding_nichols_model(3, 100, 50)
     _, _, loadings_ht = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=True)
     mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
     loadings_ht = loadings_ht.annotate(af=mt.rows()[loadings_ht.key].af)
     mt_to_project = hl.balding_nichols_model(3, 100, 50)
     ht = hl.experimental.pc_project(mt_to_project.GT, loadings_ht.loadings, loadings_ht.af)
     assert ht._force_count() == 100
Ejemplo n.º 3
0
    def test_pcrelate(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx))
        t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi")

        self.assertTrue(isinstance(t, hl.Table))
        t.count()
Ejemplo n.º 4
0
def test_self_kinship():
    mt = hl.balding_nichols_model(3, 10, 50)
    with_self = hl.pc_relate(mt.GT,
                             0.10,
                             k=2,
                             statistics='kin20',
                             block_size=16,
                             include_self_kinship=True)
    without_self = hl.pc_relate(mt.GT,
                                0.10,
                                k=2,
                                statistics='kin20',
                                block_size=16)

    assert with_self.count() == 55
    assert without_self.count() == 45

    with_self_self_kin_only = with_self.filter(
        with_self.i.sample_idx == with_self.j.sample_idx)
    assert with_self_self_kin_only.count(
    ) == 10, with_self_self_kin_only.collect()

    with_self_no_self_kin = with_self.filter(
        with_self.i.sample_idx != with_self.j.sample_idx)
    assert with_self_no_self_kin.count() == 45, with_self_no_self_kin.collect()
    assert with_self_no_self_kin._same(without_self)

    without_self_self_kin_only = without_self.filter(
        without_self.i.sample_idx == without_self.j.sample_idx)
    assert without_self_self_kin_only.count(
    ) == 0, without_self_self_kin_only.collect()
Ejemplo n.º 5
0
def test_pcrelate_paths():
    mt = hl.balding_nichols_model(3, 50, 100)
    _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False)

    kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64)
    kin2 = hl.pc_relate(mt.GT,
                        0.05,
                        k=2,
                        min_kinship=0.01,
                        statistics='kin2',
                        block_size=128).cache()
    kin3 = hl.pc_relate(mt.GT,
                        0.02,
                        k=3,
                        min_kinship=0.1,
                        statistics='kin20',
                        block_size=64).cache()
    kin_s1 = hl.pc_relate(mt.GT,
                          0.10,
                          scores_expr=scores3[mt.col_key].scores[:2],
                          statistics='kin',
                          block_size=32)

    assert kin1._same(kin_s1, tolerance=1e-4)

    assert kin1.count() == 50 * 49 / 2

    assert kin2.count() > 0
    assert kin2.filter(kin2.kin < 0.01).count() == 0

    assert kin3.count() > 0
    assert kin3.filter(kin3.kin < 0.1).count() == 0
Ejemplo n.º 6
0
def test_pcrelate_issue_5263():
    mt = hl.balding_nichols_model(3, 50, 100)
    expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all')
    mt = mt.select_entries(GT2=mt.GT,
                           GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5)))
    actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all')
    assert expected._same(actual, tolerance=1e-4)
Ejemplo n.º 7
0
def test_manhattan_plot():
    mt = hl.balding_nichols_model(3, 10, 100)
    ht = mt.rows()
    ht = ht.annotate(pval=.02)
    fig = ggplot(ht, aes(x=ht.locus, y=-hl.log10(ht.pval))) + geom_point() + geom_hline(yintercept=-math.log10(5e-8))
    pfig = fig.to_plotly()
    expected_ticks = ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y')
    assert pfig.layout.xaxis.ticktext == expected_ticks
Ejemplo n.º 8
0
def pc_relate_big():
    mt = hl.balding_nichols_model(3, 2 * 4096, 2 * 4096).checkpoint(
        hl.utils.new_temp_file(extension='mt'))
    mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1)))
    rel = hl.pc_relate(mt.GT,
                       0.05,
                       scores_expr=mt.scores,
                       statistics='kin',
                       min_kinship=0.05)
    rel._force_count()
Ejemplo n.º 9
0
def test_hwe_normalized_pca():
    mt = hl.balding_nichols_model(3, 100, 50)
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=True)

    assert len(eigenvalues) == 2
    assert isinstance(scores, hl.Table)
    scores.count() == 100
    assert isinstance(loadings, hl.Table)

    _, _, loadings = hl.hwe_normalized_pca(mt.GT, k=2, compute_loadings=False)
    assert loadings is None
Ejemplo n.º 10
0
    def test_rrm(self):
        seed = 0
        n1 = 100
        m1 = 200
        k = 3
        fst = .9

        dataset = hl.balding_nichols_model(k,
                                           n1,
                                           m1,
                                           fst=(k * [fst]),
                                           seed=seed,
                                           n_partitions=4)
        dataset = dataset.annotate_cols(s = hl.str(dataset.sample_idx)).key_cols_by('s')

        def direct_calculation(ds):
            ds = BlockMatrix.from_entry_expr(ds['GT'].n_alt_alleles()).to_numpy()

            # filter out constant rows
            isconst = lambda r: any([all([(gt < c + .01) and (gt > c - .01) for gt in r]) for c in range(3)])
            ds = np.array([row for row in ds if not isconst(row)])

            nvariants, nsamples = ds.shape
            sumgt = lambda r: sum([i for i in r if i >= 0])
            sumsq = lambda r: sum([i ** 2 for i in r if i >= 0])

            mean = [sumgt(row) / nsamples for row in ds]
            stddev = [sqrt(sumsq(row) / nsamples - mean[i] ** 2)
                      for i, row in enumerate(ds)]

            mat = np.array([[(g - mean[i]) / stddev[i] for g in row] for i, row in enumerate(ds)])

            rrm = (mat.T @ mat) / nvariants

            return rrm

        def hail_calculation(ds):
            rrm = hl.realized_relationship_matrix(ds['GT'])
            fn = utils.new_temp_file(suffix='.tsv')

            rrm.export_tsv(fn)
            data = []
            with open(utils.uri_path(fn)) as f:
                f.readline()
                for line in f:
                    row = line.strip().split()
                    data.append(list(map(float, row)))

            return np.array(data)

        manual = direct_calculation(dataset)
        rrm = hail_calculation(dataset)

        self.assertTrue(np.allclose(manual, rrm))
Ejemplo n.º 11
0
    def test_pca(self):
        dataset = hl.balding_nichols_model(3, 100, 100)
        eigenvalues, scores, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=True)

        self.assertEqual(len(eigenvalues), 2)
        self.assertTrue(isinstance(scores, hl.Table))
        self.assertEqual(scores.count(), 100)
        self.assertTrue(isinstance(loadings, hl.Table))
        self.assertEqual(loadings.count(), 100)

        _, _, loadings = hl.pca(dataset.GT.n_alt_alleles(), k=2, compute_loadings=False)
        self.assertEqual(loadings, None)
Ejemplo n.º 12
0
def test_medium_collect():
    n_variants = 100
    n_samples = 100
    block_size = 32
    mt = hl.balding_nichols_model(n_populations=2,
                                  n_variants=n_variants,
                                  n_samples=n_samples)
    mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles()))

    da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size)
    a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples)

    assert np.array_equal(da.collect(), a)
Ejemplo n.º 13
0
 def test(self):
     mt = hl.balding_nichols_model(3, 10, 10)
     t = mt.rows()
     mt.GT.show()
     mt.locus.show()
     mt.af.show()
     mt.pop.show()
     mt.sample_idx.show()
     mt.bn.show()
     mt.bn.fst.show()
     mt.GT.n_alt_alleles().show()
     (mt.GT.n_alt_alleles() * mt.GT.n_alt_alleles()).show()
     (mt.af * mt.GT.n_alt_alleles()).show()
     t.af.show()
     (t.af * 3).show()
Ejemplo n.º 14
0
    def test_DB(self):
        mt = hl.balding_nichols_model(n_populations=3,
                                      n_samples=50,
                                      n_variants=10010)

        db = hl.experimental.DB()
        mt = db.annotate_rows_db(mt, "DANN")

        actual = mt.filter_rows(hl.is_defined(mt.DANN)).DANN.collect()
        expected = [
            hl.Struct(score=0.3618202027281013),
            hl.Struct(score=0.36516159615040267),
            hl.Struct(score=0.3678246364006052),
            hl.Struct(score=0.3697632743148331)
        ]
        for i in range(len(array1)):
            self.assertAlmostEqual(actual[i], expected[i])
Ejemplo n.º 15
0
def test_king_homo_estimator():
    hl.set_global_seed(1)
    mt = hl.balding_nichols_model(2, 5, 5)
    mt = mt.select_entries(genotype_score=hl.float(mt.GT.n_alt_alleles()))
    da = hl.experimental.dnd.array(mt, 'genotype_score', block_size=3)

    def sqr(x):
        return x * x

    score_difference = da.T.inner_product(
        da, lambda l, r: sqr(l - r), lambda l, r: l + r, hl.float(0),
        hl.agg.sum).checkpoint(new_temp_file())
    assert np.array_equal(
        score_difference.collect(),
        np.array([[0., 6., 4., 2., 4.], [6., 0., 6., 4., 6.],
                  [4., 6., 0., 6., 0.], [2., 4., 6., 0., 6.],
                  [4., 6., 0., 6., 0.]]))
Ejemplo n.º 16
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

            BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            # non-field expressions currently take a separate code path
            BlockMatrix.write_from_entry_expr(mt.x + 1, path)
            self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path))

            BlockMatrix.write_from_entry_expr(mt.x + 2, path, overwrite=True)
            self._assert_eq(BlockMatrix.read(path), bm + 2)
Ejemplo n.º 17
0
def test_medium_matmul():
    n_variants = 100
    n_samples = 100
    block_size = 32
    n_blocks = 16
    mt = hl.balding_nichols_model(n_populations=2,
                                  n_variants=n_variants,
                                  n_samples=n_samples)
    mt = mt.select_entries(dosage=hl.float(mt.GT.n_alt_alleles()))

    da = hl.experimental.dnd.array(mt, 'dosage', block_size=block_size)
    da = (da @ da.T).checkpoint(new_temp_file())
    assert da._force_count_blocks() == n_blocks
    da_result = da.collect().reshape(n_variants, n_variants)

    a = np.array(mt.dosage.collect()).reshape(n_variants, n_samples)
    a_result = a @ a.T

    assert np.array_equal(da_result, a_result)
Ejemplo n.º 18
0
    def test_write_from_entry_expr_overwrite(self):
        mt = hl.balding_nichols_model(1, 1, 1)
        mt = mt.select_entries(x=mt.GT.n_alt_alleles())
        bm = BlockMatrix.from_entry_expr(mt.x)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x, path))

        BlockMatrix.write_from_entry_expr(mt.x, path, overwrite=True)
        self._assert_eq(BlockMatrix.read(path), bm)

        # non-field expressions currently take a separate code path
        path2 = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x + 1, path2)
        self.assertRaises(FatalError, lambda: BlockMatrix.write_from_entry_expr(mt.x + 1, path2))

        BlockMatrix.write_from_entry_expr(mt.x + 2, path2, overwrite=True)
        self._assert_eq(BlockMatrix.read(path2), bm + 2)
Ejemplo n.º 19
0
    def test_balding_nichols_model(self):
        from hail.stats import TruncatedBetaDist

        ds = hl.balding_nichols_model(2, 20, 25, 3,
                                      pop_dist=[1.0, 2.0],
                                      fst=[.02, .06],
                                      af_dist=TruncatedBetaDist(a=0.01, b=2.0, min=0.05, max=0.95),
                                      seed=1)

        self.assertEqual(ds.count_cols(), 20)
        self.assertEqual(ds.count_rows(), 25)
        self.assertEqual(ds.n_partitions(), 3)

        glob = ds.globals
        self.assertEqual(glob.n_populations.value, 2)
        self.assertEqual(glob.n_samples.value, 20)
        self.assertEqual(glob.n_variants.value, 25)
        self.assertEqual(glob.pop_dist.value, [1, 2])
        self.assertEqual(glob.fst.value, [.02, .06])
        self.assertEqual(glob.seed.value, 1)
        self.assertEqual(glob.ancestral_af_dist.value,
                         hl.Struct(type='TruncatedBetaDist', a=0.01, b=2.0, min=0.05, max=0.95))
Ejemplo n.º 20
0
import hail as hl
S = 500
V = 2000
mt = hl.balding_nichols_model(1, S, V, 500)
mt = mt.annotate_cols(
    n_called=hl.agg.filter(hl.is_defined(mt.GT), hl.agg.count()))
mt = mt.filter_cols(mt.n_called > 0).count()
Ejemplo n.º 21
0
import plotly
import plotly.express as px
import json
from aiohttp import web
import aiohttp_jinja2

app = web.Application()
routes = web.RouteTableDef()

if not hl.hadoop_exists('bn.mt'):
    # Generate data for demonstratation purposes, this should already exist
    mt = hl.balding_nichols_model(5,
                                  100,
                                  10000,
                                  pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2],
                                  fst=[.02, .06, .04, .12, .08],
                                  af_dist=hl.rand_beta(a=0.01,
                                                       b=2.0,
                                                       lower=0.05,
                                                       upper=1.0),
                                  mixture=True)
    mt = hl.variant_qc(mt)
    mt.write('bn.mt', overwrite=True)

mt = hl.read_matrix_table('bn.mt')

if not hl.hadoop_exists('scores.t'):
    # Generate data for demonstratation purposes, this should already exist
    scores = hl.hwe_normalized_pca(mt.GT, k=5)[1]
    scores = scores.annotate(**mt.cols()[scores.sample_idx])
    scores.write('scores.t')
Ejemplo n.º 22
0
def king():
    mt = hl.balding_nichols_model(6, n_variants=10000, n_samples=4096)
    path = hl.utils.new_temp_file(extension='mt')
    hl.king(mt.GT).write(path, overwrite=True)
Ejemplo n.º 23
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s = hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))
hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)
Ejemplo n.º 24
0
        n_sim = int(320e3)  # over simulate to be able to have ascertainment
    else:
        raise ValueError(f'sim_name="{sim_name}" does not match any models')

    hl.set_global_seed(seed)

    gt_sim_suffix = f'bn.npops_{n_pops}.nvars_{n_vars}.nsim_{n_sim}' if sim_name[:
                                                                                 3] == 'bn_' else ''  # suffix for genotype simulation (empty string if using ukb data)
    param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}'
    betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz'
    phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz'

    if sim_name[:3] == 'bn_':
        mt = hl.balding_nichols_model(n_populations=n_pops,
                                      n_samples=n_sim,
                                      n_variants=n_vars,
                                      fst=fst)

        mt = mt.filter_rows(
            (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) <
             0.5))  # remove invariant SNPs
        mt = mt.annotate_cols(s=hl.str(mt.sample_idx))

        if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path):
            #            betas = hl.import_table(betas_path, impute=True, force=True)
            #            betas = betas.annotate(locus = hl.parse_locus(betas.locus),
            #                                   alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"'))
            #            betas = betas.key_by('locus','alleles')

            phens = hl.import_table(phens_path,
                                    key=['s'],
Ejemplo n.º 25
0
    def _test_linear_mixed_model_low_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 500
        n_variants = 200
        n_orig_markers = 100
        n_culprits = 10
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      af_dist=hl.rand_unif(0.1, 0.9, seed=seed),
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')
        a_t_path = utils.new_temp_file(suffix='bm')

        BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path)

        a = BlockMatrix.read(a_t_path).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        # low rank computation of S, P
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n_samples / n_markers)
        p = (g_std @ (v / np.sqrt(sl))).T

        # compare with full rank S, P
        sk0, uk = np.linalg.eigh(k)
        sk = sk0[-n_eigenvectors:]
        pk = uk[:, -n_eigenvectors:].T
        assert np.allclose(sk, s)
        assert np.allclose(np.abs(pk), np.abs(p))

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s, y, x)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_eigenvectors
        assert model.low_rank

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()
        assert np.min(df_numpy['chi_sq']) > 0

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 10
        assert na_lmm.sum() <= 10
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 5

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8   # all p-values
Ejemplo n.º 26
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0},
                {'locus': hl.Locus('1', 2), 'cm': 3.0},
                {'locus': hl.Locus('1', 4), 'cm': 4.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('2', 1), 'cm': 2.0},
                {'locus': hl.Locus('3', 3), 'cm': 5.0}]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
Ejemplo n.º 27
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Ejemplo n.º 28
0
    def test_linear_mixed_model_full_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 200
        n_variants = 500
        n_orig_markers = 500
        n_culprits = 20
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')

        a = BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles()).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        s, u = np.linalg.eigh(k)
        p = u.T

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_samples
        assert (not model.low_rank)

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 20
        assert na_lmm.sum() <= 20
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 10

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(
            np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8  # all p-values
Ejemplo n.º 29
0
    def test_locus_windows(self):
        def assert_eq(a, b):
            self.assertTrue(np.array_equal(a, np.array(b)))

        centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9])

        mt = hl.balding_nichols_model(1, 5, 5).add_row_index()
        mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache()

        starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2)
        assert_eq(starts, [0, 0, 0, 1, 2])
        assert_eq(stops, [3, 4, 5, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(mt.locus,
                                                      0.5,
                                                      coord_expr=mt.cm)
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        starts, stops = hl.linalg.utils.locus_windows(
            mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)])
        assert_eq(starts, [0, 1, 1, 1, 3])
        assert_eq(stops, [1, 4, 4, 5, 5])

        rows = [{
            'locus': hl.Locus('1', 1),
            'cm': 1.0
        }, {
            'locus': hl.Locus('1', 2),
            'cm': 3.0
        }, {
            'locus': hl.Locus('1', 4),
            'cm': 4.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('2', 1),
            'cm': 2.0
        }, {
            'locus': hl.Locus('3', 3),
            'cm': 5.0
        }]

        ht = hl.Table.parallelize(rows,
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1)
        assert_eq(starts, [0, 0, 2, 3, 3, 5])
        assert_eq(stops, [2, 2, 3, 5, 5, 6])

        starts, stops = hl.linalg.utils.locus_windows(ht.locus,
                                                      1.0,
                                                      coord_expr=ht.cm)
        assert_eq(starts, [0, 1, 1, 3, 3, 5])
        assert_eq(stops, [1, 3, 3, 5, 5, 6])

        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0)
        self.assertTrue('ascending order' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(
                ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx)
        self.assertTrue('different source' in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0)
        self.assertTrue("no source" in str(cm.exception))

        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0)
        self.assertTrue("no source" in str(cm.exception))

        ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0)
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.x, 1.0)
        self.assertTrue("row-indexed" in str(cm.exception))
        with self.assertRaises(ExpressionException) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y)
        self.assertTrue("row-indexed" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.null(hl.tlocus()),
            'cm': 1.0
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'locus_expr'" in str(cm.exception))

        ht = hl.Table.parallelize([{
            'locus': hl.Locus('1', 1),
            'cm': hl.null(hl.tfloat64)
        }],
                                  hl.tstruct(locus=hl.tlocus('GRCh37'),
                                             cm=hl.tfloat64),
                                  key=['locus'])
        with self.assertRaises(ValueError) as cm:
            hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm)
        self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
Ejemplo n.º 30
0
import hail as hl
hl.set_global_seed(0)
mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4)
mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT))
hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
Ejemplo n.º 31
0
import hail as hl
mt = hl.balding_nichols_model(3, 100, 100)
t = hl.skat(mt.locus, mt.ancestral_af, mt.pop, mt.GT.n_alt_alleles(), covariates=[1])
t.show()
import hail as hl
mt = hl.balding_nichols_model(3, 100, 100)
gts_as_rows = mt.annotate_rows(
    mean=hl.agg.mean(hl.float(mt.GT.n_alt_alleles())),
    genotypes=hl.agg.collect(hl.float(mt.GT.n_alt_alleles()))).rows()
groups = gts_as_rows.group_by(
    ld_block=gts_as_rows.locus.position // 10).aggregate(
        genotypes=hl.agg.collect(gts_as_rows.genotypes),
        ys=hl.agg.collect(gts_as_rows.mean))

df = groups.to_spark()

from pyspark.sql.functions import udf


def get_intercept(X, y):
    from sklearn import linear_model
    clf = linear_model.Lasso(alpha=0.1)
    clf.fit(X, y)
    return float(clf.intercept_)


get_intercept_udf = udf(get_intercept)
df.select(get_intercept_udf("genotypes", "ys").alias("intercept")).show()