Example #1
0
    def test_linear_mixed_model_function(self):
        n, f, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0],
                      [1.0, 2.0],
                      [1.0, 1.0],
                      [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0],
                      [0.0, 1.0, 2.0],
                      [1.0, 2.0, 0.0],
                      [2.0, 0.0, 1.0]])

        p_path = utils.new_temp_file()

        def make_call(gt):
            if gt == 0.0:
                return hl.Call([0, 0])
            if gt == 1.0:
                return hl.Call([0, 1])
            if gt == 2.0:
                return hl.Call([1, 1])

        data = [{'v': j, 's': i, 'y': y[i], 'x1': x[i, 1], 'zt': make_call(z[i, j])}
                for i in range(n) for j in range(m)]
        ht = hl.Table.parallelize(data, hl.dtype('struct{v: int32, s: int32, y: float64, x1: float64, zt: tcall}'))
        mt = ht.to_matrix_table(row_key=['v'], col_key=['s'], col_fields=['x1', 'y'])
        colsort = np.argsort(mt.key_cols_by().s.collect()).tolist()
        mt = mt.choose_cols(colsort)

        rrm = hl.realized_relationship_matrix(mt.zt).to_numpy()

        # kinship path agrees with from_kinship
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], k=rrm, p_path=p_path, overwrite=True)
        model0, p0 = LinearMixedModel.from_kinship(y, x, rrm, p_path, overwrite=True)
        assert model0._same(model)
        assert np.allclose(p0, p)

        # random effects path with standardize=True agrees with low-rank rrm
        s0, u0 = np.linalg.eigh(rrm)
        s0 = np.flip(s0, axis=0)[:m]
        p0 = np.fliplr(u0).T[:m, :]
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True)
        model0 = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=p_path)
        assert model0._same(model)

        # random effects path with standardize=False agrees with from_random_effects
        model0, p0 = LinearMixedModel.from_random_effects(y, x, z, p_path, overwrite=True)
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1], z_t=mt.zt.n_alt_alleles(), p_path=p_path, overwrite=True, standardize=False)
        assert model0._same(model)
        assert np.allclose(p0, p.to_numpy())
Example #2
0
    def test_linear_mixed_model_math(self):
        gamma = 2.0  # testing at fixed value of gamma
        n, p, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0],
                      [1.0, 2.0],
                      [1.0, 1.0],
                      [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0],
                      [0.0, 1.0, 2.0],
                      [1.0, 2.0, 4.0],
                      [2.0, 4.0, 8.0]])
        k = z @ z.T
        v = k + np.eye(4) / gamma
        v_inv = np.linalg.inv(v)

        beta = np.linalg.solve(x.T @ v_inv @ x, x.T @ v_inv @ y)
        residual = y - x @ beta
        sigma_sq = 1 / (n - p) * (residual @ v_inv @ residual)
        sv = sigma_sq * v
        neg_log_lkhd = 0.5 * (np.linalg.slogdet(sv)[1] + np.linalg.slogdet(x.T @ np.linalg.inv(sv) @ x)[1])  # plus C

        x_star = np.array([1.0, 0.0, 1.0, 0.0])
        a = x_star.reshape(n, 1)
        x1 = np.hstack([a, x])
        beta1 = np.linalg.solve(x1.T @ v_inv @ x1, x1.T @ v_inv @ y)
        residual1 = y - x1 @ beta1
        chi_sq = n * np.log((residual @ v_inv @ residual) / (residual1 @ v_inv @ residual1))

        # test full-rank fit
        model, p = LinearMixedModel.from_kinship(y, x, k)
        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd)

        # test full-rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        # test low-rank fit
        model, p = LinearMixedModel.from_mixed_effects(y, x, z)
        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd)

        # test low_rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa, a).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        a_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)
        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path, a_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)
Example #3
0
    def _test_linear_mixed_model_low_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 500
        n_variants = 200
        n_orig_markers = 100
        n_culprits = 10
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      af_dist=hl.rand_unif(0.1, 0.9, seed=seed),
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')
        a_t_path = utils.new_temp_file(suffix='bm')

        BlockMatrix.write_from_entry_expr(mt.GT.n_alt_alleles(), a_t_path)

        a = BlockMatrix.read(a_t_path).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        # low rank computation of S, P
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n_samples / n_markers)
        p = (g_std @ (v / np.sqrt(sl))).T

        # compare with full rank S, P
        sk0, uk = np.linalg.eigh(k)
        sk = sk0[-n_eigenvectors:]
        pk = uk[:, -n_eigenvectors:].T
        assert np.allclose(sk, s)
        assert np.allclose(np.abs(pk), np.abs(p))

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s, y, x)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_eigenvectors
        assert model.low_rank

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()
        assert np.min(df_numpy['chi_sq']) > 0

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 10
        assert na_lmm.sum() <= 10
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 5

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8   # all p-values
Example #4
0
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.x.collect()]).T
        y = np.array(mt.y.collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
Example #5
0
    def test_linear_mixed_model_full_rank(self):
        seed = 0
        n_populations = 8
        fst = n_populations * [.9]
        n_samples = 200
        n_variants = 500
        n_orig_markers = 500
        n_culprits = 20
        n_covariates = 3
        sigma_sq = 1
        tau_sq = 1

        from numpy.random import RandomState
        prng = RandomState(seed)

        x = np.hstack((np.ones(shape=(n_samples, 1)),
                       prng.normal(size=(n_samples, n_covariates - 1))))

        mt = hl.balding_nichols_model(n_populations=n_populations,
                                      n_samples=n_samples,
                                      n_variants=n_variants,
                                      fst=fst,
                                      seed=seed)

        pa_t_path = utils.new_temp_file(suffix='bm')

        a = BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles()).T.to_numpy()
        g = a[:, -n_orig_markers:]
        g_std = self._filter_and_standardize_cols(g)

        n_markers = g_std.shape[1]

        k = (g_std @ g_std.T) * n_samples / n_markers

        beta = np.arange(n_covariates)
        beta_stars = np.array([1] * n_culprits)

        y = prng.multivariate_normal(
            np.hstack((a[:, 0:n_culprits], x)) @ np.hstack((beta_stars, beta)),
            sigma_sq * k + tau_sq * np.eye(n_samples))

        s, u = np.linalg.eigh(k)
        p = u.T

        # build and fit model
        py = p @ y
        px = p @ x
        pa = p @ a

        model = LinearMixedModel(py, px, s)
        assert model.n == n_samples
        assert model.f == n_covariates
        assert model.r == n_samples
        assert (not model.low_rank)

        model.fit()

        # check effect sizes tend to be near 1 for first n_marker alternative models
        BlockMatrix.from_numpy(pa).T.write(pa_t_path, force_row_major=True)
        df_lmm = model.fit_alternatives(pa_t_path).to_pandas()

        assert 0.9 < np.mean(df_lmm['beta'][:n_culprits]) < 1.1

        # compare NumPy and Hail LMM per alternative
        df_numpy = model.fit_alternatives_numpy(pa, a).to_pandas()

        na_numpy = df_numpy.isna().any(axis=1)
        na_lmm = df_lmm.isna().any(axis=1)

        assert na_numpy.sum() <= 20
        assert na_lmm.sum() <= 20
        assert np.logical_xor(na_numpy, na_lmm).sum() <= 10

        mask = ~(na_numpy | na_lmm)

        lmm_vs_numpy_p_value = np.sort(
            np.abs(df_lmm['p_value'][mask] - df_numpy['p_value'][mask]))

        assert lmm_vs_numpy_p_value[10] < 1e-12  # 10 least p-values differences
        assert lmm_vs_numpy_p_value[-1] < 1e-8  # all p-values
Example #6
0
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T
        y = np.array(mt.key_cols_by()['y'].collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        ld = g_std.T @ g_std
        sl, v = np.linalg.eigh(ld)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
Example #7
0
    def test_linear_mixed_model_math(self):
        gamma = 2.0  # testing at fixed value of gamma
        n, f, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0],
                      [1.0, 2.0],
                      [1.0, 1.0],
                      [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0],
                      [0.0, 1.0, 2.0],
                      [1.0, 2.0, 4.0],
                      [2.0, 4.0, 8.0]])
        k = z @ z.T
        v = k + np.eye(4) / gamma
        v_inv = np.linalg.inv(v)

        beta = np.linalg.solve(x.T @ v_inv @ x, x.T @ v_inv @ y)
        residual = y - x @ beta
        sigma_sq = 1 / (n - f) * (residual @ v_inv @ residual)
        sv = sigma_sq * v
        neg_log_lkhd = 0.5 * (np.linalg.slogdet(sv)[1] + np.linalg.slogdet(x.T @ np.linalg.inv(sv) @ x)[1])  # plus C

        x_star = np.array([1.0, 0.0, 1.0, 0.0])
        a = x_star.reshape(n, 1)
        x1 = np.hstack([a, x])
        beta1 = np.linalg.solve(x1.T @ v_inv @ x1, x1.T @ v_inv @ y)
        residual1 = y - x1 @ beta1
        chi_sq = n * np.log((residual @ v_inv @ residual) / (residual1 @ v_inv @ residual1))

        # test from_kinship, full-rank fit
        model, p = LinearMixedModel.from_kinship(y, x, k)
        s0, u0 = np.linalg.eigh(k)
        s0 = np.flip(s0, axis=0)
        p0 = np.fliplr(u0).T
        self.assertTrue(model._same(LinearMixedModel(p0 @ y, p0 @ x, s0)))

        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd)

        # test full-rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        # test from_random_effects, low-rank fit
        s0, p0 = s0[:m], p0[:m, :]
        # test BlockMatrix path
        temp_path = utils.new_temp_file()
        model, _ = LinearMixedModel.from_random_effects(y, x, 
                                                        BlockMatrix.from_numpy(z),
                                                        p_path=temp_path,
                                                        complexity_bound=0)
        lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=temp_path)
        self.assertTrue(model._same(lmm))
        # test ndarray path
        model, p = LinearMixedModel.from_random_effects(y, x, z)
        lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x)
        self.assertTrue(model._same(lmm))

        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)), neg_log_lkhd)

        # test low_rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa, a).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        a_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)
        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path, a_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)
Example #8
0
    def test_linear_mixed_model_function(self):
        n, f, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0], [1.0, 2.0], [1.0, 1.0], [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 2.0], [1.0, 2.0, 0.0],
                      [2.0, 0.0, 1.0]])

        p_path = utils.new_temp_file()

        def make_call(gt):
            if gt == 0.0:
                return hl.Call([0, 0])
            if gt == 1.0:
                return hl.Call([0, 1])
            if gt == 2.0:
                return hl.Call([1, 1])

        data = [{
            'v': j,
            's': i,
            'y': y[i],
            'x1': x[i, 1],
            'zt': make_call(z[i, j])
        } for i in range(n) for j in range(m)]
        ht = hl.Table.parallelize(
            data,
            hl.dtype(
                'struct{v: int32, s: int32, y: float64, x1: float64, zt: tcall}'
            ))
        mt = ht.to_matrix_table(row_key=['v'],
                                col_key=['s'],
                                col_fields=['x1', 'y'])
        colsort = np.argsort(mt.key_cols_by().s.collect()).tolist()
        mt = mt.choose_cols(colsort)

        rrm = hl.realized_relationship_matrix(mt.zt).to_numpy()

        # kinship path agrees with from_kinship
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1],
                                         k=rrm,
                                         p_path=p_path,
                                         overwrite=True)
        model0, p0 = LinearMixedModel.from_kinship(y,
                                                   x,
                                                   rrm,
                                                   p_path,
                                                   overwrite=True)
        assert model0._same(model)
        assert np.allclose(p0, p)

        # random effects path with standardize=True agrees with low-rank rrm
        s0, u0 = np.linalg.eigh(rrm)
        s0 = np.flip(s0, axis=0)[:m]
        p0 = np.fliplr(u0).T[:m, :]
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1],
                                         z_t=mt.zt.n_alt_alleles(),
                                         p_path=p_path,
                                         overwrite=True)
        model0 = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=p_path)
        assert model0._same(model)

        # random effects path with standardize=False agrees with from_random_effects
        model0, p0 = LinearMixedModel.from_random_effects(y,
                                                          x,
                                                          z,
                                                          p_path,
                                                          overwrite=True)
        model, p = hl.linear_mixed_model(mt.y, [1, mt.x1],
                                         z_t=mt.zt.n_alt_alleles(),
                                         p_path=p_path,
                                         overwrite=True,
                                         standardize=False)
        assert model0._same(model)
        assert np.allclose(p0, p.to_numpy())
Example #9
0
    def test_linear_mixed_model_math(self):
        gamma = 2.0  # testing at fixed value of gamma
        n, f, m = 4, 2, 3
        y = np.array([0.0, 1.0, 8.0, 9.0])
        x = np.array([[1.0, 0.0], [1.0, 2.0], [1.0, 1.0], [1.0, 4.0]])
        z = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 2.0], [1.0, 2.0, 4.0],
                      [2.0, 4.0, 8.0]])
        k = z @ z.T
        v = k + np.eye(4) / gamma
        v_inv = np.linalg.inv(v)

        beta = np.linalg.solve(x.T @ v_inv @ x, x.T @ v_inv @ y)
        residual = y - x @ beta
        sigma_sq = 1 / (n - f) * (residual @ v_inv @ residual)
        sv = sigma_sq * v
        neg_log_lkhd = 0.5 * (np.linalg.slogdet(sv)[1] + np.linalg.slogdet(
            x.T @ np.linalg.inv(sv) @ x)[1])  # plus C

        x_star = np.array([1.0, 0.0, 1.0, 0.0])
        a = x_star.reshape(n, 1)
        x1 = np.hstack([a, x])
        beta1 = np.linalg.solve(x1.T @ v_inv @ x1, x1.T @ v_inv @ y)
        residual1 = y - x1 @ beta1
        chi_sq = n * np.log(
            (residual @ v_inv @ residual) / (residual1 @ v_inv @ residual1))

        # test from_kinship, full-rank fit
        model, p = LinearMixedModel.from_kinship(y, x, k)
        s0, u0 = np.linalg.eigh(k)
        s0 = np.flip(s0, axis=0)
        p0 = np.fliplr(u0).T
        self.assertTrue(model._same(LinearMixedModel(p0 @ y, p0 @ x, s0)))

        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)),
                               neg_log_lkhd)

        # test full-rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        # test from_random_effects, low-rank fit
        s0, p0 = s0[:m], p0[:m, :]
        # test BlockMatrix path
        temp_path = utils.new_temp_file()
        model, _ = LinearMixedModel.from_random_effects(
            y,
            x,
            BlockMatrix.from_numpy(z),
            p_path=temp_path,
            complexity_bound=0)
        lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x, p_path=temp_path)
        self.assertTrue(model._same(lmm))
        # test ndarray path
        model, p = LinearMixedModel.from_random_effects(y, x, z)
        lmm = LinearMixedModel(p0 @ y, p0 @ x, s0, y, x)
        self.assertTrue(model._same(lmm))

        model.fit(np.log(gamma))
        self.assertTrue(np.allclose(model.beta, beta))
        self.assertAlmostEqual(model.sigma_sq, sigma_sq)
        self.assertAlmostEqual(model.compute_neg_log_reml(np.log(gamma)),
                               neg_log_lkhd)

        # test low_rank alternative
        pa = p @ a
        stats = model.fit_alternatives_numpy(pa, a).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)

        a_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)
        pa_t_path = utils.new_temp_file()
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)
        stats = model.fit_alternatives(pa_t_path, a_t_path).collect()[0]
        self.assertAlmostEqual(stats.beta, beta1[0])
        self.assertAlmostEqual(stats.chi_sq, chi_sq)