def define_gp(Y, Xr, F, type, Rr): from limix_core.covar import LowRankCov from limix_core.covar import FixedCov from limix_core.covar import FreeFormCov from limix_core.gp import GP2KronSumLR from limix_core.gp import GP2KronSum P = Y.shape[1] _A = sp.eye(P) if type in ['null', 'rank1']: _Cr = LowRankCov(P, 1) elif type == 'block': _Cr = FixedCov(sp.ones((P, P))) elif type == 'full': _Cr = FreeFormCov(P) else: print('poppo') _Cn = FreeFormCov(P) if type == 'null': _gp = GP2KronSumLR(Y=Y, G=sp.ones((Y.shape[0], 1)), F=F, A=_A, Cr=_Cr, Cn=_Cn) _Cr.setParams(1e-9 * sp.ones(P)) _gp.covar.act_Cr = False else: if Xr.shape[1] < Xr.shape[0]: _gp = GP2KronSumLR(Y=Y, G=Xr, F=F, A=_A, Cr=_Cr, Cn=_Cn) else: _gp = GP2KronSum(Y=Y, F=F, A=_A, R=Rr, Cg=_Cr, Cn=_Cn) return _gp
def setUp(self): np.random.seed(1) # define phenotype N = 200 P = 2 self.Y = sp.randn(N, P) # define fixed effects self.F = [] self.A = [] self.F.append(1. * (sp.rand(N, 2) < 0.5)) self.A.append(sp.eye(P)) # define row covariance f = 10 X = 1. * (sp.rand(N, f) < 0.2) self.R = covar_rescale(sp.dot(X, X.T)) self.R += 1e-4 * sp.eye(N) # define col covariances self.Cg = FreeFormCov(P) self.Cn = FreeFormCov(P) self.Cg.setCovariance(0.5 * sp.cov(self.Y.T)) self.Cn.setCovariance(0.5 * sp.cov(self.Y.T)) # define gp self.gp = GP2KronSum(Y=self.Y, F=self.F, A=self.A, Cg=self.Cg, Cn=self.Cn, R=self.R)
def _initGP(self): """ Internal method for initialization of the GP inference objetct """ from limix.util.util_functions import vec from limix_core.mean import MeanKronSum from limix_core.gp import GP2KronSum from limix_core.covar import SumCov if self._inference == 'GP2KronSum': signalPos = sp.where( sp.arange(self.n_randEffs) != self.noisPos)[0][0] gp = GP2KronSum(Y=self.Y, F=self.sample_designs, A=self.trait_designs, Cg=self.trait_covars[signalPos], Cn=self.trait_covars[self.noisPos], R=self.sample_covars[signalPos]) else: mean = MeanKronSum(self.Y, self.sample_designs, self.trait_designs) Iok = vec(~sp.isnan(mean.Y))[:, 0] if Iok.all(): Iok = None covar = SumCov(*[ KronCov(self.trait_covars[i], self.sample_covars[i], Iok=Iok) for i in range(self.n_randEffs) ]) gp = GP(covar=covar, mean=mean) self.gp = gp
def define_gp(Y, Xr, Sg, Ug, type): from limix_core.covar import LowRankCov from limix_core.covar import FixedCov from limix_core.covar import FreeFormCov from limix_core.gp import GP3KronSumLR from limix_core.gp import GP2KronSum P = Y.shape[1] _A = sp.eye(P) if type in 'rank1': _Cr = LowRankCov(P, 1) elif type == 'block': _Cr = FixedCov(sp.ones((P, P))) elif type == 'full': _Cr = FreeFormCov(P) elif type == 'null': pass else: print('poppo') _Cn = FreeFormCov(P) _Cg = FreeFormCov(P) if type == 'null': _gp = GP2KronSum(Y=Y, Cg=_Cg, Cn=_Cn, S_R=Sg, U_R=Ug) else: _gp = GP3KronSumLR(Y=Y, G=Xr, Cr=_Cr, Cg=_Cg, Cn=_Cn, S_R=Sg, U_R=Ug) return _gp
def test_mtmm_scan_pv_beta(): import scipy as sp import scipy.linalg as la from limix_core.gp import GP2KronSum from limix_core.covar import FreeFormCov N = 200 P = 4 M = 2 K = 2 S = 10 Y, F, G, B0, Cg0, Cn0 = _generate_data(N, P, K, S) A = sp.eye(P) Asnp = sp.rand(P, M) # compute eigenvalue decomp of RRM R = sp.dot(G, G.T) R /= R.diagonal().mean() R += 1e-4 * sp.eye(R.shape[0]) Sr, Ur = la.eigh(R) # fit null model Cg = FreeFormCov(Y.shape[1]) Cn = FreeFormCov(Y.shape[1]) gp = GP2KronSum(Y=Y, S_R=Sr, U_R=Ur, Cg=Cg, Cn=Cn, F=F, A=sp.eye(P)) gp.covar.Cg.setCovariance(0.5 * sp.cov(Y.T)) gp.covar.Cn.setCovariance(0.5 * sp.cov(Y.T)) gp.optimize(factr=10) # run MTLMM from limix_lmm import MTLMM mtlmm = MTLMM(Y, F=F, A=A, Asnp=Asnp, covar=gp.covar) pv, B = mtlmm.process(G) # run standard LMMcore from limix_lmm.lmm_core import LMMCore y = sp.reshape(Y, [Y.size, 1], order="F") covs = sp.kron(A, F) Aext = sp.kron(Asnp, sp.ones((G.shape[0], 1))) Gext = sp.kron(sp.ones((Asnp.shape[0], 1)), G) Wext = sp.einsum("ip,in->inp", Aext, Gext).reshape(Aext.shape[0], -1) stlmm = LMMCore(y, covs, Ki_dot=gp.covar.solve) stlmm.process(Wext, step=Asnp.shape[1]) pv0 = stlmm.getPv() B0 = stlmm.getBetaSNP() assert_allclose(pv0, pv, rtol=1e-06, atol=1e-06) assert_allclose(B0, B, rtol=1e-06, atol=1e-06)
def setUp(self): np.random.seed(1) # define phenotype N = 10 P = 3 Y = sp.randn(N, P) # pheno with missing data Ym = Y.copy() Im = sp.rand(N, P) < 0.2 Ym[Im] = sp.nan # define fixed effects F = [] A = [] F.append(1. * (sp.rand(N, 2) < 0.5)) A.append(sp.eye(P)) mean = MeanKronSum(Y, F=F, A=A) mean_m = MeanKronSum(Ym, F=F, A=A) # define row caoriance f = 10 X = 1. * (sp.rand(N, f) < 0.2) R = covar_rescale(sp.dot(X, X.T)) R += 1e-4 * sp.eye(N) # define col covariances Cg = FreeFormCov(P) Cn = FreeFormCov(P) Cg.setRandomParams() Cn.setRandomParams() # define covariance matrices covar1 = KronCov(Cg, R) covar2 = KronCov(Cn, sp.eye(N)) covar = SumCov(covar1, covar2) # define covariance matrice with missing data Iok = (~Im).reshape(N * P, order='F') covar1_m = KronCov(copy.copy(Cg), R, Iok=Iok) covar2_m = KronCov(copy.copy(Cn), sp.eye(N), Iok=Iok) covar_m = SumCov(covar1_m, covar2_m) # define gp self._gp = GP(covar=covar, mean=mean) self._gpm = GP(covar=covar_m, mean=mean_m) self._gp2ks = GP2KronSum(Y=Y, F=F, A=A, Cg=Cg, Cn=Cn, R=R)
N = 1000 P = 4 K = 2 S = 500 Y, F, G, B0, Cg0, Cn0 = generate_data(N, P, K, S) # compute eigenvalue decomp of RRM R = sp.dot(G, G.T) R /= R.diagonal().mean() R += 1e-4 * sp.eye(R.shape[0]) Sr, Ur = la.eigh(R) # fit null model Cg = FreeFormCov(Y.shape[1]) Cn = FreeFormCov(Y.shape[1]) gp = GP2KronSum(Y=Y, S_R=Sr, U_R=Ur, Cg=Cg, Cn=Cn, F=F, A=sp.eye(P)) gp.covar.Cg.setCovariance(0.5 * sp.cov(Y.T)) gp.covar.Cn.setCovariance(0.5 * sp.cov(Y.T)) gp.optimize(factr=10) import pdb pdb.set_trace() # run MTLMM from limix_lmm.lmm_core import MTLMM mtlmm = MTLMM(Y, F=F, A=sp.eye(P), Asnp=sp.eye(P), covar=gp.covar) pv, B = mtlmm.process(G)
def fitNull( self, cache=False, out_dir="./cache", fname=None, rewrite=False, seed=None, factr=1e3, n_times=10, init_method=None, verbose=False, ): r""" Fit null model Args: verbose () cache (bool, optional): If False (default), the null model is fitted and the results are not cached. If True, the cache is activated. The cache file dir and name can be specified using ``hcache`` and ``fname``. When ``cache=True``, we distinguish the following cases: - if the specified file does not exist, the output of the null model fiting is cached in the file. - if the specified file exists and ``rewrite=True``, the cache file is overwritten. - if the specified file exists and ``rewrite=False``, the results from the cache file are imported (the null model is not re-fitted). out_dir (str, optional): output dir of the cache file. The default value is "./cache". fname (str, optional): Name of the cache hdf5 file. It must be specified if ``cache=True``. rewrite (bool, optional): It has effect only if cache `cache=True``. In this case, if ``True``, the cache file is overwritten in case it exists. The default value is ``False`` factr (float, optional): optimization paramenter that determines the accuracy of the solution. By default it is 1000. (see scipy.optimize.fmin_l_bfgs_b for more details). verbose (bool, optional): verbose flag. Returns: (dict): dictionary containing: - **B** (*ndarray*): estimated effect sizes (null); - **Cg** (*ndarray*): estimated relatedness trait covariance (null); - **Cn** (*ndarray*): estimated genetic noise covariance (null); - **conv** (*bool*): convergence indicator; - **NLL0** (*ndarray*): negative loglikelihood (NLL) of the null model; - **LMLgrad** (*ndarray*): norm of the gradient of the NLL. - **time** (*time*): elapsed time (in seconds). """ from limix_core.gp import GP2KronSum from limix_core.gp import GP2KronSumLR from limix_core.gp import GP3KronSumLR from limix_core.covar import FreeFormCov if seed is not None: sp.random.seed(seed) read_from_file = False if cache: assert fname is not None, "MultiTraitSetTest:: specify fname" if not os.path.exists(out_dir): os.makedirs(out_dir) out_file = os.path.join(out_dir, fname) read_from_file = os.path.exists(out_file) and not rewrite RV = {} if read_from_file: f = h5py.File(out_file, "r") for key in list(f.keys()): RV[key] = f[key][:] f.close() self.setNull(RV) else: start = TIME.time() if self.bgRE: self._gpNull = GP2KronSum( Y=self.Y, F=None, A=None, Cg=self.Cg, Cn=self.Cn, R=None, S_R=self.S_R, U_R=self.U_R, ) else: self._gpNull = GP2KronSumLR(self.Y, self.Cn, G=sp.ones((self.N, 1)), F=self.F, A=self.A) # freezes Cg to 0 n_params = self._gpNull.covar.Cr.getNumberParams() self._gpNull.covar.Cr.setParams(1e-9 * sp.ones(n_params)) self._gpNull.covar.act_Cr = False for i in range(n_times): params0 = self._initParams(init_method=init_method) self._gpNull.setParams(params0) conv, info = self._gpNull.optimize(verbose=verbose, factr=factr) if conv: break if not conv: warnings.warn("not converged") LMLgrad = (self._gpNull.LML_grad()["covar"]**2).mean() LML = self._gpNull.LML() if self._gpNull.mean.n_terms == 1: RV["B"] = self._gpNull.mean.B[0] elif self._gpNull.mean.n_terms > 1: warning.warn("generalize to more than 1 fixed effect term") if self.bgRE: RV["params0_g"] = self.Cg.getParams() else: RV["params0_g"] = sp.zeros_like(self.Cn.getParams()) RV["params0_n"] = self.Cn.getParams() if self.bgRE: RV["Cg"] = self.Cg.K() else: RV["Cg"] = sp.zeros_like(self.Cn.K()) RV["Cn"] = self.Cn.K() RV["conv"] = sp.array([conv]) RV["time"] = sp.array([TIME.time() - start]) RV["NLL0"] = sp.array([LML]) RV["LMLgrad"] = sp.array([LMLgrad]) RV["nit"] = sp.array([info["nit"]]) RV["funcalls"] = sp.array([info["funcalls"]]) self.null = RV from limix.util.util_functions import smartDumpDictHdf5 if cache: f = h5py.File(out_file, "w") smartDumpDictHdf5(RV, f) f.close() return RV
def run_lmm(reader, pheno, R=None, S_R=None, U_R=None, W=None, covs=None, batch_size=1000, unique_variants=False): """ Utility function to run StructLMM Parameters ---------- reader : :class:`limix.data.BedReader` limix bed reader instance. pheno : (`N`, 1) ndarray phenotype vector R : (`N`, `N`) ndarray covariance of the random effect. Typically this is the genetic relatedness matrix. If set, ``W``, ``S_R`` and ``U_R`` are ignored. S_R : (`N`, ) ndarray eigenvalues of ``R``. If available together with the eigenvectors ``U_R``, they can be provided instead of ``R`` to avoid repeated computations. Only used when ``R`` is not set. If set, ``U_R`` should also be specified. U_R : (`N`, `N`) ndarray eigenvectors of ``R``. If available together with the eigenvalues ``S_R``, they can be provided instead of ``R`` to avoid repeated computations. Only used when ``R`` is not set. If set, ``S_R`` should also be specified. W : (`N`, `K`) ndarray this defines the covariance of a lowrank random effect. Setting ``W`` is equivalent to setting ``R = dot(W, W.T)`` but ``R`` is never computed to minimize memory usage. Only used when ``R``, ``U_R`` and ``S_R`` are not set. covs : (`N`, L) ndarray fixed effect design for covariates `N` samples and `L` covariates. If None (dafault value), an intercept only is considered. batch_size : int to minimize memory usage the analysis is run in batches. The number of variants loaded in a batch (loaded into memory at the same time). no_interaction_test : bool if True the interaction test is not consdered. Teh default value is True. unique_variants : bool if True, only non-repeated genotypes are considered The default value is False. Returns ------- res : *:class:`pandas.DataFrame`* contains pv, effect size, standard error on effect size, and test statistcs as well as variant info. """ if covs is None: covs = sp.ones((pheno.shape[0], 1)) # calc S_R, U_R if R is specified if R is not None: S_R, U_R = la.eigh(R) # assert that S_R and U_R are both specified S_is = S_R is not None U_is = U_R is not None if S_is or U_is: assert S_is and U_is, 'Both U_R and S_R should be specified' # assert semidefinite positiveness if S_R is not None: if S_R.min() < 1e-4: offset = S_R.min() + 1e-4 S_R += offset warnings.warn("Added %.2e jitter to make R a SDP cov" % offset) # fit null if R is not None: from limix_core.gp import GP2KronSum from limix_core.covar import FreeFormCov Cg = FreeFormCov(1) Cn = FreeFormCov(1) gp = GP2KronSum( Y=pheno, Cg=Cg, Cn=Cn, F=covs, A=sp.eye(1), S_R=S_R, U_R=U_R) Cg.setCovariance(0.5 * sp.ones(1, 1)) Cn.setCovariance(0.5 * sp.ones(1, 1)) info_opt = gp.optimize(verbose=False) covar = gp.covar elif W is not None: from limix_core.gp import GP2KronSumLR from limix_core.covar import FreeFormCov gp = GP2KronSumLR(Y=pheno, Cn=FreeFormCov(1), G=W, F=covs, A=sp.eye(1)) gp.covar.Cr.setCovariance(0.5 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * sp.ones((1, 1))) info_opt = gp.optimize(verbose=False) covar = gp.covar else: covar = None # define lmm lmm = LMM(pheno, covs, covar) n_batches = reader.getSnpInfo().shape[0] / batch_size t0 = time.time() res = [] for i, gr in enumerate(GIter(reader, batch_size=batch_size)): print('.. batch %d/%d' % (i, n_batches)) X, _res = gr.getGenotypes(standardize=True, return_snpinfo=True) if unique_variants: X, idxs = f_univar(X, return_idxs=True) Isnp = sp.in1d(sp.arange(_res.shape[0]), idxs) _res = _res[Isnp] # run lmm lmm.process(X) pv = lmm.getPv() beta = lmm.getBetaSNP() beta_ste = lmm.getBetaSNPste() lrt = lmm.getLRT() # add pvalues, beta, etc to res _res = _res.assign(pv=pd.Series(pv, index=_res.index)) _res = _res.assign(beta=pd.Series(beta, index=_res.index)) _res = _res.assign(beta_ste=pd.Series(beta_ste, index=_res.index)) _res = _res.assign(lrt=pd.Series(lrt, index=_res.index)) res.append(_res) res = pd.concat(res) res.reset_index(inplace=True, drop=True) t = time.time() - t0 print('%.2f s elapsed' % t) return res
def mt_scan(G, Y, M=None, K=None, Ac=None, Asnps=None, Asnps0=None, verbose=True): """ Wrapper function for multi-trait single-variant association testing using variants of the multi-trait linear mixed model. Parameters ---------- Y : (`N`, `P`) ndarray phenotype data Asnps : (`P`, `K`) ndarray trait design of snp covariance. By default, ``Asnps`` is eye(`P`). R : (`N`, `N`) ndarray LMM-covariance/genetic relatedness matrix. If not provided, then standard linear regression is considered. Alternatively, its eighenvalue decomposition can be provided through ``eigh_R``. if ``eigh_R`` is set, this parameter is ignored. eigh_R : tuple Tuple with `N` ndarray of eigenvalues of `R` and (`N`, `N`) ndarray of eigenvectors of ``R``. covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. Ac : (`P`, `L`) ndarray trait design matrices of the different fixed effect terms. By default, ``Ac`` is eye(`P`). Asnps0 : (`P`, `K`) ndarray trait design of snp covariance in the null model. By default, Asnps0 is not considered (i.e., no SNP effect in the null model). If specified, then three tests are considered: (i) Asnps vs , (ii) Asnps0!=0, (iii) Asnps!=Asnps0 verbose : (bool, optional): if True, details such as runtime as displayed. """ from pandas import DataFrame from scipy.stats import chi2 from numpy import eye, cov, asarray from scipy.linalg import eigh from limix_core.gp import GP2KronSum from limix_core.covar import FreeFormCov from limix_lmm.mtlmm import MTLMM if Ac is None: Ac = eye(Y.shape[1]) with session_block("single-trait association test", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(Y, M, G=G, K=K) Y = asarray(data["y"]) M = asarray(data["M"]) G = asarray(data["G"]) K = asarray(data["K"]) # case 1: multi-trait linear model if K is None: raise ValueError("multi-trait linear model not supported") eigh_R = eigh(K) # case 2: full-rank multi-trait linear model S_R, U_R = eigh_R S_R = add_jitter(S_R) gp = GP2KronSum( Y=Y, Cg=FreeFormCov(Y.shape[1]), Cn=FreeFormCov(Y.shape[1]), S_R=eigh_R[0], U_R=eigh_R[1], F=M, A=Ac, ) gp.covar.Cr.setCovariance(0.5 * cov(Y.T)) gp.covar.Cn.setCovariance(0.5 * cov(Y.T)) gp.optimize(verbose=verbose) lmm = MTLMM(Y, F=M, A=Ac, Asnp=Asnps, covar=gp.covar) if Asnps0 is not None: lmm0 = MTLMM(Y, F=M, A=Ac, Asnp=Asnps0, covar=gp.covar) if Asnps0 is None: lmm.process(G) RV = OrderedDict() RV["pv"] = lmm.getPv() RV["lrt"] = lmm.getLRT() else: lmm.process(G) lmm0.process(G) # compute pv lrt1 = lmm.getLRT() lrt0 = lmm0.getLRT() lrt = lrt1 - lrt0 pv = chi2(Asnps.shape[1] - Asnps0.shape[1]).sf(lrt) RV = OrderedDict() RV["pv1"] = lmm.getPv() RV["pv0"] = lmm0.getPv() RV["pv"] = pv RV["lrt1"] = lrt1 RV["lrt0"] = lrt0 RV["lrt"] = lrt return DataFrame(RV)
def st_iscan(G, y, K=None, M=None, E0=None, E1=None, W_R=None, verbose=True): r""" Single-variant association interation testing. Parameters ---------- pheno : (`N`, 1) ndarray phenotype data covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. R : (`N`, `N`) ndarray LMM-covariance/genetic relatedness matrix. If not provided, then standard linear regression is considered. Alternatively, its eighenvalue decomposition can be provided through ``eigh_R``. if ``eigh_R`` is set, this parameter is ignored. If the LMM-covariance is low-rank, ``W_R`` can be provided eigh_R : tuple Tuple with `N` ndarray of eigenvalues of `R` and (`N`, `N`) ndarray of eigenvectors of ``R``. W_R : (`N`, `R`) ndarray If the LMM-covariance is low-rank, one can provide ``W_R`` such that ``R`` = dot(``W_R``, transpose(``W_R``)). inter : (`N`, `K`) ndarray interaction variables interacting with the snp. If specified, then the current tests are considered: (i) (inter&inter0)-by-g vs no-genotype-effect; (ii) inter0-by-g vs no-genotype-effect; (iii) (inter&inter0)-by-g vs inter0-by-g. inter0 : (`N`, `K0`) ndarray interaction variables to be included in the alt and null model. By default, if inter is not specified, inter0 is ignored. By default, if inter is specified, inter0=ones so that inter0-by-g=g, i.e. an additive genetic effect is considered. verbose : (bool, optional): if True, details such as runtime as displayed. """ from limix_lmm.lmm import LMM from limix_lmm.lmm_core import LMMCore from limix_core.gp import GP2KronSum, GP2KronSumLR from limix_core.covar import FreeFormCov from scipy.linalg import eigh from numpy import ones, var, concatenate, asarray lmm0 = None with session_block("single-trait association test", disable=not verbose): # if covs is None: # covs = ones([pheno.shape[0], 1]) with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=K) y = data["y"] M = data["M"] G = data["G"] K = data["K"] # case 1: linear model # if W_R is None and eigh_R is None and R is None: if K is None: if verbose: print("Model: lm") gp = None Kiy_fun = None # case 2: low-rank linear model elif W_R is not None: if verbose: print("Model: low-rank lmm") gp = GP2KronSumLR(Y=y, Cn=FreeFormCov(1), G=W_R, F=M, A=ones((1, 1))) gp.covar.Cr.setCovariance(var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve # case 3: full-rank linear model else: if verbose: print("Model: lmm") # if eigh_R is None: eigh_R = eigh(K) S_R, U_R = eigh_R add_jitter(S_R) gp = GP2KronSum( Y=y, Cg=FreeFormCov(1), Cn=FreeFormCov(1), S_R=S_R, U_R=U_R, F=M, A=ones((1, 1)), ) gp.covar.Cr.setCovariance(0.5 * var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve if E1 is None: lmm = LMM(y, M, Kiy_fun) E1 = None E0 = None else: lmm = LMMCore(y, M, Kiy_fun) if E0 is None: E0 = ones([y.shape[0], 1]) if (E0 == 1).sum(): lmm0 = LMM(y, M, Kiy_fun) else: lmm0 = LMMCore(y, M, Kiy_fun) E1 = concatenate([E0, E1], 1) return _process(lmm, lmm0, asarray(G), E0, E1)