def define_gp(Y, Xr, F, type, Rr): from limix_core.covar import LowRankCov from limix_core.covar import FixedCov from limix_core.covar import FreeFormCov from limix_core.gp import GP2KronSumLR from limix_core.gp import GP2KronSum P = Y.shape[1] _A = sp.eye(P) if type in ['null', 'rank1']: _Cr = LowRankCov(P, 1) elif type == 'block': _Cr = FixedCov(sp.ones((P, P))) elif type == 'full': _Cr = FreeFormCov(P) else: print('poppo') _Cn = FreeFormCov(P) if type == 'null': _gp = GP2KronSumLR(Y=Y, G=sp.ones((Y.shape[0], 1)), F=F, A=_A, Cr=_Cr, Cn=_Cn) _Cr.setParams(1e-9 * sp.ones(P)) _gp.covar.act_Cr = False else: if Xr.shape[1] < Xr.shape[0]: _gp = GP2KronSumLR(Y=Y, G=Xr, F=F, A=_A, Cr=_Cr, Cn=_Cn) else: _gp = GP2KronSum(Y=Y, F=F, A=_A, R=Rr, Cg=_Cr, Cn=_Cn) return _gp
def fit_null(self, F=None, verbose=True): """ Parameters ---------- F : (`N`, L) ndarray fixed effect design for covariates. Returns ------- RV : dict Dictionary with null model info (TODO add details) """ # F is a fixed effect covariate matrix with dim = N by D # F itself cannot have any cols of 0's and it won't work if it is None self.F = F self.qweliumod = CompQuadFormLiuMod() self.qwedavies = CompQuadFormDavies() self.qwedaviesskat = CompQuadFormDaviesSkat() if self.K is not None: # Decompose K into low rank version S_K, U_K = la.eigh(self.K) S = sp.array([i for i in S_K if i > 1e-9]) U = U_K[:, -len(S):] # In most cases W = E but have left it as seperate parameter for # flexibility self.W = U * S**0.5 self.gp = GP2KronSumLR( Y=self.y, F=self.F, A=sp.eye(1), Cn=FreeFormCov(1), G=self.W) self.gp.covar.Cr.setCovariance(0.5 * sp.ones((1, 1))) self.gp.covar.Cn.setCovariance(0.5 * sp.ones((1, 1))) RV = self.gp.optimize(verbose=verbose) # Get optimal kernel parameters self.covarparam0 = self.gp.covar.Cr.K()[0, 0] self.covarparam1 = self.gp.covar.Cn.K()[0, 0] self.Kiy = self.gp.Kiy() elif self.W is not None: self.gp = GP2KronSumLR( Y=self.y, F=self.F, A=sp.eye(1), Cn=FreeFormCov(1), G=self.W) self.gp.covar.Cr.setCovariance(0.5 * sp.ones((1, 1))) self.gp.covar.Cn.setCovariance(0.5 * sp.ones((1, 1))) RV = self.gp.optimize(verbose=verbose) self.covarparam0 = self.gp.covar.Cr.K()[0, 0] # getParams()[0] self.covarparam1 = self.gp.covar.Cn.K()[0, 0] self.Kiy = self.gp.Kiy() else: # If there is no kernel then solve analytically self.alpha_hat = sp.dot( sp.dot(la.inv(sp.dot(self.F.T, self.F)), self.F.T), self.y) yminus_falpha_hat = self.y - sp.dot(self.F, self.alpha_hat) self.covarparam1 = ( yminus_falpha_hat**2).sum() / yminus_falpha_hat.shape[0] self.covarparam0 = 0 self.Kiy = (1 / float(self.covarparam1)) * self.y self.W = sp.zeros(self.y.shape) RV = self.covarparam0 return RV
def __init__(self, Y=None, R=None, S_R=None, U_R=None, traitID=None, F=None, rank=1): from limix_core.gp import GP2KronSum from limix_core.gp import GP2KronSumLR from limix_core.gp import GP3KronSumLR from limix_core.covar import FreeFormCov # data noneNone = S_R is not None and U_R is not None self.bgRE = R is not None or noneNone # fixed effect msg = "The current implementation of the full rank mtSet" msg += " does not support covariates." msg += " We reccommend to regress out covariates and" msg += " subsequently quantile normalize the phenotypes" msg += " to a normal distribution prior to use mtSet." msg += " This can be done within the LIMIX framework using" msg += " the methods limix.util.preprocess.regressOut and" msg += " limix.util.preprocess.gaussianize" assert not (F is not None and self.bgRE), msg from limix.util.preprocess import remove_dependent_cols if F is not None: F = remove_dependent_cols(F) A = sp.eye(Y.shape[1]) else: A = None # traitID if traitID is None: traitID = sp.array(["trait %d" % p for p in range(Y.shape[1])]) self.setTraitID(traitID) # init covariance matrices and gp Cg = FreeFormCov(Y.shape[1]) Cn = FreeFormCov(Y.shape[1]) G = 1. * (sp.rand(Y.shape[0], 1) < 0.2) if self.bgRE: self._gp = GP3KronSumLR(Y=Y, Cg=Cg, Cn=Cn, R=R, S_R=S_R, U_R=U_R, G=G, rank=rank) else: self._gp = GP2KronSumLR(Y=Y, Cn=Cn, G=G, F=F, A=A) # null model params self.null = None # calls itself for column-by-column trait analysis self.stSet = None self.nullST = None self.infoOpt = None self.infoOptST = None pass
def train_model(self): import scipy as sp from limix_core.covar import FreeFormCov from limix_core.gp import GP2KronSumLR _covs = sp.concatenate([self.F, self.W, self.x], 1) self.snp_mean = self.x.mean(0) self.x_std = self.x - self.snp_mean self.snp_std = self.x_std.std(0) self.x_std /= self.snp_std self.xoE = self.x_std * self.TrainingEnv gp = GP2KronSumLR(Y=self.y, F=_covs, A=sp.eye(1), Cn=FreeFormCov(1), G=self.xoE) gp.covar.Cr.setCovariance(1e-4 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.02 * sp.ones((1, 1))) gp.optimize(verbose=False) self.alpha = gp.b() self.sigma_1 = gp.covar.Cr.K()[0, 0] self.sigma_2 = gp.covar.Cn.K()[0, 0] self.y_adjust = self.y - sp.dot(_covs, self.alpha) self.persistent_effect = gp.b()[-1] return self.persistent_effect
def calc_opt_rho(self): from limix_core.covar import FreeFormCov from limix_core.gp import GP2KronSumLR _covs = sp.concatenate([self.F, self.W, self.x], 1) xoE = self.x * self.Env gp = GP2KronSumLR(Y=self.y, F=_covs, A=sp.eye(1), Cn=FreeFormCov(1), G=xoE) gp.covar.Cr.setCovariance(1e-4 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.02 * sp.ones((1, 1))) gp.optimize(verbose=False) # var_xEEx = sp.tr(xEEx P)/(n-1) = sp.tr(PW (PW)^T)/(n-1) = (PW**2).sum()/(n-1) # W = xE # variance heterogenenty var_xEEx = ((xoE - xoE.mean(0))**2).sum() var_xEEx /= float(self.y.shape[0] - 1) v_het = gp.covar.Cr.K()[0, 0] * var_xEEx # variance persistent v_comm = sp.var(gp.b()[-1] * self.x) rho = v_het / (v_comm + v_het) return rho
def calc_full_model(self): _covs = sp.concatenate([self.F, self.W, self.x], 1) xoE = self.x * self.Env gp = GP2KronSumLR(Y=self.y, F=_covs, A=sp.eye(1), Cn=FreeFormCov(1), G=xoE) gp.covar.Cr.setCovariance(1e-4 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.02 * sp.ones((1, 1))) RV = gp.optimize(verbose=False) lml = -gp.LML() return lml
def calc_marginal_model(self, env_remove=0): _covs = sp.concatenate([self.F, self.W, self.x], 1) Env_subset = sp.delete(self.Env, env_remove, axis=1) xoE = self.x * Env_subset gp = GP2KronSumLR(Y=self.y, F=_covs, A=sp.eye(1), Cn=FreeFormCov(1), G=xoE) gp.covar.Cr.setCovariance(1e-4 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.02 * sp.ones((1, 1))) RV = gp.optimize(verbose=False) lml = -gp.LML() return lml
def calc_lml(self, Env): from limix_core.covar import FreeFormCov from limix_core.gp import GP2KronSumLR _covs = sp.concatenate([self.F, self.W, self.x], 1) if Env.shape[1] == 0: xoE = sp.ones(self.x.shape) else: xoE = self.x * Env gp = GP2KronSumLR(Y=self.y, F=_covs, A=sp.eye(1), Cn=FreeFormCov(1), G=xoE) gp.covar.Cr.setCovariance(1e-4 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.02 * sp.ones((1, 1))) gp.optimize(verbose=False) lml = -gp.LML() return lml
def fitNull( self, cache=False, out_dir="./cache", fname=None, rewrite=False, seed=None, factr=1e3, n_times=10, init_method=None, verbose=False, ): r""" Fit null model Args: verbose () cache (bool, optional): If False (default), the null model is fitted and the results are not cached. If True, the cache is activated. The cache file dir and name can be specified using ``hcache`` and ``fname``. When ``cache=True``, we distinguish the following cases: - if the specified file does not exist, the output of the null model fiting is cached in the file. - if the specified file exists and ``rewrite=True``, the cache file is overwritten. - if the specified file exists and ``rewrite=False``, the results from the cache file are imported (the null model is not re-fitted). out_dir (str, optional): output dir of the cache file. The default value is "./cache". fname (str, optional): Name of the cache hdf5 file. It must be specified if ``cache=True``. rewrite (bool, optional): It has effect only if cache `cache=True``. In this case, if ``True``, the cache file is overwritten in case it exists. The default value is ``False`` factr (float, optional): optimization paramenter that determines the accuracy of the solution. By default it is 1000. (see scipy.optimize.fmin_l_bfgs_b for more details). verbose (bool, optional): verbose flag. Returns: (dict): dictionary containing: - **B** (*ndarray*): estimated effect sizes (null); - **Cg** (*ndarray*): estimated relatedness trait covariance (null); - **Cn** (*ndarray*): estimated genetic noise covariance (null); - **conv** (*bool*): convergence indicator; - **NLL0** (*ndarray*): negative loglikelihood (NLL) of the null model; - **LMLgrad** (*ndarray*): norm of the gradient of the NLL. - **time** (*time*): elapsed time (in seconds). """ from limix_core.gp import GP2KronSum from limix_core.gp import GP2KronSumLR from limix_core.gp import GP3KronSumLR from limix_core.covar import FreeFormCov if seed is not None: sp.random.seed(seed) read_from_file = False if cache: assert fname is not None, "MultiTraitSetTest:: specify fname" if not os.path.exists(out_dir): os.makedirs(out_dir) out_file = os.path.join(out_dir, fname) read_from_file = os.path.exists(out_file) and not rewrite RV = {} if read_from_file: f = h5py.File(out_file, "r") for key in list(f.keys()): RV[key] = f[key][:] f.close() self.setNull(RV) else: start = TIME.time() if self.bgRE: self._gpNull = GP2KronSum( Y=self.Y, F=None, A=None, Cg=self.Cg, Cn=self.Cn, R=None, S_R=self.S_R, U_R=self.U_R, ) else: self._gpNull = GP2KronSumLR(self.Y, self.Cn, G=sp.ones((self.N, 1)), F=self.F, A=self.A) # freezes Cg to 0 n_params = self._gpNull.covar.Cr.getNumberParams() self._gpNull.covar.Cr.setParams(1e-9 * sp.ones(n_params)) self._gpNull.covar.act_Cr = False for i in range(n_times): params0 = self._initParams(init_method=init_method) self._gpNull.setParams(params0) conv, info = self._gpNull.optimize(verbose=verbose, factr=factr) if conv: break if not conv: warnings.warn("not converged") LMLgrad = (self._gpNull.LML_grad()["covar"]**2).mean() LML = self._gpNull.LML() if self._gpNull.mean.n_terms == 1: RV["B"] = self._gpNull.mean.B[0] elif self._gpNull.mean.n_terms > 1: warning.warn("generalize to more than 1 fixed effect term") if self.bgRE: RV["params0_g"] = self.Cg.getParams() else: RV["params0_g"] = sp.zeros_like(self.Cn.getParams()) RV["params0_n"] = self.Cn.getParams() if self.bgRE: RV["Cg"] = self.Cg.K() else: RV["Cg"] = sp.zeros_like(self.Cn.K()) RV["Cn"] = self.Cn.K() RV["conv"] = sp.array([conv]) RV["time"] = sp.array([TIME.time() - start]) RV["NLL0"] = sp.array([LML]) RV["LMLgrad"] = sp.array([LMLgrad]) RV["nit"] = sp.array([info["nit"]]) RV["funcalls"] = sp.array([info["funcalls"]]) self.null = RV from limix.util.util_functions import smartDumpDictHdf5 if cache: f = h5py.File(out_file, "w") smartDumpDictHdf5(RV, f) f.close() return RV
# mean as fixed effect covs = sp.ones((pheno.shape[0], 1)) # fit null model wfile = "data_structlmm/env.txt" W = sp.loadtxt(wfile) W = W[:, W.std(0) > 0] W -= W.mean(0) W /= W.std(0) W /= sp.sqrt(W.shape[1]) # larn a covariance on the null model gp = GP2KronSumLR(Y=pheno, Cn=FreeFormCov(1), G=W, F=covs, A=sp.ones((1, 1))) gp.covar.Cr.setCovariance(0.5 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * sp.ones((1, 1))) info_opt = gp.optimize(verbose=False) # define lmm lmm = LMM(pheno, covs, gp.covar.solve) # define geno preprocessing function imputer = SimpleImputer(missing_values=np.nan, strategy="mean") preprocess = prep.compose([ prep.filter_by_missing(max_miss=0.10), prep.impute(imputer), prep.filter_by_maf(min_maf=0.10),
def run_lmm(reader, pheno, R=None, S_R=None, U_R=None, W=None, covs=None, batch_size=1000, unique_variants=False): """ Utility function to run StructLMM Parameters ---------- reader : :class:`limix.data.BedReader` limix bed reader instance. pheno : (`N`, 1) ndarray phenotype vector R : (`N`, `N`) ndarray covariance of the random effect. Typically this is the genetic relatedness matrix. If set, ``W``, ``S_R`` and ``U_R`` are ignored. S_R : (`N`, ) ndarray eigenvalues of ``R``. If available together with the eigenvectors ``U_R``, they can be provided instead of ``R`` to avoid repeated computations. Only used when ``R`` is not set. If set, ``U_R`` should also be specified. U_R : (`N`, `N`) ndarray eigenvectors of ``R``. If available together with the eigenvalues ``S_R``, they can be provided instead of ``R`` to avoid repeated computations. Only used when ``R`` is not set. If set, ``S_R`` should also be specified. W : (`N`, `K`) ndarray this defines the covariance of a lowrank random effect. Setting ``W`` is equivalent to setting ``R = dot(W, W.T)`` but ``R`` is never computed to minimize memory usage. Only used when ``R``, ``U_R`` and ``S_R`` are not set. covs : (`N`, L) ndarray fixed effect design for covariates `N` samples and `L` covariates. If None (dafault value), an intercept only is considered. batch_size : int to minimize memory usage the analysis is run in batches. The number of variants loaded in a batch (loaded into memory at the same time). no_interaction_test : bool if True the interaction test is not consdered. Teh default value is True. unique_variants : bool if True, only non-repeated genotypes are considered The default value is False. Returns ------- res : *:class:`pandas.DataFrame`* contains pv, effect size, standard error on effect size, and test statistcs as well as variant info. """ if covs is None: covs = sp.ones((pheno.shape[0], 1)) # calc S_R, U_R if R is specified if R is not None: S_R, U_R = la.eigh(R) # assert that S_R and U_R are both specified S_is = S_R is not None U_is = U_R is not None if S_is or U_is: assert S_is and U_is, 'Both U_R and S_R should be specified' # assert semidefinite positiveness if S_R is not None: if S_R.min() < 1e-4: offset = S_R.min() + 1e-4 S_R += offset warnings.warn("Added %.2e jitter to make R a SDP cov" % offset) # fit null if R is not None: from limix_core.gp import GP2KronSum from limix_core.covar import FreeFormCov Cg = FreeFormCov(1) Cn = FreeFormCov(1) gp = GP2KronSum( Y=pheno, Cg=Cg, Cn=Cn, F=covs, A=sp.eye(1), S_R=S_R, U_R=U_R) Cg.setCovariance(0.5 * sp.ones(1, 1)) Cn.setCovariance(0.5 * sp.ones(1, 1)) info_opt = gp.optimize(verbose=False) covar = gp.covar elif W is not None: from limix_core.gp import GP2KronSumLR from limix_core.covar import FreeFormCov gp = GP2KronSumLR(Y=pheno, Cn=FreeFormCov(1), G=W, F=covs, A=sp.eye(1)) gp.covar.Cr.setCovariance(0.5 * sp.ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * sp.ones((1, 1))) info_opt = gp.optimize(verbose=False) covar = gp.covar else: covar = None # define lmm lmm = LMM(pheno, covs, covar) n_batches = reader.getSnpInfo().shape[0] / batch_size t0 = time.time() res = [] for i, gr in enumerate(GIter(reader, batch_size=batch_size)): print('.. batch %d/%d' % (i, n_batches)) X, _res = gr.getGenotypes(standardize=True, return_snpinfo=True) if unique_variants: X, idxs = f_univar(X, return_idxs=True) Isnp = sp.in1d(sp.arange(_res.shape[0]), idxs) _res = _res[Isnp] # run lmm lmm.process(X) pv = lmm.getPv() beta = lmm.getBetaSNP() beta_ste = lmm.getBetaSNPste() lrt = lmm.getLRT() # add pvalues, beta, etc to res _res = _res.assign(pv=pd.Series(pv, index=_res.index)) _res = _res.assign(beta=pd.Series(beta, index=_res.index)) _res = _res.assign(beta_ste=pd.Series(beta_ste, index=_res.index)) _res = _res.assign(lrt=pd.Series(lrt, index=_res.index)) res.append(_res) res = pd.concat(res) res.reset_index(inplace=True, drop=True) t = time.time() - t0 print('%.2f s elapsed' % t) return res
def st_iscan(G, y, K=None, M=None, E0=None, E1=None, W_R=None, verbose=True): r""" Single-variant association interation testing. Parameters ---------- pheno : (`N`, 1) ndarray phenotype data covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. R : (`N`, `N`) ndarray LMM-covariance/genetic relatedness matrix. If not provided, then standard linear regression is considered. Alternatively, its eighenvalue decomposition can be provided through ``eigh_R``. if ``eigh_R`` is set, this parameter is ignored. If the LMM-covariance is low-rank, ``W_R`` can be provided eigh_R : tuple Tuple with `N` ndarray of eigenvalues of `R` and (`N`, `N`) ndarray of eigenvectors of ``R``. W_R : (`N`, `R`) ndarray If the LMM-covariance is low-rank, one can provide ``W_R`` such that ``R`` = dot(``W_R``, transpose(``W_R``)). inter : (`N`, `K`) ndarray interaction variables interacting with the snp. If specified, then the current tests are considered: (i) (inter&inter0)-by-g vs no-genotype-effect; (ii) inter0-by-g vs no-genotype-effect; (iii) (inter&inter0)-by-g vs inter0-by-g. inter0 : (`N`, `K0`) ndarray interaction variables to be included in the alt and null model. By default, if inter is not specified, inter0 is ignored. By default, if inter is specified, inter0=ones so that inter0-by-g=g, i.e. an additive genetic effect is considered. verbose : (bool, optional): if True, details such as runtime as displayed. """ from limix_lmm.lmm import LMM from limix_lmm.lmm_core import LMMCore from limix_core.gp import GP2KronSum, GP2KronSumLR from limix_core.covar import FreeFormCov from scipy.linalg import eigh from numpy import ones, var, concatenate, asarray lmm0 = None with session_block("single-trait association test", disable=not verbose): # if covs is None: # covs = ones([pheno.shape[0], 1]) with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=K) y = data["y"] M = data["M"] G = data["G"] K = data["K"] # case 1: linear model # if W_R is None and eigh_R is None and R is None: if K is None: if verbose: print("Model: lm") gp = None Kiy_fun = None # case 2: low-rank linear model elif W_R is not None: if verbose: print("Model: low-rank lmm") gp = GP2KronSumLR(Y=y, Cn=FreeFormCov(1), G=W_R, F=M, A=ones((1, 1))) gp.covar.Cr.setCovariance(var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve # case 3: full-rank linear model else: if verbose: print("Model: lmm") # if eigh_R is None: eigh_R = eigh(K) S_R, U_R = eigh_R add_jitter(S_R) gp = GP2KronSum( Y=y, Cg=FreeFormCov(1), Cn=FreeFormCov(1), S_R=S_R, U_R=U_R, F=M, A=ones((1, 1)), ) gp.covar.Cr.setCovariance(0.5 * var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve if E1 is None: lmm = LMM(y, M, Kiy_fun) E1 = None E0 = None else: lmm = LMMCore(y, M, Kiy_fun) if E0 is None: E0 = ones([y.shape[0], 1]) if (E0 == 1).sum(): lmm0 = LMM(y, M, Kiy_fun) else: lmm0 = LMMCore(y, M, Kiy_fun) E1 = concatenate([E0, E1], 1) return _process(lmm, lmm0, asarray(G), E0, E1)