def test_fast_scanner_set_scale_multicovariates(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 3) lmm = LMM(y, M, QS) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) want = [-19.318845, -19.318845, -19.318845] assert_allclose(r["lml"], want, rtol=1e-6, atol=1e-6) assert_allclose( r["effsizes0"][2], [-0.6923007382350215, 2.3550810825973034, -0.38157769653894497], rtol=1e-5, ) want = [-0.34615, 1.177541, -0.381578] assert_allclose(r["effsizes1"], want, rtol=1e-6, atol=1e-6) assert_allclose(r["scale"], [1.0, 1.0, 1.0])
def test_lmm_scan_fast_scan(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M0 = random.randn(n, 2) M1 = random.randn(n, 2) lmm = LMM(y, M0, QS) lmm.fit(verbose=False) v0 = lmm.v0 v1 = lmm.v1 K = v0 * X @ X.T + v1 * eye(n) M = concatenate((M0, M1[:, [0]]), axis=1) def fun(x): beta = x[:3] scale = exp(x[3]) return -st.multivariate_normal(M @ beta, scale * K).logpdf(y) res = minimize(fun, [0, 0, 0, 0]) scanner = lmm.get_fast_scanner() r = scanner.fast_scan(M1, verbose=False) assert_allclose(r["lml"][0], -res.fun) assert_allclose(r["effsizes0"][0], res.x[:2], rtol=1e-5) assert_allclose(r["effsizes1"][0], res.x[2:3], rtol=1e-5) assert_allclose(r["scale"][0], exp(res.x[3]), rtol=1e-5)
def test_fast_scanner_set_scale_1covariate(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 1) lmm = LMM(y, M, QS) lmm.fit(verbose=False) assert_allclose(lmm.scale, 5.282731934070453) assert_allclose(lmm.delta, 0.7029974630034005) assert_allclose(lmm.beta, [0.0599712498212]) markers = M.copy() + random.randn(n, 1) scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) assert_allclose(r["lml"], [-21.509721], rtol=1e-6) assert_allclose(r["effsizes0"], [[-1.43206379971882]]) assert_allclose(r["effsizes1"], [1.412239], rtol=1e-6) assert_allclose(r["scale"], [0.8440354018505616], rtol=1e-6) beta = lmm.beta assert_allclose( scanner.fast_scan(zeros((10, 1)), verbose=False)["effsizes0"][0], beta )
def estimate(y, lik, K, M=None, verbose=True): from numpy_sugar.linalg import economic_qs from numpy import pi, var, diag from glimix_core.glmm import GLMMExpFam from glimix_core.lmm import LMM from limix._data._assert import assert_likelihood from limix._data import normalize_likelihood, conform_dataset from limix.qtl._assert import assert_finite from limix._display import session_block, session_line lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y, M=M, K=K) y = data["y"] M = data["M"] K = data["K"] assert_finite(y, M, K) if K is not None: # K = K / diag(K).mean() QS = economic_qs(K) else: QS = None if lik_name == "normal": method = LMM(y.values, M.values, QS, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y, lik, M.values, QS, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) g = method.scale * (1 - method.delta) e = method.scale * method.delta if lik_name == "bernoulli": e += pi * pi / 3 v = var(method.mean()) return g, v, e
def estimate(y_phe, lik, kin, marker_mat=None, verbose=True): ''' estimate variance components ''' lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y_phe, M=marker_mat, K=kin) y_phe = data["y"] marker_mat = data["M"] kin = data["K"] assert_finite(y_phe, marker_mat, kin) if kin is not None: # K = K / diag(K).mean() q_s = economic_qs(kin) else: q_s = None if lik_name == "normal": method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) v_g = method.scale * (1 - method.delta) v_e = method.scale * method.delta if lik_name == "bernoulli": v_e += pi * pi / 3 v_v = var(method.mean()) return v_g, v_v, v_e
def _perform_lmm(y, M, QS, G, verbose): from glimix_core.lmm import LMM from pandas import Series from xarray import DataArray lmm = LMM(y, M.values, QS) lmm.fit(verbose=verbose) sys.stdout.flush() null_lml = lmm.lml() beta = lmm.beta covariates = list(M.coords["covariate"].values) ncov_effsizes = Series(beta, covariates) flmm = lmm.get_fast_scanner() if hasattr(G, "data"): values = G.data else: values = G.values alt_lmls, effsizes = flmm.fast_scan(values, verbose=verbose) coords = { k: ("candidate", G.coords[k].values) for k in G.coords.keys() if G.coords[k].dims[0] == "candidate" } alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords) effsizes = DataArray(effsizes, dims=["candidate"], coords=coords) return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
def test_fast_scanner_set_scale_1covariate_redundant(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = random.randn(n, 1) lmm = LMM(y, M, QS) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() r = scanner.fast_scan(markers, verbose=False) assert_allclose(r["lml"][0], -22.357525517597185, rtol=1e-6) assert_allclose(r["effsizes0"], [[0.029985622694805182]]) assert_allclose(r["effsizes1"][0], 0.02998562491058301, rtol=1e-6, atol=1e-6) assert_allclose(r["scale"], [1.0], rtol=1e-6)
def test_lmm_interface(): random = RandomState(1) n = 3 G = random.randn(n, n + 1) X = random.randn(n, 2) y = X @ random.randn(2) + G @ random.randn(G.shape[1]) + random.randn(n) y -= y.mean(0) y /= y.std(0) QS = economic_qs_linear(G) lmm = LMM(y, X, QS, restricted=False) lmm.name = "lmm" lmm.fit(verbose=False) assert_allclose( lmm.covariance(), [ [0.436311031439718, 2.6243891396439837e-16, 2.0432156171727483e-16], [2.6243891396439837e-16, 0.4363110314397185, 4.814313140426306e-16], [2.0432156171727483e-16, 4.814313140426305e-16, 0.43631103143971817], ], atol=1e-7, ) assert_allclose( lmm.mean(), [0.6398184791042468, -0.8738254794097052, 0.7198112606871158], atol=1e-7, ) assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7) assert_allclose(lmm.value(), lmm.lml(), atol=1e-7) assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7) assert_allclose( lmm.X, [ [-0.3224172040135075, -0.38405435466841564], [1.1337694423354374, -1.0998912673140309], [-0.17242820755043575, -0.8778584179213718], ], atol=1e-7, ) assert_allclose(lmm.beta, [-1.3155159120000266, -0.5615702941530938], atol=1e-7) assert_allclose( lmm.beta_covariance, [ [0.44737305797088345, 0.20431961864892412], [0.20431961864892412, 0.29835835133251526], ], atol=1e-7, ) assert_allclose(lmm.delta, 0.9999999999999998, atol=1e-7) assert_equal(lmm.ncovariates, 2) assert_equal(lmm.nsamples, 3) assert_allclose(lmm.scale, 0.43631103143971767, atol=1e-7) assert_allclose(lmm.v0, 9.688051060046502e-17, atol=1e-7) assert_allclose(lmm.v1, 0.43631103143971756, atol=1e-7) assert_equal(lmm.name, "lmm") with pytest.raises(NotImplementedError): lmm.gradient()
def test_lm(): random = RandomState(0) (y, X, _) = _full_rank(random) lmm = LMM(y, X) lmm.fit(verbose=False) assert_allclose(lmm.v0, 2.0129061033356781e-16, atol=1e-7) assert_allclose(lmm.v1, 0.9065323176914355) assert_allclose(lmm.beta, [0.24026567104188318, -0.17873180599015123])
def test_lmm_scan(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M0 = random.randn(n, 2) M1 = random.randn(n, 2) lmm = LMM(y, M0, QS) lmm.fit(verbose=False) v0 = lmm.v0 v1 = lmm.v1 K = v0 * X @ X.T + v1 * eye(n) M = concatenate((M0, M1), axis=1) def fun(x): beta = x[:4] scale = exp(x[4]) return -st.multivariate_normal(M @ beta, scale * K).logpdf(y) res = minimize(fun, [0, 0, 0, 0, 0]) scanner = lmm.get_fast_scanner() r = scanner.scan(M1) assert_allclose(r["lml"], -res.fun) assert_allclose(r["effsizes0"], res.x[:2], rtol=1e-5) assert_allclose(r["effsizes1"], res.x[2:4], rtol=1e-5) assert_allclose(r["scale"], exp(res.x[4]), rtol=1e-5) K = r["scale"] * lmm.covariance() M = concatenate((M0, M1), axis=1) effsizes_se = sqrt(inv(M.T @ solve(K, M)).diagonal()) assert_allclose(effsizes_se, concatenate((r["effsizes0_se"], r["effsizes1_se"]))) assert_allclose(scanner.null_lml(), -53.805721275578456, rtol=1e-5) assert_allclose(scanner.null_beta, [0.26521964226797085, 0.4334778669761928], rtol=1e-5) assert_allclose( scanner.null_beta_covariance, [ [0.06302553593799207, 0.00429640179038484], [0.004296401790384839, 0.05591392416235412], ], rtol=1e-5, ) assert_allclose(scanner.null_scale, 1.0) assert_allclose(scanner.null_beta, lmm.beta, rtol=1e-5) assert_allclose(scanner.null_beta_covariance, lmm.beta_covariance, rtol=1e-5)
def _fit_lmm_simple_model(self, verbose): from numpy_sugar.linalg import economic_qs from glimix_core.lmm import LMM from numpy import asarray K = self._get_matrix_simple_model() y = asarray(self._y, float).ravel() QS = None if K is not None: QS = economic_qs(K) lmm = LMM(y, self._M, QS) lmm.fit(verbose=verbose) self._set_simple_model_variances(lmm.v0, lmm.v1) self._glmm = lmm
def test_fast_scanner_statsmodel_gls(): import statsmodels.api as sm from numpy.linalg import lstsq def _lstsq(A, B): return lstsq(A, B, rcond=None)[0] data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) ols_resid = sm.OLS(data.endog, data.exog).fit().resid resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() rho = resid_fit.params[1] order = toeplitz(range(len(ols_resid))) sigma = rho ** order QS = economic_qs(sigma) lmm = LMM(data.endog, data.exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() best_beta_se = _lstsq(data.exog.T @ _lstsq(lmm.covariance(), data.exog), eye(7)) best_beta_se = sqrt(best_beta_se.diagonal()) assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-5) endog = data.endog.copy() endog -= endog.mean(0) endog /= endog.std(0) exog = data.exog.copy() exog -= exog.mean(0) with errstate(invalid="ignore", divide="ignore"): exog /= exog.std(0) exog[:, 0] = 1 lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() gls_model = sm.GLS(endog, exog, sigma=sigma) gls_results = gls_model.fit() beta_se = gls_results.bse our_beta_se = sqrt(scanner.null_beta_covariance.diagonal()) # statsmodels scales the covariance matrix we pass, that is why # we need to account for it here. assert_allclose(our_beta_se, beta_se / sqrt(gls_results.scale)) assert_allclose(scanner.null_beta_se, beta_se / sqrt(gls_results.scale))
def calc_lml(self, Env): from numpy import ones, concatenate from glimix_core.lmm import LMM from numpy_sugar.linalg import economic_qs_linear _covs = concatenate([self.F, self.W, self.x], 1) if Env.shape[1] == 0: xoE = ones(self.x.shape) else: xoE = self.x * Env QS = economic_qs_linear(xoE) gp = LMM(self.y, _covs, QS, restricted=True) gp.fit(verbose=False) return gp.lml()
def _lmm(y, M, QS, verbose): from glimix_core.lmm import LMM lmm = LMM(y, M, QS, restricted=False) lmm.fit(verbose=verbose) sys.stdout.flush() if QS is None: v0 = None else: v0 = lmm.v0 v1 = lmm.v1 scanner = ScannerWrapper(lmm.get_fast_scanner()) return scanner, v0, v1
def _fit_cis_herit(y, K_cis, X=None, compute_lrt=True): log = logging.getLogger(pyfocus.LOG) try: from glimix_core.lmm import LMM from numpy_sugar.linalg import economic_qs_linear except ImportError as ie: log.error( "Training submodule requires glimix-core>=2.0.0 and numpy-sugar to be installed." ) raise from scipy.stats import norm from scipy.linalg import lstsq if X is None: X = np.ones((len(y), 1)) K_cis = economic_qs_linear(K_cis) lmm = LMM(y, X, K_cis) lmm.fit(verbose=False) fixed_betas = lmm.beta logl_1 = lmm.lml() cis_scale = lmm.v0 noise_scale = lmm.v1 fe_scale = lmm.fixed_effects_variance if compute_lrt: n, p = X.shape # reduced model is just OLS regression for fixed-effects fixed_betas_0, sosqs, ranks, svals = lstsq(X, y) s2e = sosqs / len( y ) # LMM also uses MLE estimation, so don't adjust for bias right now logl_0 = np.sum( norm.logpdf(y, loc=np.dot(X, fixed_betas_0), scale=np.sqrt(s2e))) pval = _lrt_pvalue(logl_0, logl_1) log.debug("Estimated cis-h2g = {} (P = {})".format( cis_scale / (cis_scale + noise_scale + fe_scale), pval)) else: pval = None log.debug("Estimated cis-h2g = {}".format( cis_scale / (cis_scale + noise_scale + fe_scale))) return fe_scale, cis_scale, noise_scale, logl_1, fixed_betas, pval
def _st_lmm(Y, M, QS, verbose): from numpy import nan from glimix_core.lmm import LMM lmm = LMM(Y, M, QS, restricted=False) lmm.fit(verbose=verbose) sys.stdout.flush() if QS is None: v0 = nan else: v0 = lmm.v0 v1 = lmm.v1 return lmm.get_fast_scanner(), v0, v1
def test_lmm_scan_lmm_iid_prior(): random = RandomState(9458) n = 30 X = _covariates_sample(random, n, n + 1) markers = random.randn(n, 2) offset = 1.0 y = _outcome_sample(random, offset, X) lmm = LMM(y, ones((n, 1)), None) lmm.fit(verbose=False) scanner = lmm.get_fast_scanner() lmls = scanner.fast_scan(markers, verbose=False)["lml"] assert_allclose(lmls[:2], [-63.16019973550036, -62.489358539276715])
def test_fast_scanner_redundant_candidates(): random = RandomState(9458) n = 10 X = _covariates_sample(random, n, n + 1) offset = 1.0 y = _outcome_sample(random, offset, X) QS = economic_qs_linear(X) M = ones((n, 5)) lmm = LMM(y, M, QS, restricted=False) lmm.fit(verbose=False) markers = M.copy() scanner = lmm.get_fast_scanner() scanner.fast_scan(markers, verbose=False)
def calc_opt_rho(self): import scipy as sp from glimix_core.lmm import LMM from numpy_sugar.linalg import economic_qs_linear _covs = sp.concatenate([self.F, self.W, self.x], 1) xoE = self.x * self.Env QS = economic_qs_linear(xoE) gp = LMM(self.y, _covs, QS, restricted=True) gp.fit(verbose=False) # variance heterogenenty var_xEEx = ((xoE - xoE.mean(0)) ** 2).sum() var_xEEx /= float(self.y.shape[0] - 1) v_het = gp.v0 * var_xEEx # variance persistent v_comm = sp.var(gp.beta[-1] * self.x) rho = v_het / (v_comm + v_het) return rho
def test_lmm_predict(): random = RandomState(9458) n = 30 X = random.randn(n, n + 1) X -= X.mean(0) X /= X.std(0) X /= sqrt(X.shape[1]) offset = 1.0 mean = OffsetMean(n) mean.offset = offset cov_left = LinearCov(X) cov_left.scale = 1.5 cov_right = EyeCov(n) cov_right.scale = 1.5 cov = SumCov([cov_left, cov_right]) lik = DeltaProdLik() y = GGPSampler(lik, mean, cov).sample(random) QS = economic_qs_linear(X) lmm = LMM(y, ones((n, 1)), QS) lmm.fit(verbose=False) plmm = LMMPredict(y, lmm.beta, lmm.v0, lmm.v1, lmm.mean(), lmm.covariance()) K = dot(X, X.T) pm = plmm.predictive_mean(ones((n, 1)), K, K.diagonal()) assert_allclose(corrcoef(y, pm)[0, 1], 0.8358820971891354)
def test_lmm_beta_covariance(): random = RandomState(0) (y, X, G) = _full_rank(random) QS = economic_qs_linear(G) lmm = LMM(y, X, QS) lmm.fit(verbose=False) A = [ [0.015685784760937037, 0.006509918649859495], [0.006509918649859495, 0.007975242272006645], ] assert_allclose(lmm.beta_covariance, A) (y, X, G) = _low_rank(random) QS = economic_qs_linear(G) lmm = LMM(y, X[:, :2], QS) lmm.fit(verbose=False) A = [ [0.002763268929325623, 0.0006651810010328699], [0.0006651810010328708, 0.0016910004907565248], ] assert_allclose(lmm.beta_covariance, A) (y, X, G) = _low_rank(random) QS = economic_qs_linear(G) lmm = LMM(y, X, QS) lmm.fit(verbose=False) A = [ [ 0.003892850639339253, 0.0012112513279299796, 0.003892850639339256, 0.0012112513279299794, ], [ 0.0012112513279299794, 0.009340423857663259, 0.0012112513279299833, 0.009340423857663257, ], [ 0.0038928506393392562, 0.0012112513279299835, 0.003892850639339259, 0.0012112513279299833, ], [ 0.0012112513279299794, 0.009340423857663257, 0.0012112513279299833, 0.009340423857663257, ], ] assert_allclose(lmm.beta_covariance, A)
def estimate(y, lik, K, M=None, verbose=True): r"""Estimate the so-called narrow-sense heritability. It supports Normal, Bernoulli, Probit, Binomial, and Poisson phenotypes. Let :math:`N` be the sample size and :math:`S` the number of covariates. Parameters ---------- y : array_like Either a tuple of two arrays of `N` individuals each (Binomial phenotypes) or an array of `N` individuals (Normal, Poisson, or Bernoulli phenotypes). If a continuous phenotype is provided (i.e., a Normal one), make sure they have been normalised in such a way that its values are not extremely large; it might cause numerical errors otherwise. For example, by using :func:`limix.qc.mean_standardize` or :func:`limix.qc.quantile_gaussianize`. lik : "normal", "bernoulli", "probit", binomial", "poisson" Sample likelihood describing the residual distribution. K : array_like :math:`N`-by-:math:`N` covariance matrix. It might be, for example, the estimated kinship relationship between the individuals. The provided matrix will be normalised via the function :func:`limix.qc.normalise_covariance`. M : array_like, optional :math:`N` individuals by :math:`S` covariates. It will create a :math:`N`-by-:math:`1` matrix ``M`` of ones representing the offset covariate if ``None`` is passed. If an array is passed, it will used as is. Defaults to ``None``. verbose : bool, optional ``True`` to display progress and summary; ``False`` otherwise. Returns ------- float Estimated heritability. Examples -------- .. doctest:: >>> from numpy import dot, exp, sqrt >>> from numpy.random import RandomState >>> from limix.her import estimate >>> >>> random = RandomState(0) >>> >>> G = random.randn(150, 200) / sqrt(200) >>> K = dot(G, G.T) >>> z = dot(G, random.randn(200)) + random.randn(150) >>> y = random.poisson(exp(z)) >>> >>> print('%.3f' % estimate(y, 'poisson', K, verbose=False)) # doctest: +FLOAT_CMP 0.183 Notes ----- It will raise a ``ValueError`` exception if non-finite values are passed. Please, refer to the :func:`limix.qc.mean_impute` function for missing value imputation. """ from numpy_sugar import is_all_finite from numpy_sugar.linalg import economic_qs from numpy import ones, pi, var from glimix_core.glmm import GLMMExpFam from glimix_core.lmm import LMM if not isinstance(lik, (tuple, list)): lik = (lik,) lik_name = lik[0].lower() check_likelihood_name(lik_name) with session_block("heritability analysis", disable=not verbose): if M is None: M = ones((len(y), 1)) with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y, M=M, K=K) y = data["y"] M = data["M"] K = data["K"] if not is_all_finite(y): raise ValueError("Outcome must have finite values only.") if not is_all_finite(M): raise ValueError("Covariates must have finite values only.") if K is not None: if not is_all_finite(K): raise ValueError("Covariate matrix must have finite values only.") K = normalise_covariance(K) y = normalise_extreme_values(y, lik) if K is not None: QS = economic_qs(K) else: QS = None if lik_name == "normal": method = LMM(y.values, M.values, QS) method.fit(verbose=verbose) else: method = GLMMExpFam(y, lik, M.values, QS, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) g = method.scale * (1 - method.delta) e = method.scale * method.delta if lik_name == "bernoulli": e += pi * pi / 3 if lik_name == "normal": v = method.fixed_effects_variance else: v = var(method.mean()) return g / (v + g + e)
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000, min_maf=0.05, min_hwe_P=0.001, min_call_rate=None, blocksize=1000, cis_mode=True, skipAutosomeFiltering = False, gaussianize_method=None, minimum_test_samples= 10, seed=np.random.randint(40000), n_perm=0, write_permutations = False, write_feature_top_permutations = False, relatedness_score=0.95, feature_variant_covariate_filename = None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, randomeff_filename=None, sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront = False, debugger=False): #Manual flag to set pearson (True), spearman (False). TODO add rank as an option to gaussnorm. pearson=True if regressCovariatesUpfront is not None: #This implementation can only handle regression before the association test (correlation). regressCovariatesUpfront= True tot_time = 0 idx = 0 print(relatedness_score) fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False) print('Running QTL analysis.') lik = 'normal' minimumProbabilityStep=0.1 '''Core function to take input and run QTL tests on a given chromosome.''' # Check if relatedness_score is present as a measure of genotype similarity and hence, of sample similarity. if relatedness_score is not None: relatedness_score = float(relatedness_score) # Intersect files together to list the amount of samples with enough files if debugger: fun_start = time.time() [phenotype_df, kinship_df, randomeff_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,covariates_filename=covariates_filename, randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) if debugger: fun_end = time.time() print(" Intersecting files took {}".format(fun_end-fun_start)) # Check if kinship matrix is present. The matrix of pairwise genotype similarity. If they are not present took genetically unique individuals based on IDs mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None) : geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None # Check if feature list is empty (genes) if(feature_list==None or len(feature_list)==0): print ('No features to be tested.') sys.exit() #Open output files if debugger: fun_start = time.time() qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None : output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd)) else : output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}.h5'.format(chromosome)) if(write_permutations): if not selectionStart is None : permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd),n_perm) else : permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}.h5'.format(chromosome),n_perm) if debugger: fun_end = time.time() print(" Opening writing files took {}".format(fun_end-fun_start)) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_ids = [] pass_qc_snps_all = [] fail_qc_snps_all = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] random_eff_param = [] na_containing_features=0 currentFeatureNumber=0 snpQcInfoMain = None random_eff_param = [] log = {} rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] Sigma = {} Sigma_qs = {} randomeff_mix = False ############################################################################################################################################################# # Per feature LMM fitting ############################################################################################################################################################# # start per feature computations for feature_id in feature_list: gc.collect() start_time = time.time() # log file production for rho values storing and computation time log[(feature_id)] = [] # feature specific parameters for QS mixing mixingParameters = {} feature_best_rho = -1 snpQcInfo = None # counter currentFeatureNumber+= 1 ######################################################################################################################################################## # check if enough phenotype samples to test this gene if (len(phenotype_df.loc[feature_id,:]))<minimum_test_samples: print("Feature: "+feature_id+" not tested not enough samples do QTL test (n="+str(len(phenotype_df.loc[feature_id,:]))+").") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False ######################################################################################################################################################## ######################################################################################################################################################## # SNP selection based on gene location and window size if debugger: fun_start = time.time() snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering) if debugger: fun_end = time.time() print("SNP querying took {}".format(fun_end-fun_start)) snp_cov_df = None if debugger: fun_start = time.time() ######################################################################################################################################################## ######################################################################################################################################################### # Check if a matrix of variant covariance is present. Like for example PCs covariates -- Understand better covariates to SNP if(feature_variant_covariate_df is not None): if(feature_id in feature_variant_covariate_df['feature_id'].values): # array of covariates per SNP and feature covariateSnp = feature_variant_covariate_df['snp_id'].values[feature_variant_covariate_df['feature_id']==feature_id] if(any(i in bim['snp'].values for i in covariateSnp)): snpQuery_cov = bim.loc[bim['snp'].map(lambda x: x in list(covariateSnp)),:] if(plinkGenotype): snp_cov_df = pd.DataFrame(data=bed[snpQuery_cov['i'].values,:].compute().transpose(),index=fam.index,columns=snpQuery_cov['snp'],) else: ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2! ##Also we don't use a minimal quality to assure a value is present for all samples. print('Warning, during the regression of SNPs we assume ploidy 2.') snp_cov_df_t = pd.DataFrame(columns=fam.index) rowNumber = 0 for snpId in snpQuery_cov['i'] : geno = bgen["genotype"][snpId].compute() if(geno["phased"]): snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float) snp_df_dosage_t[(np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep)] = float('NaN') else : snp_df_dosage_t = (geno["probs"][:,0]* 2)+geno["probs"][:,1] snp_df_dosage_t[np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep)] = float('NaN') snp_df_dosage_t = pd.Series(snp_df_dosage_t, index= fam.index) snp_df_dosage_t.name = snpId snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t) rowNumber = rowNumber +1 snp_cov_df = snp_cov_df_t.transpose() snp_cov_df_t = None if debugger: fun_end = time.time() print(" Selecting feature variant covariate took {}".format(fun_end-fun_start)) ######################################################################################################################################################## ######################################################################################################################################################## # Check the number of SNP to be tested and look if there is some SNP or feature filtering requirement if (len(snpQuery) != 0) and (snp_filter_df is not None): toSelect = set(snp_filter_df.index).intersection(set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): toSelect = set(np.unique(snp_feature_filter_df['snp_id'].loc[snp_feature_filter_df['feature_id']==feature_id])).intersection(set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if len(snpQuery) == 0: print("Feature: "+feature_id+" not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue ######################################################################################################################################################## else: # selecting phenotype array phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) ##################################################################################################################################################### # check for missing samples according to specific feature otherwise use previous SNP QC information if(contains_missing_samples): print ('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features+1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index,'iid'].values sample2individual_feature= sample2individual_df.loc[phenotype_ds.index] if(contains_missing_samples): tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples(kinship_df.loc[individual_ids,individual_ids], relatedness_score); else: geneticaly_unique_individuals = individual_ids ##################################################################################################################################################### else: #If no missing samples we can use the previous SNP QC information before actually loading data. #This allows for more efficient blocking and retrieving of data snpQuery = snpQuery.loc[snpQuery['snp'].map(lambda x: x not in list(map(str, fail_qc_snps_all)))] ##################################################################################################################################################### # check for enough samples for QTL test if phenotype_ds.empty or len(geneticaly_unique_individuals)<minimum_test_samples : print("Feature: "+feature_id+" not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: "+feature_id+" has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue ##################################################################################################################################################### print ('For feature: ' +str(currentFeatureNumber)+ '/'+str(len(feature_list))+ ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.') ########################################################################################################################################################## # SNP TESTING ########################################################################################################################################################## if(n_perm!=0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. #test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[sample2individual_df['iid'],sample2individual_df['iid']].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].index) if randomeff_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index) if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' ####################################################################################################################################################### # look if kinship or other random effect dataframe are present and QS computation ! if debugger: fun_start = time.time() if kinship_df is not None and randomeff_df is None: kinship_mat = kinship_df.loc[individual_ids,individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if(QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS = economic_qs(kinship_mat) # combining the two matrices if kinship_df is not None and randomeff_df is not None: #Here we need to match names and make sure that the order is the same and the right samples get mixed. randomeff_mix = True if(not Sigma_qs and not contains_missing_samples): kinship_mat = kinship_df.loc[individual_ids,individual_ids].values kinship_mat = kinship_mat.astype(float) randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values randomeff_mat = randomeff_mat.astype(float) for rho in rho1: Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum()) Sigma_qs[rho] = economic_qs(Sigma[rho]) elif (contains_missing_samples): #To fix: this needs to be reset after running with missing samples. Now Missing!! kinship_mat = kinship_df.loc[individual_ids,individual_ids].values kinship_mat = kinship_mat.astype(float) randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values randomeff_mat = randomeff_mat.astype(float) for rho in rho1: Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat ##GOWER normalization of Kinship matrix. Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum()) Sigma_qs[rho] = economic_qs(Sigma[rho]) # if kinship_df is None and randomeff_df is not None: # randomeff_mat = randomeff_df.loc[individual_ids,individual_ids].values # randomeff_mat = randomeff_mat.astype(float) # ##GOWER normalization of Kinship matrix. # randomeff_mat *= (randomeff_mat.shape[0] - 1) / (randomeff_mat.trace() - randomeff_mat.mean(0).sum()) # ## This needs to go with the subselection stuff. # if(QS is None and not contains_missing_samples): # QS = economic_qs(randomeff_mat) # elif (contains_missing_samples): # QS = economic_qs(randomeff_mat) # creating a fake QS if none random effect is present or use the read depth one if kinship_df is None: if randomeff_df is None: K = np.eye(len(phenotype_ds.index)) if(QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS = economic_qs(K) else: if(QS is None and not contains_missing_samples): QS = economic_qs(randomeff_df) elif (contains_missing_samples): QS = economic_qs(randomeff_df) if debugger: fun_end = time.time() print(" Computing QS took {}".format(fun_end-fun_start)) ####################################################################################################################################################### ####################################################################################################################################################### # covariance matrix setting cov_matrix = covariate_df.loc[sample2individual_feature['sample'],:].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids,:] snp_cov_df_tmp.index=sample2individual_feature['sample'] snp_cov_df = pd.DataFrame(fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index=snp_cov_df_tmp.index snp_cov_df.columns=snp_cov_df_tmp.columns cov_matrix = np.concatenate((cov_matrix,snp_cov_df.values),1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) ####################################################################################################################################################### else: print ('There is an issue in mapping phenotypes vs covariates and/or kinship') sys.exit() ########################################################################################################################################################### # force normal distribution of expression values phenotype_ds = pd.Series(data= utils.force_normal_distribution(phenotype_ds.values,method=gaussianize_method) if gaussianize_method is not None else phenotype_ds.values,index=phenotype_ds.index,name=phenotype_ds.name) ########################################################################################################################################################### ########################################################################################################################################################## # Regressing up covariates if debugger: fun_start = time.time() if regressCovariatesUpfront: phenotype = phenotype_ds.values ##Using LMM/LM to regress covariates up front. # Computing Null Model if debugger: fun_start = time.time() if randomeff_mix: mixingParameters = utils.rhoTest(None, phenotype,cov_matrix,Sigma_qs,mixed, None) lmm = mixingParameters["lmm"] log[(feature_id)].append(mixingParameters["rho"]) feature_best_rho = mixingParameters["rho"] if mixingParameters["rho"]!=0: print("Random effect has influence, mixing parameter: "+str(mixingParameters["rho"])) else : print("Only kinship has effect.") else: lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') lmm.fit(verbose=False) if debugger: fun_end = time.time() print(" Computing Null model took {}".format(fun_end-fun_start)) #Replace phenotype with corrected phenotype: phenotype_ds = pd.Series(data= (phenotype-cov_matrix[:,1:].dot(lmm.beta[1:])),index=phenotype_ds.index,name=phenotype_ds.name) if debugger: fun_end = time.time() print(" Regressing Covariates took {}".format(fun_end-fun_start)) ########################################################################################################################################################## if debugger: fun_start = time.time() ########################################################################################################################################################## # Fast scanning - iterate according to a blocksize countChunker = 0 for snpGroup in utils.chunker(snpQuery, blocksize): countChunker=countChunker+1 #print(countChunker) #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) #print(snpGroup) snp_idxs = snpGroup['i'].values snp_names = snpGroup['snp'].values tested_snp_ids.extend(snp_names) #subset genotype matrix, we cannot subselect at the same time, do in two steps. if debugger: fun_start = time.time() ########################################################################################################################################################## # SNP dataframe creation if(plinkGenotype): snp_df = pd.DataFrame(data=bed[snp_idxs,:].compute().transpose(),index=fam.index,columns=snp_names) else : snp_df_dosage = pd.DataFrame(np.nan,index=fam.index, columns = snp_names) snp_df = pd.DataFrame(np.nan,index=fam.index, columns = snp_names) rowNumber = 0 for snpId in snp_idxs : geno = bgen["genotype"][snpId].compute() if (geno["ploidy"].min()>1 & geno["ploidy"].max()<3) : if(geno["phased"]): snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float) snp_df_t = (np.abs(np.argmax(geno["probs"][:,:2], axis=1)-1)+np.abs(np.argmax(geno["probs"][:,2:4], axis=1)-1)).astype(float) naId = (np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') else : snp_df_dosage_t = ((geno["probs"][:,0]* 2)+geno["probs"][:,1]).astype(float) snp_df_t = (np.abs(np.argmax(geno["probs"][:,:3], axis=1)-2)).astype(float) naId = np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') snp_df_dosage.loc[:,snp_names[rowNumber]] = snp_df_dosage_t snp_df.loc[:,snp_names[rowNumber]] = snp_df_t rowNumber = rowNumber +1 snp_df_dosage = snp_df_dosage.loc[individual_ids,:] snp_df = snp_df.loc[individual_ids,:] snp_df = snp_df.loc[:,np.unique(snp_df.columns)[np.unique(snp_df.columns,return_counts=1)[1]==1]] if debugger: fun_end = time.time() print(" Subsetting genotype matrix took {}".format(fun_end-fun_start)) ########################################################################################################################################################## #SNP QC. if debugger: fun_start = time.time() if not contains_missing_samples: #remove SNPs from snp_df if they have previously failed QC snp_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(fail_qc_snps_all)]] if snp_df.shape[1] == 0: continue snps_to_test_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(pass_qc_snps_all)]] if snps_to_test_df.shape[1] > 0: #Only do QC on relevant SNPs. join pre-QCed list and new QCed list. if kinship_df is not None: passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df.iloc[np.unique(snps_to_test_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df, min_call_rate, min_maf, min_hwe_P) snps_to_test_df = None #append snp_names and failed_snp_names pass_qc_snps_all.extend(passed_snp_names) fail_qc_snps_all.extend(failed_snp_names) snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(pass_qc_snps_all)]] else: #Do snp QC for relevant section. #Get relevant slice from: phenotype_ds if kinship_df is not None: passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df.iloc[np.unique(snp_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df, min_call_rate, min_maf, min_hwe_P) snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(passed_snp_names)]] snpQcInfo_t = None if call_rate is not None: snpQcInfo_t = call_rate if maf is not None: snpQcInfo_t = pd.concat([snpQcInfo_t,maf.reindex(snpQcInfo_t.index)],axis=1) if hweP is not None: snpQcInfo_t = pd.concat([snpQcInfo_t,hweP.reindex(snpQcInfo_t.index)],axis=1) if debugger: fun_end = time.time() print(" SNP quality control took {}".format(fun_end-fun_start)) ########################################################################################################################################################## call_rate = None maf = None hweP = None if snpQcInfo is None and snpQcInfo_t is not None: snpQcInfo = snpQcInfo_t elif snpQcInfo_t is not None: snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort = False) ##First process SNPQc than check if si can continue. if len(snp_df.columns) == 0: continue ##If we use bgen we replace the genotypes here to only have the dosage matrix in mem. Trying to save some memory. if (not plinkGenotype): snp_df= snp_df_dosage.loc[:,np.unique(snp_df.columns)] snp_df_dosage = None #We could make use of relatedness when imputing. And impute only based on genetically unique individuals. snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),index=snp_df.index,columns=snp_df.columns) ##No more snp_matrix_DF > snp_df # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_df.index) != len(sample2individual_feature.loc[phenotype_ds.index]['iid']) or not all(snp_df.index==sample2individual_feature.loc[phenotype_ds.index]['iid'])): print ('There is an issue in mapping phenotypes and genotypes') sys.exit() ########################################################################################################################################################## # SCANNING if debugger: fun_start = time.time() rho = [None] * snp_df.shape[1] pVal = [None] * snp_df.shape[1] if pearson: for snpPos in range(0, snp_df.shape[1]) : rho[snpPos], pVal[snpPos] = sp.stats.pearsonr(snp_df.values[:,snpPos], phenotype_ds.values) else: for snpPos in range(0, snp_df.shape[1]) : rho[snpPos], pVal[snpPos] = sp.stats.spearmanr(snp_df.values[:,snpPos], phenotype_ds.values) if debugger: fun_end = time.time() print(" Actual scanning took {}".format(fun_end-fun_start)) ######################################################################################################################################################### #add these results to qtl_results temp_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['feature_id','snp_id','p_value','beta','beta_se','empirical_feature_p_value']) temp_df['snp_id'] = snp_df.columns temp_df['feature_id'] = feature_id.replace("/","-") temp_df['beta'] = np.asarray(rho) temp_df['p_value'] = np.asarray(pVal) #insert default dummy value temp_df['beta_se'] = None temp_df['empirical_feature_p_value'] = -1.0 ########################################################################################################################################################## # SCANNING if debugger: fun_start = time.time() if(n_perm!=0): pValueBuffer = [] totalSnpsToBeTested = (snp_df.shape[1]*n_perm) permutationStepSize = np.floor(n_perm/(totalSnpsToBeTested/blocksize)) if(permutationStepSize>n_perm): permutationStepSize=n_perm elif(permutationStepSize<1): permutationStepSize=1 if(write_permutations): print("Not supported.") #perm_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['snp_id'] + ['permutation_'+str(x) for x in range(n_perm)]) #perm_df['snp_id'] = snp_df.columns for currentNperm in utils.chunker(list(range(1, n_perm+1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): temp = utils.get_shuffeld_genotypes_preserving_kinship(geneticaly_unique_individuals, relatedness_score, snp_df,kinship_df.loc[individual_ids,individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes(snp_df, len(currentNperm)) temp = temp.astype(float) var_pvalues_p = [None] * temp.shape[1] if pearson: for snpPos in range(0, temp.shape[1]) : rhoP, var_pvalues_p[snpPos] = sp.stats.pearsonr(temp[:,snpPos], phenotype_ds.values) else : for snpPos in range(0, temp.shape[1]) : rhoP, var_pvalues_p[snpPos] = sp.stats.spearmanr(temp[:,snpPos], phenotype_ds.values) pValueBuffer.extend(np.asarray(var_pvalues_p)) if(not(len(pValueBuffer)==totalSnpsToBeTested)): print(len(pValueBuffer)) print(pValueBuffer) print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker(pValueBuffer,snp_df.shape[1]) : #if(write_permutations): # perm_df['permutation_'+str(perm)] = relevantOutput if(bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm+1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty : data_written = True output_writer.add_result_df(temp_df) #if(write_permutations): # permutation_writer.add_permutation_results_df(perm_df,feature_id) if debugger: fun_end = time.time() print(" Permutations took {}".format(fun_end-fun_start)) #This we need to change in the written file. if debugger: fun_start = time.time() if not data_written : fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if n_perm>1 : #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction(feature_id.replace("/","-"),bestPermutationPval, cis_mode) if write_feature_top_permutations: np.savetxt(output_dir+"/Permutation.pValues."+feature_id.replace("/","-")+".txt",bestPermutationPval) alpha_params.append(alpha_para) beta_params.append(beta_para) if randomeff_mix : random_eff_param.append(feature_best_rho) if contains_missing_samples: QS = None Sigma_qs = None geneticaly_unique_individuals = tmp_unique_individuals del tmp_unique_individuals if snpQcInfo is not None: snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv(output_dir+'/snp_qc_metrics_naContaining_feature_{}.txt'.format(feature_id.replace("/","-")),sep='\t') else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False) elif snpQcInfo is not None : snpQcInfoMain = snpQcInfo.copy(deep=True) if debugger: fun_end = time.time() print(" Writing took {}".format(fun_end-fun_start)) #if snpQcInfo is not None: #snpQcInfo2 = snpQcInfo.copy().transpose() #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t') #print('step 5') print("Time: --- %s seconds ---" % (time.time() - start_time)) tot_time += time.time() - start_time idx += 1 print("Mean: --- %s seconds ---" % (tot_time/idx)) log[(feature_id)].append((time.time() - start_time)) log[(feature_id)].append(tot_time/idx) output_writer.close() if(write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if((len(feature_list)-len(fail_qc_features))==0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd)) if not selectionStart is None : os.remove(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd)) else : os.remove(output_dir+'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested SNPs tested_snp_ids = list(set(tested_snp_ids)) #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = bim['snp'] snp_df['chromosome'] = bim['chrom'] snp_df['position'] = bim['pos'] snp_df['assessed_allele'] = bim['a1'] snp_df.index = snp_df['snp_id'] snp_df = snp_df.drop_duplicates() snp_df = snp_df.reindex(tested_snp_ids) snp_df = snp_df.drop_duplicates() if snpQcInfoMain is not None : snpQcInfoMain['index']=snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat([snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) if(snp_df.shape[1]==5): snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate'] elif(snp_df.shape[1]==6): snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf'] else : snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf','hwe_p'] feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if(n_perm>1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if randomeff_mix: annotation_df['rho'] = random_eff_param if not selectionStart is None : snp_df.to_csv(output_dir+'/snp_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t',index=False) annotation_df.to_csv(output_dir+'/feature_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t') else : snp_df.to_csv(output_dir+'/snp_metadata_{}.txt'.format(chromosome),sep='\t',index=False) annotation_df.to_csv(output_dir+'/feature_metadata_{}.txt'.format(chromosome),sep='\t') if not selectionStart is None : print("saving log!") print(log) #pd.DataFrame.from_dict(log, orient="index", columns=["rho","start","mean"]).to_csv(output_dir + "/" + str(chromosome) + "_" + str(selectionStart) + "_" + str(selectionEnd) + "_log_rho.txt", sep="\t") else: print("saving log!")
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000, min_maf=0.05, min_hwe_P=0.001, min_call_rate=0.95, blocksize=1000, cis_mode=True, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=0.95, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False) print('Running QTL analysis.') lik = 'normal' minimumProbabilityStep = 0.1 '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_ids = [] pass_qc_snps_all = [] fail_qc_snps_all = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering) snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in bim['snp'].values for i in covariateSnp)): snpQuery_cov = bim.loc[ bim['snp'].map(lambda x: x in list(covariateSnp)), :] if (plinkGenotype): snp_cov_df = pd.DataFrame( data=bed[snpQuery_cov['i'].values, :].compute(). transpose(), index=fam.index, columns=snpQuery_cov['snp'], ) else: ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2! ##Also we don't use a minimal quality to assure a value is present for all samples. print( 'Warning, during the regression of SNPs we assume ploidy 2.' ) snp_cov_df_t = pd.DataFrame(columns=fam.index) rowNumber = 0 for snpId in snpQuery_cov['i']: geno = bgen["genotype"][snpId].compute() if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_dosage_t[( np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep)] = float('NaN') else: snp_df_dosage_t = (geno["probs"][:, 0] * 2) + geno["probs"][:, 1] snp_df_dosage_t[ np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep)] = float('NaN') snp_df_dosage_t = pd.Series(snp_df_dosage_t, index=fam.index) snp_df_dosage_t.name = snpId snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t) rowNumber = rowNumber + 1 snp_cov_df_t = snp_cov_df_t.transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): toSelect = set(snp_filter_df.index).intersection( set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): toSelect = set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery['snp'])) snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)] if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if (contains_missing_samples): tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids else: #If no missing samples we can use the previous SNP Qc information before actually loading data. #This allows for more efficient blocking and retrieving of data snpQuery = snpQuery.loc[snpQuery['snp'].map( lambda x: x not in list(map(str, fail_qc_snps_all)))] if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. #test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df_tmp.index = sample2individual_feature['sample'] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = snp_cov_df_tmp.index snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() countChunker = 0 for snpGroup in utils.chunker(snpQuery, blocksize): countChunker = countChunker + 1 #print(countChunker) #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) #print(snpGroup) snp_idxs = snpGroup['i'].values snp_names = snpGroup['snp'].values tested_snp_ids.extend(snp_names) #subset genotype matrix, we cannot subselect at the same time, do in two steps. if (plinkGenotype): snp_df = pd.DataFrame( data=bed[snp_idxs, :].compute().transpose(), index=fam.index, columns=snp_names) else: snp_df_dosage = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) snp_df = pd.DataFrame(np.nan, index=fam.index, columns=snp_names) rowNumber = 0 for snpId in snp_idxs: geno = bgen["genotype"][snpId].compute() if (geno["ploidy"].min() > 1 & geno["ploidy"].max() < 3): if (geno["phased"]): snp_df_dosage_t = geno["probs"][:, [0, 2]].sum( 1).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :2], axis=1) - 1 ) + np.abs( np.argmax(geno["probs"][:, 2:4], axis=1) - 1)).astype(float) naId = (np.amax(geno["probs"][:, :2], 1) + np.amax(geno["probs"][:, 2:4], 1)) < ( 1 + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') else: snp_df_dosage_t = ( (geno["probs"][:, 0] * 2) + geno["probs"][:, 1]).astype(float) snp_df_t = (np.abs( np.argmax(geno["probs"][:, :3], axis=1) - 2)).astype(float) naId = np.amax(geno["probs"][:, :3], 1) < ( (1 / 3) + minimumProbabilityStep) snp_df_dosage_t[naId] = float('NaN') snp_df_t[naId] = float('NaN') snp_df_dosage.loc[:, snp_names[ rowNumber]] = snp_df_dosage_t snp_df.loc[:, snp_names[rowNumber]] = snp_df_t rowNumber = rowNumber + 1 snp_df_dosage = snp_df_dosage.loc[individual_ids, :] snp_df = snp_df.loc[individual_ids, :] snp_df = snp_df.loc[:, np.unique(snp_df.columns)[ np.unique(snp_df.columns, return_counts=1)[1] == 1]] #SNP QC. if not contains_missing_samples: #remove SNPs from snp_df if they have previously failed QC snp_df = snp_df.loc[:, snp_df.columns[~snp_df.columns. isin(fail_qc_snps_all)]] if snp_df.shape[1] == 0: continue snps_to_test_df = snp_df.loc[:, snp_df.columns[ ~snp_df.columns.isin(pass_qc_snps_all)]] if snps_to_test_df.shape[1] > 0: #Only do QC on relevant SNPs. join pre-QCed list and new QCed list. if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df.iloc[np.unique( snps_to_test_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snps_to_test_df, min_call_rate, min_maf, min_hwe_P) snps_to_test_df = None #append snp_names and failed_snp_names pass_qc_snps_all.extend(passed_snp_names) fail_qc_snps_all.extend(failed_snp_names) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(pass_qc_snps_all)]] else: #Do snp QC for relevant section. #Get relevant slice from: phenotype_ds if kinship_df is not None: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df.iloc[np.unique( snp_df.index, return_index=1)[1]].loc[ geneticaly_unique_individuals, :], min_call_rate, min_maf, min_hwe_P) else: passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc( snp_df, min_call_rate, min_maf, min_hwe_P) snp_df = snp_df.loc[:, snp_df.columns[snp_df.columns. isin(passed_snp_names)]] snpQcInfo_t = None if call_rate is not None: snpQcInfo_t = call_rate if maf is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, maf.reindex(snpQcInfo_t.index)], axis=1) if hweP is not None: snpQcInfo_t = pd.concat( [snpQcInfo_t, hweP.reindex(snpQcInfo_t.index)], axis=1) call_rate = None maf = None hweP = None if snpQcInfo is None and snpQcInfo_t is not None: snpQcInfo = snpQcInfo_t elif snpQcInfo_t is not None: snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort=False) ##First process SNPQc than check if we can continue. if len(snp_df.columns) == 0: continue elif (not plinkGenotype): snp_df_dosage = snp_df_dosage.loc[:, np.unique(snp_df.columns )] #We could make use of relatedness when imputing. And impute only based on genetically unique individuals. snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df), index=snp_df.index, columns=snp_df.columns) if (not plinkGenotype): snp_df_dosage = pd.DataFrame( fill_NaN.fit_transform(snp_df_dosage), index=snp_df_dosage.index, columns=snp_df_dosage.columns) ##No more snp_matrix_DF > snp_df # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_df.index) != len(sample2individual_feature.loc[ phenotype_ds.index]['iid']) or not all(snp_df.index == sample2individual_feature. loc[phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() G = snp_df.values if (not plinkGenotype): G = snp_df_dosage.values G = G.astype(float) G_index = snp_df.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): if (plinkGenotype): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_df_dosage, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: if (plinkGenotype): temp = utils.get_shuffeld_genotypes( snp_df, len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_df_dosage, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, cis_mode) #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval) alpha_params.append(alpha_para) beta_params.append(beta_para) if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals del QS_tmp del tmp_unique_individuals if snpQcInfo is not None: snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #if snpQcInfo is not None: #snpQcInfo2 = snpQcInfo.copy().transpose() #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t') #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested SNPs tested_snp_ids = list(set(tested_snp_ids)) #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = bim['snp'] snp_df['chromosome'] = bim['chrom'] snp_df['position'] = bim['pos'] snp_df['assessed_allele'] = bim['a1'] snp_df.index = snp_df['snp_id'] snp_df = snp_df.drop_duplicates() snp_df = snp_df.reindex(tested_snp_ids) snp_df = snp_df.drop_duplicates() if snpQcInfoMain is not None: snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) if (snp_df.shape[1] == 5): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate' ] elif (snp_df.shape[1] == 6): snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf' ] else: snp_df.columns = [ 'snp_id', 'chromosome', 'position', 'assessed_allele', 'call_rate', 'maf', 'hwe_p' ] feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')
def run_plots(plinkGenotype, geno_prefix, annotation_filename, phenotype_filename, covariate_filename, top_qtl_results_filename, output_directory, sample_mapping_filename,randomeff_filename): # # Loading Files # phenotype_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_Expr.txt" # annotation_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_Annot.txt" # covariate_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_covariates.txt" # sample_mapping_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Geuvadis_CEU_gte.txt" # geno_prefix = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Genotypes/Geuvadis" # output_directory = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output" # top_qtl_results_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output/top_qtl_results_all_FDR0.05.txt" # plinkGenotype = True [phenotype_df, kinship_df, readdepth_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=phenotype_filename, anno_filename=annotation_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=True, skipAutosomeFiltering = False, minimum_test_samples= 10, relatedness_score=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, selection='all',covariates_filename=covariate_filename, randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=None, feature_variant_covariate_filename=None) # results top_qtl_results_df = qtl_loader_utils.get_top_qtl_results(top_qtl_results_filename) for row in top_qtl_results_df.iterrows(): # feature specific parameters for QS mixing rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] Sigma = {} Sigma_qs = {} best = {} snpQcInfo = None currentFeatureNumber+= 1 phenotype_ds = phenotype_df.loc[row[1]["feature_id"]] individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[phenotype_ds.index] if kinship_df is not None and readdepth_df is None: kinship_mat = kinship_df.loc[individual_ids,individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if(QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) randomeff_mix = False # combining the two matrices if kinship_df is not None and readdepth_df is not None: randomeff_mix = True kinship_mat = kinship_df.loc[individual_ids,individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum()) for rho in rho1: Sigma[rho] = rho * kinship_df + (1 - rho) * readdepth_df Sigma_qs[rho] = economic_qs(Sigma[rho]) # creating a fake QS if none random effect is present or use the read depth one if kinship_df is None: if readdepth_df is None: K = np.eye(len(phenotype_ds.index)) if(QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) else: if(QS is None and not contains_missing_samples): QS = economic_qs(readdepth_df) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(readdepth_df) # covariance matrix cov_matrix = covariate_df.loc[sample2individual_feature['sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) cov_matrix = cov_matrix.astype(float) phenotype = utils.force_normal_distribution(phenotype_ds.values,method="gaussnorm") # Prepare LMM phenotype = phenotype.astype(float) if randomeff_mix: # initialize best to minus infinite best["lml"] = - math.inf best["lmm"] = - math.inf best["rho1"] = - math.inf for rho, QS in Sigma_qs.items(): lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') lmm.fit(verbose=False) lml = lmm.lml() if lml > best["lml"]: best["lml"] = lml best["lmm"] = lmm best["rho1"] = rho lmm = best["lmm"] print(best["rho1"]) if best["rho1"] != 0: rho_log[(feature_id)] = best["rho1"] print("Read depth has actually an effect!") else: lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') lmm.fit(verbose=False) # create phenotype_corrected_df for plotting phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(lmm.beta[1:]) phenotype_corrected_ds = pd.Series(data = phenotype_corrected, index = phenotype_ds.index, name="exp") qtl_plots(row, phenotype_ds, phenotype_corrected_ds, plinkGenotype, bim, fam, bed, annotation_df, sample2individual_df)
def _test_lmm(random, y, X, G, mvn, restricted): c = X.shape[1] QS = economic_qs_linear(G) lmm = LMM(y, X, QS, restricted=restricted) beta = lmm.beta v0 = lmm.v0 v1 = lmm.v1 K0 = G @ G.T assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0)) beta = random.randn(c) lmm.beta = beta assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0)) delta = random.rand(1).item() lmm.delta = delta v0 = lmm.v0 v1 = lmm.v1 assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0)) scale = random.rand(1).item() lmm.scale = scale v0 = lmm.v0 v1 = lmm.v1 assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0)) def fun(x): beta = x[:c] v0 = exp(x[c]) v1 = exp(x[c + 1]) return -mvn(beta, v0, v1, y, X, K0) res = minimize(fun, [0] * c + [0, 0]) lmm.fit(verbose=False) assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6) assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6) assert_allclose(lmm.v0, exp(res.x[c]), rtol=1e-3, atol=1e-6) assert_allclose(lmm.v1, exp(res.x[c + 1]), rtol=1e-3, atol=1e-6) lmm = LMM(y, X, QS, restricted=restricted) beta = random.randn(c) lmm.beta = beta lmm.delta = random.rand(1).item() lmm.scale = random.rand(1).item() lmm.fix("beta") def fun(x): v0 = exp(x[0]) v1 = exp(x[1]) return -mvn(beta, v0, v1, y, X, K0) res = minimize(fun, [0, 0]) lmm.fit(verbose=False) assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6) assert_allclose(lmm.v0, exp(res.x[0]), rtol=1e-3, atol=1e-6) assert_allclose(lmm.v1, exp(res.x[1]), rtol=1e-3, atol=1e-6) lmm = LMM(y, X, QS, restricted=restricted) lmm.beta = random.randn(c) delta = random.rand(1).item() lmm.delta = delta lmm.scale = random.rand(1).item() lmm.fix("delta") def fun(x): beta = x[:c] scale = exp(x[c]) v0 = scale * (1 - delta) v1 = scale * delta return -mvn(beta, v0, v1, y, X, K0) res = minimize(fun, [0] * c + [0]) lmm.fit(verbose=False) assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6) assert_allclose(lmm.beta, res.x[:c], rtol=1e-5, atol=1e-6) assert_allclose(lmm.scale, exp(res.x[c]), rtol=1e-5, atol=1e-6) lmm = LMM(y, X, QS, restricted=restricted) lmm.beta = random.randn(c) lmm.delta = random.rand(1).item() scale = random.rand(1).item() lmm.scale = scale lmm.fix("scale") def fun(x): beta = x[:c] delta = 1 / (1 + exp(-x[c])) v0 = scale * (1 - delta) v1 = scale * delta return -mvn(beta, v0, v1, y, X, K0) res = minimize(fun, [0] * c + [0]) lmm.fit(verbose=False) assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6) assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6) assert_allclose(lmm.delta, 1 / (1 + exp(-res.x[c])), rtol=1e-3, atol=1e-6)
def run_PrsQtl_analysis(pheno_filename, anno_filename, prsFile, output_dir, min_call_rate=0.95, blocksize=1000, skipAutosomeFiltering=False, gaussianize_method=None, minimum_test_samples=10, seed=np.random.randint(40000), n_perm=0, write_permutations=False, relatedness_score=None, feature_variant_covariate_filename=None, snps_filename=None, feature_filename=None, snp_feature_filename=None, genetic_range='all', covariates_filename=None, kinship_filename=None, sample_mapping_filename=None, regressCovariatesUpfront=False): fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0) print('Running GRS QT analysis.') lik = 'normal' '''Core function to take input and run QTL tests on a given chromosome.''' if relatedness_score is not None: relatedness_score = float(relatedness_score) [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples, relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range, covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename) mixed = kinship_df is not None if (kinship_df is None) or (relatedness_score is None): geneticaly_unique_individuals = sample2individual_df['iid'].values QS = None if (feature_list == None or len(feature_list) == 0): print('No features to be tested.') sys.exit() #Open output files qtl_loader_utils.ensure_dir(output_dir) if not selectionStart is None: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: output_writer = qtl_output.hdf5_writer( output_dir + '/qtl_results_{}.h5'.format(chromosome)) if (write_permutations): if not selectionStart is None: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd), n_perm) else: permutation_writer = qtl_output.hdf5_permutations_writer( output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm) #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness. tested_snp_names = [] fail_qc_features = [] alpha_params = [] beta_params = [] n_samples = [] n_e_samples = [] na_containing_features = 0 currentFeatureNumber = 0 snpQcInfoMain = None for feature_id in feature_list: snpQcInfo = None currentFeatureNumber += 1 if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) geneticaly_unique_individuals = tmp_unique_individuals continue data_written = False contains_missing_samples = False snpQuery = risk_df.index.values snp_cov_df = None if (feature_variant_covariate_df is not None): if (feature_id in feature_variant_covariate_df['feature'].values): covariateSnp = feature_variant_covariate_df['snp_id'].values[ feature_variant_covariate_df['feature'] == feature_id] if (any(i in risk_df.index.values for i in covariateSnp)): snp_cov_df = risk_df.loc[risk_df.index.map( lambda x: x in list(covariateSnp)), :].transpose() if (len(snpQuery) != 0) and (snp_filter_df is not None): snpQuery = list( set(snp_filter_df.index).intersection(set(snpQuery))) if (len(snpQuery) != 0) and (snp_feature_filter_df is not None): snpQuery = list( set( np.unique(snp_feature_filter_df['snp_id'].loc[ snp_feature_filter_df['feature'] == feature_id])).intersection(set(snpQuery))) if len(snpQuery) == 0: print("Feature: " + feature_id + " not tested. No SNPS passed QC for phenotype.") fail_qc_features.append(feature_id) continue else: phenotype_ds = phenotype_df.loc[feature_id] contains_missing_samples = any(~np.isfinite(phenotype_ds)) if (contains_missing_samples): #import pdb; pdb.set_trace() print('Feature: ' + feature_id + ' contains missing data.') phenotype_ds.dropna(inplace=True) na_containing_features = na_containing_features + 1 '''select indices for relevant individuals in genotype matrix These are not unique. NOT to be used to access phenotype/covariates data ''' individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values sample2individual_feature = sample2individual_df.loc[ phenotype_ds.index] if contains_missing_samples: tmp_unique_individuals = geneticaly_unique_individuals if (kinship_df is not None) and (relatedness_score is not None): geneticaly_unique_individuals = utils.get_unique_genetic_samples( kinship_df.loc[individual_ids, individual_ids], relatedness_score) else: geneticaly_unique_individuals = individual_ids if phenotype_ds.empty or len( geneticaly_unique_individuals) < minimum_test_samples: print("Feature: " + feature_id + " not tested not enough samples do QTL test.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue elif np.var(phenotype_ds.values) == 0: print("Feature: " + feature_id + " has no variance in selected individuals.") fail_qc_features.append(feature_id) if contains_missing_samples: geneticaly_unique_individuals = tmp_unique_individuals continue print('For feature: ' + str(currentFeatureNumber) + '/' + str(len(feature_list)) + ' (' + feature_id + '): ' + str(len(snpQuery)) + ' risk scores will be tested.\n Please stand by.') if (n_perm != 0): bestPermutationPval = np.ones((n_perm), dtype=np.float) #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix. # test if the covariates, kinship, snp and phenotype are in the same order if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\ (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)): ''' if all lines are in order put in arrays the correct genotype and phenotype x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; better readability of the code ''' if kinship_df is not None: kinship_mat = kinship_df.loc[individual_ids, individual_ids].values kinship_mat = kinship_mat.astype(float) ##GOWER normalization of Kinship matrix. kinship_mat *= (kinship_mat.shape[0] - 1) / ( kinship_mat.trace() - kinship_mat.mean(0).sum()) ## This needs to go with the subselection stuff. if (QS is None and not contains_missing_samples): QS = economic_qs(kinship_mat) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(kinship_mat) if kinship_df is None: K = np.eye(len(phenotype_ds.index)) if (QS is None and not contains_missing_samples): QS = economic_qs(K) elif (contains_missing_samples): QS_tmp = QS QS = economic_qs(K) cov_matrix = covariate_df.loc[sample2individual_feature[ 'sample'], :].values if covariate_df is not None else None if covariate_df is None: cov_matrix = np.ones((len(individual_ids), 1)) #pdb.set_trace() if snp_cov_df is not None: snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :] snp_cov_df = pd.DataFrame( fill_NaN.fit_transform(snp_cov_df_tmp)) snp_cov_df.index = sample2individual_feature['sample'] snp_cov_df.columns = snp_cov_df_tmp.columns cov_matrix = np.concatenate( (cov_matrix, snp_cov_df.values), 1) snp_cov_df_tmp = None snp_cov_df = None cov_matrix = cov_matrix.astype(float) else: print( 'There is an issue in mapping phenotypes vs covariates and/or kinship' ) sys.exit() phenotype = utils.force_normal_distribution( phenotype_ds.values, method=gaussianize_method ) if gaussianize_method is not None else phenotype_ds.values #Prepare LMM phenotype = phenotype.astype(float) ##Mixed and test. ##This is a future change so we don't need to decompose the COVs every time. ##Like QS this needs to happen when genetic unique individuals is the same. #svd_cov = economic_svd(cov_matrix) #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov) #These steps need to happen only once per phenotype. #print(QS) lmm = LMM(phenotype, cov_matrix, QS) if not mixed: lmm.delta = 1 lmm.fix('delta') #Prepare null model. lmm.fit(verbose=False) if regressCovariatesUpfront: phenotype_corrected = phenotype - cov_matrix[:, 1:].dot( lmm.beta[1:]) cov_matrix_corrected = cov_matrix[:, 0] lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS) lmm.fit(verbose=False) null_lml = lmm.lml() flmm = lmm.get_fast_scanner() #pdb.set_trace(); for snpGroup in utils.chunker(snpQuery, blocksize): #Fix seed at the start of the first chunker so all permutations are based on the same random first split. np.random.seed(seed) snp_names = snpGroup tested_snp_names.extend(snp_names) snp_matrix_DF = risk_df.loc[snp_names, individual_ids].transpose() ##GRS var QC snp_matrix_DF = snp_matrix_DF.loc[:, snp_matrix_DF.isna().sum( axis=0) != snp_matrix_DF. shape[0], ] snp_matrix_DF = snp_matrix_DF.loc[:, ( np.nanstd(snp_matrix_DF, axis=0) > 0)] # test if the covariates, kinship, snp and phenotype are in the same order if (len(snp_matrix_DF.index) != len( sample2individual_feature.loc[phenotype_ds.index] ['iid']) or not all( snp_matrix_DF.index == sample2individual_feature.loc[ phenotype_ds.index]['iid'])): print( 'There is an issue in mapping phenotypes and genotypes' ) sys.exit() #Impute missingness #pdb.set_trace() call_rate = 1 - snp_matrix_DF.isnull().sum() / len( snp_matrix_DF.index) if snpQcInfo is None and call_rate is not None: snpQcInfo = call_rate elif call_rate is not None: snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0) selection = call_rate > min_call_rate snp_matrix_DF = snp_matrix_DF.loc[:, list(snp_matrix_DF. columns[selection])] if snp_matrix_DF.shape[1] == 0: continue snp_matrix_DF = pd.DataFrame( fill_NaN.fit_transform(snp_matrix_DF), index=snp_matrix_DF.index, columns=snp_matrix_DF.columns) # G = snp_matrix_DF.values G = G.astype(float) G_index = snp_matrix_DF.columns alt_lmls, effsizes = flmm.fast_scan(G, verbose=False) var_pvalues = lrt_pvalues(null_lml, alt_lmls) var_effsizes_se = effsizes_se(effsizes, var_pvalues) #add these results to qtl_results temp_df = pd.DataFrame(index=range(len(G_index)), columns=[ 'feature_id', 'snp_id', 'p_value', 'beta', 'beta_se', 'empirical_feature_p_value' ]) temp_df['snp_id'] = G_index temp_df['feature_id'] = feature_id temp_df['beta'] = np.asarray(effsizes) temp_df['p_value'] = np.asarray(var_pvalues) temp_df['beta_se'] = np.asarray(var_effsizes_se) #insert default dummy value temp_df['empirical_feature_p_value'] = -1.0 if (n_perm != 0): pValueBuffer = [] totalSnpsToBeTested = (G.shape[1] * n_perm) permutationStepSize = np.floor( n_perm / (totalSnpsToBeTested / blocksize)) if (permutationStepSize > n_perm): permutationStepSize = n_perm elif (permutationStepSize < 1): permutationStepSize = 1 if (write_permutations): perm_df = pd.DataFrame( index=range(len(G_index)), columns=['snp_id'] + ['permutation_' + str(x) for x in range(n_perm)]) perm_df['snp_id'] = G_index for currentNperm in utils.chunker( list(range(1, n_perm + 1)), permutationStepSize): if (kinship_df is not None) and (relatedness_score is not None): temp = utils.get_shuffeld_genotypes_preserving_kinship( geneticaly_unique_individuals, relatedness_score, snp_matrix_DF, kinship_df.loc[individual_ids, individual_ids], len(currentNperm)) else: temp = utils.get_shuffeld_genotypes( snp_matrix_DF, len(currentNperm)) temp = temp.astype(float) alt_lmls_p, effsizes_p = flmm.fast_scan(temp, verbose=False) var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p) pValueBuffer.extend(np.asarray(var_pvalues_p)) if (not (len(pValueBuffer) == totalSnpsToBeTested)): #print(len(pValueBuffer)) #print(pValueBuffer) #print(totalSnpsToBeTested) print('Error in blocking logic for permutations.') sys.exit() perm = 0 for relevantOutput in utils.chunker( pValueBuffer, G.shape[1]): if (write_permutations): perm_df['permutation_' + str(perm)] = relevantOutput if (bestPermutationPval[perm] > min(relevantOutput)): bestPermutationPval[perm] = min(relevantOutput) perm = perm + 1 #print(relevantOutput) #print('permutation_'+str(perm)) if not temp_df.empty: data_written = True output_writer.add_result_df(temp_df) if (write_permutations): permutation_writer.add_permutation_results_df( perm_df, feature_id) #This we need to change in the written file. if (n_perm > 1 and data_written): #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id); alpha_para, beta_para = output_writer.apply_pval_correction( feature_id, bestPermutationPval, False) alpha_params.append(alpha_para) beta_params.append(beta_para) #pdb.set_trace(); if not data_written: fail_qc_features.append(feature_id) else: n_samples.append(phenotype_ds.size) n_e_samples.append(len(geneticaly_unique_individuals)) if contains_missing_samples: QS = QS_tmp geneticaly_unique_individuals = tmp_unique_individuals snpQcInfo = snpQcInfo.to_frame(name="call_rate") snpQcInfo.index.name = "snp_id" snpQcInfo.to_csv( output_dir + '/snp_qc_metrics_naContaining_feature_{}.txt'.format( feature_id), sep='\t') del QS_tmp del tmp_unique_individuals else: if (snpQcInfo is not None and snpQcInfoMain is not None): snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0) elif snpQcInfo is not None: snpQcInfoMain = snpQcInfo.copy(deep=True) #print('step 5') output_writer.close() if (write_permutations): permutation_writer.close() fail_qc_features = np.unique(fail_qc_features) if ((len(feature_list) - len(fail_qc_features)) == 0): time.sleep(15) #Safety timer to make sure the file is unlocked. print("Trying to remove the h5 file. Nothing has been tested.") print(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) if not selectionStart is None: os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format( chromosome, selectionStart, selectionEnd)) else: os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome)) sys.exit() #gather unique indexes of tested snps #write annotation and snp data to file snp_df = pd.DataFrame() snp_df['snp_id'] = np.unique(tested_snp_names) snp_df.index = np.unique(tested_snp_names) snp_df['chromosome'] = "NA" snp_df['position'] = "NA" if (snpQcInfoMain is not None): snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate") snpQcInfoMain['index'] = snpQcInfoMain.index snpQcInfoMain = snpQcInfoMain.drop_duplicates() del snpQcInfoMain['index'] snp_df = pd.concat( [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1) feature_list = list(set(feature_list) - set(fail_qc_features)) annotation_df = annotation_df.reindex(feature_list) annotation_df['n_samples'] = n_samples annotation_df['n_e_samples'] = n_e_samples if (n_perm > 1): annotation_df['alpha_param'] = alpha_params annotation_df['beta_param'] = beta_params if not selectionStart is None: snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}_{}_{}.txt'.format( chromosome, selectionStart, selectionEnd), sep='\t') else: snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome), sep='\t', index=False) annotation_df.to_csv(output_dir + '/feature_metadata_{}.txt'.format(chromosome), sep='\t')
def test_fast_scanner_statsmodel_gls(): from numpy.linalg import lstsq def _lstsq(A, B): return lstsq(A, B, rcond=None)[0] # data = sm.datasets.longley.load() # data.exog = sm.add_constant(data.exog) # ols_resid = sm.OLS(data.endog, data.exog).fit().resid # resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() # rho = resid_fit.params[1] rho = -0.3634294908774683 # order = toeplitz(range(len(ols_resid))) order = toeplitz(range(16)) sigma = rho**order QS = economic_qs(sigma) endog = reshape( [ 60323.0, 61122.0, 60171.0, 61187.0, 63221.0, 63639.0, 64989.0, 63761.0, 66019.0, 67857.0, 68169.0, 66513.0, 68655.0, 69564.0, 69331.0, 70551.0, ], (16, ), ) exog = reshape( [ 1.0, 83.0, 234289.0, 2356.0, 1590.0, 107608.0, 1947.0, 1.0, 88.5, 259426.0, 2325.0, 1456.0, 108632.0, 1948.0, 1.0, 88.2, 258054.0, 3682.0, 1616.0, 109773.0, 1949.0, 1.0, 89.5, 284599.0, 3351.0, 1650.0, 110929.0, 1950.0, 1.0, 96.2, 328975.0, 2099.0, 3099.0, 112075.0, 1951.0, 1.0, 98.1, 346999.0, 1932.0, 3594.0, 113270.0, 1952.0, 1.0, 99.0, 365385.0, 1870.0, 3547.0, 115094.0, 1953.0, 1.0, 100.0, 363112.0, 3578.0, 3350.0, 116219.0, 1954.0, 1.0, 101.2, 397469.0, 2904.0, 3048.0, 117388.0, 1955.0, 1.0, 104.6, 419180.0, 2822.0, 2857.0, 118734.0, 1956.0, 1.0, 108.4, 442769.0, 2936.0, 2798.0, 120445.0, 1957.0, 1.0, 110.8, 444546.0, 4681.0, 2637.0, 121950.0, 1958.0, 1.0, 112.6, 482704.0, 3813.0, 2552.0, 123366.0, 1959.0, 1.0, 114.2, 502601.0, 3931.0, 2514.0, 125368.0, 1960.0, 1.0, 115.7, 518173.0, 4806.0, 2572.0, 127852.0, 1961.0, 1.0, 116.9, 554894.0, 4007.0, 2827.0, 130081.0, 1962.0, ], (16, 7), ) lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() best_beta_se = _lstsq(exog.T @ _lstsq(lmm.covariance(), exog), eye(7)) best_beta_se = sqrt(best_beta_se.diagonal()) assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-4) endog = endog.copy() endog -= endog.mean(0) endog /= endog.std(0) exog = exog.copy() exog -= exog.mean(0) with errstate(invalid="ignore", divide="ignore"): exog /= exog.std(0) exog[:, 0] = 1 lmm = LMM(endog, exog, QS) lmm.fit(verbose=False) sigma = lmm.covariance() scanner = lmm.get_fast_scanner() # gls_model = sm.GLS(endog, exog, sigma=sigma) # gls_results = gls_model.fit() # scale = gls_results.scale scale = 1.7777777777782937 # beta_se = gls_results.bse beta_se = array([ 0.014636888951505144, 0.21334653097414055, 0.7428559936739378, 0.10174713767252333, 0.032745906589939845, 0.3494488802468581, 0.4644879873404213, ]) our_beta_se = sqrt(scanner.null_beta_covariance.diagonal()) # statsmodels scales the covariance matrix we pass, that is why # we need to account for it here. assert_allclose(our_beta_se, beta_se / sqrt(scale), rtol=1e-6) assert_allclose(scanner.null_beta_se, beta_se / sqrt(scale), rtol=1e-6)