def f(mixing,G0_standardized_val=G0_standardized_val,G1_standardized_val=G1_standardized_val,covar=covar,y=y,**kwargs): _mix(G, G0_standardized_val,G1_standardized_val,mixing) lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) result = lmm.findH2() if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']): resmin[0]=result return result['nLL']
def run_gwas(self): """ invoke all steps in the right order """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: from fastlmm.inference.lmm_cov import LMM as fastLMM if self.train_pcs is None and self.train_snps is not None: assert self.mixing == 0.0 G = self.train_snps elif self.train_pcs is not None and self.train_snps is None: assert self.mixing == 0.0 G = self.train_pcs else: logging.info("concat pcs, mixing {0}".format(self.mixing)) G = np.concatenate( (np.sqrt(1.0 - self.mixing) * self.train_snps, np.sqrt(self.mixing) * self.train_pcs), 1) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=self.cov, Y=self.phen, G=G, K=None) if self.findh2: opt = lmm.findH2(nGridH2=100) h2 = opt['h2'] assert self.delta is None, "either findh2 or set delta" else: h2 = 0.0 assert not self.delta is None logging.info("using externally provided delta") res = lmm.nLLeval(h2=h2, delta=self.delta, dof=None, scale=1.0, penalty=0.0, snps=self.test_snps) chi2stats = res['beta'] * res['beta'] / res['variance_beta'] self.p_values = stats.chi2.sf(chi2stats, 1)[:, 0] self.p_values_F = stats.f.sf( chi2stats, 1, G.shape[0] - (lmm.linreg.D + 1))[:, 0] #note that G.shape is the number of individuals self.p_idx = np.argsort(self.p_values) self.sorted_p_values = self.p_values[self.p_idx] self.p_idx_F = np.argsort(self.p_values_F) self.sorted_p_values_F = self.p_values_F[self.p_idx_F]
def run_gwas(self): """ invoke all steps in the right order """ from fastlmm.inference.lmm_cov import LMM as fastLMM if self.train_pcs is None and self.train_snps is not None: assert self.mixing == 0.0 G = self.train_snps elif self.train_pcs is not None and self.train_snps is None: assert self.mixing == 0.0 G = self.train_pcs else: logging.info("concat pcs, mixing {0}".format(self.mixing)) G = np.concatenate((np.sqrt(1.0-self.mixing) * self.train_snps, np.sqrt(self.mixing) * self.train_pcs),1) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=self.cov, Y=self.phen, G=G, K=None) if self.findh2: opt = lmm.findH2(nGridH2=100) h2 = opt['h2'] assert self.delta is None, "either findh2 or set delta" else: h2 = 0.0 assert not self.delta is None logging.info("using externally provided delta") res = lmm.nLLeval(h2=h2, delta=self.delta, dof=None, scale=1.0, penalty=0.0, snps=self.test_snps) chi2stats = res['beta']*res['beta']/res['variance_beta'] self.p_values = stats.chi2.sf(chi2stats,1)[:,0] self.p_values_F = stats.f.sf(chi2stats,1,G.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals self.p_idx = np.argsort(self.p_values) self.sorted_p_values = self.p_values[self.p_idx] self.p_idx_F = np.argsort(self.p_values_F) self.sorted_p_values_F = self.p_values_F[self.p_idx_F]
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized, mixing, #!!test mixing and G1 h2, log_delta, cache_file): assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1)))) #We always add 1's to the end. y = pheno['vals'] from pysnptools.standardizer import DiagKtoN assert mixing is None or 0.0 <= mixing <= 1.0 if cache_file is not None and os.path.exists(cache_file): lmm = fastLMM(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] else: # combine two kernels (normalize kernels to diag(K)=N G0_standardized_val = DiagKtoN(G0_standardized.val.shape[0]).standardize(G0_standardized.val) G1_standardized_val = DiagKtoN(G1_standardized.val.shape[0]).standardize(G1_standardized.val) if mixing == 0.0 or G1_standardized.sid_count == 0: G = G0_standardized.val elif mixing == 1.0 or G0_standardized.sid_count == 0: G = G1_standardized.val else: G = np.empty((G0_standardized.iid_count,G0_standardized.sid_count+G1_standardized.sid_count)) if mixing is None: mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y) _mix(G, G0_standardized_val,G1_standardized_val,mixing) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) snps_read = test_snps.read().standardize() res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) np.savez(cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] if G0_standardized is not None: assert G.shape[0] == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) items = [ ('SNP', snps_read.sid), ('Chr', snps_read.pos[:,0]), ('GenDist', snps_read.pos[:,1]), ('ChrPos', snps_read.pos[:,2]), ('PValue', p_values), ('SnpWeight', beta[:,0]), ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])), ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:,0])), ('Nullh2', np.zeros((snps_read.sid_count)) + h2) ] frame = pd.DataFrame.from_items(items) return frame
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized, mixing, #!!test mixing and G1 external_log_delta, min_log_delta, max_log_delta): covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1)))) #We always add 1's to the end. y = pheno['vals'] assert 0.0 <= mixing <= 1.0 # combine two kernels (normalize kernels to diag(K)=N if mixing == 0.0: G0_standardized_val = 1./np.sqrt((G0_standardized.val**2).sum() / float(G0_standardized.val.shape[0])) * G0_standardized.val G = G0_standardized_val elif mixing == 1.0: G1_standardized_val = 1./np.sqrt((G1_standardized.val**2).sum() / float(G1_standardized.val.shape[0])) * G1_standardized.val G = G1_standardized_val else: assert G1_standardized.sid_count > 0, "If a nonzero mixing weight is given, G1 is required" logging.info("concat G1, mixing {0}".format(mixing)) #TODO: make this efficient (write C-code to perform this operation in-place)!! G0_standardized_val = 1./np.sqrt((G0_standardized.val**2).sum() / float(G0_standardized.val.shape[0])) * G0_standardized.val G1_standardized_val = 1./np.sqrt((G1_standardized.val**2).sum() / float(G1_standardized.val.shape[0])) * G1_standardized.val #G = np.concatenate((np.sqrt(1.0-mixing) * G0_norm, np.sqrt(mixing) * G1_norm),1) G = np.concatenate((np.sqrt(1.0-mixing) * G0_standardized_val, np.sqrt(mixing) * G1_standardized_val),1) #TODO: make sure low-rank case is handled correctly from fastlmm.inference.lmm_cov import LMM as fastLMM lmm = fastLMM(X=covar, Y=y, G=G, K=None) if external_log_delta is None: result = lmm.find_log_delta(sid_count=1, min_log_delta=min_log_delta, max_log_delta=max_log_delta) external_log_delta = result['log_delta'] internal_delta = np.exp(external_log_delta) logging.info("internal_delta={0}".format(internal_delta)) logging.info("external_log_delta={0}".format(external_log_delta)) snps_read = test_snps.read().standardize() res = lmm.nLLeval(delta=internal_delta, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] p_values = stats.f.sf(chi2stats,1,G.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) items = [ ('SNP', snps_read.sid), ('Chr', snps_read.pos[:,0]), ('GenDist', snps_read.pos[:,1]), ('ChrPos', snps_read.pos[:,2]), ('PValue', p_values), ('SnpWeight', beta[:,0]), ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])), ('NullLogDelta', np.zeros((snps_read.sid_count)) + external_log_delta) ] frame = pd.DataFrame.from_items(items) return frame
def _getScalesPairwise(self, verbose=False, initDiagonal=False): """ Internal function for parameter initialization Uses a single trait model for initializing variances and a pairwise model to initialize correlations """ var = sp.zeros((self.P, 2)) if initDiagonal: #1. fit single trait model if verbose: print '.. fit single-trait model for initialization' vc = VarianceDecomposition(self.Y[:, 0:1]) for term_i in range(self.n_randEffs): if term_i == self.noisPos: vc.addRandomEffect(is_noise=True) else: K = self.vd.getTerm(term_i).getK() vc.addRandomEffect(K=K) scales0 = sp.sqrt(0.5) * sp.ones(2) for p in range(self.P): if verbose: print ' .. trait %d' % p vc.setY(self.Y[:, p:p + 1]) conv = vc.optimize(scales0=scales0) if not conv: print 'warning initialization not converged' var[p, :] = vc.getVarianceComps()[0, :] elif fastlmm_present: if verbose: print '.. fit single-trait model for initialization (using fastlmm)' for p in range(self.P): if verbose: print ' .. trait %d' % p covariates = None for term_i in range(self.n_randEffs): if term_i == self.noisPos: pass else: K = self.vd.getTerm(term_i).getK() varY = sp.var(self.Y[:, p:p + 1]) lmm = fastLMM(X=covariates, Y=self.Y[:, p:p + 1], G=None, K=K) opt = lmm.findH2(nGridH2=100) h2 = opt['h2'] var[p, :] = h2 * varY var[p, self.noisPos] = (1.0 - h2) * varY #import ipdb;ipdb.set_trace() else: if verbose: print '.. random initialization of diagonal' var = sp.random.randn(var.shape[0], var.shape[1]) var = var * var + 0.001 #2. fit pairwise model if verbose: print '.. fit pairwise model for initialization' vc = VarianceDecomposition(self.Y[:, 0:2]) for term_i in range(self.n_randEffs): if term_i == self.noisPos: vc.addRandomEffect(is_noise=True, trait_covar_type='freeform') else: K = self.vd.getTerm(term_i).getK() vc.addRandomEffect(K=K, trait_covar_type='freeform') rho_g = sp.ones((self.P, self.P)) rho_n = sp.ones((self.P, self.P)) for p1 in range(self.P): for p2 in range(p1): if verbose: print ' .. fit pair (%d,%d)' % (p1, p2) vc.setY(self.Y[:, [p1, p2]]) scales0 = sp.sqrt( sp.array([ var[p1, 0], 1e-4, var[p2, 0], 1e-4, var[p1, 1], 1e-4, var[p2, 1], 1e-4 ])) conv = vc.optimize(scales0=scales0) if not conv: print 'warning initialization not converged' Cg = vc.getTraitCovar(0) Cn = vc.getTraitCovar(1) rho_g[p1, p2] = Cg[0, 1] / sp.sqrt(Cg.diagonal().prod()) rho_n[p1, p2] = Cn[0, 1] / sp.sqrt(Cn.diagonal().prod()) rho_g[p2, p1] = rho_g[p1, p2] rho_n[p2, p1] = rho_n[p1, p2] #3. init Cg0 = rho_g * sp.dot(sp.sqrt(var[:, 0:1]), sp.sqrt(var[:, 0:1].T)) Cn0 = rho_n * sp.dot(sp.sqrt(var[:, 1:2]), sp.sqrt(var[:, 1:2].T)) offset_g = abs(sp.minimum(sp.linalg.eigh(Cg0)[0].min(), 0)) + 1e-4 offset_n = abs(sp.minimum(sp.linalg.eigh(Cn0)[0].min(), 0)) + 1e-4 Cg0 += offset_g * sp.eye(self.P) Cn0 += offset_n * sp.eye(self.P) Lg = sp.linalg.cholesky(Cg0) Ln = sp.linalg.cholesky(Cn0) Cg_params0 = sp.concatenate([Lg[:, p][:p + 1] for p in range(self.P)]) Cn_params0 = sp.concatenate([Ln[:, p][:p + 1] for p in range(self.P)]) scales0 = sp.concatenate( [Cg_params0, 1e-2 * sp.ones(1), Cn_params0, 1e-2 * sp.ones(1)]) return scales0
####load GRM, covariates, and pheno G0 = np.array(ro.r('G0<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/G0_mat.npy")')) covar = np.array(ro.r('covar<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/covar_mat.npy")')) y = np.array(ro.r('y<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/y_mat.npy")')) #norm_factor = 1./np.sqrt((G0**2).sum() / float(G0.shape[0])) #G0_standardized_val = norm_factor * G0 from pysnptools.standardizer import DiagKtoN #G0_standardized_val = DiagKtoN(G0.shape[0]).standardize(G0) G0_standardized_val = G0 from fastlmm.inference.lmm_cov import LMM as fastLMM lmm = fastLMM(X=covar, Y=y, G=G0_standardized_val) result = lmm.findH2() #dir(lmm) #lists attributes if result['h2'] > -1: h2 = result['h2'] else: h2 = result['h2'][0] residual_var = 1-h2 delta = residual_var/h2 m1_df = sum(lmm.S/(lmm.S + delta)) m1 = result['nLL'][0]*-1 #nr, nc = G0_standardized_val.shape #G0_standardized_val_vec = ro.FloatVector(G0_standardized_val.transpose().reshape((G0_standardized_val.size)))
def run_select(self, G0, G_bg, y, cov=None): """set up two kernel feature selection Parameters ---------- G0 : numpy array of shape (num_ind, num_snps) Data matrix from which foreground snps will be selected G0_bg : numpy array of shape (num_ind, num_snps) Data matrix containing background snps on which will be conditioned y : numpy vector of shape (num_ind, ) Vector of phenotypes cov : numpy array of shape (num_ind, num_covariates) or None Covariates to be used as fixed effects Returns ------- best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float) best_k is the best number of SNPs selected, feat_idx is a np.array of integers denoting the indices of these snps, best_mix is the best mixing coefficient between foreground and background kernel, best_delta is the best regularization coefficient """ num_ind = len(y) if cov is None: cov = np.ones((num_ind, 1)) else: logging.info("normalizing covariates") cov = cov.copy() cov = 1. / np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov cov.flags.writeable = False # normalize to diag(K) = N norm_factor = 1. / np.sqrt((G_bg**2).sum() / float(G_bg.shape[0])) # we copy in case G and G_bg are pointing to the same object G_bg = norm_factor * G_bg K_bg_full = G_bg.dot(G_bg.T) K_bg_full.flags.writeable = False # some asserts np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0]) if self.debug: norm_factor_check = 1. / np.sqrt(G_bg.shape[1]) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) for kfold_idx, (train_idx, test_idx) in enumerate( KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)): t0 = time.time() logging.info("running fold: %i" % kfold_idx) y_train = y.take(train_idx, axis=0) y_test = y.take(test_idx, axis=0) G0_train = G0.take(train_idx, axis=0) G0_test = G0.take(test_idx, axis=0) G_bg_train = G_bg.take(train_idx, axis=0) G_bg_test = G_bg.take(test_idx, axis=0) cov_train = cov.take(train_idx, axis=0) cov_test = cov.take(test_idx, axis=0) # write protect data y_train.flags.writeable = False y_test.flags.writeable = False G0_train.flags.writeable = False G0_test.flags.writeable = False G_bg_train.flags.writeable = False G_bg_test.flags.writeable = False cov_train.flags.writeable = False cov_test.flags.writeable = False # precompute background kernel K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) K_bg_train.flags.writeable = False if self.measure != "mse": K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1) K_bg_test.flags.writeable = False # rank features if self.order_by_lmm: logging.info("using linear mixed model to rank features") t0 = time.time() gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train) gwas.run_gwas() _pval = gwas.p_values logging.info("time taken: %s" % (str(time.time() - t0))) else: logging.info("using linear regression to rank features") _F, _pval = lin_reg.f_regression_block( lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train) feat_idx = np.argsort(_pval) for k_idx, max_k in enumerate(self.grid_k): feat_idx_subset = feat_idx[0:max_k] G_fs_train = G0_train.take(feat_idx_subset, axis=1) G_fs_test = G0_test.take(feat_idx_subset, axis=1) # normalize to sum(diag)=N norm_factor = 1. / np.sqrt( (G_fs_train**2).sum() / float(G_fs_train.shape[0])) G_fs_train *= norm_factor G_fs_test *= norm_factor G_fs_train.flags.writeable = False G_fs_test.flags.writeable = False # asserts if self.debug: norm_factor_check = 1.0 / np.sqrt(max_k) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) np.testing.assert_almost_equal( sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0]) logging.info("k: %i" % (max_k)) # use LMM from fastlmm.inference.lmm_cov import LMM as fastLMM if G_bg_train.shape[1] <= G_bg_train.shape[0]: lmm = fastLMM(X=cov_train, Y=y_train[:, np.newaxis], G=G_bg_train) else: lmm = fastLMM(X=cov_train, Y=y_train[:, np.newaxis], K=K_bg_train) W = G_fs_train.copy() UGup, UUGup = lmm.rotate(W) i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool) i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool) t0 = time.time() res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup) logging.info("time taken for k=%i: %s" % (max_k, str(time.time() - t0))) # recover a2 from alternate parameterization a2 = res["h2_1"] / float(res["h2"] + res["h2_1"]) h2 = res["h2"] + res["h2_1"] delta = (1 - h2) / h2 #res_cov = res # do final prediction using lmm.py from fastlmm.inference import LMM lmm = LMM(forcefullrank=False) lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2) lmm.setX(cov_train) lmm.sety(y_train) # we take an additional step to estimate betas on covariates (not given from new model) res = lmm.nLLeval(delta=delta, REML=True) # predict on test set lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test) out = lmm.predictMean(beta=res["beta"], delta=delta) mse = mean_squared_error(y_test, out) logging.info("mse: %f" % (mse)) self.mse[kfold_idx, k_idx] = mse self.mixes[kfold_idx, k_idx] = a2 self.deltas[kfold_idx, k_idx] = delta if self.measure != "mse": K_test_test = a2 * G_fs_test.dot( G_fs_test.T) + (1.0 - a2) * K_bg_test ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True) if self.debug: ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True) np.testing.assert_almost_equal(ll, ll2, decimal=4) logging.info("ll: %f" % (ll)) self.ll[kfold_idx, k_idx] = ll logging.info("time taken for fold: %s" % str(time.time() - t0)) best_k, best_mix, best_delta = self.select_best_k() logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta)) # final scan if self.order_by_lmm: logging.info("final scan using LMM") gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov) gwas.run_gwas() _pval = gwas.p_values feat_idx = np.argsort(_pval)[0:best_k] else: logging.info("final scan using LR") _F, _pval = lin_reg.f_regression_block( lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000) logging.info("number of snps selected: %i" % (best_k)) return best_k, feat_idx, best_mix, best_delta
def run_select(self, G0, G_bg, y, cov=None): """set up two kernel feature selection Parameters ---------- G0 : numpy array of shape (num_ind, num_snps) Data matrix from which foreground snps will be selected G0_bg : numpy array of shape (num_ind, num_snps) Data matrix containing background snps on which will be conditioned y : numpy vector of shape (num_ind, ) Vector of phenotypes cov : numpy array of shape (num_ind, num_covariates) or None Covariates to be used as fixed effects Returns ------- best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float) best_k is the best number of SNPs selected, feat_idx is a np.array of integers denoting the indices of these snps, best_mix is the best mixing coefficient between foreground and background kernel, best_delta is the best regularization coefficient """ num_ind = len(y) if cov is None: cov = np.ones((num_ind,1)) else: logging.info("normalizing covariates") cov = cov.copy() cov = 1./np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov cov.flags.writeable = False # normalize to diag(K) = N norm_factor = 1./np.sqrt((G_bg**2).sum() / float(G_bg.shape[0])) # we copy in case G and G_bg are pointing to the same object G_bg = norm_factor * G_bg K_bg_full = G_bg.dot(G_bg.T) K_bg_full.flags.writeable = False # some asserts np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0]) if self.debug: norm_factor_check = 1./np.sqrt(G_bg.shape[1]) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) for kfold_idx, (train_idx, test_idx) in enumerate(KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)): t0 = time.time() logging.info("running fold: %i" % kfold_idx) y_train = y.take(train_idx, axis=0) y_test = y.take(test_idx, axis=0) G0_train = G0.take(train_idx, axis=0) G0_test = G0.take(test_idx, axis=0) G_bg_train = G_bg.take(train_idx, axis=0) G_bg_test = G_bg.take(test_idx, axis=0) cov_train = cov.take(train_idx, axis=0) cov_test = cov.take(test_idx, axis=0) # write protect data y_train.flags.writeable = False y_test.flags.writeable = False G0_train.flags.writeable = False G0_test.flags.writeable = False G_bg_train.flags.writeable = False G_bg_test.flags.writeable = False cov_train.flags.writeable = False cov_test.flags.writeable = False # precompute background kernel K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) K_bg_train.flags.writeable = False if self.measure != "mse": K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1) K_bg_test.flags.writeable = False # rank features if self.order_by_lmm: logging.info("using linear mixed model to rank features") t0 = time.time() gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train) gwas.run_gwas() _pval = gwas.p_values logging.info("time taken: %s" % (str(time.time()-t0))) else: logging.info("using linear regression to rank features") _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train) feat_idx = np.argsort(_pval) for k_idx, max_k in enumerate(self.grid_k): feat_idx_subset = feat_idx[0:max_k] G_fs_train = G0_train.take(feat_idx_subset, axis=1) G_fs_test = G0_test.take(feat_idx_subset, axis=1) # normalize to sum(diag)=N norm_factor = 1./np.sqrt((G_fs_train**2).sum() / float(G_fs_train.shape[0])) G_fs_train *= norm_factor G_fs_test *= norm_factor G_fs_train.flags.writeable = False G_fs_test.flags.writeable = False # asserts if self.debug: norm_factor_check = 1.0 / np.sqrt(max_k) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) np.testing.assert_almost_equal(sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0]) logging.info("k: %i" % (max_k)) # use LMM from fastlmm.inference.lmm_cov import LMM as fastLMM if G_bg_train.shape[1] <= G_bg_train.shape[0]: lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], G=G_bg_train) else: lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], K=K_bg_train) W = G_fs_train.copy() UGup,UUGup = lmm.rotate(W) i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool) i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool) t0 = time.time() res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup) logging.info("time taken for k=%i: %s" % (max_k, str(time.time()-t0))) # recover a2 from alternate parameterization a2 = res["h2_1"] / float(res["h2"] + res["h2_1"]) h2 = res["h2"] + res["h2_1"] delta = (1-h2) / h2 #res_cov = res # do final prediction using lmm.py from fastlmm.inference import LMM lmm = LMM(forcefullrank=False) lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2) lmm.setX(cov_train) lmm.sety(y_train) # we take an additional step to estimate betas on covariates (not given from new model) res = lmm.nLLeval(delta=delta, REML=True) # predict on test set lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test) out = lmm.predictMean(beta=res["beta"], delta=delta) mse = mean_squared_error(y_test, out) logging.info("mse: %f" % (mse)) self.mse[kfold_idx, k_idx] = mse self.mixes[kfold_idx, k_idx] = a2 self.deltas[kfold_idx, k_idx] = delta if self.measure != "mse": K_test_test = a2 * G_fs_test.dot(G_fs_test.T) + (1.0-a2) * K_bg_test ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True) if self.debug: ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True) np.testing.assert_almost_equal(ll, ll2, decimal=4) logging.info("ll: %f" % (ll)) self.ll[kfold_idx, k_idx] = ll logging.info("time taken for fold: %s" % str(time.time()-t0)) best_k, best_mix, best_delta = self.select_best_k() logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta)) # final scan if self.order_by_lmm: logging.info("final scan using LMM") gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov) gwas.run_gwas() _pval = gwas.p_values feat_idx = np.argsort(_pval)[0:best_k] else: logging.info("final scan using LR") _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000) logging.info("number of snps selected: %i" % (best_k)) return best_k, feat_idx, best_mix, best_delta
def _getScalesPairwise(self,verbose=False, initDiagonal=False): """ Internal function for parameter initialization Uses a single trait model for initializing variances and a pairwise model to initialize correlations """ var = sp.zeros((self.P,2)) if initDiagonal: #1. fit single trait model if verbose: print('.. fit single-trait model for initialization') vc = VarianceDecomposition(self.Y[:,0:1]) for term_i in range(self.n_randEffs): if term_i==self.noisPos: vc.addRandomEffect(is_noise=True) else: K = self.vd.getTerm(term_i).getK() vc.addRandomEffect(K=K) scales0 = sp.sqrt(0.5)*sp.ones(2) for p in range(self.P): if verbose: print((' .. trait %d' % p)) vc.setY(self.Y[:,p:p+1]) conv = vc.optimize(scales0=scales0) if not conv: print('warning initialization not converged') var[p,:] = vc.getVarianceComps()[0,:] elif fastlmm_present: if verbose: print('.. fit single-trait model for initialization (using fastlmm)') for p in range(self.P): if verbose: print((' .. trait %d' % p)) covariates = None for term_i in range(self.n_randEffs): if term_i==self.noisPos: pass else: K = self.vd.getTerm(term_i).getK() varY = sp.var(self.Y[:,p:p+1]) lmm = fastLMM(X=covariates, Y=self.Y[:,p:p+1], G=None, K=K) opt = lmm.findH2(nGridH2=100) h2 = opt['h2'] var[p,:] = h2 * varY var[p,self.noisPos] = (1.0-h2) * varY #;ipdb.set_trace() else: if verbose: print('.. random initialization of diagonal') var = sp.random.randn(var.shape[0],var.shape[1]) var = var*var + 0.001 #2. fit pairwise model if verbose: print('.. fit pairwise model for initialization') vc = VarianceDecomposition(self.Y[:,0:2]) for term_i in range(self.n_randEffs): if term_i==self.noisPos: vc.addRandomEffect(is_noise=True,trait_covar_type='freeform') else: K = self.vd.getTerm(term_i).getK() vc.addRandomEffect(K=K,trait_covar_type='freeform') rho_g = sp.ones((self.P,self.P)) rho_n = sp.ones((self.P,self.P)) for p1 in range(self.P): for p2 in range(p1): if verbose: print((' .. fit pair (%d,%d)'%(p1,p2))) vc.setY(self.Y[:,[p1,p2]]) scales0 = sp.sqrt(sp.array([var[p1,0],1e-4,var[p2,0],1e-4,var[p1,1],1e-4,var[p2,1],1e-4])) conv = vc.optimize(scales0=scales0) if not conv: print('warning initialization not converged') Cg = vc.getTraitCovar(0) Cn = vc.getTraitCovar(1) rho_g[p1,p2] = Cg[0,1]/sp.sqrt(Cg.diagonal().prod()) rho_n[p1,p2] = Cn[0,1]/sp.sqrt(Cn.diagonal().prod()) rho_g[p2,p1] = rho_g[p1,p2] rho_n[p2,p1] = rho_n[p1,p2] #3. init Cg0 = rho_g*sp.dot(sp.sqrt(var[:,0:1]),sp.sqrt(var[:,0:1].T)) Cn0 = rho_n*sp.dot(sp.sqrt(var[:,1:2]),sp.sqrt(var[:,1:2].T)) offset_g = abs(sp.minimum(sp.linalg.eigh(Cg0)[0].min(),0))+1e-4 offset_n = abs(sp.minimum(sp.linalg.eigh(Cn0)[0].min(),0))+1e-4 Cg0+=offset_g*sp.eye(self.P) Cn0+=offset_n*sp.eye(self.P) Lg = sp.linalg.cholesky(Cg0) Ln = sp.linalg.cholesky(Cn0) Cg_params0 = sp.concatenate([Lg[:,p][:p+1] for p in range(self.P)]) Cn_params0 = sp.concatenate([Ln[:,p][:p+1] for p in range(self.P)]) scales0 = sp.concatenate([Cg_params0,1e-2*sp.ones(1),Cn_params0,1e-2*sp.ones(1)]) return scales0
def _internal_single( G0_standardized, test_snps, pheno, covar, G1_standardized, mixing, #!!test mixing and G1 h2, log_delta, cache_file, interact_with_snp=None): assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0 / (np.exp(log_delta) + 1) covar = np.hstack((covar['vals'], np.ones( (test_snps.iid_count, 1)))) #We always add 1's to the end. y = pheno['vals'] from pysnptools.standardizer import DiagKtoN assert mixing is None or 0.0 <= mixing <= 1.0 if cache_file is not None and os.path.exists(cache_file): lmm = fastLMM(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] else: # combine two kernels (normalize kernels to diag(K)=N G0_standardized_val = DiagKtoN( G0_standardized.val.shape[0]).standardize(G0_standardized.val) G1_standardized_val = DiagKtoN( G1_standardized.val.shape[0]).standardize(G1_standardized.val) if mixing == 0.0 or G1_standardized.sid_count == 0: G = G0_standardized.val elif mixing == 1.0 or G0_standardized.sid_count == 0: G = G1_standardized.val else: G = np.empty( (G0_standardized.iid_count, G0_standardized.sid_count + G1_standardized.sid_count)) if mixing is None: mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y) _mix(G, G0_standardized_val, G1_standardized_val, mixing) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) snps_read = test_snps.read().standardize() if interact_with_snp is not None: print "interaction with %i" % interact_with_snp interact = covar[:, interact_with_snp] interact -= interact.mean() interact /= interact.std() variables_to_test = snps_read.val * interact[:, np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) np.savez( cache_file, lmm.U, lmm.S ) #using np.savez instead of pickle because it seems to be faster to read and write beta = res['beta'] chi2stats = beta * beta / res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] if G0_standardized is not None: assert G.shape[0] == lmm.U.shape[0] p_values = stats.f.sf( chi2stats, 1, lmm.U.shape[0] - 3 )[:, 0] #note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) items = [('SNP', snps_read.sid), ('Chr', snps_read.pos[:, 0]), ('GenDist', snps_read.pos[:, 1]), ('ChrPos', snps_read.pos[:, 2]), ('PValue', p_values), ('SnpWeight', beta[:, 0]), ('SnpWeightSE', np.sqrt(res['variance_beta'][:, 0])), ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:, 0])), ('Nullh2', np.zeros((snps_read.sid_count)) + h2)] frame = pd.DataFrame.from_items(items) return frame