def test_two_kernels_lowrank_REML(self): """ two kernels, using delta=1.0, REML=True """ delta = 0.5 # low rank as N > s_c N = 100 # number of individuals d = 1 # number of fix effects s_c = 40 # number of SNPs used to construct the genetic similarity matrix X, y, G0, G1, G0_small, exclude_idx = generate_random_data(N, d, s_c) lmm_nocut = getLMM() lmm_nocut.setG(G0_small, G1) lmm_nocut.setX(X) lmm_nocut.sety(y) ret_nocut = lmm_nocut.nLLeval(REML=True, delta=delta) lmm_cut = getLMM() lmm_cut.setG(G0, G1) lmm_cut.setX(X) lmm_cut.sety(y) lmm_cut.set_exclude_idx(exclude_idx) ret_cut = lmm_cut.nLLeval(REML=True, delta=delta) # make sure results are the same for key in ret_nocut.keys(): #NP.testing.assert_array_almost_equal(ret_cut[key], ret_nocut[key]) self.assertAlmostEqual(ret_cut[key], ret_nocut[key])
def test_two_kernels_lowrank_REML(self): """ two kernels, using delta=1.0, REML=True """ delta = 0.5 # low rank as N > s_c N = 100 # number of individuals d = 1 # number of fix effects s_c = 40 # number of SNPs used to construct the genetic similarity matrix X, y, G0, G1, G0_small, exclude_idx = generate_random_data(N, d, s_c) lmm_nocut = getLMM() lmm_nocut.setG(G0_small, G1) lmm_nocut.setX(X) lmm_nocut.sety(y) ret_nocut = lmm_nocut.nLLeval(REML=True,delta=delta) lmm_cut = getLMM() lmm_cut.setG(G0, G1) lmm_cut.setX(X) lmm_cut.sety(y) lmm_cut.set_exclude_idx(exclude_idx) ret_cut = lmm_cut.nLLeval(REML=True,delta=delta) # make sure results are the same for key in ret_nocut.keys(): #NP.testing.assert_array_almost_equal(ret_cut[key], ret_nocut[key]) self.assertAlmostEqual(ret_cut[key], ret_nocut[key])
def _nullModelMixedEffectLinear(self, G0=None,K0=None): lmm0 = inference.getLMM(forcefullrank = self.forcefullrank) if G0 is not None: lmm0.setG(G0=G0,K0=K0) lmm0.setX(self.X) lmm0.sety(self.Y) self.model0 = lmm0.findH2()# The null model only has a single kernel and only needs to find h2
def test_nLLeval_2(self): """ small regression test to check negative log likelihood function delta = 1, REML = True """ model = getLMM() model.setG(G0=self._G0, G1=self._G1, a2=self._a2) model.setX(self._X) model.sety(self._y) result = model.nLLeval(REML=True, delta=1.0) target_result = { 'scale': 1.0, 'h2': 0.0, 'beta': NP.array([0.05863443]), 'a2': 0.4, 'REML': True, 'nLL': 90.940636012858121, 'sigma2': 0.96761436076968987 } # make sure results are the same for key in result.keys(): self.assertAlmostEqual(result[key], target_result[key])
def test_nLLeval_1(self): """ small regression test to check negative log likelihood function delta = 1, REML = False """ model = getLMM() model.setG(G0=self._G0, G1=self._G1, a2=self._a2) model.setX(self._X) model.sety(self._y) result = model.nLLeval(REML=False, delta=1.0) target_result = { 'scale': 1.0, 'h2': 0.0, 'beta': NP.array([0.05863443]), 'a2': 0.4, 'REML': False, 'nLL': 91.92983775736522, 'sigma2': 0.94826207355429604 } # make sure results are the same for key in result.keys(): self.assertAlmostEqual(result[key], target_result[key])
def test_nLLeval_3(self): """ small regression test to check negative log likelihood function delta = None, h2 = 0.5, REML = True """ model = getLMM() model.setG(G0=self._G0, G1=self._G1, a2=self._a2) model.setX(self._X) model.sety(self._y) result = model.nLLeval(REML=True, delta=None, h2=0.5) target_result = { 'scale': 1.0, 'h2': 0.5, 'beta': NP.array([0.05863443]), 'a2': 0.4, 'REML': True, 'nLL': 90.940636012858121, 'sigma2': 1.9352287215393797 } # make sure results are the same for key in result.keys(): NP.testing.assert_array_almost_equal(result[key], target_result[key])
def test_one_kernel_fullrank(self): """ test for one kernel only, using delta=1.0, REML=False """ delta = 1.0 # full rank as N <= s_c N = 10 # number of individuals d = 1 # number of fix effects s_c = 40 # number of SNPs used to construct the genetic similarity matrix X, y, G0, G1, G0_small, exclude_idx = generate_random_data(N, d, s_c) lmm_nocut = getLMM() lmm_nocut.setG(G0_small) lmm_nocut.setX(X) lmm_nocut.sety(y) ret_nocut = lmm_nocut.nLLeval(REML=False, delta=delta) lmm_nocut.setTestData(Xstar=X[:3], K0star=None, K1star=None, G0star=G0_small[:3], G1star=None) ypred_nocut = lmm_nocut.predictMean(beta=ret_nocut['beta'], delta=delta) lmm_cut = getLMM() lmm_cut.setG(G0) lmm_cut.setX(X) lmm_cut.sety(y) lmm_cut.set_exclude_idx(exclude_idx) ret_cut = lmm_cut.nLLeval(REML=False, delta=delta) lmm_cut.setTestData(Xstar=X[:3], G0star=G0[:3]) ypred_cut = lmm_cut.predictMean(beta=ret_cut['beta'], delta=delta) # make sure results are the same for key in ret_nocut.keys(): #NP.testing.assert_array_almost_equal(ret_cut[key], ret_nocut[key]) self.assertAlmostEqual(ret_cut[key], ret_nocut[key]) wproj = SP.random.randn(ypred_nocut.shape[0]) self.assertAlmostEqual((wproj * ypred_nocut).sum(), (wproj * ypred_cut).sum())
def core_run(snpreader, pheno_fn, k, delta): """ extracted core functionality, to avoid shuffle of data and not correct delta """ G, X, y = load_snp_data(snpreader, pheno_fn, standardizer=Unit()) kf = KFold(len(y), n_folds=10, indices=False, shuffle=False) ll = np.zeros(10) fold_idx = 0 fold_data = {} for split_idx, (train_idx, test_idx) in enumerate(kf): fold_idx += 1 fold_data["train_idx"] = train_idx fold_data["test_idx"] = test_idx # set up data ############################## fold_data["G_train"] = G[train_idx,:].read() fold_data["G_test"] = G[test_idx,:] fold_data["X_train"] = X[train_idx] fold_data["X_test"] = X[test_idx] fold_data["y_train"] = y[train_idx] fold_data["y_test"] = y[test_idx] # feature selection ############################## _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt,fold_data["G_train"].val,fold_data["y_train"],blocksize=1E4,C=fold_data["X_train"]) feat_idx = np.argsort(_pval) fold_data["feat_idx"] = feat_idx # re-order SNPs (and cut to max num) ############################## fold_data["G_train"] = fold_data["G_train"][:,feat_idx[0:k]].read() fold_data["G_test"] = fold_data["G_test"][:,feat_idx[0:k]].read() model = getLMM() model.setG(fold_data["G_train"].val) model.sety(fold_data["y_train"]) model.setX(fold_data["X_train"]) REML = False # predict on test set res = model.nLLeval(delta=delta, REML=REML) model.setTestData(Xstar=fold_data["X_test"], G0star=fold_data["G_test"].val) model.predictMean(beta=res["beta"], delta=delta) #mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], #out) ll[split_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta) return ll
def core_run(snpreader, pheno_fn, k, delta): """ extracted core functionality, to avoid shuffle of data and not correct delta """ G, X, y = load_snp_data(snpreader, pheno_fn, standardizer=Unit()) kf = KFold(n_splits=10, shuffle=False).split(list(range(len(y)))) ll = np.zeros(10) fold_idx = 0 fold_data = {} for split_idx, (train_idx, test_idx) in enumerate(kf): fold_idx += 1 fold_data["train_idx"] = train_idx fold_data["test_idx"] = test_idx # set up data ############################## fold_data["G_train"] = G[train_idx,:].read() fold_data["G_test"] = G[test_idx,:] fold_data["X_train"] = X[train_idx] fold_data["X_test"] = X[test_idx] fold_data["y_train"] = y[train_idx] fold_data["y_test"] = y[test_idx] # feature selection ############################## _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt,fold_data["G_train"].val,fold_data["y_train"],blocksize=1E4,C=fold_data["X_train"]) feat_idx = np.argsort(_pval) fold_data["feat_idx"] = feat_idx # re-order SNPs (and cut to max num) ############################## fold_data["G_train"] = fold_data["G_train"][:,feat_idx[0:k]].read() fold_data["G_test"] = fold_data["G_test"][:,feat_idx[0:k]].read() model = getLMM() model.setG(fold_data["G_train"].val) model.sety(fold_data["y_train"]) model.setX(fold_data["X_train"]) REML = False # predict on test set res = model.nLLeval(delta=delta, REML=REML) model.setTestData(Xstar=fold_data["X_test"], G0star=fold_data["G_test"].val) model.predictMean(beta=res["beta"], delta=delta) #mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], #out) ll[split_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta) return ll
def test_one_kernel_fullrank(self): """ test for one kernel only, using delta=1.0, REML=False """ delta = 1.0 # full rank as N <= s_c N = 10 # number of individuals d = 1 # number of fix effects s_c = 40 # number of SNPs used to construct the genetic similarity matrix X, y, G0, G1, G0_small, exclude_idx = generate_random_data(N, d, s_c) lmm_nocut = getLMM() lmm_nocut.setG(G0_small) lmm_nocut.setX(X) lmm_nocut.sety(y) ret_nocut = lmm_nocut.nLLeval(REML=False,delta=delta) lmm_nocut.setTestData(Xstar=X[:3],K0star=None,K1star=None,G0star=G0_small[:3],G1star=None) ypred_nocut = lmm_nocut.predictMean(beta=ret_nocut['beta'],delta=delta) lmm_cut = getLMM() lmm_cut.setG(G0) lmm_cut.setX(X) lmm_cut.sety(y) lmm_cut.set_exclude_idx(exclude_idx) ret_cut = lmm_cut.nLLeval(REML=False,delta=delta) lmm_cut.setTestData(Xstar=X[:3],G0star=G0[:3]) ypred_cut = lmm_cut.predictMean(beta=ret_cut['beta'],delta=delta) # make sure results are the same for key in ret_nocut.keys(): #NP.testing.assert_array_almost_equal(ret_cut[key], ret_nocut[key]) self.assertAlmostEqual(ret_cut[key], ret_nocut[key]) wproj = SP.random.randn(ypred_nocut.shape[0]) self.assertAlmostEqual((wproj*ypred_nocut).sum(),(wproj*ypred_cut).sum())
def test_predictions(self): model = getLMM() model.setG(G0=self._G0,G1=self._G1,a2=self._a2) model.setX(self._X) model.sety(self._y) # logdelta space model.setTestData(Xstar=self._Xstar,G0star=self._G0star,G1star=self._G1star) ystar = model.predictMean(self._beta,logdelta=self._logdelta) Gstar=SP.concatenate((SP.sqrt(1.0-self._a2) * self._G0star, SP.sqrt(self._a2) * self._G1star),1) weights = model.getPosteriorWeights(beta=self._beta,logdelta=self._logdelta) ystar2 = SP.dot(self._Xstar,self._beta) + SP.dot(Gstar,weights) self.assertAlmostEqual(ystar[0],ystar2[0]) self.assertAlmostEqual(ystar[1],ystar2[1])
def _altModelMixedEffectLinear(self, G1,tol=0.0): lmm1 = inference.getLMM(forcefullrank = self.forcefullrank) if self.G0 is not None: lmm1.setG(self.G0, G1) lmm1.setX(self.X) lmm1.sety(self.Y) lik1 = lmm1.findA2()#The alternative model has two kernels and needs to find both a2 and h2 alteqnull=lik1['a2']<=(0.0+tol) else: lmm1.setG(G1) lmm1.setX(self.X) lmm1.sety(self.Y) lik1 = lmm1.findH2()#The alternative model has one kernel and needs to find only h2 alteqnull=lik1['h2']<=(0.0+tol) stat = 2.0*(self.model0['nLL'] - lik1['nLL']) self.model1=lmm1 return (lik1,stat,alteqnull)
def test_nLLeval_2(self): """ small regression test to check negative log likelihood function delta = 1, REML = True """ model = getLMM() model.setG(G0=self._G0, G1=self._G1 ,a2=self._a2) model.setX(self._X) model.sety(self._y) result = model.nLLeval(REML=True, delta=1.0) target_result = {'scale': 1.0, 'h2': 0.0, 'beta': NP.array([ 0.05863443]), 'a2': 0.4, 'REML': True, 'nLL': 90.940636012858121, 'sigma2': 0.96761436076968987} # make sure results are the same for key in result.keys(): self.assertAlmostEqual(result[key], target_result[key])
def test_nLLeval_1(self): """ small regression test to check negative log likelihood function delta = 1, REML = False """ model = getLMM() model.setG(G0=self._G0, G1=self._G1, a2=self._a2) model.setX(self._X) model.sety(self._y) result = model.nLLeval(REML=False,delta=1.0) target_result = {'scale': 1.0, 'h2': 0.0, 'beta': NP.array([ 0.05863443]), 'a2': 0.4, 'REML': False, 'nLL': 91.92983775736522, 'sigma2': 0.94826207355429604} # make sure results are the same for key in result.keys(): self.assertAlmostEqual(result[key], target_result[key])
def dowork(self, fold_idx): self.feature_selection_strategy.run_once() for i_k,k in enumerate(self.k_values): self.k_values[i_k]=min(self.k_values[i_k],self.feature_selection_strategy.snpreader.sid_count) max_k = max([1]+[k for k in self.k_values if k != self.feature_selection_strategy.snpreader.sid_count]) split_iterator = self.feature_selection_strategy.setup_linear_regression(max_k, start=fold_idx, stop=None) fold_data = next(split_iterator) tt0 = time.time() if self.strategy == "lmm_full_cv": mse_cv1 = np.zeros((len(self.k_values), len(self.delta_values))) ll_cv1 = np.zeros((len(self.k_values), len(self.delta_values))) best_delta_for_k_1 = None elif self.strategy=="insample_cv": mse_cv1 = np.zeros((len(self.k_values))) ll_cv1 = np.zeros((len(self.k_values))) best_delta_for_k_1 = np.zeros((len(self.k_values))) else: raise NotImplementedError("not implemented") logging.info("reporter:counter:PerformSelectionDistributable,foldcount,1") for k_idx, k in enumerate(self.k_values): logging.info("processing fold={0}, k={1}".format(fold_idx,k)) logging.info("reporter:status:processing fold={0}, k={1}".format(fold_idx,k)) logging.info("reporter:counter:PerformSelectionDistributable,k,1") model = fastlmm.getLMM() # compute kernel externally if k == self.feature_selection_strategy.snpreader.sid_count or k >= self.feature_selection_strategy.num_snps_in_memory: if k == self.feature_selection_strategy.snpreader.sid_count: # use precomputed kernel logging.info("using precomputed kernel on all snps") K = self.feature_selection_strategy.K else: # build kernel in blocks from snpreader (from file) logging.info("building kernel in blocks") top_k_feat_idx = fold_data["feat_idx"][0:int(k)] subset = self.feature_selection_strategy.snpreader[:,top_k_feat_idx] K = subset.kernel(self.feature_selection_strategy.standardizer,blocksize=self.feature_selection_strategy.blocksize) train_idx = fold_data["train_idx"] test_idx = fold_data["test_idx"] K_train_lhs = K[train_idx] K_train = K_train_lhs[:,train_idx] K_train_test = K_train_lhs[:,test_idx].T K_test_test = K[test_idx][:,test_idx] model.setK(K_train) model.setTestData(Xstar=fold_data["X_test"], K0star=K_train_test) #np.testing.assert_array_almost_equal(model.K, K_train, decimal=4) #np.testing.assert_array_almost_equal(model.Kstar, K_train_test, decimal=4) # use precomputed features as before else: logging.info("using cached data to build kernel") outer_G_train = fold_data["G_train"][:,0:k] outer_G_test = fold_data["G_test"][:,0:k] model.setG(outer_G_train.val) model.setTestData(Xstar=fold_data["X_test"], G0star=outer_G_test.val) K_test_test = None model.sety(fold_data["y_train"]) model.setX(fold_data["X_train"]) if self.strategy == "lmm_full_cv": for delta_idx, delta_act in enumerate(self.delta_values): if k: delta = delta_act * k else: delta = delta_act REML = True#TODO: Why is REML False? # predict on test set res = model.nLLeval(delta=delta, REML=REML,penalty=self.penalty) out = model.predictMean(beta=res["beta"], delta=delta) mse_cv1[k_idx, delta_idx] = mean_squared_error(fold_data["y_test"], out) ll_cv1[k_idx, delta_idx] = model.nLLeval_test(fold_data["y_test"], res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test) elif self.strategy == "insample_cv": best_res = None best_delta = None best_nLL = float("inf") REML = True #Note that for brent = True there will always be many unique delta values, as these deiate from the grid. #brent = False brent = True # evaluate negative log-likelihood for different values of alpha import fastlmm.util.mingrid as mingrid resmin = [None] def f(x): if k: delta_corr = x * k else: delta_corr = x myres = model.nLLeval(delta = delta_corr, REML = REML,penalty=self.penalty) if (resmin[0] is None) or (myres['nLL']<resmin[0]['nLL']): resmin[0]=myres resmin[0]["delta_corr"] = delta_corr resmin[0]["delta"] = x return myres["nLL"] res = mingrid.minimize1D(f,evalgrid = self.delta_values,brent = brent) if 0:#old code without brent search for delta_idx, delta_act in enumerate(self.delta_values): delta = delta_act * k #rescale delta for X val. res = model.nLLeval(delta=delta,REML=REML,penalty=self.penalty) #TODO: check if we need scale if res["nLL"] < best_nLL: best_res = res best_delta_act = delta_act best_delta = delta best_nLL = res["nLL"] out = model.predictMean(beta=resmin[0]["beta"], delta=resmin[0]["delta_corr"]) mse_cv1[k_idx] = mean_squared_error(fold_data["y_test"], out) ll_cv1[k_idx] = model.nLLeval_test(fold_data["y_test"], resmin[0]["beta"], sigma2=resmin[0]["sigma2"], delta=resmin[0]["delta_corr"], Kstar_star=K_test_test) best_delta_for_k_1[k_idx] = resmin[0]["delta"] logging.info("crossval time %.2f s" % (float(time.time() - tt0))) return fold_idx, mse_cv1, ll_cv1, best_delta_for_k_1