def slingshot(adata, start, n_pcs=5, cl=None): import numpy as np import pandas as pd import rpy2.robjects as ro from rpy2.robjects import numpy2ri, pandas2ri from rpy2.robjects.packages import importr importr('slingshot') numpy2ri.activate() pandas2ri.activate() ro.r.assign('pca', adata.obsm['X_pca'][:, :n_pcs]) ro.r.assign('cl', adata.obs[cl]) ro.reval('sds <- newSlingshotDataSet(pca, cl)') ro.reval(f'sce <- slingshot(sds, cl, start.clus="{start}")') pt = pd.DataFrame(np.asarray(ro.reval('slingPseudotime(sce)')), index=adata.obs_names) pt.columns = [f'{cl}_lineage_{c}' for c in pt.columns] try: adata.obs = adata.obs.drop(pt.columns, axis=1) except KeyError: print('PT keys not dropped in obs dataframe: Not found.') adata.obs = pd.concat([adata.obs, pt], axis=1) adata.uns['slingshot'] = {} adata.uns['slingshot']['lineages'] = {} lineages = np.asarray(np.asarray(ro.reval('sce@lineages'))) for i, l in enumerate(lineages): adata.uns['slingshot']['lineages'][i] = list(np.asarray(l)) numpy2ri.deactivate() pandas2ri.deactivate() return adata
def fit_curve(data, circle=False, iterations=500, stretch=None, threshold=0.00001): """ :param data: numpy array, shape (n_samples, n_features), to be denoised :param circle: True if fitting starts with a circle, usefull for denoising closed curves :param iterations: maximum number of iterations :param stretch: parameter that affects curve extrapolation :param threshold: convergence threshold on shortest distances to the curve :returns: denoised data in numpy array with shape (n_samples, n_features) """ # For more information see: # https://cran.r-project.org/web/packages/princurve/princurve.pdf numpy2ri.activate() if circle: smoother = 'periodic.lowess' stretch = 0 if stretch is None else stretch else: smoother = 'smooth.spline' stretch = 2 if stretch is None else stretch pc = princurve.principal_curve(data, maxit=iterations, stretch=stretch, smoother=smoother, thresh=threshold) numpy2ri.deactivate() return np.array(pc[0])
def activate(): global original_converter # If module is already activated, there is nothing to do if original_converter is not None: return original_converter = conversion.make_converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.activate() new_converter = conversion.make_converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.deactivate() for k,v in py2ri.registry.items(): if k is object: continue new_converter.py2ri.register(k, v) for k,v in ri2ro.registry.items(): if k is object: continue new_converter.ri2ro.register(k, v) for k,v in py2ro.registry.items(): if k is object: continue new_converter.py2ro.register(k, v) for k,v in ri2py.registry.items(): if k is object: continue new_converter.ri2py.register(k, v) conversion.converter = new_converter name, conversion.ri2ro, conversion.py2ri, conversion.py2ro, conversion.ri2py, lineage = new_converter
def gaussian_setup(X, Y, run_CV=True): """ Some calculations that can be reused by methods: lambda.min, lambda.1se, lambda.theory and Reid et al. estimate of noise """ n, p = X.shape Xn = X / np.sqrt((X**2).sum(0))[None, :] l_theory = np.fabs(Xn.T.dot(np.random.standard_normal( (n, 500)))).max(1).mean() * np.ones(p) * np.std(Y) if run_CV: numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') rpy.r('G = cv.glmnet(X, Y, intercept=FALSE, standardize=FALSE)') rpy.r( 'sigma_reid = selectiveInference:::estimate_sigma(X, Y, coef(G, s="lambda.min")[-1]) # sigma via Reid et al.' ) rpy.r("L = G[['lambda.min']]") rpy.r("L1 = G[['lambda.1se']]") L = rpy.r('L') L1 = rpy.r('L1') sigma_reid = rpy.r('sigma_reid')[0] numpy2ri.deactivate() return L * np.sqrt(X.shape[0]), L1 * np.sqrt( X.shape[0]), l_theory, sigma_reid else: return None, None, l_theory, None
def hugeR(X, lambda_threshold): """ This function computes the covariance matrix and the corresponding sparse inverse covariance matrix of the numpy input matrix X, using the huge R package by Liu et al. It transforms the variables in X into the nonparanormal family which allows then us to use the glasso algorithm to estimate the sparse inverse cov matrix. The lossy pre-screening isn't used here to speed up the calculations because we prefilter X so n ~ p. We test 30 lambda values for regularisation. The best model is selected using the 'stars' stability approach with default threshold. For more details check docs and vignette: https://cran.r-project.org/web/packages/huge/huge.pdf http://r.meteo.uni.wroc.pl/web/packages/huge/vignettes/vignette.pdf """ base = importr('base') # this allows us to send numpy to R directly, neat numpy2ri.activate() huge = importr('huge') X_npn = huge.huge_npn(X, npn_func="shrinkage") model = huge.huge(X_npn, nlambda=30, method='glasso', scr=False, cov_output=True) model_stars = huge.huge_select(model, criterion="stars", stars_thresh=lambda_threshold) cov = np.array(base.as_matrix(model_stars.rx('opt.cov')[0])) prec = np.array(base.as_matrix(model_stars.rx('opt.icov')[0])) # network = np.array(base.as_matrix(model_stars.rx('refit')[0])) # we need to turn this off once we're done numpy2ri.deactivate() return cov, prec
def test_ROSI_gaussian_JM(): n, p, s = 100, 30, 15 while True: X, y, _, _, sigma, _ = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, signal=4) lam = 7. * np.sqrt(n) X *= np.sqrt(n) L = ROSI.gaussian(X, y, lam, approximate_inverse='JM') L.sparse_inverse = True L.fit() print('here', len(L.active)) if len(L.active) > 4: S = L.summary(compute_intervals=False, dispersion=sigma**2) numpy2ri.activate() rpy.r.assign("X", X) rpy.r.assign("y", y) rpy.r.assign("sigma_est", sigma) rpy.r.assign("lam", lam) rpy.r(""" y = as.numeric(y) n = nrow(X) p = ncol(X) penalty_factor = rep(1, p); soln = selectiveInference:::solve_problem_glmnet(X, y, lam/n, penalty_factor=penalty_factor, family="gaussian") PVS = ROSI(X, y, soln, lambda=lam, penalty_factor=penalty_factor, dispersion=sigma_est^2, family="gaussian", solver="QP", construct_ci=FALSE, use_debiased=TRUE) active_vars=PVS$active_vars - 1 # for 0-based pvalues = PVS$pvalues """) pvalues = np.asarray(rpy.r('pvalues')) pvalues = pvalues[~np.isnan(pvalues)] active_set = rpy.r('active_vars') print(pvalues) print(np.asarray(S['pvalue'])) nt.assert_true(np.corrcoef(pvalues, S['pvalue'])[0, 1] > 0.999) numpy2ri.deactivate() break
def select(self): numpy2ri.activate() rpy.r.assign('chol_k', self.knockoff_chol) rpy.r(''' knockoffs = function(X) { mu = rep(0, ncol(X)) mu_k = X # sweep(X, 2, mu, "-") %*% SigmaInv_s X_k = mu_k + matrix(rnorm(ncol(X) * nrow(X)), nrow(X)) %*% chol_k return(X_k) } ''') numpy2ri.deactivate() try: numpy2ri.activate() rpy.r.assign('X', self.X) rpy.r.assign('Y', self.Y) rpy.r.assign('q', self.q) rpy.r( 'V=knockoff.filter(X, Y, fdr=q, knockoffs=knockoffs)$selected') rpy.r('if (length(V) > 0) {V = V-1}') V = rpy.r('V') numpy2ri.deactivate() return np.asarray(V, np.int), np.asarray(V, np.int) except: return [], []
def dlsa(Sig_inv_, beta_, sample_size, fit_intercept=False): '''Distributed Least Squares Approximation ''' numpy2ri.activate() dfitted = lars_lsa(np.asarray(Sig_inv_), np.asarray(beta_), intercept=fit_intercept, n=sample_size) numpy2ri.deactivate() AIC = robjects.FloatVector(dfitted.rx2("AIC")) AIC_minIdx = np.argmin(AIC) BIC = robjects.FloatVector(dfitted.rx2("BIC")) BIC_minIdx = np.argmin(BIC) beta = np.array(robjects.FloatVector(dfitted.rx2("beta"))) if fit_intercept: beta_byOLS = beta_.to_numpy() beta0 = np.array(robjects.FloatVector(dfitted.rx2("beta0"))) + beta_byOLS[0] beta_byAIC = np.hstack([beta0[AIC_minIdx], beta[AIC_minIdx, :]]) beta_byBIC = np.hstack([beta0[BIC_minIdx], beta[BIC_minIdx, :]]) else: beta_byAIC = beta[AIC_minIdx, :] beta_byBIC = beta[BIC_minIdx, :] return pd.DataFrame({"beta_byAIC": beta_byAIC, "beta_byBIC": beta_byBIC})
def Rpval(X, Y, W, noise_scale=None): numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r.assign('lam', W) if noise_scale is not None: rpy.r.assign('noise_scale', noise_scale) rpy.r( 'soln = selectiveInference:::randomizedLasso(X, Y, lam, noise_scale=noise_scale, kkt_tol=1.e-8, parameter_tol=1.e-8)' ) else: rpy.r('soln = selectiveInference:::randomizedLasso(X, Y, lam)') rpy.r('targets=selectiveInference:::compute_target(soln, type="full")') rpy.r( 'rand_inf = selectiveInference:::randomizedLassoInf(soln, sampler="adaptMCMC", targets=targets, nsample=5000, burnin=2000)' ) pval = np.asarray(rpy.r('rand_inf$pvalues')) vars = np.asarray(rpy.r('soln$active_set')) - 1 cond_cov = np.asarray(rpy.r('soln$law$cond_cov')) cond_mean = np.asarray(rpy.r('soln$law$cond_mean')) rand = np.asarray(rpy.r('soln$perturb')) active = np.asarray(rpy.r('soln$active')) - 1 soln = np.asarray(rpy.r('soln$soln')) ridge = rpy.r('soln$ridge_term') numpy2ri.deactivate() return pval, vars, rand, active, soln, ridge, cond_cov, cond_mean
def setup(cls, feature_cov, data_generating_mechanism): cls.feature_cov = feature_cov cls.data_generating_mechanism = data_generating_mechanism cls.noise = data_generating_mechanism.noise numpy2ri.activate() # see if we've factored this before have_factorization = False if not os.path.exists('.knockoff_factorizations'): os.mkdir('.knockoff_factorizations') factors = glob.glob('.knockoff_factorizations/*npz') for factor_file in factors: factor = np.load(factor_file) feature_cov_f = factor['feature_cov'] if ((feature_cov_f.shape == feature_cov.shape) and (factor['method'] == cls.factor_method) and np.allclose(feature_cov_f, feature_cov)): have_factorization = True print('found factorization: %s' % factor_file) cls.knockoff_chol = factor['knockoff_chol'] if not have_factorization: print('doing factorization') cls.knockoff_chol = factor_knockoffs(feature_cov, cls.factor_method) numpy2ri.deactivate()
def setup(cls, feature_cov): cls.feature_cov = feature_cov numpy2ri.activate() # see if we've factored this before have_factorization = False if not os.path.exists('.knockoff_factorizations'): os.mkdir('.knockoff_factorizations') factors = glob.glob('.knockoff_factorizations/*npz') for factor_file in factors: factor = np.load(factor_file) feature_cov_f = factor['feature_cov'] if ((feature_cov_f.shape == feature_cov.shape) and (factor['method'] == cls.factor_method) and np.allclose(feature_cov_f, feature_cov)): have_factorization = True cls.knockoff_chol = factor['knockoff_chol'] if not have_factorization: cls.knockoff_chol = factor_knockoffs(feature_cov, cls.factor_method) numpy2ri.deactivate()
def setup(cls, feature_cov, data_generating_mechanism, max_model_size=6, level=0.90): cls.feature_cov = feature_cov cls.data_generating_mechanism = data_generating_mechanism cls.noise = data_generating_mechanism.noise numpy2ri.activate() # see if we've factored this before have_POSI_K = False if not os.path.exists('.POSI_data'): os.mkdir('.POSI_data') posi_data = glob.glob('.POSI_data/*npz') for posi_file in posi_data: posi = np.load(posi_file) posi_f = posi['feature_cov'] if ((posi_f.shape == feature_cov.shape) and np.allclose(posi_f, feature_cov) and (posi['max_model_size'] == max_model_size) and (posi['level'] == level)): have_POSI_K = True print('found POSI instance: %s' % posi) cls.POSI_K = float(posi['K']) if not have_POSI_K: print('simulating POSI constant') cls.POSI_K = float( POSI_instance(feature_cov, max_model_size, n=10 * feature_cov.shape[0])) numpy2ri.deactivate()
def activate(): global original_converter # If module is already activated, there is nothing to do if original_converter is not None: return original_converter = conversion.Converter( 'snapshot before pandas conversion', template=conversion.converter) numpy2ri.activate() new_converter = conversion.Converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.deactivate() for k, v in py2ri.registry.items(): if k is object: continue new_converter.py2ri.register(k, v) for k, v in ri2ro.registry.items(): if k is object: continue new_converter.ri2ro.register(k, v) for k, v in py2ro.registry.items(): if k is object: continue new_converter.py2ro.register(k, v) for k, v in ri2py.registry.items(): if k is object: continue new_converter.ri2py.register(k, v) conversion.set_conversion(new_converter)
def activate(): warnings.warn( 'The global conversion available with activate() ' 'is deprecated and will be removed in the next ' 'major release. Use a local converter.', category=DeprecationWarning) global original_converter # If module is already activated, there is nothing to do. if original_converter is not None: return original_converter = conversion.Converter( 'snapshot before pandas conversion', template=conversion.converter) numpy2ri.activate() new_converter = conversion.Converter('snapshot before pandas conversion', template=conversion.converter) numpy2ri.deactivate() for k, v in py2rpy.registry.items(): if k is object: continue new_converter.py2rpy.register(k, v) for k, v in rpy2py.registry.items(): if k is object: continue new_converter.rpy2py.register(k, v) conversion.set_conversion(new_converter)
def fit(self, X, y, **kwargs): import rpy2.robjects as ro import rpy2.robjects.numpy2ri as n2r # Make everything as numpy if isinstance(X, pd.DataFrame): X = X.values y = y.values min_idx, all_lambdas = self._select_lam_by_val(X, y) # Create final model by rerunning the whole dataset with Timer('Fitting the final model'): n2r.activate() r_result = ro.r['flam'](X, y, family=self.family, alpha=1., **{ 'lambda.seq': all_lambdas }) n2r.deactivate() scores = np.asanyarray(r_result.rx2('theta.hat.list')[min_idx]) intercept = r_result.rx2('beta0.hat.vec')[min_idx] self.GAM_plot_dataframe = self._create_df_from_r_result( X, scores, intercept) # TODO: remove this return. Just to debug # return r_result ro.r('rm(list = ls())') # Remove vars
def naive_intervals(self, active_set): """ selected model """ numpy2ri.activate() if self.model_target == 'selected': rpy.r.assign("X", self.X[:, active_set]) else: n, p = self.X.shape if n > p: rpy.r.assign("X", self.X) else: return (active_set, np.ones(len(active_set)) * np.nan, np.ones(len(active_set)) * np.nan) rpy.r.assign("Y", self.Y) rpy.r.assign("level", self.confidence) rpy.r('CI = confint(lm(Y ~ X - 1), level=level)') CI = np.asarray(rpy.r('CI')) if self.model_target != 'selected': CI = CI[active_set] numpy2ri.deactivate() return active_set, CI[:, 0], CI[:, 1]
def generate_pvalues(self): numpy2ri.activate() rpy.r.assign('x', self.X) rpy.r.assign('y', self.Y) rpy.r('y = as.numeric(y)') rpy.r.assign('sigma_reid', self.sigma_reid) rpy.r.assign('lam', self.lagrange[0]) rpy.r(''' sigma_est=sigma_reid n = nrow(x); gfit = glmnet(x, y, standardize=FALSE, intercept=FALSE) lam = lam / sqrt(n); # lambdas are passed a sqrt(n) free from python code if (lam < max(abs(t(x) %*% y) / n)) { beta = coef(gfit, x=x, y=y, s=lam, exact=TRUE)[-1] out = fixedLassoInf(x, y, beta, lam*n, sigma=sigma_est, type='full', intercept=FALSE) active_vars=out$vars - 1 # for 0-based pvalues = out$pv } else { pvalues = NULL active_vars = numeric(0) } ''') pvalues = np.asarray(rpy.r('pvalues')) active_set = np.asarray(rpy.r('active_vars')) numpy2ri.deactivate() if len(active_set) > 0: return active_set, pvalues else: return [], []
def test_liu_gaussian(): n, p, s = 200, 100, 20 while True: X, y, _, _, sigma, _ = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, signal=10, sigma=1.) lam = 4. * np.sqrt(n) X *= np.sqrt(n) L = lasso_full.gaussian(X, y, lam) L.fit() if len(L.active) > 4: S = L.summary(compute_intervals=False, dispersion=sigma**2) numpy2ri.activate() rpy.r.assign('sigma_est', sigma) rpy.r.assign("X", X) rpy.r.assign("y", y) rpy.r.assign("lam", lam) rpy.r(""" y = as.numeric(y) n = nrow(X) p = ncol(X) #sigma_est = sigma(lm(y ~ X - 1)) penalty_factor = rep(1, p); soln = selectiveInference:::solve_problem_glmnet(X, y, lam/n, penalty_factor=penalty_factor, family="gaussian") PVS = ROSI(X, y, soln, lambda=lam, penalty_factor=penalty_factor, dispersion=sigma_est^2, family="gaussian", solver="QP", construct_ci=FALSE, use_debiased=FALSE) active_vars=PVS$active_vars - 1 # for 0-based pvalues = PVS$pvalues """) pvalues = rpy.r('pvalues') pvalues = pvalues[~np.isnan(pvalues)] active_set = rpy.r('active_vars') print(pvalues) print(S['pval']) nt.assert_true(np.corrcoef(pvalues, S['pval'])[0, 1] > 0.999) numpy2ri.deactivate() break
def test_ROSI_logistic_BN(): n, p, s = 100, 120, 15 while True: X, y = logistic_instance(n=n, p=p, s=s, equicorrelated=False, signal=10)[:2] lam = 1. * np.sqrt(n) X *= np.sqrt(n) L = ROSI.logistic(X, y, lam, approximate_inverse='BN') L.fit() if len(L.active) > 4: S = L.summary(compute_intervals=False, dispersion=1.) numpy2ri.activate() rpy.r.assign("X", X) rpy.r.assign("y", y) rpy.r.assign("lam", lam) rpy.r(""" y = as.numeric(y) n = nrow(X) p = ncol(X) penalty_factor = rep(1, p); soln = selectiveInference:::solve_problem_glmnet(X, y, lam/n, penalty_factor=penalty_factor, family="binomial") PVS = ROSI(X, y, soln, lambda=lam, penalty_factor=penalty_factor, dispersion=1., family="binomial", debiasing_method="BN", solver="QP", construct_ci=FALSE, use_debiased=TRUE) active_vars=PVS$active_vars - 1 # for 0-based pvalues = PVS$pvalues """) pvalues = rpy.r('pvalues') pvalues = pvalues[~np.isnan(pvalues)] active_set = rpy.r('active_vars') print(pvalues) print(np.asarray(S['pval'])) nt.assert_true(np.corrcoef(pvalues, S['pval'])[0, 1] > 0.999) numpy2ri.deactivate() break
def testActivateTwice(self): # setUp method has already activated numpy converter self.assertEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.activate() self.assertEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.deactivate() self.assertNotEqual(rpyn.numpy2ri, robjects.conversion.py2ri) rpyn.deactivate() self.assertNotEqual(rpyn.numpy2ri, robjects.conversion.py2ri)
def BHfilter(pval, q=0.2): numpy2ri.activate() rpy.r.assign('pval', pval) rpy.r.assign('q', q) rpy.r('Pval = p.adjust(pval, method="BH")') rpy.r('S = which((Pval < q)) - 1') S = rpy.r('S') numpy2ri.deactivate() return np.asarray(S, np.int)
def nlshrink_covariance(X, centered=False): LOGGER.info("computing Ledoit-Wolf non-linear shrinkage covariance") if not centered: X = X - X.mean() f = nostdout(r_nlshrink.nlshrink_cov) numpy2ri.activate() cov = np.asarray(f(X)) numpy2ri.deactivate() return cov
def testActivate(self): rpyn.deactivate() #FIXME: is the following still making sense ? self.assertNotEqual(rpyn.py2ri, conversion.py2ri) l = len(conversion.py2ri.registry) k = set(conversion.py2ri.registry.keys()) rpyn.activate() self.assertTrue(len(conversion.py2ri.registry) > l) rpyn.deactivate() self.assertEqual(l, len(conversion.py2ri.registry)) self.assertEqual(k, set(conversion.py2ri.registry.keys()))
def spMatrixToR(x): matrix_pkg = rpackages.importr('Matrix') coo_matrix = x.tocoo() numpy2ri.activate() result = matrix_pkg.sparseMatrix(i=IntVector(coo_matrix.row), j=IntVector(coo_matrix.col), x=FloatVector(coo_matrix.data), dims=IntVector(coo_matrix.shape), index1=False) numpy2ri.deactivate() return result
def computeMultivariatePoissonProbability(pdn, dataset): numpy2ri.activate() df = robjects.r["as.data.frame"](dataset) ev = pdnmodule.computeExpectedValues(pdn, df) print(ev) logprobs = pdnmodule.computeXlogProb(df, ev) print(logprobs) numpy2ri.deactivate() return logprobs
def test_activate(self): rpyn.deactivate() #FIXME: is the following still making sense ? assert rpyn.py2rpy != conversion.py2rpy l = len(conversion.py2rpy.registry) k = set(conversion.py2rpy.registry.keys()) rpyn.activate() assert len(conversion.py2rpy.registry) > l rpyn.deactivate() assert len(conversion.py2rpy.registry) == l assert set(conversion.py2rpy.registry.keys()) == k
def ipcw_weights(self, event, time): from rpy2 import robjects from rpy2.robjects import numpy2ri _mboost = robjects.packages.importr("mboost") _survival = robjects.packages.importr("survival") numpy2ri.activate() iw = _mboost.IPCweights(_survival.Surv(time, event)) numpy2ri.deactivate() return numpy.asarray(iw)
def generate_pvalues(self, compute_intervals=False): self._fit = True numpy2ri.activate() rpy.r.assign('X', self.X) rpy.r.assign('y', self.Y) rpy.r('y = as.numeric(y)') rpy.r.assign('q', self.q) rpy.r.assign('lam', self.lagrange[0]) rpy.r.assign("randomizer_scale", self.randomizer_scale) rpy.r.assign("compute_intervals", compute_intervals) rpy.r(''' n = nrow(X) p = ncol(X) lam = lam * sqrt(n) mean_diag = mean(apply(X^2, 2, sum)) ridge_term = sqrt(mean_diag) * sd(y) / sqrt(n) result = randomizedLasso(X, y, lam, ridge_term=ridge_term, noise_scale = randomizer_scale * sd(y) * sqrt(n), family='gaussian') active_set = result$active_set if (length(active_set)==0){ active_set = -1 } else{ sigma_est = sigma(lm(y ~ X[,active_set] - 1)) cat("sigma est for R", sigma_est,"\n") targets = selectiveInference:::compute_target(result, 'partial', sigma_est = sigma_est, construct_pvalues=rep(TRUE, length(active_set)), construct_ci=rep(compute_intervals, length(active_set))) out = randomizedLassoInf(result, targets=targets, sampler = "norejection", level=0.9, burnin=1000, nsample=10000) active_set=active_set-1 pvalues = out$pvalues intervals = out$ci } ''') active_set = np.asarray(rpy.r('active_set'), np.int) print(active_set) if active_set[0] == -1: numpy2ri.deactivate() return [], [], [] pvalues = np.asarray(rpy.r('pvalues')) intervals = np.asarray(rpy.r('intervals')) numpy2ri.deactivate() if len(active_set) > 0: return active_set, pvalues else: return [], []
def deactivate(): global original_py2ri, original_ri2ro # If module has never been activated or already deactivated, # there is nothing to do if not original_py2ri: return conversion.py2ri = original_py2ri conversion.ri2ro = original_ri2ro original_py2ri = original_ri2ro = None numpy2ri.deactivate()
def select(self): try: numpy2ri.activate() rpy.r.assign('X', self.X) rpy.r.assign('Y', self.Y) rpy.r.assign('q', self.q) rpy.r('V=knockoff.filter(X, Y, fdr=q)$selected') rpy.r('if (length(V) > 0) {V = V-1}') V = rpy.r('V') numpy2ri.deactivate() return np.asarray(V, np.int), np.asarray(V, np.int) except: return [], []
def get_R_theta(pi, c, Gamma, A, b, Sigma): """Return a R compatible list from numpy arrays""" numpy2ri.activate() in_theta = ListVector(dict( pi=pi, c=c.T, Gamma=Gamma.transpose((1,2,0)), A = A.transpose((1,2,0)), b=b.T, Sigma=Sigma.transpose((1,2,0)) )) numpy2ri.deactivate() return in_theta
def compute_results(y, X, sigma, active, full_results={}, do_knockoff=False, do_AIC=True, do_BIC=True, do_glmnet=True, alpha=0.05, maxstep=np.inf, compute_maxT_identify=True, burnin=2000, ndraw=8000): n, p = X.shape results, FS = compute_pvalues(y, X, active, sigma, maxstep=maxstep, compute_maxT_identify=compute_maxT_identify, burnin=burnin, ndraw=ndraw) completion_idx = completion_index(results['variable_selected'], active) full_results.setdefault('completion_idx', []).append(completion_idx) for column in results.columns: for i in range(results.shape[0]): full_results.setdefault('%s_%d' % (str(column), i+1), []).append(results[column][i]) for i in range(len(active)): full_results.setdefault('active_%d' % (i+1,), []).append(active[i]) full_results.setdefault('alpha', []).append(alpha) if do_knockoff: # this will probably not work on miller import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) # knockoff rpy.r.assign('alpha', alpha) knockoff = np.array(rpy.r(""" library(knockoff) knockoff.filter(X = X, y = y, fdr=alpha, knockoffs=create.fixed, offset=0)$selected """)) - 1 knockoff_R = knockoff.shape[0] knockoff_V = knockoff_R - len(set(active).intersection(knockoff)) knockoff_screen = set(knockoff).issuperset(active) knockoff_plus = np.array(rpy.r(""" knockoff.filter(X = X, y = y, fdr=alpha, knockoffs=create.fixed, offset=1)$selected """)) - 1 knockoff_plus_R = knockoff_plus.shape[0] knockoff_plus_V = knockoff_plus_R - len(set(active).intersection(knockoff_plus)) knockoff_plus_screen = set(knockoff_plus).issuperset(active) full_results.setdefault('knockoff_R', []).append(knockoff_R) full_results.setdefault('knockoff_V', []).append(knockoff_V) full_results.setdefault('knockoff_screen', []).append(knockoff_screen) full_results.setdefault('knockoff_plus_R', []).append(knockoff_plus_R) full_results.setdefault('knockoff_plus_V', []).append(knockoff_plus_V) full_results.setdefault('knockoff_plus_screen', []).append(knockoff_plus_screen) numpy2ri.deactivate() if do_AIC: # this will probably not work on miller import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''M = step(lm(y ~ 1, data=data.frame(X, y)), scope=list(upper="~ %s"), direction="forward", trace=FALSE)''' % ' + '.join(['X%d' % i for i in range(1, p+1)])) AIC = np.asarray([int(v[1:]) for v in rpy.r("all.vars(M$call$formula[[3]])")]) - 1 # subtract 1 for 0-based indexing AIC_R = AIC.shape[0] AIC_V = AIC_R - len(set(active).intersection(AIC)) AIC_screen = set(AIC).issuperset(active) full_results.setdefault('AIC_R', []).append(AIC_R) full_results.setdefault('AIC_V', []).append(AIC_V) full_results.setdefault('AIC_screen', []).append(AIC_screen) numpy2ri.deactivate() if do_BIC: import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''M = step(lm(y ~ 1, data=data.frame(X, y)), scope=list(upper="~ %s"), direction="forward", k=log(nrow(X)), trace=FALSE)''' % ' + '.join(['X%d' % i for i in range(1, p+1)])) BIC = np.asarray([int(v[1:]) for v in rpy.r("all.vars(M$call$formula[[3]])")]) - 1 # subtract 1 for 0-based indexing BIC_R = BIC.shape[0] BIC_V = BIC_R - len(set(active).intersection(BIC)) BIC_screen = set(BIC).issuperset(active) full_results.setdefault('BIC_R', []).append(BIC_R) full_results.setdefault('BIC_V', []).append(BIC_V) full_results.setdefault('BIC_screen', []).append(BIC_screen) numpy2ri.deactivate() if do_glmnet: import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.conversion.py2ri = numpy2ri.numpy2ri numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('y', y) rpy.r('''library(glmnet); y = as.matrix(y); X = as.matrix(X); CVG = cv.glmnet(X, y); G = glmnet(X, y); B = coef(G, s=CVG$lambda.min, exact=TRUE); selected = which(B[2:length(B)] != 0); B2 = coef(G, s=CVG$lambda.1se, exact=TRUE); selected2 = which(B2[2:length(B2)] != 0); ''') GLMnet = np.asarray(rpy.r("selected")) - 1 # subtract 1 for 0-based indexing GLMnet_R = GLMnet.shape[0] GLMnet_V = GLMnet_R - len(set(active).intersection(GLMnet)) GLMnet_screen = set(GLMnet).issuperset(active) full_results.setdefault('GLMnet_R', []).append(GLMnet_R) full_results.setdefault('GLMnet_V', []).append(GLMnet_V) full_results.setdefault('GLMnet_screen', []).append(GLMnet_screen) GLMnet1se = np.asarray(rpy.r("selected2")) - 1 # subtract 1 for 0-based indexing GLMnet1se_R = GLMnet1se.shape[0] GLMnet1se_V = GLMnet1se_R - len(set(active).intersection(GLMnet1se)) GLMnet1se_screen = set(GLMnet1se).issuperset(active) full_results.setdefault('GLMnet1se_R', []).append(GLMnet1se_R) full_results.setdefault('GLMnet1se_V', []).append(GLMnet1se_V) full_results.setdefault('GLMnet1se_screen', []).append(GLMnet1se_screen) numpy2ri.deactivate() for pval, rule_ in product(['maxT_identify_pvalue', 'maxT_identify_unknown_pvalue', 'maxT_unknown_pvalue', 'saturated_pvalue', 'nominal_pvalue', 'nominalT_pvalue', 'maxT_pvalue'], zip([simple_stop, strong_stop, forward_stop], ['simple', 'strong', 'forward'])): rule, rule_name = rule_ (R, V_var, V_model, screen, FWER_model, FDP_model, FDP_var, S_var) = summary(np.asarray(results['variable_selected']), results[pval], active, rule, alpha) pval_name = '_'.join(pval.split('_')[:-1]) for (n, value) in zip(['R', 'V_var', 'V_model', 'FDP_model', 'FDP_var', 'S_var', 'FWER_model', 'screen'], [R, V_var, V_model, FDP_model, FDP_var, S_var, FWER_model, screen]): full_results.setdefault('%s_%s_%s' % (pval_name, rule_name, n), []).append(value) return full_results, FS
print 'maxT unknown:', forward_stop_U print 'nominal:', forward_stop_N print 'saturated:', forward_stop_S pvals = pvals[:20] # R pvalues Rpval = [] model_str = '' for i in range(pvals.shape[0]): model_str = '+'.join([' X[,%d] ' % v for v in pvals['Column number'][:(i+1)]]) Rstr = 'summary(lm(Y ~ %s))$coef[,4]' % model_str Rpval.append(np.array(rpy.r(Rstr))[-1]) print 'checking whether nominal agrees with R:', np.linalg.norm(np.array(Rpval) - pvals['Nominal pvalue']) / np.linalg.norm(pvals['Nominal pvalue']) # save the HTML table file('../../tables/diabetes.html', 'w').write(pvals.to_html(float_format = lambda v : '%0.2f' % v, index=False)) pvals = pvals.reindex_axis(['Step', 'Variable', 'Nominal pvalue', 'Saturated pvalue', 'MaxT pvalue'], axis=1) print pvals # save the LaTeX table file('../../tables/diabetes.tex', 'w').write(pvals.to_latex(float_format = lambda v : '%0.2f' % v, index=False)) numpy2ri.deactivate()
def tearDown(self): rpyn.deactivate()