def test_antisymmetric_fns(self): n = 100 p = 20 np.random.seed(110) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data(n=n, p=p, y_dist="gaussian", coeff_size=100, sign_prob=1) groups = np.arange(1, p + 1, 1) # These are not real, just helpful syntatically fake_knockoffs = np.zeros((n, p)) # Run to make sure there are no errors for # different antisyms np.random.seed(110) lasso_stat = kstats.LassoStatistic() lasso_stat.fit(X=X, Xk=fake_knockoffs, y=y, y_dist=None, antisym="cd") W_cd = lasso_stat.W Z_cd = lasso_stat.Z W_cd[np.abs(W_cd) < 10] = 0 Z_cd[np.abs(Z_cd) < 10] = 0 np.testing.assert_array_almost_equal( W_cd, -1 * Z_cd[0:p], err_msg="antisym CD returns weird W stats") # Run to make sure there are no errors for # different antisyms np.random.seed(110) lasso_stat = kstats.LassoStatistic() lasso_stat.fit(X=X, Xk=fake_knockoffs, y=y, y_dist=None, antisym="sm") Z_sm = lasso_stat.Z W_sm = lasso_stat.W np.testing.assert_array_almost_equal( W_sm, np.abs(Z_sm[0:p]), decimal=3, err_msg="antisym SM returns weird W stats", ) # Run to make sure there are no errors for # different antisyms np.random.seed(110) lasso_stat = kstats.LassoStatistic() lasso_stat.fit(X=X, Xk=fake_knockoffs, y=y, y_dist=None, antisym="scd") W_scd = lasso_stat.W Z_scd = lasso_stat.Z W_scd[np.abs(W_scd) < 10] = 0 Z_scd[np.abs(Z_scd) < 10] = 0 np.testing.assert_array_almost_equal( W_scd, Z_scd[0:p], err_msg="antisym SCD returns weird W stats")
def test_lasso_fit(self): # Lasso fit for Gaussian data self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Sklearn lasso", n=200, p=100, rho=0.7, coeff_size=5, sparsity=0.5, seed=110, min_power=0.9, group_features=False, max_l2norm=np.inf, ) # Repeat for grouped features self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Sklearn lasso", n=200, p=100, rho=0.7, coeff_size=5, sparsity=0.5, seed=110, min_power=0.4, group_features=True, max_l2norm=np.inf, ) # Repeat for logistic features self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Sklearn lasso", n=350, p=100, rho=0.7, coeff_size=5, sparsity=0.5, seed=110, min_power=0.8, group_features=True, max_l2norm=np.inf, )
def test_pyglm_group_lasso_fit(self): pyglm_kwargs = { "use_pyglm": True, "max_iter": 20, "tol": 5e-2, "learning_rate": 3, "group_lasso": True, } self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Pyglm solver", fstat_kwargs=pyglm_kwargs, n=500, p=200, rho=0.2, coeff_size=5, sparsity=0.5, seed=110, min_power=0.5, group_features=True, max_l2norm=np.inf, ) # Repeat for logistic case self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Pyglm solver", fstat_kwargs=pyglm_kwargs, n=500, p=100, rho=0.2, coeff_size=5, sparsity=0.5, seed=110, min_power=0.5, group_features=True, y_dist="binomial", max_l2norm=np.inf, )
def test_vanilla_group_lasso_fit(self): glasso_kwargs = { "use_pyglm": False, "group_lasso": True, } self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Vanilla group lasso solver", fstat_kwargs=glasso_kwargs, n=500, p=200, rho=0.2, coeff_size=5, sparsity=0.5, seed=110, min_power=0, group_features=True, max_l2norm=np.inf, ) # Repeat for logistic case self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="Vanilla group lasso solver", fstat_kwargs=glasso_kwargs, n=500, p=100, rho=0.2, coeff_size=5, sparsity=0.5, seed=110, min_power=0, group_features=True, y_dist="binomial", max_l2norm=np.inf, )
def non_sklearn_backend_cvscore(): dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data( n=n, p=p, y_dist="binomial", coeff_size=100, sign_prob=1) groups = np.random.randint(1, p + 1, size=(p, )) group = utilities.preprocess_groups(groups) pyglm_logit = kstats.LassoStatistic() pyglm_logit.fit( X, knockoffs, y, use_pyglm=True, group_lasso=True, groups=groups, cv_score=True, )
def test_lars_solver_fit(self): """ Tests power of lars lasso solver """ self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="LARS solver", fstat_kwargs={"use_lars": True}, n=150, p=100, rho=0.7, sign_prob=0, coeff_size=5, coeff_dist="uniform", sparsity=0.5, seed=1, )
def test_lars_path_fit(self): """ Tests power of lars path statistic """ # Get DGP, knockoffs, S matrix self.check_kstat_fit( fstat=kstats.LassoStatistic(), fstat_name="LARS path statistic", fstat_kwargs={ "zstat": "lars_path", "antisym": "sm" }, n=300, p=100, rho=0.7, sign_prob=0.5, coeff_size=5, coeff_dist="uniform", sparsity=0.5, seed=110, min_power=0.8, max_l2norm=np.inf, )
def test_debiased_lasso(self): # Create data generating process n = 200 p = 20 rho = 0.3 np.random.seed(110) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data( n=n, p=p, y_dist="gaussian", coeff_size=100, sign_prob=0.5, method="blockequi", rho=rho, ) groups = np.arange(1, p + 1, 1) # Create knockoffs S = (1 - rho) * np.eye(p) ksampler = knockpy.knockoffs.GaussianSampler(X=X, groups=groups, Sigma=corr_matrix, verbose=False, S=S) knockoffs = ksampler.sample_knockoffs() G = np.concatenate( [ np.concatenate([corr_matrix, corr_matrix - S]), np.concatenate([corr_matrix - S, corr_matrix]), ], axis=1, ) Ginv = utilities.chol2inv(G) # Debiased lasso - test accuracy dlasso_stat = kstats.LassoStatistic() dlasso_stat.fit(X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv) W = dlasso_stat.W l2norm = np.power(W - beta, 2).mean() self.assertTrue( l2norm > 1, msg= f"Debiased lasso fits gauissan very poorly (l2norm = {l2norm} btwn real/fitted coeffs)", ) # Test that this throws the correct errors # first for Ginv def debiased_lasso_sans_Ginv(): dlasso_stat.fit(X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=None) self.assertRaisesRegex(ValueError, "Ginv must be provided", debiased_lasso_sans_Ginv) # Second for logistic data y = np.random.binomial(1, 0.5, n) def binomial_debiased_lasso(): dlasso_stat.fit( X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv, ) self.assertRaisesRegex( ValueError, "Debiased lasso is not implemented for binomial data", binomial_debiased_lasso, )
def test_cv_scoring(self): # Create data generating process n = 100 p = 20 np.random.seed(110) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data(n=n, p=p, y_dist="gaussian", coeff_size=100, sign_prob=1) groups = np.arange(1, p + 1, 1) # These are not real, just helpful syntatically knockoffs = np.zeros((n, p)) # 1. Test lars cv scoring lars_stat = kstats.LassoStatistic() lars_stat.fit( X, knockoffs, y, use_lars=True, cv_score=True, ) self.assertTrue( lars_stat.score_type == "mse_cv", msg= f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})", ) # 2. Test OLS cv scoring ols_stat = kstats.OLSStatistic() ols_stat.fit( X, knockoffs, y, cv_score=True, ) self.assertTrue( ols_stat.score_type == "mse_cv", msg= f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})", ) self.assertTrue( ols_stat.score < 2, msg= f"cv scoring fails for ols_stat as cv_score={ols_stat.score} >= 2", ) # 3. Test that throws correct error for non-sklearn backend def non_sklearn_backend_cvscore(): dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data( n=n, p=p, y_dist="binomial", coeff_size=100, sign_prob=1) groups = np.random.randint(1, p + 1, size=(p, )) group = utilities.preprocess_groups(groups) pyglm_logit = kstats.LassoStatistic() pyglm_logit.fit( X, knockoffs, y, use_pyglm=True, group_lasso=True, groups=groups, cv_score=True, ) self.assertRaisesRegex(ValueError, "must be sklearn estimator", non_sklearn_backend_cvscore)