Exemple #1
0
    def test_partialcorr_sample(self):

        p = 50
        rho = 0.99
        dgprocess = dgp.DGP()
        _, _, _, _, V = dgprocess.sample_data(p=p, method="partialcorr", rho=rho)
        diag_diff = np.mean(np.abs(np.diag(V) - 1))
        self.assertTrue(
            diag_diff < 1e-4,
            f"Partial corr Sigma={V} for rho={rho} is not a correlation matrix",
        )
        pairwise_corr = V[0, 1]
        expected = -1 / (p - 1)
        self.assertTrue(
            np.abs(pairwise_corr - expected) < 1e-4,
            f"Partial corr pairwise_corr {pairwise_corr} deviates from expectation {expected} for rho={rho}",
        )
Exemple #2
0
    def test_complex_group_solns(self):
        """
        Check the solutions of the PSGD solver
        for group knockoffs.
        """

        if not TORCH_AVAILABLE:
            return None
        from knockpy import kpytorch

        # Construct graph + groups
        np.random.seed(110)
        p = 50
        groups = knockpy.utilities.preprocess_groups(
            np.random.randint(1, p + 1, p))
        for method in ["ar1", "ver"]:
            dgprocess = dgp.DGP()
            _, _, _, _, Sigma = dgprocess.sample_data(
                method=method,
                p=p,
            )

            # Use SDP as baseline
            init_S = knockpy.mac.solve_group_SDP(Sigma, groups)
            init_loss = mrc.mvr_loss(Sigma, init_S)

            # Apply gradient solver
            opt_S = kpytorch.mrcgrad.solve_mrc_psgd(
                Sigma=Sigma,
                groups=groups,
                init_S=init_S,
                tol=1e-5,
                max_epochs=100,
                line_search_iter=10,
            )
            psgd_loss = mrc.mvr_loss(Sigma, opt_S)

            # Check S matrix
            self.check_S_properties(Sigma, opt_S, groups)
            # Check new loss < init_loss
            self.assertTrue(
                psgd_loss <= init_loss,
                msg=
                f"For {method}, PSGD solver has higher loss {psgd_loss} v. sdp {init_loss}",
            )
Exemple #3
0
    def test_nested_AR1(self):

        # Check that a, b parameters work
        np.random.seed(110)
        a = 100
        b = 40
        dgprocess = dgp.DGP()
        _, _, _, _, Sigma = dgprocess.sample_data(
            p=500, method="nestedar1", a=a, b=b, nest_size=2, num_nests=1
        )
        mean_rho = np.diag(Sigma, k=1).mean()
        expected = a / (2 * (a + b)) + (a / (a + b)) ** 2 / 2
        np.testing.assert_almost_equal(
            mean_rho,
            expected,
            decimal=2,
            err_msg=f"random nested AR1 gen has unexpected avg rho {mean_rho}, should be ~ {expected} ",
        )
Exemple #4
0
    def test_gmliu2019_sample(self):

        n = 300
        p = 1000
        rho = 0.8
        np.random.seed(110)
        dgprocess = dgp.DGP()
        _, _, beta, _, _ = dgprocess.sample_data(
            rho=rho,
            gamma=1,
            p=p,
            n=n,
            sparsity=0.06,
            method="blockequi",
            coeff_dist="gmliu2019",
        )
        self.assertTrue(
            (beta != 0).sum() == 60, f"Sparsity constraint for gmliu2019 violated"
        )
Exemple #5
0
 def test_smoothing(self):
     """
     Smoothing is not required for this, but this is a nice check anyway.
     """
     p = 50
     smoothing = 0.1
     dgprocess = dgp.DGP()
     _, _, _, _, V = dgprocess.sample_data(
         method="partialcorr",
         rho=0.1,
     )
     S_MVR = mrc.solve_mvr(Sigma=V, smoothing=smoothing)
     # Not implemented yet
     S_SDP = mac.solve_SDP(Sigma=V, tol=1e-5)
     mvr_mean = np.diag(S_MVR).mean()
     sdp_mean = np.diag(S_SDP).mean()
     self.assertTrue(
         sdp_mean - mvr_mean < 1e-3,
         f"Highly smoothed S_MVR ({S_MVR}) too far from S_SDP ({S_SDP}) for equi partial corr",
     )
Exemple #6
0
    def test_beta_corr_signals(self):

        # Test signals are grouped together
        p = 4
        sparsity = 0.5
        expected_nn = int(sparsity * p)
        for j in range(10):
            dgprocess = dgp.DGP()
            _, _, beta, _, _ = dgprocess.sample_data(
                p=p, sparsity=0.5, corr_signals=True
            )
            nn_flags = beta != 0
            self.assertTrue(
                nn_flags.sum() == expected_nn,
                f"Corr_signals breaks sparsity (beta = {beta}, should have {expected_nn} non-nulls)",
            )
            first_nonzero = np.where(nn_flags)[0].min()
            self.assertTrue(
                nn_flags[first_nonzero + 1],
                f"Corr_signals does not produce correlated signals (beta = {beta})",
            )
Exemple #7
0
    def test_large_ising_samples(self):

        # Test that sampling does not throw an error
        np.random.seed(110)
        n = 100
        p = 625
        mu = np.zeros(p)
        dgprocess = dgp.DGP()
        X, _, _, _, _ = dgprocess.sample_data(
            n=n,
            p=p,
            method="ising",
            x_dist="gibbs",
        )
        gibbs_graph = dgprocess.gibbs_graph
        np.fill_diagonal(gibbs_graph, 1)

        # We load custom cov/q matrices for this
        file_directory = os.path.dirname(os.path.abspath(__file__))
        V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt")
        Q = np.loadtxt(f"{file_directory}/test_covs/qout{p}.txt")
        max_nonedge = np.max(np.abs(Q[gibbs_graph == 0]))
        self.assertTrue(
            max_nonedge < 1e-5,
            f"Estimated precision for ising{p} has max_nonedge {max_nonedge} >= 1e-5",
        )

        # Initialize sampler
        metro_sampler = metro.GibbsGridSampler(
            X=X,
            gibbs_graph=gibbs_graph,
            mu=mu,
            Sigma=V,
            Q=Q,
            max_width=5,
            method="equicorrelated",
        )

        # Sample and hope for no errors
        Xk = metro_sampler.sample_knockoffs()
Exemple #8
0
    def test_maxent(self):
        """ Both maxent/mmi work properly """
        # Sample data
        dgprocess = dgp.DGP()
        dgprocess.sample_data(p=50, method='ar1', a=3)

        # Check solve_maxent/solve_mmi
        np.random.seed(110)
        S_ME = smatrix.compute_smatrix(dgprocess.Sigma, method='maxent')
        np.random.seed(110)
        S_MMI = smatrix.compute_smatrix(dgprocess.Sigma, method='mmi')
        np.testing.assert_array_almost_equal(
            S_ME,
            S_MMI,
            decimal=3,
            err_msg=f"compute_smatrix yields diff answers for mmi, maxent")

        # Check solve_maxent/solve_mmi
        np.random.seed(110)
        S_ME = mrc.solve_maxent(dgprocess.Sigma)
        np.random.seed(110)
        S_MMI = mrc.solve_mmi(dgprocess.Sigma)
        np.testing.assert_array_almost_equal(
            S_ME,
            S_MMI,
            decimal=3,
            err_msg=f"solve_maxent and solve_mmi yield different answers")

        # Check maxent_loss/mmi_loss
        L_ME = mrc.maxent_loss(dgprocess.Sigma, S_ME)
        L_MMI = mrc.mmi_loss(dgprocess.Sigma, S_MMI)
        np.testing.assert_almost_equal(
            L_ME,
            L_MMI,
            decimal=3,
            err_msg=f"maxent_loss and mmi_loss yield different answers")
Exemple #9
0
    def test_consistency_of_inferring_sigma(self):
        """ Checks that the same knockoffs are produced
        whether you infer the covariance matrix first and
        pass it to the gaussian_knockoffs generator, or
        you let the generator do the work for you
        """

        n = 25
        p = 300
        rho = 0.5
        dgprocess = dgp.DGP()
        X, _, _, _, _ = dgprocess.sample_data(n=n, p=p, rho=rho, method="AR1")

        # Method 1: infer cov first
        V, _ = utilities.estimate_covariance(X, tol=1e-2)
        np.random.seed(110)
        Xk1 = knockoffs.GaussianSampler(X=X, Sigma=V,
                                        method="sdp").sample_knockoffs()

        # Method 2: Infer during
        np.random.seed(110)
        Xk2 = knockoffs.GaussianSampler(X=X, method="sdp").sample_knockoffs()
        np.testing.assert_array_almost_equal(
            Xk1, Xk2, 5, err_msg="Knockoff gen is inconsistent")
Exemple #10
0
    def test_misaligned_covariance_estimation(self):

        # Inputs
        seed = 110
        sample_kwargs = {
            "n": 640,
            "p": 300,
            "method": "blockequi",
            "gamma": 1,
            "rho": 0.8,
        }

        # Extracta couple of constants
        n = sample_kwargs["n"]
        p = sample_kwargs["p"]

        # Create data generating process
        np.random.seed(seed)
        dgprocess = dgp.DGP()
        X, y, beta, _, V = dgprocess.sample_data(**sample_kwargs)

        # Make sure this does not raise an error
        # (even though it is ill-conditioned and the graph lasso doesn't fail)
        utilities.estimate_covariance(X, shrinkage="graphicallasso")
Exemple #11
0
    def test_small_ising_samples(self):

        # Test samples to make sure the
        # knockoff properties hold
        np.random.seed(110)
        n = 100000
        p = 9
        mu = np.zeros(p)
        dgprocess = dgp.DGP()
        X, _, _, _, _ = dgprocess.sample_data(
            n=n,
            p=p,
            method="ising",
            x_dist="gibbs",
        )
        gibbs_graph = dgprocess.gibbs_graph
        np.fill_diagonal(gibbs_graph, 1)

        # We load custom cov/q matrices for this
        file_directory = os.path.dirname(os.path.abspath(__file__))
        V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt")
        Q = np.loadtxt(f"{file_directory}/test_covs/qout{p}.txt")
        max_nonedge = np.max(np.abs(Q[gibbs_graph == 0]))
        self.assertTrue(
            max_nonedge < 1e-5,
            f"Estimated precision for ising{p} has max_nonedge {max_nonedge} >= 1e-5",
        )

        # Initialize sampler
        metro_sampler = metro.GibbsGridSampler(
            X=X,
            gibbs_graph=gibbs_graph,
            mu=mu,
            Sigma=V,
            Q=Q,
            max_width=2,
        )

        # Sample
        Xk = metro_sampler.sample_knockoffs()

        # Check empirical means
        # Check empirical covariance matrix
        mu_hat = X.mean(axis=0)
        muk_hat = np.mean(Xk, axis=0)
        np.testing.assert_almost_equal(
            muk_hat,
            mu_hat,
            decimal=2,
            err_msg=
            f"For Ising sampler, empirical mean of Xk does not match mean of X",
        )

        # Check empirical covariance matrix
        V_hat = np.cov(X.T)
        Vk_hat = np.cov(Xk.T)
        np.testing.assert_almost_equal(
            V_hat / 2,
            Vk_hat / 2,
            decimal=1,
            err_msg=
            f"For Ising sampler, empirical covariance of Xk does not match cov of X",
        )

        # Check that marginal fourth moments match
        X4th = np.mean(np.power(X, 4), axis=0)
        Xk4th = np.mean(np.power(Xk, 4), axis=0)
        np.testing.assert_almost_equal(
            X4th / 10,
            Xk4th / 10,
            decimal=1,
            err_msg=
            f"For Ising sampler, fourth moment of Xk does not match theoretical fourth moment",
        )

        # Run a ton of KS tests
        metro_sampler.check_xk_validity(
            X,
            Xk,
            testname="SMALL_ISING",
        )
Exemple #12
0
 def ARsample():
     dgprocess = dgp.DGP()
     dgprocess.sample_data(method="AR1", rho=1.5)
Exemple #13
0
    def test_divconquer_likelihoods(self):

        # Test to make sure the way we split up
        # cliques does not change the likelihood
        np.random.seed(110)
        n = 10
        p = 625
        mu = np.zeros(p)
        dgprocess = dgp.DGP()
        X, _, _, _, _ = dgprocess.sample_data(
            n=n,
            p=p,
            method="ising",
            x_dist="gibbs",
        )
        gibbs_graph = dgprocess.gibbs_graph
        np.fill_diagonal(gibbs_graph, 1)

        # Read V
        file_directory = os.path.dirname(os.path.abspath(__file__))
        V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt")

        # Initialize sampler
        metro_sampler = metro.GibbsGridSampler(
            X=X,
            gibbs_graph=gibbs_graph,
            mu=mu,
            Sigma=V,
            max_width=2,
        )

        # Non-divided likelihood
        nondiv_like = 0
        for clique, lp in zip(metro_sampler.cliques,
                              metro_sampler.log_potentials):
            nondiv_like += lp(X[:, np.array(clique)])

        # Divided likelihood for the many keys
        many_div_like = np.zeros(n)
        for dc_key in metro_sampler.dc_keys:
            # Initialize likelihood for these data points
            div_like = 0
            # Helpful constants
            seps = metro_sampler.separators[dc_key]
            n_inds = metro_sampler.X_ninds[dc_key]
            # Add separator-to-separator cliques manually
            for clique, lp in zip(metro_sampler.cliques,
                                  metro_sampler.log_potentials):
                if clique[0] not in seps or clique[1] not in seps:
                    continue
                sepX = X[n_inds]
                div_like += lp(sepX[:, np.array(clique)])

            # Now loop through other blocks
            div_dict_list = metro_sampler.divconq_info[dc_key]
            for block_dict in div_dict_list:
                blockX = X[n_inds][:, block_dict["inds"]]
                for clique, lp in zip(block_dict["cliques"],
                                      block_dict["lps"]):
                    div_like += lp(blockX[:, clique])
            many_div_like[n_inds] = np.array(div_like)

        # Test to make sure these likelihoods agree
        np.testing.assert_almost_equal(
            nondiv_like,
            many_div_like,
            decimal=5,
            err_msg=
            f"Non-divided clique potentials {nondiv_like} do not agree with divided cliques {div_like}",
        )
Exemple #14
0
    def test_blockt_samples(self):

        # Test to make sure low df --> heavy tails
        # and therefore acceptances < 1
        np.random.seed(110)
        n = 2000000
        p = 6
        df_t = 5
        dgprocess = dgp.DGP()
        X, _, _, Q, V = dgprocess.sample_data(
            n=n,
            p=p,
            method="blockequi",
            rho=0.4,
            gamma=0,
            block_size=3,
            x_dist="blockt",
            df_t=df_t,
        )
        for S in [np.eye(p), None]:

            # Sample t
            tsampler = metro.BlockTSampler(X=X,
                                           Sigma=V,
                                           df_t=df_t,
                                           S=S,
                                           metro_verbose=True)

            # Sample
            Xk = tsampler.sample_knockoffs()

            # Check empirical means
            # Check empirical covariance matrix
            muk_hat = np.mean(Xk, axis=0)
            np.testing.assert_almost_equal(
                muk_hat,
                np.zeros(p),
                decimal=2,
                err_msg=
                f"For block T sampler, empirical mean of Xk does not match mean of X",
            )

            # Check empirical covariance matrix
            Vk_hat = np.cov(Xk.T)
            np.testing.assert_almost_equal(
                V,
                Vk_hat,
                decimal=2,
                err_msg=
                f"For block T sampler, empirical covariance of Xk does not match cov of X",
            )

            # Check that marginal fourth moments match
            X4th = np.mean(np.power(X, 4), axis=0)
            Xk4th = np.mean(np.power(Xk, 4), axis=0)
            np.testing.assert_almost_equal(
                X4th / 10,
                Xk4th / 10,
                decimal=1,
                err_msg=
                f"For block T sampler, fourth moment of Xk does not match theoretical fourth moment",
            )

            # Run a ton of KS tests
            tsampler.check_xk_validity(X, Xk, testname="BLOCKT")
Exemple #15
0
    def test_tmarkov_samples(self):

        # Test to make sure low df --> heavy tails
        # and therefore acceptances < 1
        np.random.seed(110)
        n = 1000000
        p = 5
        df_t = 3
        dgprocess = dgp.DGP()
        X, _, _, Q, V = dgprocess.sample_data(n=n,
                                              p=p,
                                              method="AR1",
                                              rho=0.3,
                                              x_dist="ar1t",
                                              df_t=df_t)
        for S in [None, np.eye(p)]:

            # Sample t
            tsampler = metro.ARTKSampler(X=X,
                                         Sigma=V,
                                         df_t=df_t,
                                         S=S,
                                         metro_verbose=True)

            # Correct junction tree
            self.assertTrue(
                tsampler.width == 1,
                f"tsampler should have width 1, not {tsampler.width}")

            # Sample
            Xk = tsampler.sample_knockoffs()

            # Check empirical means
            # Check empirical covariance matrix
            muk_hat = np.mean(Xk, axis=0)
            np.testing.assert_almost_equal(
                muk_hat,
                np.zeros(p),
                decimal=2,
                err_msg=
                f"For ARTK sampler, empirical mean of Xk does not match mean of X",
            )

            # Check empirical covariance matrix
            Vk_hat = np.corrcoef(Xk.T)
            np.testing.assert_almost_equal(
                V,
                Vk_hat,
                decimal=2,
                err_msg=
                f"For ARTK sampler, empirical covariance of Xk does not match cov of X",
            )

            # Check that marginal fourth moments match
            X4th = np.mean(np.power(X, 4), axis=0)
            Xk4th = np.mean(np.power(Xk, 4), axis=0)
            np.testing.assert_almost_equal(
                X4th / 10,
                Xk4th / 10,
                decimal=1,
                err_msg=
                f"For ARTK sampler, fourth moment of Xk does not match theoretical fourth moment",
            )

            # Run a ton of KS tests
            tsampler.check_xk_validity(X, Xk, testname="ARTK")
Exemple #16
0
    def test_tmarkov_likelihood(self):

        # Data
        np.random.seed(110)
        n = 15
        p = 10
        df_t = 5
        X1 = np.random.randn(n, p)
        X2 = np.random.randn(n, p)
        V = np.eye(p)
        Q = np.eye(p)

        # Scipy likelihood ratio for X, scale matrix
        inv_scale = np.sqrt(df_t / (df_t - 2))
        sp_like1 = stats.t.logpdf(inv_scale * X1, df=df_t).sum(axis=1)
        sp_like2 = stats.t.logpdf(inv_scale * X2, df=df_t).sum(axis=1)
        sp_ratio = sp_like1 - sp_like2

        # General likelihood
        rhos = np.zeros(p - 1)
        ar1_like1 = metro.t_markov_loglike(X1, rhos, df_t=df_t)
        ar1_like2 = metro.t_markov_loglike(X2, rhos, df_t=df_t)
        ar1_ratio = ar1_like1 - ar1_like2

        self.assertTrue(
            np.abs(ar1_ratio - sp_ratio).sum() < 0.01,
            f"AR1 ratio {ar1_ratio} and scipy ratio {sp_ratio} disagree for independent t vars",
        )

        # Test again with df_t --> infinity, so it should be approx gaussian
        dgprocess = dgp.DGP()
        X1, _, _, Q, V = dgprocess.sample_data(n=n,
                                               p=p,
                                               method="AR1",
                                               a=3,
                                               b=1)
        X2 = np.random.randn(n, p)

        # Ratio using normals
        df_t = 100000
        mu = np.zeros(p)
        norm_like1 = stats.multivariate_normal(mean=mu, cov=V).logpdf(X1)
        norm_like2 = stats.multivariate_normal(mean=mu, cov=V).logpdf(X2)
        norm_ratio = norm_like1 - norm_like2

        # Ratios using T
        rhos = np.diag(V, 1)
        ar1_like1 = metro.t_markov_loglike(X1, rhos, df_t=df_t)
        ar1_like2 = metro.t_markov_loglike(X2, rhos, df_t=df_t)
        ar1_ratio = ar1_like1 - ar1_like2

        self.assertTrue(
            np.abs(ar1_ratio - norm_ratio).mean() < 0.01,
            f"AR1 ratio {ar1_ratio} and gaussian ratio {norm_ratio} disagree for corr. t vars, df={df_t}",
        )

        # Check consistency of tsampler class
        tsampler = metro.ARTKSampler(
            X=X1,
            Sigma=V,
            df_t=df_t,
        )
        new_ar1_like1 = tsampler.lf(tsampler.X)
        self.assertTrue(
            np.abs(ar1_like1 - new_ar1_like1).sum() < 0.01,
            f"AR1 loglike inconsistent between class ({new_ar1_like1}) and function ({ar1_ratio})",
        )
Exemple #17
0
    def test_dense_sample(self):

        # Fake data
        np.random.seed(110)
        n = 10000
        p = 4
        dgprocess = dgp.DGP()
        X, _, _, Q, V = dgprocess.sample_data(method="blockequi",
                                              rho=0.6,
                                              n=n,
                                              p=p,
                                              gamma=1,
                                              block_size=p)
        ksampler = knockpy.knockoffs.GaussianSampler(X=X,
                                                     Sigma=V,
                                                     method="mvr")
        S = ksampler.fetch_S()

        # Network graph
        Q_graph = np.abs(Q) > 1e-5
        Q_graph = Q_graph - np.eye(p)
        undir_graph = nx.Graph(Q_graph)
        width, T = treewidth.treewidth_decomp(undir_graph)
        order, active_frontier = metro.get_ordering(T)

        # Metro sampler and likelihood
        mvn = stats.multivariate_normal(mean=np.zeros(p), cov=V)

        def mvn_likelihood(X):
            return mvn.logpdf(X)

        gamma = 0.99999
        metro_sampler = metro.MetropolizedKnockoffSampler(
            lf=mvn_likelihood,
            X=X,
            mu=np.zeros(p),
            Sigma=V,
            order=order,
            active_frontier=active_frontier,
            gamma=gamma,
            S=S,
            metro_verbose=True,
        )

        # Output knockoffs
        Xk = metro_sampler.sample_knockoffs()

        # Acceptance rate should be exactly one
        acc_rate = metro_sampler.final_acc_probs.mean()
        self.assertTrue(
            acc_rate - gamma > -1e-3,
            msg=
            f"For equi gaussian design, metro has acc_rate={acc_rate} < gamma={gamma}",
        )

        # Check covariance matrix
        features = np.concatenate([X, Xk], axis=1)
        emp_corr_matrix = np.corrcoef(features.T)
        G = np.concatenate([
            np.concatenate([V, V - S]),
            np.concatenate([V - S, V]),
        ],
                           axis=1)

        np.testing.assert_almost_equal(
            emp_corr_matrix,
            G,
            decimal=2,
            err_msg=
            f"For equi gaussian design, metro does not match theoretical matrix",
        )
Exemple #18
0
 def non_ar1_t():
     dgprocess = dgp.DGP()
     dgprocess.sample_data(n=n, p=p, method="ver", x_dist="ar1t")
Exemple #19
0
    def test_complex_solns(self):
        """
        Check the solution of the various solvers
        for non-grouped knockoffs.
        """

        # Check availability
        if not TORCH_AVAILABLE:
            return None
        from knockpy import kpytorch

        np.random.seed(110)
        p = 100
        methods = ["ar1", "ver"]
        groups = np.arange(1, p + 1, 1)
        for method in methods:
            dgprocess = dgp.DGP()
            _, _, _, _, Sigma = dgprocess.sample_data(method=method, p=p)

            # Use SDP as baseline
            init_S = knockpy.mac.solve_group_SDP(Sigma, groups)
            sdp_mvr_loss = mrc.mvr_loss(Sigma, init_S)

            # Apply gradient solver
            opt_S = kpytorch.mrcgrad.solve_mrc_psgd(
                Sigma=Sigma,
                groups=groups,
                init_S=init_S,
                tol=1e-5,
                max_epochs=100,
                line_search_iter=10,
            )
            psgd_mvr_loss = mrc.mvr_loss(Sigma, opt_S)

            # Check S matrix
            self.check_S_properties(Sigma, opt_S, groups)
            # Check new loss < init_loss
            self.assertTrue(
                psgd_mvr_loss <= sdp_mvr_loss,
                msg=
                f"For {method}, PSGD solver has higher loss {psgd_mvr_loss} v. sdp {sdp_mvr_loss}",
            )

            # MVR solver outperforms PSGD
            opt_S_mvr = mrc.solve_mvr(Sigma=Sigma)
            self.check_S_properties(Sigma, opt_S_mvr, groups)
            cd_mvr_loss = mrc.mvr_loss(Sigma, opt_S_mvr)
            self.assertTrue(
                cd_mvr_loss <= psgd_mvr_loss,
                msg=
                f"For {method}, coord descent MVR solver has higher loss {cd_mvr_loss} v. PSGD {psgd_mvr_loss}",
            )

            # mmi solver outperforms PSGD
            print(Sigma)
            print(np.linalg.eigh(Sigma)[0].min())
            opt_S_mmi = mrc.solve_mmi(Sigma=Sigma)
            print(opt_S_mmi)
            print(opt_S)
            self.check_S_properties(Sigma, opt_S_mmi, groups)
            cd_mmi_loss = mrc.mmi_loss(Sigma, opt_S_mmi)
            psgd_mmi_loss = mrc.mmi_loss(Sigma, opt_S)
            self.assertTrue(
                cd_mmi_loss <= psgd_mmi_loss,
                msg=
                f"For {method}, coord descent mmi solver has higher loss {cd_mmi_loss} v. PSGD {psgd_mmi_loss}",
            )
Exemple #20
0
 def bad_xdist():
     dgprocess = dgp.DGP()
     dgprocess.sample_data(method="ver", x_dist="t_dist")
Exemple #21
0
    def test_ar1_sample(self):

        # Fake data
        np.random.seed(110)
        n = 30000
        p = 8
        dgprocess = dgp.DGP()
        X, _, _, Q, V = dgprocess.sample_data(method="AR1", n=n, p=p)
        ksampler = knockpy.knockoffs.GaussianSampler(X=X,
                                                     Sigma=V,
                                                     method="mvr")
        S = ksampler.fetch_S()

        # Graph structure + junction tree
        Q_graph = np.abs(Q) > 1e-5
        Q_graph = Q_graph - np.eye(p)

        # Metro sampler + likelihood
        mvn = stats.multivariate_normal(mean=np.zeros(p), cov=V)

        def mvn_likelihood(X):
            return mvn.logpdf(X)

        gamma = 0.9999
        metro_sampler = metro.MetropolizedKnockoffSampler(
            lf=mvn_likelihood,
            X=X,
            mu=np.zeros(p),
            Sigma=V,
            undir_graph=Q_graph,
            S=S,
            gamma=gamma,
        )

        # Output knockoffs
        Xk = metro_sampler.sample_knockoffs()

        # Acceptance rate should be exactly one
        acc_rate = metro_sampler.final_acc_probs.mean()
        self.assertTrue(
            acc_rate - gamma > -1e-3,
            msg=
            f"For AR1 gaussian design, metro has acc_rate={acc_rate} < gamma={gamma}",
        )

        # Check covariance matrix
        features = np.concatenate([X, Xk], axis=1)
        emp_corr_matrix = np.corrcoef(features.T)
        G = np.concatenate([
            np.concatenate([V, V - S]),
            np.concatenate([V - S, V]),
        ],
                           axis=1)

        np.testing.assert_almost_equal(
            emp_corr_matrix,
            G,
            decimal=2,
            err_msg=
            f"For AR1 gaussian design, metro does not match theoretical matrix",
        )
Exemple #22
0
    def test_debiased_lasso(self):

        # Create data generating process
        n = 200
        p = 20
        rho = 0.3
        np.random.seed(110)
        dgprocess = dgp.DGP()
        X, y, beta, _, corr_matrix = dgprocess.sample_data(
            n=n,
            p=p,
            y_dist="gaussian",
            coeff_size=100,
            sign_prob=0.5,
            method="blockequi",
            rho=rho,
        )
        groups = np.arange(1, p + 1, 1)

        # Create knockoffs
        S = (1 - rho) * np.eye(p)
        ksampler = knockpy.knockoffs.GaussianSampler(X=X,
                                                     groups=groups,
                                                     Sigma=corr_matrix,
                                                     verbose=False,
                                                     S=S)
        knockoffs = ksampler.sample_knockoffs()
        G = np.concatenate(
            [
                np.concatenate([corr_matrix, corr_matrix - S]),
                np.concatenate([corr_matrix - S, corr_matrix]),
            ],
            axis=1,
        )
        Ginv = utilities.chol2inv(G)

        # Debiased lasso - test accuracy
        dlasso_stat = kstats.LassoStatistic()
        dlasso_stat.fit(X,
                        knockoffs,
                        y,
                        use_lars=False,
                        cv_score=False,
                        debias=True,
                        Ginv=Ginv)
        W = dlasso_stat.W
        l2norm = np.power(W - beta, 2).mean()
        self.assertTrue(
            l2norm > 1,
            msg=
            f"Debiased lasso fits gauissan very poorly (l2norm = {l2norm} btwn real/fitted coeffs)",
        )

        # Test that this throws the correct errors
        # first for Ginv
        def debiased_lasso_sans_Ginv():
            dlasso_stat.fit(X,
                            knockoffs,
                            y,
                            use_lars=False,
                            cv_score=False,
                            debias=True,
                            Ginv=None)

        self.assertRaisesRegex(ValueError, "Ginv must be provided",
                               debiased_lasso_sans_Ginv)

        # Second for logistic data
        y = np.random.binomial(1, 0.5, n)

        def binomial_debiased_lasso():
            dlasso_stat.fit(
                X,
                knockoffs,
                y,
                use_lars=False,
                cv_score=False,
                debias=True,
                Ginv=Ginv,
            )

        self.assertRaisesRegex(
            ValueError,
            "Debiased lasso is not implemented for binomial data",
            binomial_debiased_lasso,
        )
Exemple #23
0
    def test_cv_scoring(self):

        # Create data generating process
        n = 100
        p = 20
        np.random.seed(110)
        dgprocess = dgp.DGP()
        X, y, beta, _, corr_matrix = dgprocess.sample_data(n=n,
                                                           p=p,
                                                           y_dist="gaussian",
                                                           coeff_size=100,
                                                           sign_prob=1)
        groups = np.arange(1, p + 1, 1)

        # These are not real, just helpful syntatically
        knockoffs = np.zeros((n, p))

        # 1. Test lars cv scoring
        lars_stat = kstats.LassoStatistic()
        lars_stat.fit(
            X,
            knockoffs,
            y,
            use_lars=True,
            cv_score=True,
        )
        self.assertTrue(
            lars_stat.score_type == "mse_cv",
            msg=
            f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})",
        )

        # 2. Test OLS cv scoring
        ols_stat = kstats.OLSStatistic()
        ols_stat.fit(
            X,
            knockoffs,
            y,
            cv_score=True,
        )
        self.assertTrue(
            ols_stat.score_type == "mse_cv",
            msg=
            f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})",
        )
        self.assertTrue(
            ols_stat.score < 2,
            msg=
            f"cv scoring fails for ols_stat as cv_score={ols_stat.score} >= 2",
        )

        # 3. Test that throws correct error for non-sklearn backend
        def non_sklearn_backend_cvscore():
            dgprocess = dgp.DGP()
            X, y, beta, _, corr_matrix = dgprocess.sample_data(
                n=n, p=p, y_dist="binomial", coeff_size=100, sign_prob=1)
            groups = np.random.randint(1, p + 1, size=(p, ))
            group = utilities.preprocess_groups(groups)
            pyglm_logit = kstats.LassoStatistic()
            pyglm_logit.fit(
                X,
                knockoffs,
                y,
                use_pyglm=True,
                group_lasso=True,
                groups=groups,
                cv_score=True,
            )

        self.assertRaisesRegex(ValueError, "must be sklearn estimator",
                               non_sklearn_backend_cvscore)
Exemple #24
0
    def check_kstat_fit(
        self,
        fstat,
        fstat_name,
        fstat_kwargs={},
        min_power=0.8,
        max_l2norm=9,
        seed=110,
        group_features=False,
        **sample_kwargs,
    ):
        """ fstat should be a class instance inheriting from FeatureStatistic """

        # Add defaults to sample kwargs
        if "method" not in sample_kwargs:
            sample_kwargs["method"] = "blockequi"
        if "gamma" not in sample_kwargs:
            sample_kwargs["gamma"] = 1
        if "n" not in sample_kwargs:
            sample_kwargs["n"] = 200
        if "p" not in sample_kwargs:
            sample_kwargs["p"] = 50
        if "rho" not in sample_kwargs:
            sample_kwargs["rho"] = 0.5
        if "y_dist" not in sample_kwargs:
            sample_kwargs["y_dist"] = "gaussian"
        n = sample_kwargs["n"]
        p = sample_kwargs["p"]
        rho = sample_kwargs["rho"]
        y_dist = sample_kwargs["y_dist"]

        # Create data generating process
        np.random.seed(seed)
        dgprocess = dgp.DGP()
        X, y, beta, _, corr_matrix = dgprocess.sample_data(**sample_kwargs)

        # Create groups
        if group_features:
            groups = np.random.randint(1, p + 1, size=(p, ))
            groups = utilities.preprocess_groups(groups)
        else:
            groups = np.arange(1, p + 1, 1)

        # Create knockoffs
        ksampler = knockpy.knockoffs.GaussianSampler(
            X=X,
            groups=groups,
            Sigma=corr_matrix,
            verbose=False,
            S=(1 - rho) * np.eye(p),
        )
        Xk = ksampler.sample_knockoffs()
        S = ksampler.fetch_S()

        # Fit and extract coeffs/T
        fstat.fit(
            X,
            Xk,
            y,
            groups=groups,
            **fstat_kwargs,
        )
        W = fstat.W
        T = data_dependent_threshhold(W, fdr=0.2)

        # Test L2 norm
        m = np.unique(groups).shape[0]
        if m == p:
            pair_W = W
        else:
            pair_W = kstats.combine_Z_stats(fstat.Z, antisym="cd")
        l2norm = np.power(pair_W - np.abs(beta), 2)
        l2norm = l2norm.mean()
        self.assertTrue(
            l2norm < max_l2norm,
            msg=
            f"{fstat_name} fits {y_dist} data very poorly (l2norm = {l2norm} btwn real {beta} / fitted {pair_W} coeffs)",
        )

        # Test power for non-grouped setting.
        # (For group setting, power will be much lower.)
        selections = (W >= T).astype("float32")
        group_nnulls = utilities.fetch_group_nonnulls(beta, groups)
        power = (
            (group_nnulls != 0) * selections).sum() / np.sum(group_nnulls != 0)
        fdp = ((group_nnulls == 0) * selections).sum() / max(
            np.sum(selections), 1)
        self.assertTrue(
            power >= min_power,
            msg=
            f"Power {power} for {fstat_name} in equicor case (n={n},p={p},rho={rho}, y_dist {y_dist}, grouped={group_features}) should be > {min_power}. W stats are {W}, beta is {beta}",
        )
    def check_fdr_control(
        self,
        reps=NUM_REPS,
        q=0.2,
        alpha=0.05,
        filter_kwargs={},
        S=None,
        infer_sigma=False,
        test_grouped=True,
        S_method="mvr",
        **kwargs,
    ):

        np.random.seed(110)
        filter_kwargs = filter_kwargs.copy()
        kwargs = kwargs.copy()
        fixedX = False
        if "ksampler" in filter_kwargs:
            if filter_kwargs["ksampler"] == "fx":
                fixedX = True

        # Create and name DGP
        mu = kwargs.pop("mu", None)
        Sigma = kwargs.pop("Sigma", None)
        invSigma = kwargs.pop("invSigma", None)
        beta = kwargs.pop("beta", None)
        dgprocess = dgp.DGP(mu=mu, Sigma=Sigma, invSigma=invSigma, beta=beta)
        X0, _, beta, _, Sigma = dgprocess.sample_data(**kwargs)

        basename = ""
        for key in kwargs:
            basename += f"{key}={kwargs[key]} "

        # Two settings: one grouped, one not
        p = Sigma.shape[0]
        groups1 = np.arange(1, p + 1, 1)
        name1 = basename + " (ungrouped)"
        groups2 = np.random.randint(1, p + 1, size=(p, ))
        groups2 = utilities.preprocess_groups(groups2)
        name2 = basename + " (grouped)"

        # Split filter_kwargs
        init_filter_kwargs = {}
        init_filter_kwargs["ksampler"] = filter_kwargs.pop(
            "ksampler", "gaussian")
        init_filter_kwargs["fstat"] = filter_kwargs.pop("fstat", "lasso")
        knockoff_kwargs = filter_kwargs.pop('knockoff_kwargs', {})

        for name, groups in zip([name1, name2], [groups1, groups2]):

            if not test_grouped and np.all(groups == groups2):
                continue

            # Solve for S matrix
            if S is None and not fixedX and not infer_sigma:
                ksampler = knockpy.knockoffs.GaussianSampler(
                    X=X0,
                    Sigma=Sigma,
                    groups=groups,
                    method=S_method,
                )
            if not fixedX:
                invSigma = utilities.chol2inv(Sigma)
            group_nonnulls = utilities.fetch_group_nonnulls(beta, groups)

            # Container for fdps
            fdps = []

            # Sample data reps times
            for j in range(reps):
                np.random.seed(j)
                dgprocess = dgp.DGP(Sigma=Sigma, beta=beta)
                X, y, _, Q, _ = dgprocess.sample_data(**kwargs)
                gibbs_graph = dgprocess.gibbs_graph

                # Infer y_dist
                if "y_dist" in kwargs:
                    y_dist = kwargs["y_dist"]
                else:
                    y_dist = "gaussian"

                # Run (MX) knockoff filter
                if fixedX or infer_sigma:
                    mu_arg = None
                    Sigma_arg = None
                    invSigma_arg = None
                else:
                    mu_arg = np.zeros(p)
                    Sigma_arg = Sigma
                    invSigma_arg = invSigma

                # Initialize filter
                knockoff_filter = KnockoffFilter(**init_filter_kwargs)

                # Knockoff kwargs
                knockoff_kwargs['S'] = S
                knockoff_kwargs['invSigma'] = invSigma_arg
                knockoff_kwargs['verbose'] = False

                if "df_t" in kwargs:
                    knockoff_kwargs["df_t"] = kwargs["df_t"]
                if "x_dist" in kwargs:
                    if kwargs["x_dist"] == "gibbs":
                        knockoff_kwargs["gibbs_graph"] = gibbs_graph
                    knockoff_kwargs.pop("S", None)

                selections = knockoff_filter.forward(
                    X=X,
                    y=y,
                    mu=mu_arg,
                    Sigma=Sigma_arg,
                    groups=groups,
                    knockoff_kwargs=knockoff_kwargs,
                    fdr=q,
                    **filter_kwargs,
                )

                # Check null W-statistics are symmetric
                pos_prop = (knockoff_filter.W[group_nonnulls == 0] > 0).mean()
                pos_prop_se = np.sqrt(pos_prop * (1 - pos_prop) /
                                      (1 - group_nonnulls).sum())
                Zstat = (pos_prop - 0.5) / pos_prop_se
                pval = 1 - stats.norm.cdf(Zstat)
                self.assertTrue(
                    pval >= 0.001,
                    msg=
                    f"MX filter null W-stats have pos_prob {pos_prop} with p={p} and pval={pval}",
                )

                # Calculate fdp
                fdp = np.sum(selections * (1 - group_nonnulls)) / max(
                    1, np.sum(selections))
                fdps.append(fdp)

                del knockoff_filter

            fdps = np.array(fdps)
            fdr = fdps.mean()
Exemple #26
0
 def sample_bad_dist():
     dgprocess = dgp.DGP()
     dgprocess.sample_data(p=100, coeff_dist="bad_dist_arg")