def simulate_null(): n, p = 100, 40 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_stepwise(X, Y, sigma=0.5) for i in range(5): FS.next() return [p[-1] for p in FS.model_pivots(3)]
def test_FS(): n, p = 100, 40 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_stepwise(X, Y, sigma=0.5) for i in range(30): FS.next() if not FS.check_constraints(): raise ValueError('constraints not satisfied') print 'first 30 variables selected', FS.variables print 'M^{\pm} for the 10th selected model knowing that we performed 30 steps of forward stepwise' FS.model_pivots(3) FS.model_quadratic(3)
def sample_split(X, Y, sigma=None, nstep=10, burnin=1000, ndraw=5000, reduced=True): n, p = X.shape half_n = int(n / 2) X1, Y1 = X[:half_n, :] * 1., Y[:half_n] * 1. X1 -= X1.mean(0)[None, :] Y1 -= Y1.mean() X2, Y2 = X[half_n:], Y[half_n:] X2 -= X2.mean(0)[None, :] Y2 -= Y2.mean() FS_half = forward_stepwise(X1, Y1) # sample splitting model FS_full = forward_stepwise(X.copy(), Y.copy()) # full data model spacings_P = [] split_P = [] reduced_Pknown = [] reduced_Punknown = [] covtest_P = [] for i in range(nstep): FS_half.next() if FS_half.P[i] is not None: RX = FS_half.X - FS_half.P[i](FS_half.X) RY = FS_half.Y - FS_half.P[i](FS_half.Y) covariance = centering(FS_half.Y.shape[0]) - np.dot( FS_half.P[i].U, FS_half.P[i].U.T) else: RX = FS_half.X RY = FS_half.Y covariance = centering(FS_half.Y.shape[0]) RX -= RX.mean(0)[None, :] RX /= (RX.std(0)[None, :] * np.sqrt(RX.shape[0])) # covtest on half -- not saved con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=True) # spacings on half -- not saved eta1 = RX[:, idx] * sign Acon = constraints(FS_half.A, np.zeros(FS_half.A.shape[0]), covariance=centering(FS_half.Y.shape[0])) Acon.covariance *= sigma**2 Acon.pivot(eta1, FS_half.Y) # sample split eta2 = np.linalg.pinv(X2[:, FS_half.variables])[-1] eta_sigma = np.linalg.norm(eta2) * sigma split_P.append(2 * ndist.sf(np.fabs((eta2 * Y2).sum() / eta_sigma))) # inference on full mu using split model, this \beta^+_s. zero_block = np.zeros((Acon.linear_part.shape[0], (n - half_n))) linear_part = np.hstack([Acon.linear_part, zero_block]) Fcon = constraints(linear_part, Acon.offset, covariance=centering(n)) Fcon.covariance *= sigma**2 if i > 0: U = np.linalg.pinv(X[:, FS_half.variables[:-1]]) Uy = np.dot(U, Y) Fcon = Fcon.conditional(U, Uy) else: Fcon = Fcon eta_full = np.linalg.pinv(X[:, FS_half.variables])[-1] if reduced: reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='twosided')[0] reduced_Pknown.append(reduced_pval) reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='twosided')[0] reduced_Punknown.append(reduced_pval) # now use all the data FS_full.next() if FS_full.P[i] is not None: RX = X - FS_full.P[i](X) RY = Y - FS_full.P[i](Y) covariance = centering(RY.shape[0]) - np.dot( FS_full.P[i].U, FS_full.P[i].U.T) else: RX = X RY = Y.copy() covariance = centering(RY.shape[0]) RX -= RX.mean(0)[None, :] RX /= RX.std(0)[None, :] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=False) covtest_P.append(pval) # spacings on full data eta1 = RX[:, idx] * sign Acon = constraints(FS_full.A, np.zeros(FS_full.A.shape[0]), centering(RY.shape[0])) Acon.covariance *= sigma**2 spacings_P.append(Acon.pivot(eta1, Y)) return split_P, reduced_Pknown, reduced_Punknown, spacings_P, covtest_P, FS_half.variables
def sample_split(X, Y, sigma=None, nstep=10, burnin=1000, ndraw=5000, reduced=True): n, p = X.shape half_n = int(n/2) X1, Y1 = X[:half_n,:]*1., Y[:half_n]*1. X1 -= X1.mean(0)[None,:] Y1 -= Y1.mean() X2, Y2 = X[half_n:], Y[half_n:] X2 -= X2.mean(0)[None,:] Y2 -= Y2.mean() FS_half = forward_stepwise(X1, Y1) # sample splitting model FS_full = forward_stepwise(X.copy(), Y.copy()) # full data model spacings_P = [] split_P = [] reduced_Pknown = [] reduced_Punknown = [] covtest_P = [] for i in range(nstep): FS_half.next() if FS_half.P[i] is not None: RX = FS_half.X - FS_half.P[i](FS_half.X) RY = FS_half.Y - FS_half.P[i](FS_half.Y) covariance = centering(FS_half.Y.shape[0]) - np.dot(FS_half.P[i].U, FS_half.P[i].U.T) else: RX = FS_half.X RY = FS_half.Y covariance = centering(FS_half.Y.shape[0]) RX -= RX.mean(0)[None,:] RX /= (RX.std(0)[None,:] * np.sqrt(RX.shape[0])) # covtest on half -- not saved con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=True) # spacings on half -- not saved eta1 = RX[:,idx] * sign Acon = constraints(FS_half.A, np.zeros(FS_half.A.shape[0]), covariance=centering(FS_half.Y.shape[0])) Acon.covariance *= sigma**2 Acon.pivot(eta1, FS_half.Y) # sample split eta2 = np.linalg.pinv(X2[:,FS_half.variables])[-1] eta_sigma = np.linalg.norm(eta2) * sigma split_P.append(2*ndist.sf(np.fabs((eta2*Y2).sum() / eta_sigma))) # inference on full mu using split model, this \beta^+_s. zero_block = np.zeros((Acon.linear_part.shape[0], (n-half_n))) linear_part = np.hstack([Acon.linear_part, zero_block]) Fcon = constraints(linear_part, Acon.offset, covariance=centering(n)) Fcon.covariance *= sigma**2 if i > 0: U = np.linalg.pinv(X[:,FS_half.variables[:-1]]) Uy = np.dot(U, Y) Fcon = Fcon.conditional(U, Uy) else: Fcon = Fcon eta_full = np.linalg.pinv(X[:,FS_half.variables])[-1] if reduced: reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='twosided')[0] reduced_Pknown.append(reduced_pval) reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='twosided')[0] reduced_Punknown.append(reduced_pval) # now use all the data FS_full.next() if FS_full.P[i] is not None: RX = X - FS_full.P[i](X) RY = Y - FS_full.P[i](Y) covariance = centering(RY.shape[0]) - np.dot(FS_full.P[i].U, FS_full.P[i].U.T) else: RX = X RY = Y.copy() covariance = centering(RY.shape[0]) RX -= RX.mean(0)[None,:] RX /= RX.std(0)[None,:] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=False) covtest_P.append(pval) # spacings on full data eta1 = RX[:,idx] * sign Acon = constraints(FS_full.A, np.zeros(FS_full.A.shape[0]), centering(RY.shape[0])) Acon.covariance *= sigma**2 spacings_P.append(Acon.pivot(eta1, Y)) return split_P, reduced_Pknown, reduced_Punknown, spacings_P, covtest_P, FS_half.variables
def forward_step(X, Y, sigma=None, nstep=5, exact=False, burnin=1000, ndraw=5000): """ A simple implementation of forward stepwise that uses the `reduced_covtest` iteratively after adjusting fully for the selected variable. This implementation is not efficient, in that it computes more SVDs than it really has to. Parameters ---------- X : np.float((n,p)) Y : np.float(n) sigma : float (optional) Noise level (not needed for reduced). nstep : int How many steps of forward stepwise? exact : bool Which version of covtest should we use? burnin : int How many iterations until we start recording samples? ndraw : int How many samples should we return? tests : ['reduced_known', 'covtest', 'reduced_unknown'] Which test to use? A subset of the above sequence. """ n, p = X.shape FS = forward_stepwise(X, Y) spacings_P = [] covtest_P = [] reduced_Pknown = [] reduced_Punknown = [] for i in range(nstep): FS.next() # covtest if FS.P[i] is not None: RX = X - FS.P[i](X) RY = Y - FS.P[i](Y) covariance = np.identity(n) - np.dot(FS.P[i].U, FS.P[i].U.T) else: RX = X RY = Y covariance = None RX -= RX.mean(0)[None,:] RX /= RX.std(0)[None,:] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=exact) covtest_P.append(pval) # reduced eta = RX[:,idx] * sign Acon = constraints(FS.A, np.zeros(FS.A.shape[0])) Acon.covariance *= sigma**2 if i > 0: U = FS.P[-2].U.T Uy = np.dot(U, Y) Bcon = Acon.conditional(U, Uy) else: Bcon = Acon spacings_P.append(Acon.pivot(eta, Y)) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='greater') reduced_Pknown.append(reduced_pval) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='greater') reduced_Punknown.append(reduced_pval) return covtest_P, reduced_Pknown, reduced_Punknown, spacings_P, FS.variables
def forward_step(X, Y, sigma=None, nstep=5, exact=False, burnin=1000, ndraw=5000): """ A simple implementation of forward stepwise that uses the `reduced_covtest` iteratively after adjusting fully for the selected variable. This implementation is not efficient, in that it computes more SVDs than it really has to. Parameters ---------- X : np.float((n,p)) Y : np.float(n) sigma : float (optional) Noise level (not needed for reduced). nstep : int How many steps of forward stepwise? exact : bool Which version of covtest should we use? burnin : int How many iterations until we start recording samples? ndraw : int How many samples should we return? tests : ['reduced_known', 'covtest', 'reduced_unknown'] Which test to use? A subset of the above sequence. """ n, p = X.shape FS = forward_stepwise(X, Y) spacings_P = [] covtest_P = [] reduced_Pknown = [] reduced_Punknown = [] for i in range(nstep): FS.next() # covtest if FS.P[i] is not None: RX = X - FS.P[i](X) RY = Y - FS.P[i](Y) covariance = np.identity(n) - np.dot(FS.P[i].U, FS.P[i].U.T) else: RX = X RY = Y covariance = None RX -= RX.mean(0)[None, :] RX /= RX.std(0)[None, :] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=exact) covtest_P.append(pval) # reduced eta = RX[:, idx] * sign Acon = constraints(FS.A, np.zeros(FS.A.shape[0])) Acon.covariance *= sigma**2 if i > 0: U = FS.P[-2].U.T Uy = np.dot(U, Y) Bcon = Acon.conditional(U, Uy) else: Bcon = Acon spacings_P.append(Acon.pivot(eta, Y)) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='greater') reduced_Pknown.append(reduced_pval) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='greater') reduced_Punknown.append(reduced_pval) return covtest_P, reduced_Pknown, reduced_Punknown, spacings_P, FS.variables