def test_stack(): A, b = np.random.standard_normal((4,30)), np.random.standard_normal(4) con1 = AC.constraints(A,b) A, b = np.random.standard_normal((5,30)), np.random.standard_normal(5) E, f = np.random.standard_normal((3,30)), np.random.standard_normal(3) con2 = AC.constraints(A,b) return AC.stack(con1, con2)
def test_conditional(): p = 200 k1, k2 = 5, 3 b = np.random.standard_normal((k1,)) A = np.random.standard_normal((k1,p)) con = AC.constraints(A,b) w = np.random.standard_normal(p) con.mean = w C = np.random.standard_normal((k2,p)) d = np.random.standard_normal(k2) new_con = con.conditional(C, d) while True: W = np.random.standard_normal(p) W -= np.dot(np.linalg.pinv(C), np.dot(C, W) - d) if new_con(W) and con(W): break Z = AC.sample_from_constraints(new_con, W, ndraw=5000) tol = 0 nt.assert_true(np.linalg.norm(np.dot(Z, C.T) - d[None,:]) < 1.e-7) V = (np.dot(Z, new_con.linear_part.T) - new_con.offset[None,:]).max(1) V2 = (np.dot(Z, con.linear_part.T) - con.offset[None,:]).max(1) print ('failing:', (V>tol).sum(), (V2>tol).sum(), np.linalg.norm(np.dot(C, W) - d)) nt.assert_true(np.sum(V > tol) < 0.001*V.shape[0])
def test_conditional_simple(): A = np.ones((1,2)) b = np.array([1]) con = AC.constraints(A,b) #X1+X2<= 1 C = np.array([[0,1]]) d = np.array([2]) #X2=2 new_con = con.conditional(C,d) while True: W = np.random.standard_normal(2) W -= np.dot(np.linalg.pinv(C), np.dot(C, W) - d) if con(W): break Z1 = AC.sample_from_constraints(new_con, W, ndraw=10000) counter = 0 new_sample = [] while True: W = np.random.standard_normal() # conditional distribution if W < -1: new_sample.append(W) counter += 1 if counter >= 10000: break a1 = Z1[:,0] a2 = np.array(new_sample) test = np.fabs((a1.mean() - a2.mean()) / (np.std(a1) * np.sqrt(2)) * np.sqrt(10000)) nt.assert_true(test < 5)
def power(n, snr, pos, rho=0.25, muval = np.linspace(0,5,51)): X, mu, beta = parameters(n, rho, pos) # form the correct constraints con, initial = constraints(X, pos) Z_selection = sample_from_constraints(con, initial, ndraw=4000000, burnin=100000) S0 = np.dot(X.T, Z_selection.T).T W0 = np.ones(S0.shape[0]) dfam0 = discrete_family(S0[:,pos], W0) one_sided_acceptance_region = dfam0.one_sided_acceptance(0) def one_sided_power(mu): L, U = one_sided_acceptance_region return 1 - (dfam0.cdf(mu,U) - dfam0.cdf(mu, L)) power_fig = plt.figure(figsize=(8,8)) power_ax = power_fig.gca() power_ax.set_ylabel('Power', fontsize=20) power_ax.legend(loc='lower right') power_ax.set_xlabel('Effect size $\mu$', fontsize=20) full_power = np.array([one_sided_power(m) for m in muval]) print full_power power_ax.plot(muval, full_power, label='Reduced model UMPU', linewidth=7, alpha=0.5) power_ax.legend(loc='lower right') power_ax.set_xlim([0,5]) power_ax.plot([snr,snr],[0,1], 'k--') print one_sided_power(snr) return power_fig, {'full':full_power}
def simulation(n, snr, pos, rho=0.25, ndraw=5000, burnin=1000): X, mu, beta = parameters(n, rho, pos) con, initial = constraints(X, pos) con.mean = snr * mu / np.sqrt(2) Z_selection = sample_from_constraints(con, initial, ndraw=ndraw, burnin=burnin) Z_inference_pos = np.random.standard_normal(Z_selection.shape[0]) + snr / np.sqrt(2) return (np.dot(X.T, Z_selection.T)[pos] + Z_inference_pos) / np.sqrt(2)
def power(mu, ndraw=100000, keep_every=100): constraint = affine.constraints(np.array([[-1,0.]]), np.array([-cutoff])) constraint.mean = np.array([mu,mu]) sample = affine.sample_from_constraints(constraint, np.array([4,2.]), ndraw=ndraw)[::keep_every] print sample.mean(0) sample = sample.sum(1) decisions = [] for s in sample: decisions.append(null_dbn.one_sided_test(0, s, alternative='greater')) print np.mean(decisions) return np.mean(decisions)
def simulation(n, snr, pos, rho=0.25, ndraw=5000, burnin=1000): X, mu, beta = parameters(n, rho, pos) con, initial = constraints(X, pos) con.mean = snr * mu / np.sqrt(2) Z_selection = sample_from_constraints(con, initial, ndraw=ndraw, burnin=burnin) Z_inference_pos = np.random.standard_normal( Z_selection.shape[0]) + snr / np.sqrt(2) return (np.dot(X.T, Z_selection.T)[pos] + Z_inference_pos) / np.sqrt(2)
def power(mu, ndraw=100000, keep_every=100): constraint = affine.constraints(np.array([[-1, 0.]]), np.array([-cutoff])) constraint.mean = np.array([mu, mu]) sample = affine.sample_from_constraints(constraint, np.array([4, 2.]), ndraw=ndraw)[::keep_every] print sample.mean(0) sample = sample.sum(1) decisions = [] for s in sample: decisions.append(null_dbn.one_sided_test(0, s, alternative='greater')) print np.mean(decisions) return np.mean(decisions)
def cone_with_slice(angles, ai, hull, which, fill_args={}, ax=None, label=None, suffix='', Y=None): ax, poly, constraint, rays = cone_rays(angles, ai, hull, which, ax=ax, fill_args=fill_args) eta_idx = np.argmax(np.dot(hull.points, Y)) eta = 40 * hull.points[eta_idx] representation = constraints(-constraint.T, np.zeros(2)) if Y is None: Y = sample_from_constraints(representation) ax.fill(poly[:, 0], poly[:, 1], label=r'$A_{(M,H_0)}$', **fill_args) if symmetric: ax.fill(-poly[:, 0], -poly[:, 1], **fill_args) legend_args = {'scatterpoints': 1, 'fontsize': 30, 'loc': 'lower left'} ax.legend(**legend_args) ax.figure.savefig('fig_onesparse1.png', dpi=300) ax.scatter(Y[0], Y[1], c='k', s=150, label=label) Vp, _, Vm = representation.bounds(eta, Y)[:3] Yperp = Y - (np.dot(eta, Y) / np.linalg.norm(eta)**2 * eta) if Vm == np.inf: Vm = 10000 width_points = np.array([(Yperp + Vp * eta / np.linalg.norm(eta)**2), (Yperp + Vm * eta / np.linalg.norm(eta)**2)]) ax.plot(width_points[:, 0], width_points[:, 1], '-', c='k', linewidth=4) legend_args = {'scatterpoints': 1, 'fontsize': 30, 'loc': 'lower left'} ax.legend(**legend_args) ax.figure.savefig('fig_onesparse2.png', dpi=300) return ax, poly, constraint, rays
def test_sampling(): """ See that means and covariances are approximately correct """ C = AC.constraints(np.identity(3), np.inf*np.ones(3)) C.mean = np.array([3,4,5.2]) W = np.random.standard_normal((5,3)) S = np.dot(W.T, W) / 30. C.covariance = S V = AC.sample_from_constraints(C, np.zeros(3), ndraw=500000) nt.assert_true(np.linalg.norm(V.mean(0)-C.mean) < 0.01) nt.assert_true(np.linalg.norm(np.einsum('ij,ik->ijk', V, V).mean(0) - np.outer(V.mean(0), V.mean(0)) - S) < 0.01)
def full_sim(L, b, p): k, q = L.shape A1 = np.random.standard_normal((p, q)) A2 = L[:p] A3 = np.array([np.arange(q)**(i / 2.) for i in range(1, 4)]) con = AC.constraints((L, b), None) def sim(A): y = C.simulate_from_constraints(con) return quadratic_test(y, np.identity(con.dim), con) return sim(A1), sim(A2), sim(A3)
def full_sim(L, b, p): k, q = L.shape A1 = np.random.standard_normal((p,q)) A2 = L[:p] A3 = np.array([np.arange(q)**(i/2.) for i in range(1,4)]) con = AC.constraints((L, b), None) def sim(A): y = C.simulate_from_constraints(con) return quadratic_test(y, np.identity(con.dim), con) return sim(A1), sim(A2), sim(A3)
def test_chisq_noncentral(): mu = np.arange(6) ncp = np.linalg.norm(mu[:3])**2 A, b = np.random.standard_normal((4, 6)), np.zeros(4) con = AC.constraints(A, b, mean=mu) ro.r('fncp=%f' % ncp) ro.r('f = function(x) {pchisq(x,3,ncp=fncp)}') def F(x): if x != np.inf: return np.array(ro.r('f(%f)' % x)) else: return np.array([1.]) nsim = 2000 P = [] for i in range(nsim): Z = AC.simulate_from_constraints(con, mu=mu) print i u = 0 * Z u[:3] = Z[:3] / np.linalg.norm(Z[:3]) L, V, U = con.pivots(u, Z)[:3] if L > 0: Ln = L**2 Un = U**2 Vn = V**2 else: Ln = 0 Un = U**2 Vn = V**2 if U < 0: stop P.append(np.array((F(Un) - F(Vn)) / (F(Un) - F(Ln)))) P = np.array(P).reshape(-1) P = P[P > 0] P = P[P < 1] ecdf = sm.distributions.ECDF(P) plt.clf() x = np.linspace(0, 1, 101) plt.plot(x, ecdf(x), c='red') plt.plot([0, 1], [0, 1], c='blue', linewidth=2)
def test_simulate_nonwhitened(): n, p = 50, 200 X = np.random.standard_normal((n,p)) cov = np.dot(X.T, X) W = np.random.standard_normal((3,p)) con = AC.constraints(W, np.ones(3), covariance=cov) while True: z = np.random.standard_normal(p) if np.dot(W, z).max() <= 1: break Z = AC.sample_from_constraints(con, z) nt.assert_true((np.dot(Z, W.T) - 1).max() < 0)
def test_chisq_noncentral(): mu = np.arange(6) ncp = np.linalg.norm(mu[:3])**2 A, b = np.random.standard_normal((4,6)), np.zeros(4) con = AC.constraints(A,b, mean=mu) ro.r('fncp=%f' % ncp) ro.r('f = function(x) {pchisq(x,3,ncp=fncp)}') def F(x): if x != np.inf: return np.array(ro.r('f(%f)' % x)) else: return np.array([1.]) nsim = 2000 P = [] for i in range(nsim): Z = AC.simulate_from_constraints(con,mu=mu) print i u = 0 * Z u[:3] = Z[:3] / np.linalg.norm(Z[:3]) L, V, U = con.pivots(u, Z)[:3] if L > 0: Ln = L**2 Un = U**2 Vn = V**2 else: Ln = 0 Un = U**2 Vn = V**2 if U < 0: stop P.append(np.array((F(Un) - F(Vn)) / (F(Un) - F(Ln)))) P = np.array(P).reshape(-1) P = P[P > 0] P = P[P < 1] ecdf = sm.distributions.ECDF(P) plt.clf() x = np.linspace(0,1,101) plt.plot(x, ecdf(x), c='red') plt.plot([0,1],[0,1], c='blue', linewidth=2)
def draw_sample(mu, cutoff, nsample=10000): if mu >= cutoff - 4: sample = [] while True: candidate = np.random.standard_normal(1000000) + mu candidate = candidate[candidate > cutoff] sample.extend(candidate) if len(sample) > nsample: break sample = np.array(sample) sample += np.random.standard_normal(sample.shape) + mu else: constraint = affine.constraints(np.array([[-1,0.]]), np.array([-cutoff])) constraint.mean = np.array([mu,mu]) sample = affine.sample_from_constraints(constraint, np.array([cutoff + 0.1,0]), ndraw=2000000, direction_of_interest=np.array([1,1.])) sample = sample.sum(1)[::(2000000/nsample)] return sample
def cone_with_slice(angles, ai, hull, which, fill_args={}, ax=None, label=None, suffix='', Y=None): ax, poly, constraint, rays = cone_rays(angles, ai, hull, which, ax=ax, fill_args=fill_args) eta_idx = np.argmax(np.dot(hull.points, Y)) eta = 40 * hull.points[eta_idx] representation = constraints(-constraint.T, np.zeros(2)) if Y is None: Y = sample_from_constraints(representation) ax.fill(poly[:,0], poly[:,1], label=r'$A_{(M,H_0)}$', **fill_args) if symmetric: ax.fill(-poly[:,0], -poly[:,1], **fill_args) legend_args = {'scatterpoints':1, 'fontsize':30, 'loc':'lower left'} ax.legend(**legend_args) ax.figure.savefig('fig_onesparse1.png', dpi=300) ax.scatter(Y[0], Y[1], c='k', s=150, label=label) Vp, _, Vm = representation.bounds(eta, Y)[:3] Yperp = Y - (np.dot(eta, Y) / np.linalg.norm(eta)**2 * eta) if Vm == np.inf: Vm = 10000 width_points = np.array([(Yperp + Vp*eta / np.linalg.norm(eta)**2), (Yperp + Vm*eta / np.linalg.norm(eta)**2)]) ax.plot(width_points[:,0], width_points[:,1], '-', c='k', linewidth=4) legend_args = {'scatterpoints':1, 'fontsize':30, 'loc':'lower left'} ax.legend(**legend_args) ax.figure.savefig('fig_onesparse2.png', dpi=300) return ax, poly, constraint, rays
def test_pivots_intervals(): A, b = np.random.standard_normal((4,30)), np.random.standard_normal(4) con = AC.constraints(A,b) while True: w = np.random.standard_normal(30) if con(w): break Z = AC.sample_from_constraints(con, w)[-1] u = np.zeros(con.dim) u[4] = 1 # call pivot con.pivot(u, Z) con.pivot(u, Z, alternative='less') con.pivot(u, Z, alternative='greater') con.interval(u, Z, UMAU=True) con.interval(u, Z, UMAU=False)
def power(n, snr, pos, rho=0.25, muval=np.linspace(0, 5, 51)): X, mu, beta = parameters(n, rho, pos) # form the correct constraints con, initial = constraints(X, pos) Z_selection = sample_from_constraints(con, initial, ndraw=4000000, burnin=100000) S0 = np.dot(X.T, Z_selection.T).T W0 = np.ones(S0.shape[0]) dfam0 = discrete_family(S0[:, pos], W0) one_sided_acceptance_region = dfam0.one_sided_acceptance(0) def one_sided_power(mu): L, U = one_sided_acceptance_region return 1 - (dfam0.cdf(mu, U) - dfam0.cdf(mu, L)) power_fig = plt.figure(figsize=(8, 8)) power_ax = power_fig.gca() power_ax.set_ylabel('Power', fontsize=20) power_ax.legend(loc='lower right') power_ax.set_xlabel('Effect size $\mu$', fontsize=20) full_power = np.array([one_sided_power(m) for m in muval]) print full_power power_ax.plot(muval, full_power, label='Reduced model UMPU', linewidth=7, alpha=0.5) power_ax.legend(loc='lower right') power_ax.set_xlim([0, 5]) power_ax.plot([snr, snr], [0, 1], 'k--') print one_sided_power(snr) return power_fig, {'full': full_power}
def test_chisq_central(): n, p = 4, 10 A, b = np.random.standard_normal((n, p)), np.zeros(n) con = AC.constraints(A, b) while True: z = np.random.standard_normal(p) if con(z): break S = np.identity(p)[:3] Z = AC.sample_from_constraints(con, z, ndraw=10000) P = [] for i in range(Z.shape[0] / 10): P.append(chisq.quadratic_test(Z[10 * i], S, con)) ecdf = sm.distributions.ECDF(P) plt.clf() x = np.linspace(0, 1, 101) plt.plot(x, ecdf(x), c='red') plt.plot([0, 1], [0, 1], c='blue', linewidth=2) nt.assert_true(np.fabs(np.mean(P) - 0.5) < 0.03) nt.assert_true(np.fabs(np.std(P) - 1 / np.sqrt(12)) < 0.03)
def test_chisq_central(): n, p = 4, 10 A, b = np.random.standard_normal((n, p)), np.zeros(n) con = AC.constraints(A,b) while True: z = np.random.standard_normal(p) if con(z): break S = np.identity(p)[:3] Z = AC.sample_from_constraints(con, z, ndraw=10000) P = [] for i in range(Z.shape[0]/10): P.append(chisq.quadratic_test(Z[10*i], S, con)) ecdf = sm.distributions.ECDF(P) plt.clf() x = np.linspace(0,1,101) plt.plot(x, ecdf(x), c='red') plt.plot([0,1],[0,1], c='blue', linewidth=2) nt.assert_true(np.fabs(np.mean(P)-0.5) < 0.03) nt.assert_true(np.fabs(np.std(P)-1/np.sqrt(12)) < 0.03)
def forward_step(X, Y, sigma=None, nstep=5, exact=False, burnin=1000, ndraw=5000): """ A simple implementation of forward stepwise that uses the `reduced_covtest` iteratively after adjusting fully for the selected variable. This implementation is not efficient, in that it computes more SVDs than it really has to. Parameters ---------- X : np.float((n,p)) Y : np.float(n) sigma : float (optional) Noise level (not needed for reduced). nstep : int How many steps of forward stepwise? exact : bool Which version of covtest should we use? burnin : int How many iterations until we start recording samples? ndraw : int How many samples should we return? tests : ['reduced_known', 'covtest', 'reduced_unknown'] Which test to use? A subset of the above sequence. """ n, p = X.shape FS = forward_stepwise(X, Y) spacings_P = [] covtest_P = [] reduced_Pknown = [] reduced_Punknown = [] for i in range(nstep): FS.next() # covtest if FS.P[i] is not None: RX = X - FS.P[i](X) RY = Y - FS.P[i](Y) covariance = np.identity(n) - np.dot(FS.P[i].U, FS.P[i].U.T) else: RX = X RY = Y covariance = None RX -= RX.mean(0)[None, :] RX /= RX.std(0)[None, :] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=exact) covtest_P.append(pval) # reduced eta = RX[:, idx] * sign Acon = constraints(FS.A, np.zeros(FS.A.shape[0])) Acon.covariance *= sigma**2 if i > 0: U = FS.P[-2].U.T Uy = np.dot(U, Y) Bcon = Acon.conditional(U, Uy) else: Bcon = Acon spacings_P.append(Acon.pivot(eta, Y)) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='greater') reduced_Pknown.append(reduced_pval) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='greater') reduced_Punknown.append(reduced_pval) return covtest_P, reduced_Pknown, reduced_Punknown, spacings_P, FS.variables
import os import numpy as np import matplotlib.pyplot as plt from selection import affine from selection.discrete_family import discrete_family from scipy.stats import norm as ndist cutoff = ndist.ppf(0.95) null_constraint = affine.constraints(np.array([[-1, 0.]]), np.array([-cutoff])) null_sample = affine.sample_from_constraints(null_constraint, np.array([4, 2.]), ndraw=100000).sum(1) null_dbn = discrete_family(null_sample, np.ones_like(null_sample)) def power(mu, ndraw=100000, keep_every=100): constraint = affine.constraints(np.array([[-1, 0.]]), np.array([-cutoff])) constraint.mean = np.array([mu, mu]) sample = affine.sample_from_constraints(constraint, np.array([4, 2.]), ndraw=ndraw)[::keep_every] print sample.mean(0) sample = sample.sum(1) decisions = [] for s in sample: decisions.append(null_dbn.one_sided_test(0, s, alternative='greater')) print np.mean(decisions) return np.mean(decisions)
def forward_step(X, Y, sigma=None, nstep=5, exact=False, burnin=1000, ndraw=5000): """ A simple implementation of forward stepwise that uses the `reduced_covtest` iteratively after adjusting fully for the selected variable. This implementation is not efficient, in that it computes more SVDs than it really has to. Parameters ---------- X : np.float((n,p)) Y : np.float(n) sigma : float (optional) Noise level (not needed for reduced). nstep : int How many steps of forward stepwise? exact : bool Which version of covtest should we use? burnin : int How many iterations until we start recording samples? ndraw : int How many samples should we return? tests : ['reduced_known', 'covtest', 'reduced_unknown'] Which test to use? A subset of the above sequence. """ n, p = X.shape FS = forward_stepwise(X, Y) spacings_P = [] covtest_P = [] reduced_Pknown = [] reduced_Punknown = [] for i in range(nstep): FS.next() # covtest if FS.P[i] is not None: RX = X - FS.P[i](X) RY = Y - FS.P[i](Y) covariance = np.identity(n) - np.dot(FS.P[i].U, FS.P[i].U.T) else: RX = X RY = Y covariance = None RX -= RX.mean(0)[None,:] RX /= RX.std(0)[None,:] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=exact) covtest_P.append(pval) # reduced eta = RX[:,idx] * sign Acon = constraints(FS.A, np.zeros(FS.A.shape[0])) Acon.covariance *= sigma**2 if i > 0: U = FS.P[-2].U.T Uy = np.dot(U, Y) Bcon = Acon.conditional(U, Uy) else: Bcon = Acon spacings_P.append(Acon.pivot(eta, Y)) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='greater') reduced_Pknown.append(reduced_pval) reduced_pval, _, _ = gibbs_test(Bcon, Y, eta, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='greater') reduced_Punknown.append(reduced_pval) return covtest_P, reduced_Pknown, reduced_Punknown, spacings_P, FS.variables
def sample_split(X, Y, sigma=None, nstep=10, burnin=1000, ndraw=5000, reduced=True): n, p = X.shape half_n = int(n/2) X1, Y1 = X[:half_n,:]*1., Y[:half_n]*1. X1 -= X1.mean(0)[None,:] Y1 -= Y1.mean() X2, Y2 = X[half_n:], Y[half_n:] X2 -= X2.mean(0)[None,:] Y2 -= Y2.mean() FS_half = forward_stepwise(X1, Y1) # sample splitting model FS_full = forward_stepwise(X.copy(), Y.copy()) # full data model spacings_P = [] split_P = [] reduced_Pknown = [] reduced_Punknown = [] covtest_P = [] for i in range(nstep): FS_half.next() if FS_half.P[i] is not None: RX = FS_half.X - FS_half.P[i](FS_half.X) RY = FS_half.Y - FS_half.P[i](FS_half.Y) covariance = centering(FS_half.Y.shape[0]) - np.dot(FS_half.P[i].U, FS_half.P[i].U.T) else: RX = FS_half.X RY = FS_half.Y covariance = centering(FS_half.Y.shape[0]) RX -= RX.mean(0)[None,:] RX /= (RX.std(0)[None,:] * np.sqrt(RX.shape[0])) # covtest on half -- not saved con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=True) # spacings on half -- not saved eta1 = RX[:,idx] * sign Acon = constraints(FS_half.A, np.zeros(FS_half.A.shape[0]), covariance=centering(FS_half.Y.shape[0])) Acon.covariance *= sigma**2 Acon.pivot(eta1, FS_half.Y) # sample split eta2 = np.linalg.pinv(X2[:,FS_half.variables])[-1] eta_sigma = np.linalg.norm(eta2) * sigma split_P.append(2*ndist.sf(np.fabs((eta2*Y2).sum() / eta_sigma))) # inference on full mu using split model, this \beta^+_s. zero_block = np.zeros((Acon.linear_part.shape[0], (n-half_n))) linear_part = np.hstack([Acon.linear_part, zero_block]) Fcon = constraints(linear_part, Acon.offset, covariance=centering(n)) Fcon.covariance *= sigma**2 if i > 0: U = np.linalg.pinv(X[:,FS_half.variables[:-1]]) Uy = np.dot(U, Y) Fcon = Fcon.conditional(U, Uy) else: Fcon = Fcon eta_full = np.linalg.pinv(X[:,FS_half.variables])[-1] if reduced: reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='twosided')[0] reduced_Pknown.append(reduced_pval) reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='twosided')[0] reduced_Punknown.append(reduced_pval) # now use all the data FS_full.next() if FS_full.P[i] is not None: RX = X - FS_full.P[i](X) RY = Y - FS_full.P[i](Y) covariance = centering(RY.shape[0]) - np.dot(FS_full.P[i].U, FS_full.P[i].U.T) else: RX = X RY = Y.copy() covariance = centering(RY.shape[0]) RX -= RX.mean(0)[None,:] RX /= RX.std(0)[None,:] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=False) covtest_P.append(pval) # spacings on full data eta1 = RX[:,idx] * sign Acon = constraints(FS_full.A, np.zeros(FS_full.A.shape[0]), centering(RY.shape[0])) Acon.covariance *= sigma**2 spacings_P.append(Acon.pivot(eta1, Y)) return split_P, reduced_Pknown, reduced_Punknown, spacings_P, covtest_P, FS_half.variables
def sample_split(X, Y, sigma=None, nstep=10, burnin=1000, ndraw=5000, reduced=True): n, p = X.shape half_n = int(n / 2) X1, Y1 = X[:half_n, :] * 1., Y[:half_n] * 1. X1 -= X1.mean(0)[None, :] Y1 -= Y1.mean() X2, Y2 = X[half_n:], Y[half_n:] X2 -= X2.mean(0)[None, :] Y2 -= Y2.mean() FS_half = forward_stepwise(X1, Y1) # sample splitting model FS_full = forward_stepwise(X.copy(), Y.copy()) # full data model spacings_P = [] split_P = [] reduced_Pknown = [] reduced_Punknown = [] covtest_P = [] for i in range(nstep): FS_half.next() if FS_half.P[i] is not None: RX = FS_half.X - FS_half.P[i](FS_half.X) RY = FS_half.Y - FS_half.P[i](FS_half.Y) covariance = centering(FS_half.Y.shape[0]) - np.dot( FS_half.P[i].U, FS_half.P[i].U.T) else: RX = FS_half.X RY = FS_half.Y covariance = centering(FS_half.Y.shape[0]) RX -= RX.mean(0)[None, :] RX /= (RX.std(0)[None, :] * np.sqrt(RX.shape[0])) # covtest on half -- not saved con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=True) # spacings on half -- not saved eta1 = RX[:, idx] * sign Acon = constraints(FS_half.A, np.zeros(FS_half.A.shape[0]), covariance=centering(FS_half.Y.shape[0])) Acon.covariance *= sigma**2 Acon.pivot(eta1, FS_half.Y) # sample split eta2 = np.linalg.pinv(X2[:, FS_half.variables])[-1] eta_sigma = np.linalg.norm(eta2) * sigma split_P.append(2 * ndist.sf(np.fabs((eta2 * Y2).sum() / eta_sigma))) # inference on full mu using split model, this \beta^+_s. zero_block = np.zeros((Acon.linear_part.shape[0], (n - half_n))) linear_part = np.hstack([Acon.linear_part, zero_block]) Fcon = constraints(linear_part, Acon.offset, covariance=centering(n)) Fcon.covariance *= sigma**2 if i > 0: U = np.linalg.pinv(X[:, FS_half.variables[:-1]]) Uy = np.dot(U, Y) Fcon = Fcon.conditional(U, Uy) else: Fcon = Fcon eta_full = np.linalg.pinv(X[:, FS_half.variables])[-1] if reduced: reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=sigma is not None, alternative='twosided')[0] reduced_Pknown.append(reduced_pval) reduced_pval = gibbs_test(Fcon, Y, eta_full, ndraw=ndraw, burnin=burnin, sigma_known=False, alternative='twosided')[0] reduced_Punknown.append(reduced_pval) # now use all the data FS_full.next() if FS_full.P[i] is not None: RX = X - FS_full.P[i](X) RY = Y - FS_full.P[i](Y) covariance = centering(RY.shape[0]) - np.dot( FS_full.P[i].U, FS_full.P[i].U.T) else: RX = X RY = Y.copy() covariance = centering(RY.shape[0]) RX -= RX.mean(0)[None, :] RX /= RX.std(0)[None, :] con, pval, idx, sign = covtest(RX, RY, sigma=sigma, covariance=covariance, exact=False) covtest_P.append(pval) # spacings on full data eta1 = RX[:, idx] * sign Acon = constraints(FS_full.A, np.zeros(FS_full.A.shape[0]), centering(RY.shape[0])) Acon.covariance *= sigma**2 spacings_P.append(Acon.pivot(eta1, Y)) return split_P, reduced_Pknown, reduced_Punknown, spacings_P, covtest_P, FS_half.variables
import os from glob import glob import numpy as np import matplotlib.pyplot as plt from selection import affine from selection.discrete_family import discrete_family from scipy.stats import norm as ndist from sklearn.isotonic import IsotonicRegression cutoff = 3. null_constraint = affine.constraints(np.array([[-1,0.]]), np.array([-cutoff])) null_sample = affine.sample_from_constraints(null_constraint, np.array([4,2.]), ndraw=100000).sum(1) null_dbn = discrete_family(null_sample, np.ones_like(null_sample)) def draw_sample(mu, cutoff, nsample=10000): if mu >= cutoff - 4: sample = [] while True: candidate = np.random.standard_normal(1000000) + mu candidate = candidate[candidate > cutoff] sample.extend(candidate) if len(sample) > nsample: break sample = np.array(sample) sample += np.random.standard_normal(sample.shape) + mu else: constraint = affine.constraints(np.array([[-1,0.]]), np.array([-cutoff])) constraint.mean = np.array([mu,mu]) sample = affine.sample_from_constraints(constraint, np.array([cutoff + 0.1,0]), ndraw=2000000,