def test_giga_input_validation(): fe1 = False fe2 = False try: bc.GIGA('fdas') except ValueError: fe1 = True pass except: assert False, "Unrecognized error type" try: bc.GIGA(np.array(['fdsa', 'asdf'])) except ValueError: fe2 = True pass except: assert False, "Unrecognized error type" if not fe1 or not fe2: assert False, "GIGA failed: did not catch invalid input"
cputs_full[tr] = time.time() - t0 print('attempt ' + str(mcmc_attempt) + ': accept rate = ' + str(accept_rate) + ', passes if in (.15, .7) ') mcmc_attempt += 1 th_samples = np.array(th_samples) Fs_full[tr] = 0. #always 0, just doing this to make later code simpler full_samples = np.array(th_samples) print('Running coreset construction / MCMC') for aidx, anm in enumerate(anms): print(anm + ':') t0 = time.time() alg = None if 'GIGA' in anm: alg = bc.GIGA(vecs) elif anm == 'FW': alg = bc.FrankWolfe(vecs) else: alg = bc.RandomSubsampling(vecs) t_setup = time.time() - t0 t_alg = 0. for m in range(Ms.shape[0]): print('M = ' + str(Ms[m]) + ': coreset construction') #this runs alg up to a level of M; on the next iteration, it will continue from where it left off t0 = time.time() alg.run(Ms[m]) t_alg += time.time() - t0 wts = alg.weights() idcs = wts > 0
projection_dim = 500 #random projection dimension, K #build the discretization of all the log-likelihoods based on random projection proj = bc.ProjectionF(Z, grad_log_likelihood, projection_dim, post_approx) #construct the N x K discretized log-likelihood matrix; each row represents the discretized LL func for one datapoint vecs = proj.get() ############################ ############################ ## Step 4: Build the Coreset ############################ ############################ #build the coreset M = 100 # use 100 datapoints giga = bc.GIGA( vecs ) #do coreset construction using the discretized log-likelihood functions giga.run(M) #build the coreset wts = giga.weights() #get the output weights idcs = wts > 0 #pull out the indices of datapoints that were included in the coreset ######################## ######################## ## Step 5: Run Inference ######################## ######################## #example: #from inference import hmc #mcmc_steps = 5000 #total number of MH steps #mcmc_burn = 1000
########################################## N = 1000000 D = 50 err = np.zeros((len(anms), n_trials, Ms.shape[0])) csize = np.zeros((len(anms), n_trials, Ms.shape[0])) cput = np.zeros((len(anms), n_trials, Ms.shape[0])) for tr in range(n_trials): X = np.random.randn(N, D) XS = X.sum(axis=0) for aidx, anm in enumerate(anms): print('data: gauss, trial ' + str(tr + 1) + '/' + str(n_trials) + ', alg: ' + anm) alg = None if anm == 'GIGA': alg = bc.GIGA(X) elif anm == 'FW': alg = bc.FrankWolfe(X) else: alg = bc.RandomSubsampling(X) for m, M in enumerate(Ms): t0 = time.time() alg.run(M) tf = time.time() cput[aidx, tr, m] = tf - t0 + cput[aidx, tr, m - 1] if m > 0 else tf - t0 wts = alg.weights() err[aidx, tr, m] = np.sqrt( (((wts[:, np.newaxis] * X).sum(axis=0) - XS)**2).sum()) csize[aidx, tr, m] = (wts > 0).sum()
def giga_single(N, D, dist="gauss"): x = gendata(N, D, dist) xs = x.sum(axis=0) giga = bc.GIGA(x) #TODO uncomment once giga bds implemented ##bound tests #prev_sqrt_bd = np.inf #prev_exp_bd = np.inf #for m in range(1, N+1): # sqrt_bd = giga.sqrt_bound(m) # exp_bd = giga.exp_bound(m) # assert sqrt_bd >= 0., "GIGA failed: sqrt bound < 0" # assert sqrt_bd - prev_sqrt_bd < tol, "GIGA failed: sqrt bound is not decreasing" # assert exp_bd >= 0., "GIGA failed: exp bound < 0" # assert exp_bd - prev_exp_bd < tol, "GIGA failed: exp bound is not decreasing" # prev_sqrt_bd = sqrt_bd # prev_exp_bd = exp_bd #assert giga.sqrt_bound(1e100) < tol, "GIGA failed: sqrt bound doesn't approach 0" #assert giga.exp_bound(1e100) < tol, "GIGA failed: exp bound doesn't approach 0" #incremental M tests prev_err = np.inf for m in range(1, N + 1): giga.run(m) if x.shape[0] == 1: assert np.fabs(giga.weights() - np.array([1])) < tol or ( np.fabs(giga.weights() - np.array([0])) < tol and (x**2).sum() == 0. ), "GIGA failed: coreset not immediately optimal with N = 1" assert (giga.weights() > 0.).sum() <= m, "GIGA failed: coreset size > m" xw = (giga.weights()[:, np.newaxis] * x).sum(axis=0) assert np.sqrt( ((xw - xs)**2).sum() ) - prev_err < tol, "GIGA failed: error is not monotone decreasing, err = " + str( np.sqrt( ((xw - xs)** 2).sum())) + " prev_err = " + str(prev_err) + " M = " + str( giga.M) assert np.fabs( giga.error('accurate') - np.sqrt(((xw - xs)**2).sum()) ) < tol, "GIGA failed: x(w) est is not close to true x(w): est err = " + str( giga.error('accurate')) + ' true err = ' + str( np.sqrt(((xw - xs)**2).sum())) assert np.fabs( giga.error('accurate') - giga.error() ) < tol * 1000, "GIGA failed: giga.error(accurate/fast) do not return similar results: fast err = " + str( giga.error()) + ' acc err = ' + str(giga.error('accurate')) #TODO uncomment once giga bound implemented #assert giga.sqrt_bound() - np.sqrt(((xw-xs)**2).sum()) >= -tol, "GIGA failed: sqrt bound invalid" #assert giga.exp_bound() - np.sqrt(((xw-xs)**2).sum()) >= -tol, "GIGA failed: exp bound invalid" if 'colinear' in dist and m >= 1: if not np.sqrt(((xw - xs)**2).sum()) < tol: assert False, "colinear m>= 1 problem nrm = " + str( np.sqrt( ((xw - xs)** 2).sum())) + " tol = " + str(tol) + " m = " + str(m) assert np.sqrt( ((xw - xs)**2).sum() ) < tol, "GIGA failed: for M>=2, coreset with colinear data not optimal" if 'axis' in dist: assert np.all( np.fabs(giga.weights()[giga.weights() > 0.] - 1.) < tol ), "GIGA failed: on axis-aligned data, weights are not 1" assert np.fabs( np.sqrt(((xw - xs)**2).sum()) / np.sqrt((xs**2).sum()) - np.sqrt(1. - float(m) / float(N)) ) < tol, "GIGA failed: on axis-aligned data, error is not sqrt(1 - M/N)" prev_err = np.sqrt(((xw - xs)**2).sum()) #save incremental M result w_inc = giga.weights() xw_inc = (giga.weights()[:, np.newaxis] * x).sum(axis=0) #check reset giga.reset() assert giga.M == 0 and np.all(np.fabs(giga.weights()) < tol) and np.fabs( giga.error() - np.sqrt((xs**2).sum()) ) < tol and not giga.reached_numeric_limit, "GIGA failed: giga.reset() did not properly reset" #check run up to N all at once vs incremental #do this test for all except bin, where symmetries can cause instabilities in the choice of vector (and then different weights if the original vector norms were different) if dist != 'bin': giga.run(N) xw = (giga.weights()[:, np.newaxis] * x).sum(axis=0) assert np.sqrt( ((xw - xw_inc)**2).sum() ) < tol, "GIGA failed: incremental run up to N doesn't produce same result as one run at N : \n xw = " + str( xw) + " error = " + str(np.sqrt(( (xw - xs)** 2).sum())) + " \n xw_inc = " + str(xw_inc) + " error = " + str( np.sqrt(((xw_inc - xs)**2).sum())) + " \n xs = " + str(xs)