Esempio n. 1
0
def test_giga_input_validation():
    fe1 = False
    fe2 = False
    try:
        bc.GIGA('fdas')
    except ValueError:
        fe1 = True
        pass
    except:
        assert False, "Unrecognized error type"
    try:
        bc.GIGA(np.array(['fdsa', 'asdf']))
    except ValueError:
        fe2 = True
        pass
    except:
        assert False, "Unrecognized error type"

    if not fe1 or not fe2:
        assert False, "GIGA failed: did not catch invalid input"
Esempio n. 2
0
            cputs_full[tr] = time.time() - t0
            print('attempt ' + str(mcmc_attempt) + ': accept rate = ' +
                  str(accept_rate) + ', passes if in (.15, .7) ')
            mcmc_attempt += 1
        th_samples = np.array(th_samples)
        Fs_full[tr] = 0.  #always 0, just doing this to make later code simpler
        full_samples = np.array(th_samples)

        print('Running coreset construction / MCMC')
        for aidx, anm in enumerate(anms):
            print(anm + ':')

            t0 = time.time()
            alg = None
            if 'GIGA' in anm:
                alg = bc.GIGA(vecs)
            elif anm == 'FW':
                alg = bc.FrankWolfe(vecs)
            else:
                alg = bc.RandomSubsampling(vecs)
            t_setup = time.time() - t0

            t_alg = 0.
            for m in range(Ms.shape[0]):
                print('M = ' + str(Ms[m]) + ': coreset construction')
                #this runs alg up to a level of M; on the next iteration, it will continue from where it left off
                t0 = time.time()
                alg.run(Ms[m])
                t_alg += time.time() - t0
                wts = alg.weights()
                idcs = wts > 0
Esempio n. 3
0
projection_dim = 500  #random projection dimension, K
#build the discretization of all the log-likelihoods based on random projection
proj = bc.ProjectionF(Z, grad_log_likelihood, projection_dim, post_approx)
#construct the N x K discretized log-likelihood matrix; each row represents the discretized LL func for one datapoint
vecs = proj.get()

############################
############################
## Step 4: Build the Coreset
############################
############################

#build the coreset
M = 100  # use 100 datapoints
giga = bc.GIGA(
    vecs
)  #do coreset construction using the discretized log-likelihood functions
giga.run(M)  #build the coreset
wts = giga.weights()  #get the output weights
idcs = wts > 0  #pull out the indices of datapoints that were included in the coreset

########################
########################
## Step 5: Run Inference
########################
########################

#example:
#from inference import hmc
#mcmc_steps = 5000 #total number of MH steps
#mcmc_burn = 1000
Esempio n. 4
0
##########################################
N = 1000000
D = 50

err = np.zeros((len(anms), n_trials, Ms.shape[0]))
csize = np.zeros((len(anms), n_trials, Ms.shape[0]))
cput = np.zeros((len(anms), n_trials, Ms.shape[0]))
for tr in range(n_trials):
    X = np.random.randn(N, D)
    XS = X.sum(axis=0)
    for aidx, anm in enumerate(anms):
        print('data: gauss, trial ' + str(tr + 1) + '/' + str(n_trials) +
              ', alg: ' + anm)
        alg = None
        if anm == 'GIGA':
            alg = bc.GIGA(X)
        elif anm == 'FW':
            alg = bc.FrankWolfe(X)
        else:
            alg = bc.RandomSubsampling(X)

        for m, M in enumerate(Ms):
            t0 = time.time()
            alg.run(M)
            tf = time.time()
            cput[aidx, tr,
                 m] = tf - t0 + cput[aidx, tr, m - 1] if m > 0 else tf - t0
            wts = alg.weights()
            err[aidx, tr, m] = np.sqrt(
                (((wts[:, np.newaxis] * X).sum(axis=0) - XS)**2).sum())
            csize[aidx, tr, m] = (wts > 0).sum()
Esempio n. 5
0
def giga_single(N, D, dist="gauss"):
    x = gendata(N, D, dist)
    xs = x.sum(axis=0)
    giga = bc.GIGA(x)

    #TODO uncomment once giga bds implemented
    ##bound tests
    #prev_sqrt_bd = np.inf
    #prev_exp_bd = np.inf
    #for m in range(1, N+1):
    #  sqrt_bd = giga.sqrt_bound(m)
    #  exp_bd = giga.exp_bound(m)
    #  assert sqrt_bd >= 0., "GIGA failed: sqrt bound < 0"
    #  assert sqrt_bd - prev_sqrt_bd < tol, "GIGA failed: sqrt bound is not decreasing"
    #  assert exp_bd >= 0., "GIGA failed: exp bound < 0"
    #  assert exp_bd - prev_exp_bd < tol, "GIGA failed: exp bound is not decreasing"
    #  prev_sqrt_bd = sqrt_bd
    #  prev_exp_bd = exp_bd
    #assert giga.sqrt_bound(1e100) < tol, "GIGA failed: sqrt bound doesn't approach 0"
    #assert giga.exp_bound(1e100) < tol, "GIGA failed: exp bound doesn't approach 0"

    #incremental M tests
    prev_err = np.inf
    for m in range(1, N + 1):
        giga.run(m)
        if x.shape[0] == 1:
            assert np.fabs(giga.weights() - np.array([1])) < tol or (
                np.fabs(giga.weights() - np.array([0])) < tol and
                (x**2).sum() == 0.
            ), "GIGA failed: coreset not immediately optimal with N = 1"
        assert (giga.weights() >
                0.).sum() <= m, "GIGA failed: coreset size > m"
        xw = (giga.weights()[:, np.newaxis] * x).sum(axis=0)
        assert np.sqrt(
            ((xw - xs)**2).sum()
        ) - prev_err < tol, "GIGA failed: error is not monotone decreasing, err = " + str(
            np.sqrt(
                ((xw - xs)**
                 2).sum())) + " prev_err = " + str(prev_err) + " M = " + str(
                     giga.M)
        assert np.fabs(
            giga.error('accurate') - np.sqrt(((xw - xs)**2).sum())
        ) < tol, "GIGA failed: x(w) est is not close to true x(w): est err = " + str(
            giga.error('accurate')) + ' true err = ' + str(
                np.sqrt(((xw - xs)**2).sum()))
        assert np.fabs(
            giga.error('accurate') - giga.error()
        ) < tol * 1000, "GIGA failed: giga.error(accurate/fast) do not return similar results: fast err = " + str(
            giga.error()) + ' acc err = ' + str(giga.error('accurate'))
        #TODO uncomment once giga bound implemented
        #assert giga.sqrt_bound() - np.sqrt(((xw-xs)**2).sum()) >= -tol, "GIGA failed: sqrt bound invalid"
        #assert giga.exp_bound() - np.sqrt(((xw-xs)**2).sum()) >= -tol, "GIGA failed: exp bound invalid"
        if 'colinear' in dist and m >= 1:
            if not np.sqrt(((xw - xs)**2).sum()) < tol:
                assert False, "colinear m>= 1 problem nrm = " + str(
                    np.sqrt(
                        ((xw - xs)**
                         2).sum())) + " tol = " + str(tol) + " m = " + str(m)
            assert np.sqrt(
                ((xw - xs)**2).sum()
            ) < tol, "GIGA failed: for M>=2, coreset with colinear data not optimal"
        if 'axis' in dist:
            assert np.all(
                np.fabs(giga.weights()[giga.weights() > 0.] - 1.) < tol
            ), "GIGA failed: on axis-aligned data, weights are not 1"
            assert np.fabs(
                np.sqrt(((xw - xs)**2).sum()) / np.sqrt((xs**2).sum()) -
                np.sqrt(1. - float(m) / float(N))
            ) < tol, "GIGA failed: on axis-aligned data, error is not sqrt(1 - M/N)"
        prev_err = np.sqrt(((xw - xs)**2).sum())
    #save incremental M result
    w_inc = giga.weights()
    xw_inc = (giga.weights()[:, np.newaxis] * x).sum(axis=0)

    #check reset
    giga.reset()
    assert giga.M == 0 and np.all(np.fabs(giga.weights()) < tol) and np.fabs(
        giga.error() - np.sqrt((xs**2).sum())
    ) < tol and not giga.reached_numeric_limit, "GIGA failed: giga.reset() did not properly reset"
    #check run up to N all at once vs incremental
    #do this test for all except bin, where symmetries can cause instabilities in the choice of vector (and then different weights if the original vector norms were different)
    if dist != 'bin':
        giga.run(N)
        xw = (giga.weights()[:, np.newaxis] * x).sum(axis=0)
        assert np.sqrt(
            ((xw - xw_inc)**2).sum()
        ) < tol, "GIGA failed: incremental run up to N doesn't produce same result as one run at N : \n xw = " + str(
            xw) + " error = " + str(np.sqrt((
                (xw - xs)**
                2).sum())) + " \n xw_inc = " + str(xw_inc) + " error = " + str(
                    np.sqrt(((xw_inc - xs)**2).sum())) + " \n xs = " + str(xs)