def adam_minimax(grad_both, init_params_max, init_params_min, callback=None, num_iters=100, step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam modified to do minimiax optimization, for instance to help with training generative adversarial networks.""" x_max, unflatten_max = flatten(init_params_max) x_min, unflatten_min = flatten(init_params_min) m_max = np.zeros(len(x_max)) v_max = np.zeros(len(x_max)) m_min = np.zeros(len(x_min)) v_min = np.zeros(len(x_min)) for i in range(num_iters): g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i) g_max, _ = flatten(g_max_uf) g_min, _ = flatten(g_min_uf) if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min)) m_max = (1 - b1) * g_max + b1 * m_max # First moment estimate. v_max = (1 - b2) * (g_max**2) + b2 * v_max # Second moment estimate. mhat_max = m_max / (1 - b1**(i + 1)) # Bias correction. vhat_max = v_max / (1 - b2**(i + 1)) x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min**2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) return unflatten_max(x_max), unflatten_min(x_min)
def adam_minmin(grad_both, init_params_nn, init_params_nn2, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): x_nn, unflatten_nn = flatten(init_params_nn) x_nn2, unflatten_nn2 = flatten(init_params_nn2) m_nn, v_nn = np.zeros(len(x_nn)), np.zeros(len(x_nn)) m_nn2, v_nn2 = np.zeros(len(x_nn2)), np.zeros(len(x_nn2)) for i in range(num_iters): g_nn_uf, g_nn2_uf = grad_both(unflatten_nn(x_nn), unflatten_nn2(x_nn2), i) g_nn, _ = flatten(g_nn_uf) g_nn2, _ = flatten(g_nn2_uf) if callback: callback(unflatten_nn(x_nn), unflatten_nn2(x_nn2), i) step_size = exponential_decay(step_size) # Update parameters m_nn = (1 - b1) * g_nn + b1 * m_nn # First moment estimate. v_nn = (1 - b2) * (g_nn**2) + b2 * v_nn # Second moment estimate. mhat_nn = m_nn / (1 - b1**(i + 1)) # Bias correction. vhat_nn = v_nn / (1 - b2**(i + 1)) x_nn = x_nn - step_size * mhat_nn / (np.sqrt(vhat_nn) + eps) # Update parameters m_nn2 = (1 - b1) * g_nn2 + b1 * m_nn2 # First moment estimate. v_nn2 = (1 - b2) * (g_nn2**2) + b2 * v_nn2 # Second moment estimate. mhat_nn2 = m_nn2 / (1 - b1**(i + 1)) # Bias correction. vhat_nn2 = v_nn2 / (1 - b2**(i + 1)) x_nn2 = x_nn2 - step_size * mhat_nn2 / (np.sqrt(vhat_nn2) + eps) return unflatten_nn(x_nn), unflatten_nn2(x_nn2)
def test_flatten(): val = (npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)])) vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) assert np.all(vect == vect_2)
def flatmap(f, container): flatten = lambda lst: [item for sublst in lst for item in sublst] mappers = { np.ndarray: lambda f, arr: f(arr), list: lambda f, lst: flatten(map(f, lst)), dict: lambda f, dct: flatten(map(f, dct.values())) } return mappers[type(container)](f, container)
def adam_minimax(grad_both, init_params_max, init_params_min, callback=None, num_iters=100, step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam modified to do minimiax optimization, for instance to help with training generative adversarial networks.""" x_max, unflatten_max = flatten(init_params_max) x_min, unflatten_min = flatten(init_params_min) m_max = np.zeros(len(x_max)) v_max = np.zeros(len(x_max)) m_min = np.zeros(len(x_min)) v_min = np.zeros(len(x_min)) ability = 0 HANDICAP = 100 for i in range(num_iters): g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i) g_max, _ = flatten(g_max_uf) g_min, _ = flatten(g_min_uf) if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min)) if i % 10 == 0: ability = objective(unflatten_max(x_max), unflatten_min(x_min), i) if ability < HANDICAP: m_max = (1 - b1) * g_max + b1 * m_max # First moment estimate. v_max = (1 - b2) * (g_max** 2) + b2 * v_max # Second moment estimate. mhat_max = m_max / (1 - b1**(i + 1)) # Bias correction. vhat_max = v_max / (1 - b2**(i + 1)) x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps) else: print('Skipping generator update because objective is too high') if ability > -HANDICAP: m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min** 2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) else: print('Skipping discriminator update because objective is too low') return unflatten_max(x_max), unflatten_min(x_min)
def test_flatten_dict(): val = {'k': npr.random((4, 4)), 'k2': npr.random((3, 3)), 'k3': 3.0, 'k4': [1.0, 4.0, 7.0, 9.0]} vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) assert np.all(vect == vect_2)
def time_flatten(): val = {'k': npr.random((4, 4)), 'k2': npr.random((3, 3)), 'k3': 3.0, 'k4': [1.0, 4.0, 7.0, 9.0], 'k5': np.array([4., 5., 6.]), 'k6': np.array([[7., 8.], [9., 10.]])} vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered)
def time_flatten(): val = { 'k': npr.random((4, 4)), 'k2': npr.random((3, 3)), 'k3': 3.0, 'k4': [1.0, 4.0, 7.0, 9.0], 'k5': np.array([4., 5., 6.]), 'k6': np.array([[7., 8.], [9., 10.]]) } vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered)
def make_gradfun(run_inference, pgm_prior, data, batch_size, num_samples, natgrad_scale=1., callback=callback): _, unflat = flatten(pgm_prior) num_datapoints = get_num_datapoints(data) data_batches, num_batches = split_into_batches(data, batch_size) get_batch = lambda i: data_batches[i % num_batches] saved = lambda: None def mc_elbo(pgm_params, i): #Here nn_potentials are just the sufficient stats of the data x = get_batch(i) xxT = np.einsum('ij,ik->ijk', x, x) n = np.ones(x.shape[0]) if x.ndim == 2 else 1. nn_potentials = pack_dense(xxT, x, n, n) saved.stats, global_kl, local_kl = run_inference( pgm_prior, pgm_params, nn_potentials) return (-global_kl - num_batches * local_kl) / num_datapoints #CHECK def gradfun(params, i): pgm_params = params val = -mc_elbo(pgm_params, i) pgm_natgrad = -natgrad_scale / num_datapoints * \ (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params)) #print(flat(pgm_prior), num_batches*flat(saved.stats), -flat(pgm_params)) grad = unflat(pgm_natgrad) if callback: callback(i, val, params, grad) return grad return gradfun
def question4b3(m, train_x, train_y_integers): # Number of hidden units dims_hid = m # Compress all weights into one weight vector using autograd's flatten x_train, x_test, y_train_integers, y_test_integers = train_test_split( train_x, train_y_integers, test_size=0.2, train_size=0.8) y_train = np.zeros((x_train.shape[0], 4)) y_train[np.arange(x_train.shape[0]), y_train_integers] = 1 y_test = np.zeros((x_test.shape[0], 4)) y_test[np.arange(x_test.shape[0]), y_test_integers] = 1 W = np.random.randn(x_train.shape[1], dims_hid) b = np.random.randn(dims_hid) V = np.random.randn(dims_hid, 4) c = np.random.randn(4) all_weights = (W, b, V, c) weights, unflatten = flatten(all_weights) smooth_grad = 0 for i in range(1000): weight_gradients, returned_values = grad_fun(weights, x_train, y_train, unflatten) smooth_grad = (1 - momentum) * smooth_grad + momentum * weight_gradients weights = weights - epsilon * smooth_grad return mean_zero_one_loss(weights, x_test, y_test_integers, unflatten)
def make_gradfun(run_inference, recognize, loglike, pgm_prior, data, batch_size, num_samples, natgrad_scale=1., callback=callback): _, unflat = flatten(pgm_prior) num_datapoints = get_num_datapoints(data) data_batches, num_batches = split_into_batches(data, batch_size) get_batch = lambda i: data_batches[i % num_batches] saved = lambda: None def mc_elbo(pgm_params, loglike_params, recogn_params, i): nn_potentials = recognize(recogn_params, get_batch(i)) samples, saved.stats, global_kl, local_kl = \ run_inference(pgm_prior, pgm_params, nn_potentials, num_samples) return (num_batches * loglike(loglike_params, samples, get_batch(i)) - global_kl - num_batches * local_kl) / num_datapoints def gradfun(params, i): pgm_params, loglike_params, recogn_params = params objective = lambda (loglike_params, recogn_params): \ -mc_elbo(pgm_params, loglike_params, recogn_params, i) val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params)) # this expression for pgm_natgrad drops a term that can be computed using # the function autograd.misc.fixed_points.fixed_point pgm_natgrad = -natgrad_scale / num_datapoints * \ (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params)) grad = unflat(pgm_natgrad), loglike_grad, recogn_grad if callback: callback(i, val, params, grad) return grad return gradfun
def __init__(self, params, predict, inputs, targets): """Construct a Model object given a prediction function.""" self.__params = params self.__params_flat, self.unflatten_params = flatten(self.params) self.predict = predict self.inputs = inputs self.targets = targets self.gradient = autograd.grad(self.loss) self.hessian = autograd.hessian(self.loss) self.hess_dot_vec = autograd.hessian_vector_product(self.loss) self.grad_rayleigh = autograd.grad(self.rayleigh_quotient)
def unflatten_tracing(): val = [ npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0, npr.randn(2)]) ] vect, unflatten = flatten(val) def f(vect): return unflatten(vect) flatten2, _ = make_vjp(f)(vect) assert np.all(vect == flatten2(val))
def time_grad_flatten(): val = {'k': npr.random((4, 4)), 'k2': npr.random((3, 3)), 'k3': 3.0, 'k4': [1.0, 4.0, 7.0, 9.0], 'k5': np.array([4., 5., 6.]), 'k6': np.array([[7., 8.], [9., 10.]])} vect, unflatten = flatten(val) def fun(vec): v = unflatten(vec) return np.sum(v['k5']) + np.sum(v['k6']) grad(fun)(vect)
def time_grad_flatten(): val = { 'k': npr.random((4, 4)), 'k2': npr.random((3, 3)), 'k3': 3.0, 'k4': [1.0, 4.0, 7.0, 9.0], 'k5': np.array([4., 5., 6.]), 'k6': np.array([[7., 8.], [9., 10.]]) } vect, unflatten = flatten(val) def fun(vec): v = unflatten(vec) return np.sum(v['k5']) + np.sum(v['k6']) grad(fun)(vect)
def init_params(scale, rs=npr.RandomState(0)): w = range(4) # LeNet: 20-50-500 # 10-20-(320)-128-10 w[0] = (scale * rs.randn(1, 10, 5, 5).astype(dtype), scale * rs.randn(1, 10, 1, 1).astype(dtype)) w[1] = (scale * rs.randn(10, 20, 5, 5).astype(dtype), scale * rs.randn(1, 20, 1, 1).astype(dtype)) w[2] = (scale * rs.randn(320, 128).astype(dtype), scale * rs.randn(128).astype(dtype)) w[3] = (scale * rs.randn(128, 10).astype(dtype), scale * rs.randn(10).astype(dtype)) t1, _ = flatten(w) print '[size]: ', t1.shape return w
def nnOneLayerTrainEntry(): data = read_image_data() train_x = data[0] train_y_integers = data[1] test_x = data[2] # Make inputs approximately zero mean (improves optimization backprob algorithm in NN) train_x -= .5 test_x -= .5 # Number of output dimensions dims_out = 4 # Number of hidden units dims_hid_list = [5, 40, 70] #5 # Learning rate epsilon = 0.0001 # Momentum of gradients update momentum = 0.1 # Number of epochs nEpochs = 1000 #10 # Number of train examples nTrainSamples = train_x.shape[0] # Number of input dimensions dims_in = train_x.shape[1] # Convert integer labels to one-hot vectors # i.e. convert label 2 to 0, 0, 1, 0 train_y = np.zeros((nTrainSamples, dims_out)) train_y[np.arange(nTrainSamples), train_y_integers] = 1 print("trainy shape: ", train_y.shape) assert momentum <= 1 assert epsilon <= 1 xnEpochsLst = range(1, nEpochs + 1, 1) yLossLst = [] for dims_hid in dims_hid_list: trainStart = time.time() * 1000 # Initializing weights W = np.random.randn(dims_in, dims_hid) b = np.random.randn(dims_hid) V = np.random.randn(dims_hid, dims_out) c = np.random.randn(dims_out) smooth_grad = 0 # Compress all weights into one weight vector using autograd's flatten all_weights = (W, b, V, c) weights, unflatten = flatten(all_weights) yLossInns = [] for epo in xnEpochsLst: #range(0, nEpochs): smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN( epsilon, momentum, train_x, train_y, train_y_integers, weights, unflatten, smooth_grad) yLossInns.append(meanLogisticloss) yLossLst.append(yLossInns) #print ("YLossLsttttttt: ", yLossLst) print("NN time for different M: ", dims_hid, time.time() * 1000 - trainStart) labels = ["M = " + str(dims_hid) for dims_hid in dims_hid_list] #print('Train yLossInns =', xnEpochsLst, yLossLst) plotNN(xnEpochsLst, yLossLst, labels)
def stratifyDataTrainTestNN(): data = read_image_data() train_x = data[0] train_y_integers = data[1] test_x = data[2] # Make inputs approximately zero mean (improves optimization backprob algorithm in NN) train_x -= .5 test_x -= .5 dims_out = 4 xsplitTrain, xsplitTest, ysplitTrain_integer, ysplitTest_integer = train_test_split( train_x, train_y_integers, test_size=0.2, random_state=0, stratify=train_y_integers) dims_in = xsplitTrain.shape[1] nTrainSamples = xsplitTrain.shape[0] ysplitTrain = np.zeros((nTrainSamples, dims_out)) ysplitTrain[np.arange(nTrainSamples), ysplitTrain_integer] = 1 # Learning rate epsilon = 0.0001 # Momentum of gradients update momentum = 0.1 dims_hid_list = [5, 40, 70] nEpochs = 1000 xnEpochsLst = range(1, nEpochs + 1, 1) smallestValidationError = 2**32 bestParas = [] best_dims_hid = 0 for dims_hid in dims_hid_list: # Initializing weights W = np.random.randn(dims_in, dims_hid) b = np.random.randn(dims_hid) V = np.random.randn(dims_hid, dims_out) c = np.random.randn(dims_out) smooth_grad = 0 # Compress all weights into one weight vector using autograd's flatten all_weights = (W, b, V, c) weights, unflatten = flatten(all_weights) meanZeroOneLoss = 0 for epo in xnEpochsLst: #range(0, nEpochs): smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN( epsilon, momentum, xsplitTrain, ysplitTrain, ysplitTrain_integer, weights, unflatten, smooth_grad) #get validation data set zero-one-loss-error zeroOnelossEach = mean_zero_one_loss(weights, xsplitTest, ysplitTest_integer, unflatten) print("zeroOnelossEach: ", zeroOnelossEach) if zeroOnelossEach < smallestValidationError: smallestValidationError = zeroOnelossEach bestParas = [weights, unflatten, smooth_grad] best_dims_hid = dims_hid print("smallestValidationError: ", smallestValidationError, "M = ", best_dims_hid) #train whole data nTrainSamples = train_x.shape[0] train_y = np.zeros((nTrainSamples, dims_out)) train_y[np.arange(nTrainSamples), train_y_integers] = 1 weights = bestParas[0] unflatten = bestParas[1] smooth_grad = bestParas[2] smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN( epsilon, momentum, train_x, train_y, train_y_integers, weights, unflatten, smooth_grad) fileTestOutputNN = "../Predictions/best_NN2.csv" testDataOutputFile(weights, test_x, unflatten, fileTestOutputNN)
def log_gaussian(params, scale): flat_params, _ = flatten(params) return np.sum(norm.logpdf(flat_params, 0, scale))
for dims_hid in dims_hids: print("unit: ", dims_hid) start = time.time() mean_loss = [] # Initializing weights W = np.random.randn(dims_in, dims_hid) b = np.random.randn(dims_hid) V = np.random.randn(dims_hid, dims_out) c = np.random.randn(dims_out) smooth_grad = 0 # Compress all weights into one weight vector using autograd's flatten all_weights = (W, b, V, c) weights, unflatten = flatten(all_weights) for i in range(nEpochs): # Compute gradients (partial derivatives) using autograd toolbox weight_gradients, returned_values = grad_fun(weights, X_train, train_y, unflatten) #print('logistic loss: ', returned_values[0], 'Train error =', returned_values[1]) mean = returned_values[0] / nTrainSamples #print('logistic loss: ',mean) mean_loss.append(mean) # Update weight vector smooth_grad = (1 - momentum) * smooth_grad + momentum * weight_gradients weights = weights - epsilon * smooth_grad
def test_flatten(): val = (npr.randn(4), [npr.randn(3,4), 2.5], (), (2.0, [1.0, npr.randn(2)])) vect, unflatten = flatten(val) val_recovered = unflatten(vect) vect_2, _ = flatten(val_recovered) assert np.all(vect == vect_2)
def test_flatten_complex(): val = 1 + 1j flat, unflatten = flatten(val) assert np.all(val == unflatten(flat))
def adam_minimax(grad_both, init_params_max, init_params_min, neighbors_function, callback=None, num_iters=100, step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam modified to do minimiax optimization, for instance to help with training generative adversarial networks.""" def exponential_decay(step_size_max): if step_size_max > 0.001: step_size_max *= 0.999 return step_size_max x_max, unflatten_max = flatten(init_params_max) x_min, unflatten_min = flatten(init_params_min) m_max = np.zeros(len(x_max)) v_max = np.zeros(len(x_max)) m_min = np.zeros(len(x_min)) v_min = np.zeros(len(x_min)) # gp_fold = '/cluster/mshen/prj/gans/out/2017-06-19/c_gan/ajc/gen_params/' # iter_nm = 'akb' # genZ_params = import_ganZ_gen_params(gp_fold, iter_nm) # x_max, unflatten_max = flatten(genZ_params) # i = 0 # g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function) # g_max, _ = flatten(g_max_uf) # g_min, _ = flatten(g_min_uf) # dnow = datetime.datetime.now(); g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function); print(datetime.datetime.now() - dnow) # import code; code.interact(local=dict(globals(), **locals())) for i in range(num_iters): print(i, datetime.datetime.now(), alphabetize(i)) # if i % 5 == 0 and i % 10 != 1: # K = 10 # if i % 5 == 0: # K = 10 # else: # K = 1 K = 3 if i == 10: # Once entropy is done learning, reduce step size step_size_max = 0.01 # import code; code.interact(local=dict(globals(), **locals())) g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function) g_max, _ = flatten(g_max_uf) g_min, _ = flatten(g_min_uf) if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min)) step_size_max = exponential_decay(step_size_max) # Update generator (maximizer) m_max = (1 - b1) * g_max + b1 * m_max # First moment estimate. v_max = (1 - b2) * (g_max**2) + b2 * v_max # Second moment estimate. mhat_max = m_max / (1 - b1**(i + 1)) # Bias correction. vhat_max = v_max / (1 - b2**(i + 1)) x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps) # Update discriminator (minimizer) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min**2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) for k in range(K - 1): if k <= 0: step_size_min_temp = step_size_min if k > 0: step_size_min_temp = step_size_min_temp * 0.50 g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function) g_min, _ = flatten(g_min_uf) # Update discriminator (minimizer) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min** 2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min_temp * mhat_min / ( np.sqrt(vhat_min) + eps) return unflatten_max(x_max), unflatten_min(x_min)
from __future__ import division, print_function from toolz import curry from autograd import value_and_grad as vgrad from autograd.util import flatten from util import split_into_batches, get_num_datapoints callback = lambda i, val, params, grad: print('{}: {}'.format(i, val)) flat = lambda struct: flatten(struct)[0] @curry def make_gradfun(run_inference, recognize, loglike, pgm_prior, data, batch_size, num_samples, natgrad_scale=1., callback=callback): _, unflat = flatten(pgm_prior) num_datapoints = get_num_datapoints(data) data_batches, num_batches = split_into_batches(data, batch_size) get_batch = lambda i: data_batches[i % num_batches] saved = lambda: None def mc_elbo(pgm_params, loglike_params, recogn_params, i): nn_potentials = recognize(recogn_params, get_batch(i)) samples, saved.stats, global_kl, local_kl = \ run_inference(pgm_prior, pgm_params, nn_potentials, num_samples) return (num_batches * loglike(loglike_params, samples, get_batch(i)) - global_kl - num_batches * local_kl) / num_datapoints def gradfun(params, i): pgm_params, loglike_params, recogn_params = params objective = lambda (loglike_params, recogn_params): \ -mc_elbo(pgm_params, loglike_params, recogn_params, i) val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params)) # this expression for pgm_natgrad drops a term that can be computed using
def adam_minimax(grad_both, init_params_max, init_params_min, neighbors_function, callback=None, num_iters=100, step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam modified to do minimiax optimization, for instance to help with training generative adversarial networks.""" def exponential_decay(step_size_min, step_size_max): if step_size_min > 0.0001: step_size_min *= 0.99 if step_size_max > 0.001: step_size_max *= 0.99 return step_size_min, step_size_max x_max, unflatten_max = flatten(init_params_max) x_min, unflatten_min = flatten(init_params_min) m_max = np.zeros(len(x_max)) v_max = np.zeros(len(x_max)) m_min = np.zeros(len(x_min)) v_min = np.zeros(len(x_min)) K = 1 for i in range(num_iters): g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function) g_max, _ = flatten(g_max_uf) g_min, _ = flatten(g_min_uf) if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i, unflatten_max(g_max), unflatten_min(g_min)) step_size_min, step_size_max = exponential_decay( step_size_min, step_size_max) # Update generator (maximizer) m_max = (1 - b1) * g_max + b1 * m_max # First moment estimate. v_max = (1 - b2) * (g_max**2) + b2 * v_max # Second moment estimate. mhat_max = m_max / (1 - b1**(i + 1)) # Bias correction. vhat_max = v_max / (1 - b2**(i + 1)) x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps) # Update discriminator (minimizer) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min**2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) for k in range(K - 1): g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function) g_min, _ = flatten(g_min_uf) # Update discriminator (minimizer) m_min = (1 - b1) * g_min + b1 * m_min # First moment estimate. v_min = (1 - b2) * (g_min** 2) + b2 * v_min # Second moment estimate. mhat_min = m_min / (1 - b1**(i + 1)) # Bias correction. vhat_min = v_min / (1 - b2**(i + 1)) x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps) return unflatten_max(x_max), unflatten_min(x_min)
def l1_norm(params): if isinstance(params, dict): return np.sum(np.absolute(flatten(params)[0])) return np.sum(np.absolute(flatten(params.value)[0]))
def params(self, params): self.__params = params self.__params_flat, self.unflatten_params = flatten(self.__params)
def run_variational_inference_gumbel(Ys, A, W_true, Ps_true, Cs, etasq, stepsize=0.1, init_with_true=True, num_iters=250, temp_prior=0.1, num_sinkhorn=20, num_mcmc_samples=500, temp=1): def sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp): # Sample W mu_W, log_sigmasq_W, log_mu_Ps = params W_flat = mu_W + np.sqrt(np.exp(log_sigmasq_W)) * npr.randn(*mu_W.shape) W = unpack_W(W_flat) #W = W_true # Sample Ps: run sinkhorn to move mu close to Birkhoff Ps = [] for log_mu_P , unpack_P, C in \ zip(log_mu_Ps, unpack_Ps, Cs): # Unpack the mean, run sinkhorn, the pack it again log_mu_P = unpack_P(log_mu_P) a = log_mu_P.shape log_mu_P = ( log_mu_P + -np.log(-np.log(np.random.uniform(0, 1, (a[0], a[1]))))) / temp log_mu_P = sinkhorn_logspace(log_mu_P - 1e8 * (1 - C), num_sinkhorn) log_mu_P = log_mu_P[C] ##Notice how we limit the variance P = np.exp(log_mu_P) P = unpack_P(P) Ps.append(P) Ps = np.array(Ps) return W, Ps def elbo(params, unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn, num_mcmc_samples, temp_prior, temp): """ Provides a stochastic estimate of the variational lower bound. sigma_Lim: limits for the variance of the re-parameterization of the permutation """ def gumbel_distance(log_mu_Ps, temp_prior, temperature, Cs): arr = 0 for n in range(len(log_mu_Ps)): log_mu_P = unpack_Ps[n](log_mu_Ps[n]) C = Cs[n] log_mu_P = log_mu_P[C] log_mu_P = log_mu_P[:] arr += np.sum( np.log(temp_prior) - 0.5772156649 * temp_prior / temperature - log_mu_P * temp_prior / temperature - np.exp( gammaln(1 + temp_prior / temperature) - log_mu_P * temp_prior / temperature) - (np.log(temperature) - 1 - 0.5772156649)) return arr M, T, N = Ys.shape assert A.shape == (N, N) assert len(unpack_Ps) == M mu_W, log_sigmasq_W, log_mu_Ps = params L = 0 for smpl in range(num_mcmc_samples): W, Ps = sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp) # Compute the ELBO L += log_likelihood(Ys, A, W, Ps, etasq) / num_mcmc_samples L += gumbel_distance(log_mu_Ps, temp_prior, temp, Cs) # Add the entropy terms L += gaussian_entropy(log_sigmasq_W) fac = 1000 ## This terms adds the KL divergence between the W prior and posterior with entries of W having a prior variance # sigma = 1/fac, for details see the appendix of the VAE paper. L += - 0.5 * log_sigmasq_W.size * (np.log(2 * np.pi)) -\ 0.5 * fac* np.sum(np.exp(log_sigmasq_W)) - 0.5 * fac * np.sum( np.power(mu_W, 2)) # Normalize objective L /= (T * M * N) return L M, T, N = Ys.shape # Initialize variational parameters if init_with_true: mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \ initialize_params_gumbel(A, Cs, map_W=W_true) else: mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \ initialize_params_gumbel(A, Cs) # Make a function to convert an array of params into # a set of parameters mu_W, sigmasq_W, [mu_P1, sigmasq_P1, ... ] flat_params, unflatten = \ flatten((mu_W, log_sigmasq_W, log_mu_Ps )) objective = \ lambda flat_params, t: \ -1 * elbo(unflatten(flat_params), unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn, num_mcmc_samples, temp_prior, temp) # Define a callback to monitor optimization progress elbos = [] lls = [] mses = [] num_corrects = [] times = [] W_samples = [] Ps_samples = [] def collect_stats(params, t): if t % 10 == 0: W_samples.append([]) Ps_samples.append([]) for i in range(100): W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, num_sinkhorn, temp) W_samples[-1].append(W) Ps_samples[-1].append(Ps) times.append(time.time()) elbos.append(-1 * objective(params, 0)) # Sample the variational posterior and compute num correct matches mu_W, log_sigmasq_W, log_mu_Ps = unflatten(params) W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, 10, 1.0) list = [] for i in range(A.shape[0]): list.extend(np.where(Ps[0, i, :] + Ps_true[0, i, :] == 1)[0]) mses.append(np.mean((W * A - W_true * A)**2)) # Round doubly stochastic matrix P to the nearest permutation matrix num_correct = np.zeros(M) Ps2 = np.zeros((Ps.shape[0], A.shape[0], A.shape[0])) for m, P in enumerate(Ps): row, col = linear_sum_assignment(-P + 1e8 * (1 - Cs[m])) Ps2[m] = perm_to_P(col) num_correct[m] = n_correct(perm_to_P(col), Ps_true[m]) num_corrects.append(num_correct) lls.append(log_likelihood(Ys, A, W, Ps2, etasq) / (M * T * N)) def callback(params, t, g): collect_stats(params, t) print( "Iteration {}. ELBO: {:.4f} LL: {:.4f} MSE(W): {:.4f}, Num Correct: {}" .format(t, elbos[-1], lls[-1], mses[-1], num_corrects[-1])) # Run optimizer callback(flat_params, -1, None) variational_params = adam(grad(objective), flat_params, step_size=stepsize, num_iters=num_iters, callback=callback) times = np.array(times) times -= times[0] return times, np.array(elbos), np.array(lls), np.array(mses), \ np.array(num_corrects), Ps_samples, W_samples, A, W_true
def l2_norm(params): flattened, _ = flatten(params) return np.dot(flattened, flattened)
def l2_norm(params): """Computes l2 norm of params by flattening them into a vector.""" flattened, _ = flatten(params) return np.dot(flattened, flattened)
log1pexp = primitive(lambda x: np.log1p(np.exp(x))) log1pexp.defgrad(lambda ans, x: lambda g: g / (1 + np.exp(-x))) normalize = lambda x: x / np.sum(x, axis=-1, keepdims=True) softmax = lambda x: normalize(np.exp(x - np.max(x, axis=-1, keepdims=True))) ### misc def rle(stateseq): pos, = np.where(np.diff(stateseq) != 0) pos = np.concatenate(([0], pos + 1, [len(stateseq)])) return stateseq[pos[:-1]], np.diff(pos) isarray = lambda x: hasattr(x, 'ndim') flat = lambda x: flatten(x)[0] partial_flat = lambda a, axes: np.reshape(a, a.shape[:-axes] + (-1, )) tensordot = lambda a, b, axes=2: np.dot(partial_flat(a, axes), partial_flat(b, axes).T) outer = lambda x, y: x[..., :, None] * y[..., None, :] ### functions and monads def compose(funcs): def composition(x): for f in funcs: x = f(x) return x return composition
def unflatten_tracing(): val = [npr.randn(4), [npr.randn(3,4), 2.5], (), (2.0, [1.0, npr.randn(2)])] vect, unflatten = flatten(val) def f(vect): return unflatten(vect) flatten2, _ = make_vjp(f)(vect) assert np.all(vect == flatten2(val))
def f(x, y): xy, _ = flatten([x, y]) return np.sum(xy)
def scalar_args_fun(*new_args): full_args = list(args) for i, argnum in enumerate(argnums): wrt_flat, unflatten = flatten(wrt_args[i]) full_args[argnum] = unflatten(wrt_flat + new_args[i] * rand_vecs[i]) return to_scalar(fun(*full_args, **kwargs))
def l2_norm(params) : flattened,_ = flatten(params) return auto_np.dot(flattened,flattened)