def make_gradfun(run_inference, recognize, loglike, pgm_prior, data, batch_size, num_samples, natgrad_scale=1., callback=callback): _, unflat = flatten(pgm_prior) num_datapoints = get_num_datapoints(data) data_batches, num_batches = split_into_batches(data, batch_size) get_batch = lambda i: data_batches[i % num_batches] saved = lambda: None def mc_elbo(pgm_params, loglike_params, recogn_params, i): nn_potentials = recognize(recogn_params, get_batch(i)) samples, saved.stats, global_kl, local_kl = \ run_inference(pgm_prior, pgm_params, nn_potentials, num_samples) return (num_batches * loglike(loglike_params, samples, get_batch(i)) - global_kl - num_batches * local_kl) / num_datapoints def gradfun(params, i): pgm_params, loglike_params, recogn_params = params objective = lambda (loglike_params, recogn_params): \ -mc_elbo(pgm_params, loglike_params, recogn_params, i) val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params)) # this expression for pgm_natgrad drops a term that can be computed using # the function autograd.misc.fixed_points.fixed_point pgm_natgrad = -natgrad_scale / num_datapoints * \ (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params)) grad = unflat(pgm_natgrad), loglike_grad, recogn_grad if callback: callback(i, val, params, grad) return grad return gradfun
def adadelta(allparams, nat_stepsize, num_epochs, seq_len, num_seqs=None, rho=0.95, epsilon=1e-6, num_samples=1, permute=True): natparams, params = allparams[:1], allparams[1:] sum_gsq = zeros_like(params) # accumulated sq. grads sum_usq = zeros_like(params) # accumulated sq. updates accumulate = lambda a, b: add(scale(rho, a), scale(1 - rho, b)) for epoch in xrange(num_epochs): vals = [] batches, num_batches = split_into_batches(data, seq_len, num_seqs) for y in batches: val, grad = scale( 1. / num_datapoints, val_and_grad(y, num_batches, num_samples, *allparams)) natgrad, grad = grad[:1], grad[1:] sum_gsq = accumulate(sum_gsq, square(grad)) diag_scaling = div(sqrt(add_scalar(epsilon, sum_usq)), sqrt(add_scalar(epsilon, sum_gsq))) update = mul(diag_scaling, grad) sum_usq = accumulate(sum_usq, square(update)) natparams = add(natparams, scale(nat_stepsize, natgrad)) params = add(params, update) allparams = concat(natparams, params) vals.append(val) if callback: callback(epoch, vals, natgrad, allparams) return allparams
def adam(allparams, nat_stepsize, stepsize, num_epochs, seq_len, num_seqs=None, b1=0.9, b2=0.999, eps=1e-8, num_samples=1): natparams, params = allparams[:1], allparams[1:] m = zeros_like(params) v = zeros_like(params) i = 0 accumulate = lambda rho, a, b: add(scale(1 - rho, a), scale(rho, b)) for epoch in xrange(num_epochs): vals = [] batches, num_batches = split_into_batches(data, seq_len, num_seqs) for y in batches: val, grad = scale( 1. / num_datapoints, val_and_grad(y, num_batches, num_samples, *allparams)) natgrad, grad = grad[:1], grad[1:] m = accumulate(b1, grad, m) # first moment estimate v = accumulate(b2, square(grad), v) # second moment estimate mhat = scale(1. / (1 - b1**(i + 1)), m) # bias correction vhat = scale(1. / (1 - b2**(i + 1)), v) update = scale(stepsize, div(mhat, add_scalar(eps, sqrt(vhat)))) natparams = add(natparams, scale(nat_stepsize, natgrad)) params = add(params, update) allparams = concat(natparams, params) vals.append(val) i += 1 if callback: callback(epoch, vals, natgrad, allparams) return allparams
def make_gradfun(run_inference, recognize, loglike, pgm_prior, pgm_expectedstats, data, batch_size, num_samples, natgrad_scale=1., callback=callback): _, unflat = flatten(pgm_prior) num_datapoints = get_num_datapoints(data) data_batches, num_batches = split_into_batches(data, batch_size) get_batch = lambda i: data_batches[i % num_batches] saved = lambda: None def mc_elbo(pgm_params, pgm_stats, loglike_params, recogn_params, i): nn_potentials = recognize(recogn_params, get_batch(i)) samples, saved.stats, global_kl, local_kl = \ run_inference(pgm_prior, pgm_params, pgm_stats, nn_potentials, num_samples) return (num_batches * loglike(loglike_params, samples, get_batch(i)) - global_kl - num_batches * local_kl) / num_datapoints def gradfun(params, i): pgm_params, loglike_params, recogn_params = params objective = lambda (pgm_stats, loglike_params, recogn_params): \ -mc_elbo(pgm_params, pgm_stats, loglike_params, recogn_params, i) pgm_stats = pgm_expectedstats(pgm_params) val, (pgm_stats_grad, loglike_grad, recogn_grad) = vgrad(objective)( (pgm_stats, loglike_params, recogn_params)) pgm_natgrad = -natgrad_scale / num_datapoints * \ (flat(pgm_prior) + num_batches*(flat(saved.stats) + flat(pgm_stats_grad)) - flat(pgm_params)) grad = unflat(pgm_natgrad), loglike_grad, recogn_grad if callback: callback(i, val, params, grad) return grad return gradfun
def train(self, X, Y, sig2, rff_dim=1200, batch_size=16, epochs=16): model_graph = tf.Graph() model_sess = tf.Session(graph=model_graph) with model_graph.as_default(): X_tr = tf.placeholder(dtype=tf.float64, shape=[None, self.dim_in]) Y_true = tf.placeholder(dtype=tf.float64, shape=[None, 1]) H_inv = tf.placeholder(dtype=tf.float64, shape=[rff_dim, rff_dim]) Phi_y = tf.placeholder(dtype=tf.float64, shape=[rff_dim, 1]) rff_layer = kernel_layers.RandomFourierFeatures( output_dim=rff_dim, kernel_initializer='gaussian', trainable=True) ## define model rff_output = tf.cast(rff_layer(X_tr) * np.sqrt(2. / rff_dim), dtype=tf.float64) weight_cov = util.minibatch_woodbury_update(rff_output, H_inv) covl_xy = util.minibatch_interaction_update( Phi_y, rff_output, Y_true) random_feature_weight = rff_layer.kernel random_feature_bias = rff_layer.bias ### Training and Evaluation ### X_batches = util.split_into_batches(X, batch_size) * epochs Y_batches = util.split_into_batches(Y, batch_size) * epochs num_steps = X_batches.__len__() num_batch = int(num_steps / epochs) with model_sess as sess: sess.run(tf.global_variables_initializer()) rff_1 = sess.run(rff_output, feed_dict={X_tr: X_batches[0]}) weight_cov_val = util.compute_inverse(rff_1, sig_sq=sig2**2) covl_xy_val = np.matmul(rff_1.T, Y_batches[0]) rff_weight, rff_bias = sess.run( [random_feature_weight, random_feature_bias]) for batch_id in range(1, num_batch): X_batch = X_batches[batch_id] Y_batch = Y_batches[batch_id] ## update posterior mean/covariance try: weight_cov_val, covl_xy_val = sess.run( [weight_cov, covl_xy], feed_dict={ X_tr: X_batch, Y_true: Y_batch, H_inv: weight_cov_val, Phi_y: covl_xy_val }) except: print("\n================================\n" "Problem occurred at Step {}\n" "================================".format(batch_id)) self.beta = np.matmul(weight_cov_val, covl_xy_val)[:, 0] self.Sigma_beta = weight_cov_val * sig2**2 self.RFF_weight = rff_weight # (d, D) self.RFF_bias = rff_bias # (D, )