def main(_): ed.set_seed(FLAGS.seed) # setting up output directory outdir = FLAGS.outdir if '~' in outdir: outdir = os.path.expanduser(outdir) os.makedirs(outdir, exist_ok=True) is_vector = FLAGS.base_dist in ['mvnormal', 'mvlaplace'] ((Xtrain, ytrain), (Xtest, ytest)) = blr_utils.get_data() N, D = Xtrain.shape N_test, D_test = Xtest.shape assert D_test == D, 'Test dimension %d different than train %d' % (D_test, D) logger.info('D = %d, Ntrain = %d, Ntest = %d' % (D, N, N_test)) # Solution components weights, q_params = [], [] # L-continous gradient estimate lipschitz_estimate = None # Metrics to log times_filename = os.path.join(outdir, 'times.csv') open(times_filename, 'w').close() # (mean, +- std) elbos_filename = os.path.join(outdir, 'elbos.csv') logger.info('saving elbos to, %s' % elbos_filename) open(elbos_filename, 'w').close() rocs_filename = os.path.join(outdir, 'roc.csv') logger.info('saving rocs to, %s' % rocs_filename) open(rocs_filename, 'w').close() gap_filename = os.path.join(outdir, 'gap.csv') open(gap_filename, 'w').close() step_filename = os.path.join(outdir, 'steps.csv') open(step_filename, 'w').close() # (mean, std) ll_train_filename = os.path.join(outdir, 'll_train.csv') open(ll_train_filename, 'w').close() ll_test_filename = os.path.join(outdir, 'll_test.csv') open(ll_test_filename, 'w').close() # (bin_ac_train, bin_ac_test) bin_ac_filename = os.path.join(outdir, 'bin_ac.csv') open(bin_ac_filename, 'w').close() # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): lipschitz_filename = os.path.join(outdir, 'lipschitz.csv') open(lipschitz_filename, 'w').close() iter_info_filename = os.path.join(outdir, 'iter_info.txt') open(iter_info_filename, 'w').close() for t in range(FLAGS.n_fw_iter): g = tf.Graph() with g.as_default(): sess = tf.InteractiveSession() with sess.as_default(): tf.set_random_seed(FLAGS.seed) # Build Model w = Normal(loc=tf.zeros(D, tf.float32), scale=tf.ones(D, tf.float32)) X = tf.placeholder(tf.float32, [None, D]) y = Bernoulli(logits=ed.dot(X, w)) p_joint = blr_utils.Joint(Xtrain, ytrain, sess, FLAGS.n_monte_carlo_samples, logger) # vectorized Model evaluations n_test_samples = 100 W = tf.placeholder(tf.float32, [n_test_samples, D]) y_data = tf.placeholder(tf.float32, [None]) # N -> (N, n_test) y_data_matrix = tf.tile(tf.expand_dims(y_data, 1), (1, n_test_samples)) pred_logits = tf.matmul(X, tf.transpose(W)) # (N, n_test) ypred = tf.sigmoid(tf.reduce_mean(pred_logits, axis=1)) pY = Bernoulli(logits=pred_logits) # (N, n_test) log_likelihoods = pY.log_prob(y_data_matrix) # (N, n_test) log_likelihood_expectation = tf.reduce_mean(log_likelihoods, axis=1) # (N, ) ll_mean, ll_std = tf.nn.moments(log_likelihood_expectation, axes=[0]) if t == 0: fw_iterates = {} else: # Current solution prev_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=is_vector) for c in q_params ] qtw_prev = coreutils.get_mixture(weights, prev_components) fw_iterates = {w: qtw_prev} # s is the solution to LMO, random initialization s = coreutils.construct_base(FLAGS.base_dist, [D], t, 's', multivariate=is_vector) sess.run(tf.global_variables_initializer()) total_time = 0. inference_time_start = time.time() # Run relbo to solve LMO problem # If the first atom is being selected through running LMO # it is equivalent to running vi on a uniform prior # Since uniform is not in our variational family try # only random element (without LMO inference) as initial iterate if FLAGS.iter0 == 'vi' or t > 0: inference = relbo.KLqp({w: s}, fw_iterates=fw_iterates, data={ X: Xtrain, y: ytrain }, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) inference_time_end = time.time() # compute only step size selection time #total_time += float(inference_time_end - inference_time_start) loc_s = s.mean().eval() scale_s = s.stddev().eval() # Evaluate the next step step_result = {} if t == 0: # Initialization, q_0 q_params.append({'loc': loc_s, 'scale': scale_s}) weights.append(1.) if FLAGS.fw_variant.startswith('ada'): lipschitz_estimate = opt.adafw_linit(s, p_joint) step_type = 'init' elif FLAGS.fw_variant == 'fixed': start_step_time = time.time() step_result = opt.fixed(weights, q_params, qtw_prev, loc_s, scale_s, s, p_joint, t) end_step_time = time.time() total_time += float(end_step_time - start_step_time) elif FLAGS.fw_variant == 'adafw': start_step_time = time.time() step_result = opt.adaptive_fw(weights, q_params, qtw_prev, loc_s, scale_s, s, p_joint, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type == 'adaptive': lipschitz_estimate = step_result['l_estimate'] elif FLAGS.fw_variant == 'ada_pfw': start_step_time = time.time() step_result = opt.adaptive_pfw(weights, q_params, qtw_prev, loc_s, scale_s, s, p_joint, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type in ['adaptive', 'drop']: lipschitz_estimate = step_result['l_estimate'] elif FLAGS.fw_variant == 'ada_afw': start_step_time = time.time() step_result = opt.adaptive_afw(weights, q_params, qtw_prev, loc_s, scale_s, s, p_joint, t, lipschitz_estimate) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] if step_type in ['adaptive', 'away', 'drop']: lipschitz_estimate = step_result['l_estimate'] elif FLAGS.fw_variant == 'line_search': start_step_time = time.time() step_result = opt.line_search_dkl(weights, q_params, qtw_prev, loc_s, scale_s, s, p_joint, t) end_step_time = time.time() total_time += float(end_step_time - start_step_time) step_type = step_result['step_type'] else: raise NotImplementedError( 'Step size variant %s not implemented' % FLAGS.fw_variant) if t == 0: gamma = 1. new_components = [s] else: q_params = step_result['params'] weights = step_result['weights'] gamma = step_result['gamma'] new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=is_vector) for c in q_params ] qtw_new = coreutils.get_mixture(weights, new_components) # Log metrics for current iteration logger.info('total time %f' % total_time) append_to_file(times_filename, total_time) elbo_t = elbo(qtw_new, p_joint, return_std=False) # testing elbo directory from KLqp elbo_loss = elboModel.KLqp({w: qtw_new}, data={ X: Xtrain, y: ytrain }) res_update = elbo_loss.run() logger.info("iter, %d, elbo, %.2f loss %.2f" % (t, elbo_t, res_update['loss'])) append_to_file(elbos_filename, "%f,%f" % (elbo_t, res_update['loss'])) logger.info('iter %d, gamma %.4f' % (t, gamma)) append_to_file(step_filename, gamma) if t > 0: gap_t = step_result['gap'] logger.info('iter %d, gap %.4f' % (t, gap_t)) append_to_file(gap_filename, gap_t) if FLAGS.fw_variant.startswith('ada'): append_to_file(lipschitz_filename, lipschitz_estimate) append_to_file(iter_info_filename, step_type) logger.info('lt = %.5f, iter_type = %s' % (lipschitz_estimate, step_type)) # get weight samples to evaluate expectations w_samples = qtw_new.sample([n_test_samples]).eval() ll_train_mean, ll_train_std = sess.run([ll_mean, ll_std], feed_dict={ W: w_samples, X: Xtrain, y_data: ytrain }) logger.info("iter, %d, train ll, %.2f +/- %.2f" % (t, ll_train_mean, ll_train_std)) append_to_file(ll_train_filename, "%f,%f" % (ll_train_mean, ll_train_std)) ll_test_mean, ll_test_std, y_test_pred = sess.run( [ll_mean, ll_std, ypred], feed_dict={ W: w_samples, X: Xtest, y_data: ytest }) logger.info("iter, %d, test ll, %.2f +/- %.2f" % (t, ll_test_mean, ll_test_std)) append_to_file(ll_test_filename, "%f,%f" % (ll_test_mean, ll_test_std)) roc_score = roc_auc_score(ytest, y_test_pred) logger.info("iter %d, roc %.4f" % (t, roc_score)) append_to_file(rocs_filename, roc_score) y_post = ed.copy(y, {w: qtw_new}) # eq. to y = Bernoulli(logits=ed.dot(X, qtw_new)) ed_train_ll = ed.evaluate('log_likelihood', data={ X: Xtrain, y_post: ytrain, }) ed_test_ll = ed.evaluate('log_likelihood', data={ X: Xtest, y_post: ytest, }) logger.info("edward train ll %.2f test ll %.2f" % (ed_train_ll, ed_test_ll)) bin_ac_train = ed.evaluate('binary_accuracy', data={ X: Xtrain, y_post: ytrain, }) bin_ac_test = ed.evaluate('binary_accuracy', data={ X: Xtest, y_post: ytest, }) append_to_file(bin_ac_filename, "%f,%f" % (bin_ac_train, bin_ac_test)) logger.info( "edward binary accuracy train ll %.2f test ll %.2f" % (bin_ac_train, bin_ac_test)) mse_test = ed.evaluate('mean_squared_error', data={ X: Xtest, y_post: ytest, }) logger.info("edward mse test ll %.2f" % (mse_test)) sess.close() tf.reset_default_graph()
def main(argv): del argv outdir = FLAGS.outdir if '~' in outdir: outdir = os.path.expanduser(outdir) os.makedirs(outdir, exist_ok=True) # Files to log metrics times_filename = os.path.join(outdir, 'times.csv') elbos_filename = os.path.join(outdir, 'elbos.csv') objective_filename = os.path.join(outdir, 'kl.csv') reference_filename = os.path.join(outdir, 'ref_kl.csv') step_filename = os.path.join(outdir, 'steps.csv') # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): curvature_filename = os.path.join(outdir, 'curvature.csv') gap_filename = os.path.join(outdir, 'gap.csv') iter_info_filename = os.path.join(outdir, 'iter_info.txt') elif FLAGS.fw_variant == 'line_search': goutdir = os.path.join(outdir, 'gradients') # empty the files present in the folder already open(times_filename, 'w').close() open(elbos_filename, 'w').close() open(objective_filename, 'w').close() open(reference_filename, 'w').close() open(step_filename, 'w').close() # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): open(curvature_filename, 'w').close() append_to_file(curvature_filename, "c_local,c_global") open(gap_filename, 'w').close() open(iter_info_filename, 'w').close() elif FLAGS.fw_variant == 'line_search': os.makedirs(goutdir, exist_ok=True) for i in range(FLAGS.n_fw_iter): # NOTE: First iteration (t = 0) is initialization g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): p, mus, stds = create_target_dist() # current iterate (solution until now) if FLAGS.init == 'random': muq = np.random.randn(D).astype(np.float32) stdq = softplus(np.random.randn(D).astype(np.float32)) raise ValueError else: muq = mus[0] stdq = stds[0] # 1 correct LMO t = 1 comps = [{'loc': muq, 'scale_diag': stdq}] weights = [1.0] curvature_estimate = opt.adafw_linit() qtx = MultivariateNormalDiag( loc=tf.convert_to_tensor(muq, dtype=tf.float32), scale_diag=tf.convert_to_tensor(stdq, dtype=tf.float32)) fw_iterates = {p: qtx} # calculate kl-div with 1 component objective_old = kl_divergence(qtx, p).eval() logger.info("kl with init %.4f" % (objective_old)) append_to_file(reference_filename, objective_old) # s is the solution to LMO. It is initialized randomly # mu ~ N(0, 1), std ~ softplus(N(0, 1)) s = coreutils.construct_multivariatenormaldiag([D], t, 's') sess.run(tf.global_variables_initializer()) total_time = 0 start_inference_time = time.time() if FLAGS.LMO == 'vi': # we have to iterate over parameter space raise ValueError inference = relbo.KLqp({p: s}, fw_iterates=fw_iterates, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) # s now contains solution to LMO end_inference_time = time.time() mu_s = s.mean().eval() cov_s = s.stddev().eval() # NOTE: keep only step size time #total_time += end_inference_time - start_inference_time # compute step size to update the next iterate step_result = {} if FLAGS.fw_variant == 'fixed': gamma = 2. / (t + 2.) elif FLAGS.fw_variant == 'line_search': start_line_search_time = time.time() step_result = opt.line_search_dkl( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t) end_line_search_time = time.time() total_time += (end_line_search_time - start_line_search_time) gamma = step_result['gamma'] elif FLAGS.fw_variant == 'adafw': start_adafw_time = time.time() step_result = opt.adaptive_fw( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t, curvature_estimate) end_adafw_time = time.time() total_time += end_adafw_time - start_adafw_time gamma = step_result['gamma'] else: raise NotImplementedError comps.append({'loc': mu_s, 'scale_diag': cov_s}) weights = [(1. - gamma), gamma] c_global = estimate_global_curvature(comps, qtx) q_latest = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[MultivariateNormalDiag(**c) for c in comps]) # Log metrics for current iteration time_t = float(total_time) logger.info('total time %f' % (time_t)) append_to_file(times_filename, time_t) elbo_t = elbo(q_latest, p, n_samples=1000) logger.info("iter, %d, elbo, %.2f +/- %.2f" % (t, elbo_t[0], elbo_t[1])) append_to_file(elbos_filename, "%f,%f" % (elbo_t[0], elbo_t[1])) logger.info('iter %d, gamma %.4f' % (t, gamma)) append_to_file(step_filename, gamma) objective_t = kl_divergence(q_latest, p).eval() logger.info("run %d, kl %.4f" % (i, objective_t)) append_to_file(objective_filename, objective_t) if FLAGS.fw_variant.startswith('ada'): curvature_estimate = step_result['c_estimate'] append_to_file(gap_filename, step_result['gap']) append_to_file(iter_info_filename, step_result['step_type']) logger.info('gap = %.3f, ct = %.5f, iter_type = %s' % (step_result['gap'], step_result['c_estimate'], step_result['step_type'])) append_to_file(curvature_filename, '%f,%f' % (curvature_estimate, c_global)) elif FLAGS.fw_variant == 'line_search': n_line_search_samples = step_result['n_samples'] grad_t = step_result['grad_gamma'] g_outfile = os.path.join( goutdir, 'line_search_samples_%d.npy.%d' % (n_line_search_samples, t)) logger.info('saving line search data to, %s' % g_outfile) np.save(open(g_outfile, 'wb'), grad_t) sess.close() tf.reset_default_graph()
def adaptive_fw(weights, params, q_t, mu_s, cov_s, s_t, p, k, l_prev, gap=None): """Adaptive Frank-Wolfe algorithm. Sets step size as suggested in Algorithm 1 of https://arxiv.org/pdf/1806.05123.pdf Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: edward.model, target distribution p k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate gap: Duality-Gap (if already computed) Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ # FIXME is_vector = FLAGS.base_dist in ['mvnormal', 'mvlaplace'] d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('\ndistance norm is %.3e' % d_t_norm) N_samples = FLAGS.n_monte_carlo_samples if gap is None: # create and sample from $s_t, q_t$ sample_q = q_t.sample([N_samples]) sample_s = s_t.sample([N_samples]) step_s = tf.reduce_mean(grad_elbo(q_t, p, sample_s)).eval() step_q = tf.reduce_mean(grad_elbo(q_t, p, sample_q)).eval() gap = step_q - step_s logger.info('duality gap %.3e' % gap) if gap < 0: logger.warning("Duality gap is negative returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, k, gap) gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw # NOTE: this is from v1 of the paper, new version # replaces multiplicative eta with divisor eta pow_tau = 1.0 i, l_t = 0, l_prev # Objective in this case is -ELBO f_t = -elbo(q_t, p, N_samples, return_std=False) debug('f(q_t) = %.3e' % (f_t)) # return intial estimate if gap is -ve while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), 1.0) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.3e, quad d2 = %.3e' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # $w_{t + 1} = [(1 - \gamma)w_t, \gamma]$ # Handling the case of gamma = 1.0 # separately, weights might not get exactly 0 because # of precision issues. 0 wt components should be removed if gamma != 1.0: new_weights = copy.copy(weights) new_weights = [(1. - gamma) * w for w in new_weights] new_weights.append(gamma) new_params = copy.copy(params) new_params.append({'loc': mu_s, 'scale': cov_s}) new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=is_vector) for c in new_params ] else: new_weights = [1.] new_params = [{'loc': mu_s, 'scale': cov_s}] new_components = [s_t] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p, N_samples, return_std=False) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: # Adaptive loop succeeded return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': 'adaptive' } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, k, gap)
def adaptive_afw(weights, params, q_t, mu_s, cov_s, s_t, p, k, l_prev): """Adaptive Away Steps algorithm. Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: edward.model, target distribution p k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ # FIXME is_vector = FLAGS.base_dist in ['mvnormal', 'mvlaplace'] d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('\ndistance norm is %.3e' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps, FLAGS.n_monte_carlo_samples) v_t = qcomps[index_v_t] # Frank-Wolfe gap N_samples = FLAGS.n_monte_carlo_samples sample_q = q_t.sample([N_samples]) sample_s = s_t.sample([N_samples]) step_s = tf.reduce_mean(grad_elbo(q_t, p, sample_s)).eval() step_q = tf.reduce_mean(grad_elbo(q_t, p, sample_q)).eval() gap_fw = step_q - step_s if gap_fw < 0: logger.warning("Frank-Wolfe duality gap is negative") # Away gap gap_a = step_v_t - step_q if gap_a < 0: eprint('Away gap < 0!!!') logger.info('fw gap %.3e, away gap %.3e' % (gap_fw, gap_a)) if (gap_fw >= gap_a) or (len(params) == 1): # FW direction, proceeds exactly as adafw logger.info('Proceeding in FW direction ') return adaptive_fw(weights, params, q_t, mu_s, cov_s, s_t, p, k, l_prev, gap_fw) # Away direction logger.info('Proceeding in Away direction ') adaptive_step_type = 'away' gap = gap_a if weights[index_v_t] < 1.0: MAX_GAMMA = weights[index_v_t] / (1.0 - weights[index_v_t]) else: MAX_GAMMA = 100. # Large value when t = 1 gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = -elbo(q_t, p, N_samples, return_std=False) debug('f(q_t) = %.5f' % (f_t)) is_drop_step = False while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev # NOTE: Handle extreme values of gamma carefully gamma = min(gap / (l_t * d_t_norm), MAX_GAMMA) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct $q_{t + 1}$ new_weights = copy.copy(weights) new_params = copy.copy(params) if gamma == MAX_GAMMA: # drop v_t is_drop_step = True del new_weights[index_v_t] new_weights = [(1. + gamma) * w for w in new_weights] del new_params[index_v_t] else: is_drop_step = False new_weights = [(1. + gamma) * w for w in new_weights] new_weights[index_v_t] -= gamma new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=is_vector) for c in new_params ] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p, N_samples, return_std=False) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': "drop" if is_drop_step else "away" } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, k, gap)
def adaptive_pfw(weights, params, q_t, mu_s, cov_s, s_t, p, k, l_prev): """Adaptive pairwise variant. Args: weights: [k], weights of the mixture components of q_t params: list containing dictionary of mixture params ('mu', 'scale') q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: edward.model, target distribution p k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate Returns: a dictionary containing gamma, new weights, new parameters lipschitz estimate, duality gap of current iterate and step information """ # FIXME is_vector = FLAGS.base_dist in ['mvnormal', 'mvlaplace'] d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('\ndistance norm is %.3e' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps, FLAGS.n_monte_carlo_samples) v_t = qcomps[index_v_t] # Pairwise gap N_samples = FLAGS.n_monte_carlo_samples sample_s = s_t.sample([N_samples]) step_s = tf.reduce_mean(grad_elbo(q_t, p, sample_s)).eval() gap_pw = step_v_t - step_s logger.info('Pairwise gap %.3e' % gap_pw) if gap_pw <= 0: logger.warning('Pairwise gap <= 0, returning fixed step') return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, k, gap_pw) gap = gap_pw MAX_GAMMA = weights[index_v_t] gamma = 2. / (k + 2.) tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = -elbo(q_t, p, N_samples, return_std=False) debug('f(q_t) = %.3e' % f_t) is_drop_step = False while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute L_t and gamma_t l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), MAX_GAMMA) d_1 = -gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct q_{t + 1} # handle the case of gamma = MAX_GAMMA separately new_weights = copy.copy(weights) new_weights.append(gamma) new_params = copy.copy(params) new_params.append({'loc': mu_s, 'scale': cov_s}) if gamma != MAX_GAMMA: new_weights[index_v_t] -= gamma is_drop_step = False else: # hardcoding to 0 del new_weights[index_v_t] del new_params[index_v_t] is_drop_step = True new_components = [ coreutils.base_loc_scale(FLAGS.base_dist, c['loc'], c['scale'], multivariate=is_vector) for c in new_params ] qt_new = coreutils.get_mixture(new_weights, new_components) quad_bound_lhs = -elbo(qt_new, p, N_samples, return_std=False) logger.info('lt = %.3e, gamma = %.3f, f_(qt_new) = %.3e, ' 'linear extrapolated = %.3e' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: # Adaptive loop succeeded return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'params': new_params, 'gap': gap, 'step_type': 'drop' if is_drop_step else 'adaptive' } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return fixed(weights, params, q_t, mu_s, cov_s, s_t, p, k, gap)
def run(self, outdir, pi, mus, stds, n_features): """Run Boosted BBVI. Args: outdir: output directory pi: weights of target mixture mus: means of target mixture stds: scale of target mixture n_features: dimensionality Returns: runs FLAGS.n_fw_iter of frank-wolfe and logs relevant metrics """ # comps: component atoms of boosting (contains a dict of params) # weights: weights given to every atom over comps # Together S = {weights, comps} make the active set weights, comps = [], [] # L-continuous gradient estimate lipschitz_estimate = None #debug('target', mus, stds) start = 0 if FLAGS.restore: # 1 correct LMO start = 1 comps.append({'loc': mus[0], 'scale_diag': stds[0]}) weights.append(1.0) lipschitz_estimate = opt.adafw_linit(None, None) # Metrics to log times_filename = os.path.join(outdir, 'times.csv') open(times_filename, 'w').close() # truncate the file if exists elbos_filename = os.path.join(outdir, 'elbos.csv') logger.info("saving elbos to, %s" % elbos_filename) open(elbos_filename, 'w').close() relbos_filename = os.path.join(outdir, 'relbos.csv') logger.info('saving relbos to, %s' % relbos_filename) open(relbos_filename, 'w').close() objective_filename = os.path.join(outdir, 'kl.csv') logger.info("saving kl divergence to, %s" % objective_filename) if not FLAGS.restore: open(objective_filename, 'w').close() step_filename = os.path.join(outdir, 'steps.csv') logger.info("saving gamma values to, %s" % step_filename) if not FLAGS.restore: open(step_filename, 'w').close() # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): lipschitz_filename = os.path.join(outdir, 'lipschitz.csv') open(lipschitz_filename, 'w').close() gap_filename = os.path.join(outdir, 'gap.csv') open(gap_filename, 'w').close() iter_info_filename = os.path.join(outdir, 'iter_info.txt') open(iter_info_filename, 'w').close() elif FLAGS.fw_variant == 'line_search': goutdir = os.path.join(outdir, 'gradients') os.makedirs(goutdir, exist_ok=True) for t in range(start, start + FLAGS.n_fw_iter): # NOTE: First iteration (t = 0) is initialization g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): # build target distribution p = self.target_dist(pi=pi, mus=mus, stds=stds) if t == 0: fw_iterates = {} else: # current iterate (solution until now) qtx = Mixture( cat=Categorical( probs=tf.convert_to_tensor(weights)), components=[ MultivariateNormalDiag(**c) for c in comps ]) fw_iterates = {p: qtx} # s is the solution to LMO. It is initialized randomly #s = coreutils.construct_normal([n_features], t, 's') s = coreutils.construct_multivariatenormaldiag([n_features], t, 's') sess.run(tf.global_variables_initializer()) total_time = 0 start_inference_time = time.time() # Run inference on relbo to solve LMO problem # If initilization of mixture is random, then the # first component will be random distribution, in # that case no inference is needed. # NOTE: KLqp has a side effect, it is modifying s #if FLAGS.iter0 == 'vi' or t > 0: if FLAGS.iter0 == 'vi': inference = relbo.KLqp( { p: s }, fw_iterates=fw_iterates, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) # s now contains solution to LMO end_inference_time = time.time() mu_s = s.mean().eval() cov_s = s.stddev().eval() #debug('LMO', mu_s, cov_s) # NOTE: keep only step size time #total_time += end_inference_time - start_inference_time # compute step size to update the next iterate step_result = {} if t == 0: gamma = 1. if FLAGS.fw_variant.startswith('ada'): lipschitz_estimate = opt.adafw_linit(s, p) elif FLAGS.fw_variant == 'fixed': gamma = 2. / (t + 2.) elif FLAGS.fw_variant == 'line_search': start_line_search_time = time.time() step_result = opt.line_search_dkl( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t) end_line_search_time = time.time() total_time += ( end_line_search_time - start_line_search_time) gamma = step_result['gamma'] elif FLAGS.fw_variant == 'fc': # Add a fixed component. Correct later gamma = 2. / (t + 2.) elif FLAGS.fw_variant == 'adafw': start_adafw_time = time.time() step_result = opt.adaptive_fw( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t, lipschitz_estimate) end_adafw_time = time.time() total_time += end_adafw_time - start_adafw_time gamma = step_result['gamma'] elif FLAGS.fw_variant == 'ada_afw': start_adaafw_time = time.time() step_result = opt.adaptive_afw( weights, comps, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t, lipschitz_estimate) end_adaafw_time = time.time() total_time += end_adaafw_time - start_adaafw_time gamma = step_result['gamma'] # just for logging elif FLAGS.fw_variant == 'ada_pfw': start_adapfw_time = time.time() step_result = opt.adaptive_pfw( weights, comps, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t, lipschitz_estimate) end_adapfw_time = time.time() total_time += end_adapfw_time - start_adapfw_time gamma = step_result['gamma'] # just for logging if ((FLAGS.fw_variant == 'ada_afw' or FLAGS.fw_variant == 'ada_pfw') and t > 0): comps = step_result['comps'] weights = step_result['weights'] else: comps.append({'loc': mu_s, 'scale_diag': cov_s}) weights = coreutils.update_weights(weights, gamma, t) # TODO: Move this to fw_step_size.py if FLAGS.fw_variant == "fc": q_latest = Mixture( cat=Categorical( probs=tf.convert_to_tensor(weights)), components=[ MultivariateNormalDiag(**c) for c in comps ]) # Correction start_fc_time = time.time() weights = opt.fully_corrective(q_latest, p) weights = list(weights) for i in reversed(range(len(weights))): # Remove components whose weight is 0 w = weights[i] if w == 0: del weights[i] del comps[i] weights = np.array(weights) end_fc_time = time.time() total_time += end_fc_time - start_fc_time q_latest = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[ MultivariateNormalDiag(**c) for c in comps ]) # Log metrics for current iteration time_t = float(total_time) logger.info('total time %f' % (time_t)) append_to_file(times_filename, time_t) elbo_t = elbo(q_latest, p, n_samples=10) logger.info("iter, %d, elbo, %.2f +/- %.2f" % (t, elbo_t[0], elbo_t[1])) append_to_file(elbos_filename, "%f,%f" % (elbo_t[0], elbo_t[1])) logger.info('iter %d, gamma %.4f' % (t, gamma)) append_to_file(step_filename, gamma) if t > 0: relbo_t = -coreutils.compute_relbo( s, fw_iterates[p], p, np.log(t + 1)) append_to_file(relbos_filename, relbo_t) objective_t = kl_divergence(q_latest, p).eval() logger.info("iter, %d, kl, %.2f" % (t, objective_t)) append_to_file(objective_filename, objective_t) if FLAGS.fw_variant.startswith('ada'): if t > 0: lipschitz_estimate = step_result['l_estimate'] append_to_file(gap_filename, step_result['gap']) append_to_file(iter_info_filename, step_result['step_type']) logger.info( 'gap = %.3f, lt = %.5f, iter_type = %s' % (step_result['gap'], step_result['l_estimate'], step_result['step_type'])) # l_estimate for iter 0 is the intial value append_to_file(lipschitz_filename, lipschitz_estimate) elif FLAGS.fw_variant == 'line_search' and t > 0: n_line_search_samples = step_result['n_samples'] grad_t = step_result['grad_gamma'] g_outfile = os.path.join( goutdir, 'line_search_samples_%d.npy.%d' % (n_line_search_samples, t)) logger.info( 'saving line search data to, %s' % g_outfile) np.save(open(g_outfile, 'wb'), grad_t) for_serialization = { 'locs': np.array([c['loc'] for c in comps]), 'scale_diags': np.array([c['scale_diag'] for c in comps]) } qt_outfile = os.path.join(outdir, 'qt_iter%d.npz' % t) np.savez(qt_outfile, weights=weights, **for_serialization) np.savez( os.path.join(outdir, 'qt_latest.npz'), weights=weights, **for_serialization) logger.info("saving qt to, %s" % qt_outfile) tf.reset_default_graph()