def test_multivariate_normal_diag(self): with self.test_session() as sess: N, D, w_true, X_train, y_train, X, w, b, y = self._setup() # INFERENCE. Initialize sigma's at identity to verify if we # learned an approximately zero determinant. qw = MultivariateNormalDiag(mu=tf.Variable(tf.random_normal([D])), diag_stdev=tf.Variable(tf.ones(D))) qb = MultivariateNormalDiag(mu=tf.Variable(tf.random_normal([1])), diag_stdev=tf.Variable(tf.ones(1))) inference = ed.Laplace({ w: qw, b: qb }, data={ X: X_train, y: y_train }) inference.run(n_iter=100) self._test(sess, qw, qb, w_true) self.assertAllClose(qw.sigma.eval(), tf.diag(tf.diag_part(qw.sigma)).eval()) self.assertAllClose(qb.sigma.eval(), tf.diag(tf.diag_part(qb.sigma)).eval())
def klqp(self, docs, S, T, wordVec): K = self.K D = self.D nu = self.nu self.latent_vars = latent_vars = {} training_data = {} qmu = Normal(loc=tf.Variable(tf.random_normal([K, nu])), scale=tf.nn.softplus(tf.Variable(tf.zeros([K, nu])))) latent_vars[self.mu] = qmu qsigmasq = InverseGamma(tf.nn.softplus(tf.Variable(tf.zeros([K, nu]))), tf.nn.softplus(tf.Variable(tf.zeros([K, nu])))) latent_vars[self.sigmasq] = qsigmasq for d in range(D): training_data[self.w[d]] = docs[d] self.qmu = qmu self.qsigma = qsigma = tf.sqrt(qsigmasq) self.qw = MultivariateNormalDiag(loc=qmu, scale_diag=qsigma) V = len(wordVec) logprobs = [None] * V for i in range(V): logprobs[i] = self.qw.log_prob(wordVec[i]) self.qbeta = tf.convert_to_tensor(logprobs) self.inference = ed.KLqp(latent_vars, data=training_data) self.inference.initialize(n_iter=T, n_print=10, n_samples=S) self.__run_inference__(T)
def test_mvn_same_as_edward_mvn(): loc = np.zeros(5) scale = np.ones(5) A = mvn.mvn(loc=loc, scale=scale) B = MultivariateNormalDiag(loc=loc, scale_diag=scale) M = np.random.rand(5, 5) tf.InteractiveSession() assert (tf.reduce_sum(A.log_prob(M)).eval() - tf.reduce_sum(B.log_prob(M)).eval() < 1e-6)
def test_mvn_same_as_edward_log_prob(): loc = np.zeros(5) scale = np.ones(5) A = mvn.mvn(loc=loc, scale=scale) B = MultivariateNormalDiag(loc=loc, scale_diag=scale) samples = np.random.rand(5, 5) tf.InteractiveSession() print('Log probability of Multivariate Normal Scipy vs Edward') print_err( tf.reduce_sum(A.log_prob(samples)).eval(), tf.reduce_sum(B.log_prob(samples)).eval())
def create_target_dist(): """Create and return target distribution.""" if FLAGS.dist != 'normal': raise NotImplementedError pi = np.random.dirichlet([1.] * K) #pi = pi[np.newaxis, :].astype(np.float32) #mus = 2.*np.random.rand(K, D).astype(np.float32) - 1. #stds = np.random.rand(K, D).astype(np.float32) mus = np.random.randn(K, D).astype(np.float32) stds = softplus(np.random.randn(K, D).astype(np.float32)) pcomps = [ MultivariateNormalDiag(loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor( stds[i], dtype=tf.float32)) for i in range(K) ] p = Mixture( cat=Categorical(probs=tf.convert_to_tensor(pi, dtype=tf.float32)), components=pcomps) #q = VectorLaplaceDiag(loc=mus[0], scale_diag=stds[0]) return p, mus, stds
def __init__(self, K, D, N, nu, use_param=False): self.K = K # number of topics self.D = D # number of documents self.N = N # number of words of each document self.nu = nu self.alpha = alpha = tf.zeros([K]) + 0.1 self.sigmasq = InverseGamma(tf.ones(nu), tf.ones(nu), sample_shape=K) self.sigma = sigma = tf.sqrt(self.sigmasq) self.mu = mu = Normal(tf.zeros(nu), tf.ones(nu), sample_shape=K) self.theta = theta = [None] * D self.z = z = [None] * D self.w = w = [None] * D for d in range(D): theta[d] = Dirichlet(alpha) if use_param: w[d] = ParamMixture(mixing_weights=theta[d], component_params={ 'loc': mu, 'scale_diag': sigma }, component_dist=MultivariateNormalDiag, sample_shape=N[d]) z[d] = w[d].cat else: z[d] = Categorical(probs=theta[d], sample_shape=N[d]) components = [ MultivariateNormalDiag(loc=tf.gather(mu, k), scale_diag=tf.gather(self.sigma, k), sample_shape=N[d]) for k in range(K) ] w[d] = Mixture(cat=z[d], components=components, sample_shape=N[d])
def get_tf_mixture(locs, diags, weights): q_comps = [ MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) for loc, scale_diag in zip(locs, diags) ] cat = Categorical(probs=tf.convert_to_tensor(weights)) return Mixture(cat=cat, components=q_comps)
def main(): # build model xcomps = [ Normal(loc=tf.convert_to_tensor(mixture_model_relbo.mus[i]), scale=tf.convert_to_tensor(mixture_model_relbo.stds[i])) for i in range(len(mixture_model_relbo.mus)) ] x = Mixture( cat=Categorical(probs=tf.convert_to_tensor(mixture_model_relbo.pi)), components=xcomps, sample_shape=mixture_model_relbo.N) x_mvns = [ MultivariateNormalDiag( loc=tf.convert_to_tensor(mixture_model_relbo.mus[i]), scale_diag=tf.convert_to_tensor(mixture_model_relbo.stds[i])) for i in range(len(mixture_model_relbo.mus)) ] x_train, components = mixture_model_relbo.build_toy_dataset( mixture_model_relbo.N) n_examples, n_features = x_train.shape qxs = [ MultivariateNormalDiag(loc=[scipy.stats.norm.rvs(1)], scale_diag=[scipy.stats.norm.rvs(1)]) for i in range(10) ] truth = [ MultivariateNormalDiag(loc=mixture_model_relbo.mus[i], scale_diag=mixture_model_relbo.stds[i]) for i in range(len(mixture_model_relbo.mus)) ] qxs.extend(truth) mix = Mixture(cat=Categorical(probs=[1. / len(qxs)] * len(qxs)), components=qxs) sess = tf.InteractiveSession() with sess.as_default(): mixture_model_relbo.fully_corrective(mix, x)
def construct_multivariatenormaldiag(dims, iter, name='', sample_shape=N): #loc = tf.get_variable(name + "_loc%d" % iter, dims) loc = tf.get_variable(name + "_loc%d" % iter, initializer=tf.random_normal(dims)) #scale = tf.nn.softplus(tf.get_variable(name + "_scale%d" % iter, dims)) scale = tf.nn.softplus( tf.get_variable(name + "_scale%d" % iter, initializer=tf.random_normal(dims))) mvn = MultivariateNormalDiag(loc=loc, scale_diag=scale, sample_shape=sample_shape) return mvn
def deserialize_target_from_file(filename): qt_deserialized = np.load(filename) mus = qt_deserialized['mus'].astype(np.float32) stds = qt_deserialized['stds'].astype(np.float32) pi = qt_deserialized['pi'].astype(np.float32) cat = Categorical(probs=tf.convert_to_tensor(pi[0])) target_comps = [ MultivariateNormalDiag(loc=tf.convert_to_tensor(mus[i]), scale_diag=tf.convert_to_tensor(stds[i])) for i in range(len(mus)) ] return Mixture(cat=cat, components=target_comps)
def deserialize_mixture_from_file(filename): qt_deserialized = np.load(filename) locs = qt_deserialized['locs'].astype(np.float32) scale_diags = qt_deserialized['scale_diags'].astype(np.float32) weights = qt_deserialized['weights'].astype(np.float32) q_comps = [ MultivariateNormalDiag(loc=loc, scale_diag=scale_diag) for loc, scale_diag in zip(locs, scale_diags) ] cat = Categorical(probs=tf.convert_to_tensor(weights)) q_latest = Mixture(cat=cat, components=q_comps) return q_latest
def target_dist(*args, **kwargs): """Build the target distribution""" stds = kwargs['stds'] mus = kwargs['mus'] pi = kwargs['pi'] pcomps = [ MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor( stds[i], dtype=tf.float32)) for i in range(len(mus)) ] p = Mixture( cat=Categorical(probs=tf.convert_to_tensor(pi[0])), components=pcomps) #q = VectorLaplaceDiag(loc=mus[0], scale_diag=stds[0]) return p
def test_lipschitz_init(pi, mus, stds): g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): s = construct_normal([1], 0, 's') sess.run(tf.global_variables_initializer()) logger.info('mean of s = %.3f, std = %.3f' % (s.mean().eval(), s.stddev().eval())) # build target distribution pcomps = [ MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor(stds[i], dtype=tf.float32)) for i in range(len(mus)) ] p = Mixture(cat=Categorical(probs=tf.convert_to_tensor(pi)), components=pcomps) lipschitz_init_estimate = opt.adafw_linit(s, p) logger.info('L estimate is %.5f' % lipschitz_init_estimate)
def test_multivariate_normal_diag(self): with self.test_session() as sess: N, D, w_true, X_train, y_train, X, w, b, y = self._setup() # INFERENCE. Initialize scales at identity to verify if we # learned an approximately zero determinant. qw = MultivariateNormalDiag( loc=tf.Variable(tf.random_normal([D])), scale_diag=tf.Variable(tf.ones(D))) qb = MultivariateNormalDiag( loc=tf.Variable(tf.random_normal([1])), scale_diag=tf.Variable(tf.ones(1))) inference = ed.Laplace({w: qw, b: qb}, data={X: X_train, y: y_train}) inference.run(n_iter=100) self._test(sess, qw, qb, w_true) self.assertAllClose(qw.covariance().eval(), tf.diag(tf.diag_part(qw.covariance())).eval()) self.assertAllClose(qb.covariance().eval(), tf.diag(tf.diag_part(qb.covariance())).eval())
def adaptive_fw(weights, locs, diags, q_t, mu_s, cov_s, s_t, p, k, l_prev, return_gamma=False): """Adaptive Frank-Wolfe algorithm. Sets step size as suggested in Algorithm 1 of https://arxiv.org/pdf/1806.05123.pdf Args: weights: [k], weights of the mixture components of q_t locs: [k x dim], means of mixture components of q_t diags: [k x dim], std deviations of mixture components of q_t q_t: current mixture iterate q_t mu_s: [dim], mean for LMO solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: edward.model, target distribution p k: iteration number of Frank-Wolfe l_prev: previous lipschitz estimate return_gamma: only return the value of gamma Returns: If return_gamma is True, only the computed value of gamma is returned. Else returns a dictionary containing gamma, lipschitz estimate, duality gap and step information """ # Set $q_{t+1}$'s params new_locs = copy.copy(locs) new_diags = copy.copy(diags) new_locs.append(mu_s) new_diags.append(cov_s) d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('distance norm is %.5f' % d_t_norm) N_samples = FLAGS.n_monte_carlo_samples # create and sample from $s_t, q_t$ sample_q = q_t.sample([N_samples]) sample_s = s_t.sample([N_samples]) step_s = tf.reduce_mean(grad_kl(q_t, p, sample_s)).eval() step_q = tf.reduce_mean(grad_kl(q_t, p, sample_q)).eval() gap = step_q - step_s logger.info('duality gap %.5f' % gap) if gap < 0: logger.warning("Duality gap is negative returning 0 step") #gamma = 2. / (k + 2.) gamma = 0. tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw # did the adaptive loop suceed or not step_type = "fixed" # NOTE: this is from v1 of the paper, new version # replaces multiplicative tau with divisor eta pow_tau = 1.0 i, l_t = 0, l_prev f_t = kl_divergence(q_t, p, allow_nan_stats=False).eval() debug('f(q_t) = %.5f' % (f_t)) # return intial estimate if gap is -ve while gap >= 0: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), 1.0) d_1 = - gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # $w_{t + 1} = [(1 - \gamma)w_t, \gamma]$ new_weights = copy.copy(weights) new_weights = [(1. - gamma) * w for w in new_weights] new_weights.append(gamma) qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(new_locs, new_diags) ]) quad_bound_lhs = kl_divergence(qt_new, p, allow_nan_stats=False).eval() logger.info('lt = %.5f, gamma = %.3f, f_(qt_new) = %.5f, ' 'linear extrapolated = %.5f' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: step_type = "adaptive" break pow_tau *= tau i += 1 #if i > FLAGS.adafw_MAXITER or gamma < MIN_GAMMA: if i > FLAGS.adafw_MAXITER: # estimate not good #gamma = 2. / (k + 2.) gamma = 0. l_t = l_prev step_type = "fixed_adaptive_MAXITER" break if return_gamma: return gamma return { 'gamma': gamma, 'l_estimate': l_t, 'gap': gap, 'step_type': step_type }
def main(argv): del argv x_train, components = build_toy_dataset(N) n_examples, n_features = x_train.shape # save the target outdir = setup_outdir() np.savez(os.path.join(outdir, 'target_dist.npz'), pi=pi, mus=mus, stds=stds) weights, comps = [], [] elbos = [] relbo_vals = [] times = [] for iter in range(FLAGS.n_fw_iter): g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): # build model xcomps = [ Normal(loc=tf.convert_to_tensor(mus[i]), scale=tf.convert_to_tensor(stds[i])) for i in range(len(mus)) ] x = Mixture(cat=Categorical(probs=tf.convert_to_tensor(pi)), components=xcomps, sample_shape=N) qx = construct_normal([n_features], iter, 'qx') if iter > 0: qtx = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[ Normal( loc=c['loc'][0], #scale_diag=tf.nn.softplus(c['scale_diag'])) for c in comps], sample_shape=N) scale=c['scale_diag'][0]) for c in comps ], sample_shape=N) fw_iterates = {x: qtx} else: fw_iterates = {} sess.run(tf.global_variables_initializer()) total_time = 0 start_inference_time = time.time() inference = relbo.KLqp({x: qx}, fw_iterates=fw_iterates, fw_iter=iter) inference.run(n_iter=FLAGS.LMO_iter) end_inference_time = time.time() total_time += end_inference_time - start_inference_time if iter > 0: relbo_vals.append(-utils.compute_relbo( qx, fw_iterates[x], x, np.log(iter + 1))) if iter == 0: gamma = 1. elif iter > 0 and FLAGS.fw_variant == 'fixed': gamma = 2. / (iter + 2.) elif iter > 0 and FLAGS.fw_variant == 'line_search': start_line_search_time = time.time() gamma = line_search_dkl(weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qx.loc.eval(), qx.stddev().eval(), x, iter) end_line_search_time = time.time() total_time += end_line_search_time - start_line_search_time elif iter > 0 and FLAGS.fw_variant == 'fc': gamma = 2. / (iter + 2.) comps.append({ 'loc': qx.mean().eval(), 'scale_diag': qx.stddev().eval() }) weights = utils.update_weights(weights, gamma, iter) print("weights", weights) print("comps", [c['loc'] for c in comps]) print("scale_diags", [c['scale_diag'] for c in comps]) q_latest = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[MultivariateNormalDiag(**c) for c in comps], sample_shape=N) if FLAGS.fw_variant == "fc": start_fc_time = time.time() weights = fully_corrective(q_latest, x) weights = list(weights) for i in reversed(range(len(weights))): w = weights[i] if w == 0: del weights[i] del comps[i] weights = np.array(weights) end_fc_time = time.time() total_time += end_fc_time - start_fc_time q_latest = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[MultivariateNormalDiag(**c) for c in comps], sample_shape=N) elbos.append(elbo(q_latest, x)) outdir = setup_outdir() print("total time", total_time) times.append(float(total_time)) utils.save_times(os.path.join(outdir, 'times.csv'), times) elbos_filename = os.path.join(outdir, 'elbos.csv') logger.info("iter, %d, elbo, %.2f +/- %.2f" % (iter, *elbos[-1])) np.savetxt(elbos_filename, elbos, delimiter=',') logger.info("saving elbos to, %s" % elbos_filename) relbos_filename = os.path.join(outdir, 'relbos.csv') np.savetxt(relbos_filename, relbo_vals, delimiter=',') logger.info("saving relbo values to, %s" % relbos_filename) for_serialization = { 'locs': np.array([c['loc'] for c in comps]), 'scale_diags': np.array([c['scale_diag'] for c in comps]) } qt_outfile = os.path.join(outdir, 'qt_iter%d.npz' % iter) np.savez(qt_outfile, weights=weights, **for_serialization) np.savez(os.path.join(outdir, 'qt_latest.npz'), weights=weights, **for_serialization) logger.info("saving qt to, %s" % qt_outfile) tf.reset_default_graph()
def run_gap(pi, mus, stds): weights, comps = [], [] elbos = [] relbo_vals = [] for t in range(FLAGS.n_fw_iter): logger.info('Frank Wolfe Iteration %d' % t) g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): # target distribution components pcomps = [ MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor(stds[i], dtype=tf.float32)) for i in range(len(mus)) ] # target distribution p = Mixture(cat=Categorical(probs=tf.convert_to_tensor(pi)), components=pcomps) # LMO appoximation s = construct_normal([1], t, 's') fw_iterates = {} if t > 0: qtx = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[ MultivariateNormalDiag(**c) for c in comps ]) fw_iterates = {p: qtx} sess.run(tf.global_variables_initializer()) # Run inference on relbo to solve LMO problem # NOTE: KLqp has a side effect, it is modifying s inference = relbo.KLqp({p: s}, fw_iterates=fw_iterates, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) # s now contains solution to LMO if t > 0: sample_s = s.sample([FLAGS.n_monte_carlo_samples]) sample_q = qtx.sample([FLAGS.n_monte_carlo_samples]) step_s = tf.reduce_mean(grad_kl(qtx, p, sample_s)).eval() step_q = tf.reduce_mean(grad_kl(qtx, p, sample_q)).eval() gap = step_q - step_s logger.info('Frank-Wolfe gap at iter %d is %.5f' % (t, gap)) if gap < 0: eprint('Frank-Wolfe gab becoming negative!') # f(q*) = f(p) = 0 logger.info('Objective value (actual gap) is %.5f' % kl_divergence(qtx, p).eval()) gamma = 2. / (t + 2.) comps.append({ 'loc': s.mean().eval(), 'scale_diag': s.stddev().eval() }) weights = coreutils.update_weights(weights, gamma, t) tf.reset_default_graph()
Halos_Pos.append(hal[3:3 + nb_components * 2].reshape(nb_components, 2)) print("Galaxy (X, Y):", len(Galaxy_Pos), Galaxy_Pos[0].shape) print("Galaxy (E1, E2):", len(Galaxy_E), Galaxy_E[0].shape) print("Halos (X, Y):", len(Halos_Pos), Halos_Pos[0].shape) # =========================================================================== # Create the model # =========================================================================== # latent variable z mu = Normal(mu=tf.zeros([nb_components, nb_features]), sigma=tf.ones([nb_components, nb_features])) sigma = InverseGamma(alpha=tf.ones([nb_components, nb_features]), beta=tf.ones([nb_components, nb_features])) cat = Categorical(logits=tf.zeros([nb_datapoints, nb_components])) components = [ MultivariateNormalDiag(mu=tf.ones([nb_datapoints, 1]) * mu[k], diag_stdev=tf.ones([nb_datapoints, 1]) * sigma[k]) for k in range(nb_components) ] x = Mixture(cat=cat, components=components) # ====== inference ====== # qmu = Normal(mu=tf.Variable(tf.random_normal([nb_components, nb_features])), sigma=tf.nn.softplus( tf.Variable(tf.zeros([nb_components, nb_features])))) qsigma = InverseGamma(alpha=tf.nn.softplus( tf.Variable(tf.random_normal([nb_components, nb_features]))), beta=tf.nn.softplus( tf.Variable( tf.random_normal([nb_components, nb_features])))) # fitting data
class SimpleGaussianLDA(object): def __init__(self, K, D, N, nu, use_param=False): self.K = K # number of topics self.D = D # number of documents self.N = N # number of words of each document self.nu = nu self.alpha = alpha = tf.zeros([K]) + 0.1 self.sigmasq = InverseGamma(tf.ones(nu), tf.ones(nu), sample_shape=K) self.sigma = sigma = tf.sqrt(self.sigmasq) self.mu = mu = Normal(tf.zeros(nu), tf.ones(nu), sample_shape=K) self.theta = theta = [None] * D self.z = z = [None] * D self.w = w = [None] * D for d in range(D): theta[d] = Dirichlet(alpha) if use_param: w[d] = ParamMixture(mixing_weights=theta[d], component_params={ 'loc': mu, 'scale_diag': sigma }, component_dist=MultivariateNormalDiag, sample_shape=N[d]) z[d] = w[d].cat else: z[d] = Categorical(probs=theta[d], sample_shape=N[d]) components = [ MultivariateNormalDiag(loc=tf.gather(mu, k), scale_diag=tf.gather(self.sigma, k), sample_shape=N[d]) for k in range(K) ] w[d] = Mixture(cat=z[d], components=components, sample_shape=N[d]) def __run_inference__(self, T, S=None): tf.global_variables_initializer().run() for n in range(self.inference.n_iter): info_dict = self.inference.update() self.inference.print_progress(info_dict) self.inference.finalize() def klqp(self, docs, S, T, wordVec): K = self.K D = self.D nu = self.nu self.latent_vars = latent_vars = {} training_data = {} qmu = Normal(loc=tf.Variable(tf.random_normal([K, nu])), scale=tf.nn.softplus(tf.Variable(tf.zeros([K, nu])))) latent_vars[self.mu] = qmu qsigmasq = InverseGamma(tf.nn.softplus(tf.Variable(tf.zeros([K, nu]))), tf.nn.softplus(tf.Variable(tf.zeros([K, nu])))) latent_vars[self.sigmasq] = qsigmasq for d in range(D): training_data[self.w[d]] = docs[d] self.qmu = qmu self.qsigma = qsigma = tf.sqrt(qsigmasq) self.qw = MultivariateNormalDiag(loc=qmu, scale_diag=qsigma) V = len(wordVec) logprobs = [None] * V for i in range(V): logprobs[i] = self.qw.log_prob(wordVec[i]) self.qbeta = tf.convert_to_tensor(logprobs) self.inference = ed.KLqp(latent_vars, data=training_data) self.inference.initialize(n_iter=T, n_print=10, n_samples=S) self.__run_inference__(T) def getTopWords(self, wordVec, tokens): K = self.K V = len(wordVec) qbeta = self.qbeta qbeta_sample = qbeta.eval() prob = [None] * K for k in range(K): prob[k] = qbeta_sample[:, k] self.tokens_probs = tokens_probs = [None] * K self.top_words = [None] * K for k in range(K): tokens_probs[k] = dict((t, p) for t, p in zip(range(V), prob[k])) newdict = sorted(tokens_probs[k], key=tokens_probs[k].get, reverse=True)[:15] self.top_words[k] = newdict print('topic %d' % k) for Id in newdict: print(tokens[Id], tokens_probs[k][Id]) def getPMI(self, comatrix): K = self.K self.pmis = pmis = [None] * K for k in range(K): pmis[k] = util.pmi(comatrix, self.top_words[k]) print('topic %d pmi: %f' % (k, pmis[k]))
def _test(mu, diag_stdev, n): x = MultivariateNormalDiag(mu=mu, diag_stdev=diag_stdev) val_est = get_dims(x.sample(n)) val_true = n + get_dims(mu) assert val_est == val_true
def f(gamma): weights = [(1 - gamma), gamma] q_l = Mixture(cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[MultivariateNormalDiag(**c) for c in comps]) return kl_divergence(q_l, qt).eval()
def _build_model(self): """ implementation of the KMN """ with tf.variable_scope(self.name): self.layer_in_x, self.layer_in_y = self._build_input_layers() # add playeholders, data_normalization and data_noise if desired self.X_in = L.get_output(self.layer_in_x) self.Y_in = L.get_output(self.layer_in_y) # get batch size self.batch_size = tf.shape(self.X_ph)[0] # create core multi-layer perceptron core_network = MLP( name="core_network", input_layer=self.layer_in_x, output_dim=self.n_centers*self.n_scales, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=None, ) self.core_output_layer = core_network.output_layer # weights of the mixture components self.logits = L.get_output(self.core_output_layer) self.softmax_layer_weights = L.NonlinearityLayer(self.core_output_layer, nonlinearity=tf.nn.softmax) self.weights = L.get_output(self.softmax_layer_weights) # locations of the kernelfunctions self.locs = tf.Variable(np.zeros((self.n_centers, self.ndim_y)), name="locs", trainable=False, dtype=tf.float32) # assign sampled locs when fitting self.locs_layer = L.VariableLayer(core_network.input_layer, (self.n_centers, self.ndim_y), variable=self.locs, name="locs", trainable=False) self.locs_array = tf.unstack(tf.transpose(tf.multiply(tf.ones((self.batch_size, self.n_centers, self.ndim_y)), self.locs), perm=[1, 0, 2])) assert len(self.locs_array) == self.n_centers # scales of the gaussian kernels log_scales_layer = L.VariableLayer(core_network.input_layer, (self.n_scales,), variable=tf.Variable(self.init_scales_softplus, dtype=tf.float32, trainable=self.train_scales), name="log_scales", trainable=self.train_scales) self.scales_layer = L.NonlinearityLayer(log_scales_layer, nonlinearity=tf.nn.softplus) self.scales = L.get_output(self.scales_layer) self.scales_array = scales_array = tf.unstack(tf.transpose(tf.multiply(tf.ones((self.batch_size, self.ndim_y, self.n_scales)), self.scales), perm=[2,0,1])) assert len(self.scales_array) == self.n_scales # put mixture components together self.y_input = L.get_output(self.layer_in_y) self.cat = cat = Categorical(logits=self.logits) self.components = components = [MultivariateNormalDiag(loc=loc, scale_diag=scale) for loc in self.locs_array for scale in scales_array] self.mixture = mixture = Mixture(cat=cat, components=components) # softmax entropy penalty -> regularization self.softmax_entropy = tf.reduce_sum(- tf.multiply(tf.log(self.weights), self.weights), axis=1) self.entropy_reg_coef_ph = tf.placeholder_with_default(float(self.entropy_reg_coef), name='entropy_reg_coef', shape=()) self.softmax_entrop_loss = self.entropy_reg_coef_ph * self.softmax_entropy tf.losses.add_loss(self.softmax_entrop_loss, tf.GraphKeys.REGULARIZATION_LOSSES) # tensor to compute probabilities if self.data_normalization: self.pdf_ = mixture.prob(self.y_input) / tf.reduce_prod(self.std_y_sym) self.log_pdf_ = mixture.log_prob(self.y_input) - tf.reduce_sum(tf.log(self.std_y_sym)) else: self.pdf_ = mixture.prob(self.y_input) self.log_pdf_ = mixture.log_prob(self.y_input) # symbolic tensors for getting the unnormalized mixture components if self.data_normalization: self.scales_unnormalized = tf.transpose(tf.multiply(tf.ones((self.ndim_y, self.n_scales)), self.scales)) * self.std_y_sym # shape = (n_scales, ndim_y) self.locs_unnormalized = self.locs * self.std_y_sym + self.mean_y_sym else: self.scales_unnormalized = tf.transpose(tf.multiply(tf.ones((self.ndim_y, self.n_scales)), self.scales)) # shape = (n_scales, ndim_y) self.locs_unnormalized = self.locs # initialize LayersPowered --> provides functions for serializing tf models LayersPowered.__init__(self, [self.core_output_layer, self.locs_layer, self.scales_layer, self.layer_in_y])
def main(argv): del argv outdir = FLAGS.outdir if '~' in outdir: outdir = os.path.expanduser(outdir) os.makedirs(outdir, exist_ok=True) # Files to log metrics times_filename = os.path.join(outdir, 'times.csv') elbos_filename = os.path.join(outdir, 'elbos.csv') objective_filename = os.path.join(outdir, 'kl.csv') reference_filename = os.path.join(outdir, 'ref_kl.csv') step_filename = os.path.join(outdir, 'steps.csv') # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): curvature_filename = os.path.join(outdir, 'curvature.csv') gap_filename = os.path.join(outdir, 'gap.csv') iter_info_filename = os.path.join(outdir, 'iter_info.txt') elif FLAGS.fw_variant == 'line_search': goutdir = os.path.join(outdir, 'gradients') # empty the files present in the folder already open(times_filename, 'w').close() open(elbos_filename, 'w').close() open(objective_filename, 'w').close() open(reference_filename, 'w').close() open(step_filename, 'w').close() # 'adafw', 'ada_afw', 'ada_pfw' if FLAGS.fw_variant.startswith('ada'): open(curvature_filename, 'w').close() append_to_file(curvature_filename, "c_local,c_global") open(gap_filename, 'w').close() open(iter_info_filename, 'w').close() elif FLAGS.fw_variant == 'line_search': os.makedirs(goutdir, exist_ok=True) for i in range(FLAGS.n_fw_iter): # NOTE: First iteration (t = 0) is initialization g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): p, mus, stds = create_target_dist() # current iterate (solution until now) if FLAGS.init == 'random': muq = np.random.randn(D).astype(np.float32) stdq = softplus(np.random.randn(D).astype(np.float32)) raise ValueError else: muq = mus[0] stdq = stds[0] # 1 correct LMO t = 1 comps = [{'loc': muq, 'scale_diag': stdq}] weights = [1.0] curvature_estimate = opt.adafw_linit() qtx = MultivariateNormalDiag( loc=tf.convert_to_tensor(muq, dtype=tf.float32), scale_diag=tf.convert_to_tensor(stdq, dtype=tf.float32)) fw_iterates = {p: qtx} # calculate kl-div with 1 component objective_old = kl_divergence(qtx, p).eval() logger.info("kl with init %.4f" % (objective_old)) append_to_file(reference_filename, objective_old) # s is the solution to LMO. It is initialized randomly # mu ~ N(0, 1), std ~ softplus(N(0, 1)) s = coreutils.construct_multivariatenormaldiag([D], t, 's') sess.run(tf.global_variables_initializer()) total_time = 0 start_inference_time = time.time() if FLAGS.LMO == 'vi': # we have to iterate over parameter space raise ValueError inference = relbo.KLqp({p: s}, fw_iterates=fw_iterates, fw_iter=t) inference.run(n_iter=FLAGS.LMO_iter) # s now contains solution to LMO end_inference_time = time.time() mu_s = s.mean().eval() cov_s = s.stddev().eval() # NOTE: keep only step size time #total_time += end_inference_time - start_inference_time # compute step size to update the next iterate step_result = {} if FLAGS.fw_variant == 'fixed': gamma = 2. / (t + 2.) elif FLAGS.fw_variant == 'line_search': start_line_search_time = time.time() step_result = opt.line_search_dkl( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t) end_line_search_time = time.time() total_time += (end_line_search_time - start_line_search_time) gamma = step_result['gamma'] elif FLAGS.fw_variant == 'adafw': start_adafw_time = time.time() step_result = opt.adaptive_fw( weights, [c['loc'] for c in comps], [c['scale_diag'] for c in comps], qtx, mu_s, cov_s, s, p, t, curvature_estimate) end_adafw_time = time.time() total_time += end_adafw_time - start_adafw_time gamma = step_result['gamma'] else: raise NotImplementedError comps.append({'loc': mu_s, 'scale_diag': cov_s}) weights = [(1. - gamma), gamma] c_global = estimate_global_curvature(comps, qtx) q_latest = Mixture( cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[MultivariateNormalDiag(**c) for c in comps]) # Log metrics for current iteration time_t = float(total_time) logger.info('total time %f' % (time_t)) append_to_file(times_filename, time_t) elbo_t = elbo(q_latest, p, n_samples=1000) logger.info("iter, %d, elbo, %.2f +/- %.2f" % (t, elbo_t[0], elbo_t[1])) append_to_file(elbos_filename, "%f,%f" % (elbo_t[0], elbo_t[1])) logger.info('iter %d, gamma %.4f' % (t, gamma)) append_to_file(step_filename, gamma) objective_t = kl_divergence(q_latest, p).eval() logger.info("run %d, kl %.4f" % (i, objective_t)) append_to_file(objective_filename, objective_t) if FLAGS.fw_variant.startswith('ada'): curvature_estimate = step_result['c_estimate'] append_to_file(gap_filename, step_result['gap']) append_to_file(iter_info_filename, step_result['step_type']) logger.info('gap = %.3f, ct = %.5f, iter_type = %s' % (step_result['gap'], step_result['c_estimate'], step_result['step_type'])) append_to_file(curvature_filename, '%f,%f' % (curvature_estimate, c_global)) elif FLAGS.fw_variant == 'line_search': n_line_search_samples = step_result['n_samples'] grad_t = step_result['grad_gamma'] g_outfile = os.path.join( goutdir, 'line_search_samples_%d.npy.%d' % (n_line_search_samples, t)) logger.info('saving line search data to, %s' % g_outfile) np.save(open(g_outfile, 'wb'), grad_t) sess.close() tf.reset_default_graph()
def test_multivariate_real(self): with self.test_session(): x = MultivariateNormalDiag(tf.zeros(2), tf.ones(2)) y = ed.transform(x) sample = y.sample(10, seed=1).eval() self.assertSamplePosNeg(sample)
def adaptive_afw(weights, comps, locs, diags, q_t, mu_s, cov_s, s_t, p, k, l_prev): """ Away steps variant Args: same as fixed """ d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('distance norm is %.5f' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps, FLAGS.n_monte_carlo_samples) v_t = qcomps[index_v_t] # Frank-Wolfe gap sample_q = q_t.sample([FLAGS.n_monte_carlo_samples]) sample_s = s_t.sample([FLAGS.n_monte_carlo_samples]) step_s = tf.reduce_mean(grad_kl(q_t, p, sample_s)).eval() step_q = tf.reduce_mean(grad_kl(q_t, p, sample_q)).eval() gap_fw = step_q - step_s if gap_fw < 0: logger.warning("Frank-Wolfe duality gap is negative") # Away gap gap_a = step_v_t - step_q if gap_a < 0: eprint('Away gap < 0!!!') logger.info('fw gap %.5f, away gap %.5f' % (gap_fw, gap_a)) # Set $q_{t+1}$'s params new_locs = copy.copy(locs) new_diags = copy.copy(diags) if (gap_fw >= gap_a) or (len(comps) == 1): # FW direction, proceeds exactly as adafw logger.info('Proceeding in FW direction ') adaptive_step_type = 'fw' gap = gap_fw new_locs.append(mu_s) new_diags.append(cov_s) gamma_max = 1.0 else: # Away direction logger.info('Proceeding in Away direction ') adaptive_step_type = 'away' gap = gap_a if weights[index_v_t] < 1.0: gamma_max = weights[index_v_t] / (1.0 - weights[index_v_t]) else: gamma_max = 100. # Large value when t = 1 def default_fixed_step(fail_type='fixed'): # adaptive failed, return to fixed gamma = 2. / (k + 2.) new_comps = copy.copy(comps) new_comps.append({'loc': mu_s, 'scale_diag': cov_s}) new_weights = [(1. - gamma) * w for w in weights] new_weights.append(gamma) return { 'gamma': 2. / (k + 2.), 'l_estimate': l_prev, 'weights': new_weights, 'comps': new_comps, 'gap': gap, 'step_type': fail_type } if gap <= 0: return default_fixed_step() tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = kl_divergence(q_t, p, allow_nan_stats=False).eval() debug('f(q_t) = %.5f' % (f_t)) gamma = 2. / (k + 2) is_drop_step = False while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev # NOTE: Handle extreme values of gamma carefully gamma = min(gap / (l_t * d_t_norm), gamma_max) d_1 = - gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct $q_{t + 1}$ if adaptive_step_type == 'fw': if gamma == gamma_max: # gamma = 1.0, q_{t + 1} = s_t new_comps = [{'loc': mu_s, 'scale_diag': cov_s}] new_weights = [1.] qt_new = MultivariateNormalDiag(loc=mu_s, scale_diag=cov_s) else: new_comps = copy.copy(comps) new_comps.append({'loc': mu_s, 'scale_diag': cov_s}) new_weights = copy.copy(weights) new_weights = [(1. - gamma) * w for w in new_weights] new_weights.append(gamma) qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(new_locs, new_diags) ]) elif adaptive_step_type == 'away': new_weights = copy.copy(weights) new_comps = copy.copy(comps) if gamma == gamma_max: # drop v_t is_drop_step = True logger.info('...drop step') del new_weights[index_v_t] new_weights = [(1. + gamma) * w for w in new_weights] del new_comps[index_v_t] # NOTE: recompute locs and diags after dropping v_t drop_locs = [c['loc'] for c in new_comps] drop_diags = [c['scale_diag'] for c in new_comps] qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(drop_locs, drop_diags) ]) else: is_drop_step = False new_weights = [(1. + gamma) * w for w in new_weights] new_weights[index_v_t] -= gamma qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(new_locs, new_diags) ]) quad_bound_lhs = kl_divergence(qt_new, p, allow_nan_stats=False).eval() logger.info('lt = %.5f, gamma = %.3f, f_(qt_new) = %.5f, ' 'linear extrapolated = %.5f' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: step_type = "adaptive" if adaptive_step_type == "away": step_type = "away" if is_drop_step: step_type = "drop" return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'comps': new_comps, 'gap': gap, 'step_type': step_type } pow_tau *= tau i += 1 # adaptive loop failed, return fixed step size logger.warning("gamma below threshold value, returning fixed step") return default_fixed_step()
def test_exact_gamma(): pi = mixture_model_relbo.pi mus = mixture_model_relbo.mus stds = mixture_model_relbo.stds outfile = os.path.join(FLAGS.outdir, 'gamma.csv') g = tf.Graph() with g.as_default(): tf.set_random_seed(FLAGS.seed) sess = tf.InteractiveSession() with sess.as_default(): # Build p = pi[0] * N(mu[0], std[0]) + pi[1] * N(mu[1], std[1]) # thus, gamma = pi[1] (=0.6), q_t = N(mu[0], std[0]) # s = N(mu[1], std[1]) pcomps = [ MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor(stds[i], dtype=tf.float32)) for i in range(len(mus)) ] p = Mixture(cat=Categorical(probs=tf.convert_to_tensor(pi[0])), components=pcomps) # build q_t weights = [1.] locs = [mus[0]] diags = [stds[0]] # Create current iter $q_t$ qt = Mixture(cat=Categorical(probs=tf.convert_to_tensor(weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(locs, diags) ]) s = MultivariateNormalDiag(loc=mus[1], scale_diag=stds[1]) if FLAGS.fw_variant == "line_search": gamma = opt.line_search_dkl(weights, locs, diags, qt, mus[1], stds[1], s, p, FLAGS.init_k, return_gamma=True) # seed, n_line_search_iter, n_monte_carlo_samples, b, gamma append_to_file( outfile, "%d,%d,%d,%d,%f" % (FLAGS.seed, FLAGS.n_line_search_iter, FLAGS.n_monte_carlo_samples, 1, gamma)) elif FLAGS.fw_variant == "adafw": gamma = opt.adaptive_fw(weights=weights, locs=locs, diags=diags, q_t=qt, mu_s=mus[1], cov_s=stds[1], s_t=s, p=p, k=FLAGS.init_k, l_prev=1., return_gamma=True) # seed, n_monte_carlo_samples, eta, tau, linit, gamma append_to_file( outfile, "%d,%d,%f,%f,%f,%f" % (FLAGS.seed, FLAGS.n_monte_carlo_samples, FLAGS.damping_adafw, FLAGS.exp_adafw, FLAGS.linit_fixed, gamma)) else: raise NotImplementedError('other variants not tested yet.') print_err(pi[0][1], gamma)
D = 2 # dimensionality of data ed.set_seed(42) # DATA x_train = build_toy_dataset(N) plt.scatter(x_train[:, 0], x_train[:, 1]) plt.axis([-3, 3, -3, 3]) plt.title("Simulated dataset") plt.show() # MODEL mu = Normal(mu=tf.zeros([K, D]), sigma=tf.ones([K, D])) sigma = InverseGamma(alpha=tf.ones([K, D]), beta=tf.ones([K, D])) cat = Categorical(logits=tf.zeros([N, K])) components = [ MultivariateNormalDiag(mu=tf.ones([N, 1]) * mu[k], diag_stdev=tf.ones([N, 1]) * sigma[k]) for k in range(K) ] x = Mixture(cat=cat, components=components) # INFERENCE qmu = Normal(mu=tf.Variable(tf.random_normal([K, D])), sigma=tf.nn.softplus(tf.Variable(tf.zeros([K, D])))) qsigma = InverseGamma(alpha=tf.nn.softplus( tf.Variable(tf.random_normal([K, D]))), beta=tf.nn.softplus(tf.Variable(tf.random_normal([K, D])))) inference = ed.KLqp({mu: qmu, sigma: qsigma}, data={x: x_train}) inference.initialize(n_samples=20, n_iter=4000)
def test_adaptive_gamma(): pi = np.array([0.2, 0.5, 0.3]).astype(np.float32) mus = [[2.], [-1.], [0.]] stds = [[.6], [.4], [0.5]] outfile = os.path.join(FLAGS.outdir, 'gamma.csv') g = tf.Graph() with g.as_default(): sess = tf.InteractiveSession() with sess.as_default(): # p = pi[0] * N(mus[0], stds[0]) + ... + pi[2] * N(mus[2], stds[2]) p = Mixture( cat=Categorical(probs=tf.convert_to_tensor(pi)), components=[ #Normal(loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), # scale=tf.convert_to_tensor( # stds[i], dtype=tf.float32)), MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor(stds[i], dtype=tf.float32)) for i in range(len(mus)) ]) qt = Mixture(cat=Categorical(probs=tf.convert_to_tensor(pi[:2])), components=[ MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[i], dtype=tf.float32), scale_diag=tf.convert_to_tensor( stds[i], dtype=tf.float32)) for i in range(len(mus[:2])) ]) st = MultivariateNormalDiag( loc=tf.convert_to_tensor(mus[2], dtype=tf.float32), scale_diag=tf.convert_to_tensor(stds[2], dtype=tf.float32)) if FLAGS.fw_variant == "line_search": gamma = opt.line_search_dkl(pi[:2], mus[:2], stds[:2], qt, mus[2], stds[2], st, p, FLAGS.init_k, return_gamma=True) # seed, n_line_search_iter, n_monte_carlo_samples, b, gamma append_to_file( outfile, "%d,%d,%d,%d,%f" % (FLAGS.seed, FLAGS.n_line_search_iter, FLAGS.n_monte_carlo_samples, 1, gamma)) elif FLAGS.fw_variant == "adafw": gamma = opt.adaptive_fw(weights=pi[:2], locs=mus[:2], diags=stds[:2], q_t=qt, mu_s=mus[2], cov_s=stds[2], s_t=st, p=p, k=FLAGS.init_k, l_prev=opt.adafw_linit(qt, p), return_gamma=True) # seed, n_monte_carlo_samples, eta, tau, linit, gamma append_to_file( outfile, "%d,%d,%f,%f,%f,%f" % (FLAGS.seed, FLAGS.n_monte_carlo_samples, FLAGS.damping_adafw, FLAGS.exp_adafw, FLAGS.linit_fixed, gamma)) print_err(pi[2], gamma)
def line_search_dkl(weights, locs, diags, q_t, mu_s, cov_s, s_t, p, k, return_gamma=False): """Performs line search for the best step size gamma. Uses gradient ascent to find gamma that minimizes KL(q_t + gamma (s - q_t) || p) Args: weights: [k], weights of mixture components of q_t locs: [k x dim], means of mixture components of q_t diags: [k x dim], deviations of mixture components of q_t q_t: current mixture iterate q_t mu_s: [dim], mean for LMO Solution s cov_s: [dim], cov matrix for LMO solution s s_t: Current atom & LMO Solution s p: edward.model, target distribution p k: iteration number of Frank-Wolfe return_gamma: only return the value of gamma Returns: If return_gamma is True, only the computed value of gamma is returned. Else along with gradient data is returned in a dict """ N_samples = FLAGS.n_monte_carlo_samples # sample from $q_t$ and s sample_q = q_t.sample([N_samples]) sample_s = s_t.sample([N_samples]) # set $q_{t+1}$'s parameters new_locs = copy.copy(locs) new_diags = copy.copy(diags) new_locs.append(mu_s) new_diags.append(cov_s) # initialize $\gamma$ gamma = 2. / (k + 2.) n_steps = FLAGS.n_line_search_iter prog_bar = ed.util.Progbar(n_steps) # storing gradients for analysis grad_gamma = [] for it in range(n_steps): print("line_search iter %d, %.5f" % (it, gamma)) new_weights = copy.copy(weights) new_weights = [(1. - gamma) * w for w in new_weights] new_weights.append(gamma) qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(new_locs, new_diags) ]) rez_s = grad_kl(qt_new, p, sample_s).eval() rez_q = grad_kl(qt_new, p, sample_q).eval() grad_gamma.append({'E_s': rez_s, 'E_q': rez_q, 'gamma': gamma}) # Gradient descent step size decreasing as $\frac{1}{it + 1}$ gamma_prime = gamma - 0.1 * (np.mean(rez_s) - np.mean(rez_q)) / (it + 1.) # Projecting it back to [0, 1] if gamma_prime >= 1 or gamma_prime <= 0: gamma_prime = max(min(gamma_prime, 1.), 0.) if np.abs(gamma - gamma_prime) < 1e-6: gamma = gamma_prime break gamma = gamma_prime if return_gamma: return gamma return {'gamma': gamma, 'n_samples': N_samples, 'grad_gamma': grad_gamma}
D = 2 # dimensionality of data ed.set_seed(42) # DATA x_train = build_toy_dataset(N) plt.scatter(x_train[:, 0], x_train[:, 1]) plt.axis([-3, 3, -3, 3]) plt.title("Simulated dataset") plt.show() # MODEL mu = Normal(mu=tf.zeros([K, D]), sigma=tf.ones([K, D])) sigma = InverseGamma(alpha=tf.ones([K, D]), beta=tf.ones([K, D])) cat = Categorical(logits=tf.zeros([N, K])) components = [ MultivariateNormalDiag(mu=tf.ones([N, 1]) * tf.gather(mu, k), diag_stdev=tf.ones([N, 1]) * tf.gather(sigma, k)) for k in range(K) ] x = Mixture(cat=cat, components=components) # INFERENCE qmu = Normal(mu=tf.Variable(tf.random_normal([K, D])), sigma=tf.nn.softplus(tf.Variable(tf.zeros([K, D])))) qsigma = InverseGamma(alpha=tf.nn.softplus( tf.Variable(tf.random_normal([K, D]))), beta=tf.nn.softplus(tf.Variable(tf.random_normal([K, D])))) inference = ed.KLqp({mu: qmu, sigma: qsigma}, data={x: x_train}) inference.initialize(n_samples=20, n_iter=4000)
def adaptive_pfw(weights, comps, locs, diags, q_t, mu_s, cov_s, s_t, p, k, l_prev): """ Adaptive pairwise variant. Args: same as fixed """ d_t_norm = divergence(s_t, q_t, metric=FLAGS.distance_metric).eval() logger.info('distance norm is %.5f' % d_t_norm) # Find v_t qcomps = q_t.components index_v_t, step_v_t = argmax_grad_dotp(p, q_t, qcomps, FLAGS.n_monte_carlo_samples) v_t = qcomps[index_v_t] # Pairwise gap sample_s = s_t.sample([FLAGS.n_monte_carlo_samples]) step_s = tf.reduce_mean(grad_kl(q_t, p, sample_s)).eval() gap_pw = step_v_t - step_s if gap_pw < 0: eprint("Pairwise gap is negative") def default_fixed_step(fail_type='fixed'): # adaptive failed, return to fixed gamma = 2. / (k + 2.) new_comps = copy.copy(comps) new_comps.append({'loc': mu_s, 'scale_diag': cov_s}) new_weights = [(1. - gamma) * w for w in weights] new_weights.append(gamma) return { 'gamma': 2. / (k + 2.), 'l_estimate': l_prev, 'weights': new_weights, 'comps': new_comps, 'gap': gap_pw, 'step_type': fail_type } logger.info('Pairwise gap %.5f' % gap_pw) # Set $q_{t+1}$'s params new_locs = copy.copy(locs) new_diags = copy.copy(diags) new_locs.append(mu_s) new_diags.append(cov_s) gap = gap_pw if gap <= 0: return default_fixed_step() gamma_max = weights[index_v_t] step_type = 'adaptive' tau = FLAGS.exp_adafw eta = FLAGS.damping_adafw pow_tau = 1.0 i, l_t = 0, l_prev f_t = kl_divergence(q_t, p, allow_nan_stats=False).eval() drop_step = False debug('f(q_t) = %.5f' % (f_t)) gamma = 2. / (k + 2) while gamma >= MIN_GAMMA and i < FLAGS.adafw_MAXITER: # compute $L_t$ and $\gamma_t$ l_t = pow_tau * eta * l_prev gamma = min(gap / (l_t * d_t_norm), gamma_max) d_1 = - gamma * gap d_2 = gamma * gamma * l_t * d_t_norm / 2. debug('linear d1 = %.5f, quad d2 = %.5f' % (d_1, d_2)) quad_bound_rhs = f_t + d_1 + d_2 # construct $q_{t + 1}$ new_weights = copy.copy(weights) new_weights.append(gamma) if gamma == gamma_max: # hardcoding to 0 for precision issues new_weights[index_v_t] = 0 drop_step = True else: new_weights[index_v_t] -= gamma drop_step = False qt_new = Mixture( cat=Categorical(probs=tf.convert_to_tensor(new_weights)), components=[ MultivariateNormalDiag(loc=loc, scale_diag=diag) for loc, diag in zip(new_locs, new_diags) ]) quad_bound_lhs = kl_divergence(qt_new, p, allow_nan_stats=False).eval() logger.info('lt = %.5f, gamma = %.3f, f_(qt_new) = %.5f, ' 'linear extrapolated = %.5f' % (l_t, gamma, quad_bound_lhs, quad_bound_rhs)) if quad_bound_lhs <= quad_bound_rhs: new_comps = copy.copy(comps) new_comps.append({'loc': mu_s, 'scale_diag': cov_s}) if drop_step: del new_comps[index_v_t] del new_weights[index_v_t] logger.info("...drop step") step_type = 'drop' return { 'gamma': gamma, 'l_estimate': l_t, 'weights': new_weights, 'comps': new_comps, 'gap': gap, 'step_type': step_type } pow_tau *= tau i += 1 # gamma below MIN_GAMMA logger.warning("gamma below threshold value, returning fixed step") return default_fixed_step("fixed_adaptive_MAXITER")