class Joint: ''' Wrapper to handle calculating the log p(y, w | X) = log [ p(y | X, w) * p(w) ] for a given sample of w. Should be the same as the slow version but vectorized and therefore faster. ''' def __init__(self, Xtrain, ytrain, sess): self.Xtrain = Xtrain self.ytrain = ytrain self.sess = sess self.n_samples = 1000 # TODO this is hard coded and must be matched in elbo and fc. N, D = Xtrain.shape self.w = tf.placeholder(tf.float32, [D, self.n_samples]) self.X = tf.placeholder(tf.float32, [N, D]) #self.y = Bernoulli(logits=ed.dot(self.X, self.w)) self.y = Bernoulli(logits=tf.matmul(self.X, self.w)) self.prior = Normal(loc=tf.zeros([self.n_samples, D]), scale=1.0 * tf.ones([self.n_samples, D])) # TODO hard coded def log_prob(self, samples): copied_ytrain = np.repeat(self.ytrain[:, np.newaxis], self.n_samples, axis=1) per_sample = self.sess.run(self.y.log_prob(copied_ytrain), feed_dict={ self.X: self.Xtrain, self.w: samples.T }).astype(np.float32) lik = np.sum(per_sample, axis=0) prior = np.sum(self.prior.log_prob(samples).eval(), axis=1) return lik + prior
class Joint: '''Wrapper to handle joint probability p(UV, R_train) log p(UV, R_train) = log [ p(R_train | UV) * p(UV) ] ''' def __init__(self, R_true, I_train, sess, D, N, M): """ Args: R_true: full matrix I_train: training mask """ self.n_samples = FLAGS.n_monte_carlo_samples self.R = tf.constant(R_true, dtype=tf.float32) self.I = tf.constant(I_train, dtype=tf.float32) self.D = D self.N = N self.M = M scale_uv = tf.concat([tf.ones([D, N]), tf.ones([D, M])], axis=1) mean_uv = tf.concat([tf.zeros([D, N]), tf.zeros([D, M])], axis=1) self.prior_UV = Normal(loc=mean_uv, scale=scale_uv) # (D, N + M) def log_lik(self, sample_uv): """ Args: sample_uv: single (D, (N + M)) samples from qUV Returns: tensor scalar of log likelihood """ # constructed matrix dist. R ~ N(U'V, 1) pR = Normal(loc=tf.matmul(tf.transpose(sample_uv[:, :self.N]), sample_uv[:, self.N:]), scale=tf.ones([self.N, self.M])) # dist (N, M) full_log_likelihood = pR.log_prob(self.R) # (N, M) full_log_likelihood_t = full_log_likelihood.eval() train_log_likelihood = full_log_likelihood * self.I # (N, M) log_lik = tf.reduce_sum(train_log_likelihood) # () return log_lik def log_prob(self, sample_uv): """ Args: sample_uv: single (D, (N + M)) samples from qUV Returns: tensor scalar of log_prob """ prior_batch = self.prior_UV.log_prob(sample_uv) # (D, N + M) prior = tf.reduce_sum(prior_batch) ll = self.log_lik(sample_uv) #print('DEBUG values', prior.eval(), ll.eval()) p_joint = prior + ll #return prior return p_joint def log_prob_batch(self, samples): """ samples: (n_samples, D, N + M) tensor """ raise NotImplementedError('what to do here? just run in a loop?')
class Joint: '''Wrapper to handle calculating the joint probability of data log p(y, w | X) = log [ p(y | X, w) * p(w) ] ''' def __init__(self, X, y, sess, n_samples, logger=None): """Initialize the distribution. Constructs the graph for evaluation of joint probabilities of data X and weights (latent vars) w Args: X: [N x D] data y: [D] predicted target variable sess: tensorflow session n_samples: number of monte carlo samples to compute expectation """ self.sess = sess self.n_samples = n_samples # (N, ) -> (N, n_samples) # np.tile(y[:, np.newaxis], (1, self.n_samples)) y_matrix = np.repeat(y[:, np.newaxis], self.n_samples, axis=1) if logger is not None: self.logger = logger # Define the model graph N, D = X.shape self.X = tf.convert_to_tensor(X, dtype=tf.float32) self.Y = tf.convert_to_tensor(y_matrix, dtype=tf.float32) self.W = tf.get_variable('samples', (self.n_samples, D), tf.float32, initializer=tf.zeros_initializer()) # (N, n_samples) self.py = Bernoulli(logits=tf.matmul(self.X, tf.transpose(self.W))) self.w_prior = Normal(loc=tf.zeros([self.n_samples, D], tf.float32), scale=tf.ones([self.n_samples, D], tf.float32)) # to get prior log probability would be summed across the D features # [n_samples D] -> [n_samples] self.prior = tf.reduce_sum(self.w_prior.log_prob(self.W), axis=1) log_likelihoods = self.py.log_prob(self.Y) # (N, n_samples) self.ll = tf.reduce_sum(log_likelihoods, axis=0) # (n_samples, ) self.joint = self.ll + self.prior def log_prob(self, samples): """Log probability of samples. Since X is already given. samples, like for target distribution, for base distributions on approximation, for individual atoms are all samples of w. Args: samples: [self.n_samples x D] tensor Returns: [self.n_samples, ] joint log probability of samples, X, y """ assert samples.shape[ 0] == self.n_samples, 'Different number of samples' self.sess.run(self.W.assign(samples)) return self.joint
def _test(mu, sigma, n): rv = Normal(mu=mu, sigma=sigma) rv_sample = rv.sample(n) x = rv_sample.eval() x_tf = tf.constant(x, dtype=tf.float32) mu = mu.eval() sigma = sigma.eval() assert np.allclose( rv.log_prob(x_tf).eval(), stats.norm.logpdf(x, mu, sigma))
def _test(mu, sigma, n): rv = Normal(mu=mu, sigma=sigma) rv_sample = rv.sample(n) x = rv_sample.eval() x_tf = tf.constant(x, dtype=tf.float32) mu = mu.eval() sigma = sigma.eval() assert np.allclose(rv.log_prob(x_tf).eval(), stats.norm.logpdf(x, mu, sigma))
def clustering(self, x_data): mu_sample = self.qmu.sample(100) sigmasq_sample = self.qsigmasq.sample(100) x_post = Normal(loc=tf.ones([self.N, 1, 1, 1]) * mu_sample, scale=tf.ones([self.N, 1, 1, 1]) * tf.sqrt(sigmasq_sample)) x_broadcasted = tf.tile(tf.reshape(x_data, [self.N, 1, 1, self.D]), [1, 100, self.K, 1]) log_liks = x_post.log_prob(x_broadcasted) log_liks = tf.reduce_sum(log_liks, 3) log_liks = tf.reduce_mean(log_liks, 1) self.clusters = tf.argmax(log_liks, 1).eval()
def log_lik(self, sample_uv): """ Args: sample_uv: single (D, (N + M)) samples from qUV Returns: tensor scalar of log likelihood """ # constructed matrix dist. R ~ N(U'V, 1) pR = Normal(loc=tf.matmul(tf.transpose(sample_uv[:, :self.N]), sample_uv[:, self.N:]), scale=tf.ones([self.N, self.M])) # dist (N, M) full_log_likelihood = pR.log_prob(self.R) # (N, M) full_log_likelihood_t = full_log_likelihood.eval() train_log_likelihood = full_log_likelihood * self.I # (N, M) log_lik = tf.reduce_sum(train_log_likelihood) # () return log_lik
init.run() for _ in range(inference.n_iter): info_dict = inference.update() inference.print_progress(info_dict) t = info_dict['t'] if t % inference.n_print == 0: print("Inferred cluster means:") print(sess.run(qmu.mean())) # Calculate likelihood for each data point and cluster assignment, # averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D). mu_sample = qmu.sample(100) sigma_sample = qsigma.sample(100) x_post = Normal(mu=tf.ones([N, 1, 1, 1]) * mu_sample, sigma=tf.ones([N, 1, 1, 1]) * sigma_sample) x_broadcasted = tf.tile(tf.reshape(x_train, [N, 1, 1, D]), [1, 100, K, 1]) # Sum over latent dimension, then average over posterior samples. # ``log_liks`` ends up with shape (N, K). log_liks = x_post.log_prob(x_broadcasted) log_liks = tf.reduce_sum(log_liks, 3) log_liks = tf.reduce_mean(log_liks, 1) # Choose the cluster with the highest likelihood for each data point. clusters = tf.argmax(log_liks, 1).eval() plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr) plt.axis([-3, 3, -3, 3]) plt.title("Predicted cluster assignments") plt.show()
data = {x1: x_ph_bin, x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph} # sample posterior predictive for p(y|z,t) y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post') # crude approximation of the above y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean') # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound # for early stopping according to a validation set y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval') x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval') x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval') t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval') # losses logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) + tf.reduce_sum(x1_post_eval.log_prob(x_ph_bin), axis=1) + tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) + tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1)) inference = ed.KLqp({z: qz}, data) optimizer = tf.train.AdamOptimizer(learning_rate=lr) inference.initialize(optimizer=optimizer) # saver and initializer before experiment saver = tf.train.Saver(tf.contrib.slim.get_variables()) tf.global_variables_initializer().run() # Load existing model if load_model: print("Load model from: {}".format(load_model + '/{}-{}'.format(task, i))) saver.restore(sess, load_model + '/{}-{}'.format(task, i)) n_epoch, n_iter_per_epoch, idx = epochs, max(10 * int(xtr.shape[0] / batch_size), 1), np.arange(xtr.shape[0]) # dictionaries needed for evaluation tr0, tr1 = np.zeros((xalltr.shape[0], 1)), np.ones((xalltr.shape[0], 1)) tr0t, tr1t = np.zeros((xte.shape[0], 1)), np.ones((xte.shape[0], 1)) f1 = {x_ph_bin: xalltr[:, 0:len(binfeats)], x_ph_cont: xalltr[:, len(binfeats):], t_ph: tr1}
yi_post_eval = ed.copy(yi, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph, ti: ti_ph}, scope='yi_post_eval') yj_post_eval = ed.copy(yj, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph, tj: tj_ph}, scope='yj_post_eval') xi1_post_eval = ed.copy(xi1, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='xi1_post_eval') xi2_post_eval = ed.copy(xi2, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='xi2_post_eval') xj1_post_eval = ed.copy(xj1, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='xj1_post_eval') xj2_post_eval = ed.copy(xj2, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='xj2_post_eval') ti_post_eval = ed.copy(ti, {zi: qzi.mean(), qti: ti_ph, qyi: yi_ph}, scope='ti_post_eval') tj_post_eval = ed.copy(tj, {zj: qzj.mean(), qtj: tj_ph, qyj: yj_ph}, scope='tj_post_eval') logp_valid = tf.reduce_mean(tf.reduce_sum(yi_post_eval.log_prob(yi_ph) + ti_post_eval.log_prob(ti_ph), axis=1) + tf.reduce_sum(xi1_post_eval.log_prob(xi_ph_bin), axis=1) + tf.reduce_sum(xi2_post_eval.log_prob(xi_ph_cont), axis=1) + tf.reduce_sum(zi.log_prob(qzi.mean()) - qzi.log_prob(qzi.mean()), axis=1) + tf.reduce_sum(yj_post_eval.log_prob(yj_ph) + tj_post_eval.log_prob(tj_ph), axis=1) + tf.reduce_sum(xj1_post_eval.log_prob(xj_ph_bin), axis=1) + tf.reduce_sum(xj2_post_eval.log_prob(xj_ph_cont), axis=1) + tf.reduce_sum(zj.log_prob(qzj.mean()) - qzj.log_prob(qzj.mean()), axis=1)) #TODO: negative sampling... # inference = ed.KLqp({zi: qzi, zj: qzj, zi: qzj, zj: qzi}, data) inference = ed.KLqp({zi: qzi, zj: qzj, zi: qzj}, data) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) inference.initialize(optimizer=optimizer) saver = tf.train.Saver(tf.contrib.slim.get_variables()) tf.global_variables_initializer().run()
def cevae_tf(X, T, Y, n_epochs=100, early_stop = 10, d_cevae=20): T, Y = T.reshape((-1,1)), Y.reshape((-1,1)) args = dict() args['earl'] = early_stop args['lr'] = 0.001 args['opt'] = 'adam' args['epochs'] = n_epochs args['print_every'] = 10 args['true_post'] = True M = None # batch size during training d = d_cevae # latent dimension lamba = 1e-4 # weight decay nh, h = 3, 200 # number and size of hidden layers contfeats = list(range(X.shape[1])) # all continuous binfeats = [] # need for early stopping xtr, xva, ttr, tva, ytr, yva = train_test_split(X, T, Y) # zero mean, unit variance for y during training ym, ys = np.mean(Y), np.std(Y) ytr, yva = (ytr - ym) / ys, (yva - ym) / ys best_logpvalid = - np.inf with tf.Graph().as_default(): sess = tf.InteractiveSession() ed.set_seed(1) np.random.seed(1) tf.set_random_seed(1) # x_ph_bin = tf.placeholder(tf.float32, [M, len(binfeats)], name='x_bin') # binary inputs x_ph_cont = tf.placeholder(tf.float32, [M, len(contfeats)], name='x_cont') # continuous inputs t_ph = tf.placeholder(tf.float32, [M, 1]) y_ph = tf.placeholder(tf.float32, [M, 1]) # x_ph = tf.concat([x_ph_bin, x_ph_cont], 1) x_ph = x_ph_cont activation = tf.nn.elu # CEVAE model (decoder) # p(z) z = Normal(loc=tf.zeros([tf.shape(x_ph)[0], d]), scale=tf.ones([tf.shape(x_ph)[0], d])) # p(x|z) hx = fc_net(z, (nh - 1) * [h], [], 'px_z_shared', lamba=lamba, activation=activation) # logits = fc_net(hx, [h], [[len(binfeats), None]], 'px_z_bin', lamba=lamba, activation=activation) # x1 = Bernoulli(logits=logits, dtype=tf.float32, name='bernoulli_px_z') mu, sigma = fc_net(hx, [h], [[len(contfeats), None], [len(contfeats), tf.nn.softplus]], 'px_z_cont', lamba=lamba, activation=activation) x2 = Normal(loc=mu, scale=sigma, name='gaussian_px_z') # p(t|z) logits = fc_net(z, [h], [[1, None]], 'pt_z', lamba=lamba, activation=activation) t = Bernoulli(logits=logits, dtype=tf.float32) # p(y|t,z) mu2_t0 = fc_net(z, nh * [h], [[1, None]], 'py_t0z', lamba=lamba, activation=activation) mu2_t1 = fc_net(z, nh * [h], [[1, None]], 'py_t1z', lamba=lamba, activation=activation) y = Normal(loc=t * mu2_t1 + (1. - t) * mu2_t0, scale=tf.ones_like(mu2_t0)) # CEVAE variational approximation (encoder) # q(t|x) logits_t = fc_net(x_ph, [d], [[1, None]], 'qt', lamba=lamba, activation=activation) qt = Bernoulli(logits=logits_t, dtype=tf.float32) # q(y|x,t) hqy = fc_net(x_ph, (nh - 1) * [h], [], 'qy_xt_shared', lamba=lamba, activation=activation) mu_qy_t0 = fc_net(hqy, [h], [[1, None]], 'qy_xt0', lamba=lamba, activation=activation) mu_qy_t1 = fc_net(hqy, [h], [[1, None]], 'qy_xt1', lamba=lamba, activation=activation) qy = Normal(loc=qt * mu_qy_t1 + (1. - qt) * mu_qy_t0, scale=tf.ones_like(mu_qy_t0)) # q(z|x,t,y) inpt2 = tf.concat([x_ph, qy], 1) hqz = fc_net(inpt2, (nh - 1) * [h], [], 'qz_xty_shared', lamba=lamba, activation=activation) muq_t0, sigmaq_t0 = fc_net(hqz, [h], [[d, None], [d, tf.nn.softplus]], 'qz_xt0', lamba=lamba, activation=activation) muq_t1, sigmaq_t1 = fc_net(hqz, [h], [[d, None], [d, tf.nn.softplus]], 'qz_xt1', lamba=lamba, activation=activation) qz = Normal(loc=qt * muq_t1 + (1. - qt) * muq_t0, scale=qt * sigmaq_t1 + (1. - qt) * sigmaq_t0) # Create data dictionary for edward data = {x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph} # sample posterior predictive for p(y|z,t) y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post') # crude approximation of the above y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean') # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound # for early stopping according to a validation set y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval') # x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval') x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval') t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval') logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) + tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) + tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1)) inference = ed.KLqp({z: qz}, data) optimizer = tf.train.AdamOptimizer(learning_rate=args['lr']) inference.initialize(optimizer=optimizer) saver = tf.train.Saver(tf.contrib.slim.get_variables()) tf.global_variables_initializer().run() n_epoch, n_iter_per_epoch, idx = args['epochs'], 10 * int(xtr.shape[0] / 100), np.arange(xtr.shape[0]) # # dictionaries needed for evaluation t0, t1 = np.zeros((X.shape[0], 1)), np.ones((X.shape[0], 1)) # tr0t, tr1t = np.zeros((xte.shape[0], 1)), np.ones((xte.shape[0], 1)) f1 = {x_ph_cont: X, t_ph: t1} f0 = {x_ph_cont: X, t_ph: t0} # f1t = {x_ph_bin: xte[:, 0:len(binfeats)], x_ph_cont: xte[:, len(binfeats):], t_ph: tr1t} # f0t = {x_ph_bin: xte[:, 0:len(binfeats)], x_ph_cont: xte[:, len(binfeats):], t_ph: tr0t} for epoch in range(n_epoch): avg_loss = 0.0 widgets = ["epoch #%d|" % epoch, Percentage(), Bar(), ETA()] pbar = ProgressBar(n_iter_per_epoch, widgets=widgets) pbar.start() np.random.shuffle(idx) for j in range(n_iter_per_epoch): # print('j', j) # pbar.update(j) batch = np.random.choice(idx, 100) x_train, y_train, t_train = xtr[batch], ytr[batch], ttr[batch] info_dict = inference.update(feed_dict={x_ph_cont: x_train, t_ph: t_train, y_ph: y_train}) avg_loss += info_dict['loss'] avg_loss = avg_loss / n_iter_per_epoch avg_loss = avg_loss / 100 if epoch % args['earl'] == 0 or epoch == (n_epoch - 1): logpvalid = sess.run(logp_valid, feed_dict={x_ph_cont: xva, t_ph: tva, y_ph: yva}) if logpvalid >= best_logpvalid: print('Improved validation bound, old: {:0.3f}, new: {:0.3f}'.format(best_logpvalid, logpvalid)) best_logpvalid = logpvalid saver.save(sess, 'data/cevae_models/dlvm') saver.restore(sess, 'data/cevae_models/dlvm') y0, y1 = get_y0_y1(sess, y_post, f0, f1, shape=Y.shape, L=100) y0, y1 = y0 * ys + ym, y1 * ys + ym sess.close() return y0.reshape((-1)), y1.reshape((-1))
data = {x1: x_ph_bin, x2: x_ph_cont, y: y_ph, qt: t_ph, t: t_ph, qy: y_ph} # sample posterior predictive for p(y|z,t) y_post = ed.copy(y, {z: qz, t: t_ph}, scope='y_post') # crude approximation of the above y_post_mean = ed.copy(y, {z: qz.mean(), t: t_ph}, scope='y_post_mean') # construct a deterministic version (i.e. use the mean of the approximate posterior) of the lower bound # for early stopping according to a validation set y_post_eval = ed.copy(y, {z: qz.mean(), qt: t_ph, qy: y_ph, t: t_ph}, scope='y_post_eval') x1_post_eval = ed.copy(x1, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x1_post_eval') x2_post_eval = ed.copy(x2, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='x2_post_eval') t_post_eval = ed.copy(t, {z: qz.mean(), qt: t_ph, qy: y_ph}, scope='t_post_eval') logp_valid = tf.reduce_mean(tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) + tf.reduce_sum(x1_post_eval.log_prob(x_ph_bin), axis=1) + tf.reduce_sum(x2_post_eval.log_prob(x_ph_cont), axis=1) + tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1)) z_learned = ed.copy(qz, {x1: x_ph_bin, x2: x_ph_cont}) # for matching inference = ed.KLqp({z: qz}, data) # ------------------------------------------------------------------------------------------------------------- elif model_type == 'separated': # CEVAE model (decoder) n_ph = tf.shape(x_ph)[0] # number of samples fed to placeholders latent_dims = (z_t_dim, z_y_dim) # prior over latent variables: # p(zx) - # zx = Normal(loc=tf.zeros([n_ph, z_x_dim]), scale=tf.ones([n_ph, z_x_dim])) # p(zt) - zt = Normal(loc=tf.zeros([n_ph, z_t_dim]), scale=tf.ones([n_ph, z_t_dim]))
def __init__(self, n, xdim, n_mixtures=5, mc_samples=500): # Compute the shape dynamically from placeholders self.x_ph = tf.placeholder(tf.float32, [None, xdim]) self.k = k = n_mixtures self.batch_size = n self.d = d = xdim self.sample_size = tf.placeholder(tf.int32, ()) # Build the priors over membership probabilities and mixture parameters with tf.variable_scope("priors"): pi = Dirichlet(tf.ones(k)) mu = Normal(tf.zeros(d), tf.ones(d), sample_shape=k) sigmasq = InverseGamma(tf.ones(d), tf.ones(d), sample_shape=k) # Build the conditional mixture model with tf.variable_scope("likelihood"): x = ParamMixture(pi, {'loc': mu, 'scale_diag': tf.sqrt(sigmasq)}, MultivariateNormalDiag, sample_shape=n) z = x.cat # Build approximate posteriors as Empirical samples t = mc_samples with tf.variable_scope("posteriors_samples"): qpi = Empirical(tf.get_variable( "qpi/params", [t, k], initializer=tf.constant_initializer(1.0 / k))) qmu = Empirical(tf.get_variable( "qmu/params", [t, k, d], initializer=tf.zeros_initializer())) qsigmasq = Empirical(tf.get_variable( "qsigmasq/params", [t, k, d], initializer=tf.ones_initializer())) qz = Empirical(tf.get_variable( "qz/params", [t, n], initializer=tf.zeros_initializer(), dtype=tf.int32)) # Build inference graph using Gibbs and conditionals with tf.variable_scope("inference"): self.inference = ed.Gibbs({ pi: qpi, mu: qmu, sigmasq: qsigmasq, z: qz }, data={ x: self.x_ph }) self.inference.initialize() # Build predictive posterior graph by taking samples n_samples = self.sample_size with tf.variable_scope("posterior"): mu_smpl = qmu.sample(n_samples) # shape: [1, 100, k, d] sigmasq_smpl = qsigmasq.sample(n_samples) x_post = Normal( loc=tf.ones((n, 1, 1, 1)) * mu_smpl, scale=tf.ones((n, 1, 1, 1)) * tf.sqrt(sigmasq_smpl) ) # NOTE: x_ph has shape [n, d] x_broadcasted = tf.tile( tf.reshape(self.x_ph, (n, 1, 1, d)), (1, n_samples, k, 1) ) x_ll = x_post.log_prob(x_broadcasted) x_ll = tf.reduce_sum(x_ll, axis=3) x_ll = tf.reduce_mean(x_ll, axis=1) self.sample_t_ph = tf.placeholder(tf.int32, ()) self.eval_ops = { 'generative_post': x_post, 'qmu': qmu, 'qsigma': qsigma, 'post_running_mu': tf.reduce_mean( qmu.params[:self.sample_t_ph], axis=0 ) 'post_log_prob': xll }
# for early stopping according to a validation set y_post_eval = ed.copy(y, { z: qz.mean(), y: y_ph, t: t_ph }, scope='y_post_eval') t_post_eval = ed.copy(t, {z: qz.mean(), y: y_ph}, scope='t_post_eval') log_valid = tf.reduce_mean( tf.reduce_sum(y_post_eval.log_prob(y_ph) + t_post_eval.log_prob(t_ph), axis=1) + tf.reduce_sum(z.log_prob(qz.mean()) - qz.log_prob(qz.mean()), axis=1)) tf.global_variables_initializer().run() # Information bottleneck control parameter BETA = 16671.79 #257.83 #2753.05 #9268.75 #4806.3 #16671.79 # Latent Loss info_loss = tf.reduce_sum(tf.contrib.distributions.kl_divergence( qz, z)) # Log-Likelihood class_loss = -BETA * tf.reduce_sum( y_post.log_prob(y_ph) + t_post.log_prob(t_ph), axis=1)
Gibbs_inference_elapsedTime = time.time() - Gibbs_inference_startTime posterior_mu = qmu.params.eval().mean(axis=0) # Calculate likelihood for each data point and cluster assignment, # averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D). print("Sampling from Posterior...") mu_sample = qmu.sample(M) sigmasq_sample = qsigma.sample(M) pi_sample = qpi.sample(M) x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample, scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigmasq_sample)) x_broadcasted = tf.tile(tf.reshape(train_img, [N, 1, 1, D]), [1, M, K, 1]) x_broadcasted = tf.cast(x_broadcasted, dtype=tf.float32) # Sum over latent dimension, then average over posterior samples. # ``log_liks`` ends up with shape (N, K). log_liks = tf.reduce_mean(tf.reduce_sum(x_post.log_prob(x_broadcasted), 3), 1) print("Calculating Cluster Assignment...") clusters = tf.argmax(log_liks, 1).eval() result_img_dirs = '../tmp/img_result/{}'.format(current_time) os.makedirs(result_img_dirs) plt.hist(clusters) plt.savefig( '../tmp/img_result/{}/cluster_dist_img={}_K={}_T={}_Time={}.png'.format( current_time, img_no, K, T, current_time)) result_cluster_assign_dirs = '../tmp/log/cluster_assign_matrix' if not os.path.isdir(result_cluster_assign_dirs): os.makedirs(result_cluster_assign_dirs) np.save( result_cluster_assign_dirs +
info_dict = inference.update() inference.print_progress(info_dict) t = info_dict['t'] if t % inference.n_print == 0: print("Inferred cluster means:") print(sess.run(qmu.value())) # Average per-cluster and per-data point likelihood over many posterior samples. log_liks = [] for _ in range(100): mu_sample = qmu.sample() sigma_sample = qsigma.sample() # Take per-cluster and per-data point likelihood. log_lik = [] for k in range(K): x_post = Normal(mu=tf.ones([N, 1]) * tf.gather(mu_sample, k), sigma=tf.ones([N, 1]) * tf.gather(sigma_sample, k)) log_lik.append(tf.reduce_sum(x_post.log_prob(x_train), 1)) log_lik = tf.pack(log_lik) # has shape (K, N) log_liks.append(log_lik) log_liks = tf.reduce_mean(log_liks, 0) # Choose the cluster with the highest likelihood for each data point. clusters = tf.argmax(log_liks, 0).eval() plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr) plt.axis([-3, 3, -3, 3]) plt.title("Predicted cluster assignments") plt.show()
def __init__(self, d, K, sig, sess, logdir): self.K = K self.sig = sig self.sess = sess self.logdir = logdir with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(d.cs / 2, d.n_minibatch + d.cs / 2), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.cs / 2), [0]), [d.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.n_minibatch), [1]), [1, d.cs / 2]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + d.cs / 2 + 1], 1) with tf.name_scope('embeddings'): # Embedding vectors self.rho = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='rho') # Context vectors self.alpha = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='alpha') with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [d.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, d.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, d.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * d.N / d.n_minibatch self.loss = -(scale * self.log_likelihood + self.log_prior) # Training optimizer = tf.train.AdamOptimizer() self.train = optimizer.minimize(self.loss) with self.sess.as_default(): tf.global_variables_initializer().run() variable_summaries('rho', self.rho) variable_summaries('alpha', self.alpha) with tf.name_scope('objective'): tf.summary.scalar('loss', self.loss) tf.summary.scalar('priors', self.log_prior) tf.summary.scalar('ll_pos', self.ll_pos) tf.summary.scalar('ll_neg', self.ll_neg) self.summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) self.saver = tf.train.Saver() config = projector.ProjectorConfig() alpha = config.embeddings.add() alpha.tensor_name = 'model/embeddings/alpha' alpha.metadata_path = '../vocab.tsv' rho = config.embeddings.add() rho.tensor_name = 'model/embeddings/rho' rho.metadata_path = '../vocab.tsv' projector.visualize_embeddings(self.train_writer, config)