def m_step_op(self, prior, samples, r_nk, step_size): """ Update parameters as the M step of an EM process Args: prior: samples: 1-dim latent variable is assumed """ # Bishop eq 10.51 N_k = tf.reduce_sum(r_nk, axis=0) # Bishop eq 10.52 xbar_k = (tf.expand_dims(r_nk, axis=-1) * samples) / tf.expand_dims( N_k, axis=-1) # Bishop eq 10.53 x_xk = tf.reshape(samples - xbar_k, samples.get_shape().as_list() + [-1]) S_k = tf.expand_dims(tf.expand_dims(r_nk, -1), -1) * tf.matmul( x_xk, x_xk, transpose_b=True) # rename for easy and clarity beta, m, C, v = niw.natural_to_standard(prior.A, prior.b, prior.beta, prior.v_hat) alpha_0 = prior.alpha m_0 = m beta_0 = beta v_0 = v W_0 = C # Bishop eq 10.58 alpha_k = alpha_0 + N_k # Bishop eq 10.60 beta_k = beta_0 + N_k # Bishop eq 10.61 m_k = (tf.expand_dims(beta_0, -1) + tf.expand_dims(N_k, axis=-1) * xbar_k) / tf.expand_dims(beta_k, -1) # Bishop eq 10.62 W_k_2nd = tf.expand_dims(tf.expand_dims(N_k, axis=-1), -1) * S_k xbar_m0 = tf.reshape(xbar_k - m_0, xbar_k.get_shape().as_list() + [-1]) W_k_3rd = tf.expand_dims( tf.expand_dims(((beta_0 * N_k) / (beta_0 * N_k)), -1), -1) * tf.matmul(xbar_m0, xbar_m0, transpose_b=True) W_k = W_0 + W_k_2nd + W_k_3rd # Bishop eq 10.63 v_k = v_0 + N_k # create update op current_vars = (self.alpha, self.A, self.b, self.beta, self.v_hat) updated_params = (alpha_k, ) + niw.standard_to_natural( beta_k, tf.reduce_mean(m_k, axis=0), tf.reduce_mean(W_k, axis=0), v_k) return tf.group([ tf.assign( initial, tf.add(((1 - step_size) * initial), (step_size * updated))) for initial, updated in zip(current_vars, updated_params) ])
def make_loc_scale_variables(theta, param_device='/gpu:0', name='copy_m_v'): # create location/scale variables for point estimations with tf.name_scope(name): theta_copied = niw.natural_to_standard(tf.identity(theta[1]), tf.identity(theta[2]), tf.identity(theta[3]), tf.identity(theta[4])) mu_k_init, sigma_k = niw.expected_values(theta_copied) L_k_init = tf.cholesky(sigma_k) mu_k = variable_on_device('mu_k', shape=None, initializer=mu_k_init, trainable=True, device=param_device) L_k = variable_on_device('L_k', shape=None, initializer=L_k_init, trainable=True, device=param_device) return mu_k, L_k
def inference(x, K, seed, name='inference'): """ Args: x: data; shape = N, D K: number of components seed: random seed name: Returns: """ with tf.name_scope(name): N, D = x.get_shape().as_list() with tf.name_scope('init_responsibilities'): r_nk = tf.Variable(tf.contrib.distributions.Dirichlet( tf.ones(K)).sample(N, seed=seed), dtype=tf.float32, name='r_nk') with tf.name_scope('init_prior'): alpha, A, b, beta, v_hat = svae.init_mm_params(K, D, alpha_scale=0.05 / K, beta_scale=0.5, m_scale=0, C_scale=D + 0.5, v_init=D + 0.5, seed=seed, name='prior', trainable=False) beta_0, m_0, C_0, v_0 = niw.natural_to_standard(A, b, beta, v_hat) alpha_0 = dirichlet.natural_to_standard(alpha) with tf.name_scope('em_algorithm'): alpha_k, beta_k, m_k, C_k, v_k, x_k, S_k = m_step( x, r_nk, alpha_0, beta_0, m_0, C_0, v_0) P_k = tf.matrix_inverse(C_k) r_nk_new, pi = e_step(x, alpha_k, beta_k, m_k, P_k, v_k) step = r_nk.assign(r_nk_new) theta = tf.tuple((alpha_k, beta_k, m_k, C_k, v_k), name='theta') log_r_nk = tf.log(r_nk_new) return step, log_r_nk, theta, (x_k, S_k, pi)
def inference(x, K, kappa_init, seed, name='inference'): """ Args: x: data; shape = N, D K: number of components kappa_init: student-t degrees of freedom seed: random seed Returns: """ with tf.name_scope(name): N, D = x.get_shape().as_list() with tf.name_scope('init_local_vars'): r_nk = tf.Variable( tf.contrib.distributions.Dirichlet(tf.ones(K)).sample(N, seed=seed), dtype=tf.float32, name='r_nk') u_nk = tf.Variable(tf.ones((N, K)), dtype=tf.float32, name='u_nk') with tf.name_scope('init_prior'): # returns Dirichlet+NiW natural parameters alpha, A, b, beta, v_hat = svae.init_mm_params(K, D, alpha_scale=0.05 / K, beta_scale=0.5, m_scale=0, C_scale=D + 0.5, v_init=D + 0.5, seed=seed, name='prior', trainable=False) beta_0, m_0, C_0, v_0 = niw.natural_to_standard(A, b, beta, v_hat) alpha_0 = dirichlet.natural_to_standard(alpha) kappa_k = kappa_init * tf.ones(K) # student-t degrees of freedom with tf.name_scope('em_algorithm'): alpha_k, beta_k, m_k, C_k, v_k, x_k, S_k = m_step(x, r_nk, u_nk, alpha_0, beta_0, m_0, C_0, v_0) P_k = tf.matrix_inverse(C_k) r_nk_new, u_nk_new, pi = e_step(x, alpha_k, beta_k, m_k, P_k, v_k, kappa_k) # define step: update r_nk and u_nk step = tf.group(r_nk.assign(r_nk_new), u_nk.assign(u_nk_new)) # group global parameters theta = tf.tuple((alpha_k, beta_k, m_k, C_k, v_k, kappa_k), name='theta') log_r_nk = tf.log(r_nk_new) return step, log_r_nk, theta, (x_k, S_k, pi)
def expected_values(self): _, m, C, v = niw.natural_to_standard(self.A, self.b, self.beta, self.v_hat) exp_log_pi = dirichlet.expected_log_pi( dirichlet.natural_to_standard(self.alpha)) with tf.name_scope('niw_expectation'): exp_m = tf.identity(m, 'expected_mean') C_inv = tf.matrix_inverse(C) C_inv_sym = tf.divide(tf.add(C_inv, tf.matrix_transpose(C_inv)), 2., name='C_inv_symmetrised') exp_C = tf.matrix_inverse(tf.multiply(C_inv_sym, tf.expand_dims( tf.expand_dims(v, 1), 2), name='expected_precision'), name='expected_covariance') return exp_log_pi, exp_m, exp_C
def m_step(gmm_prior, x_samples, r_nk): """ Args: gmm_prior: Dirichlet+NiW prior for Gaussian mixture model x_samples: samples of shape (N, S, L) r_nk: responsibilities of shape (N, K) Returns: Dirichlet+NiW parameters obtained by executing Bishop's M-step in the VEM algorithm for GMMs """ with tf.name_scope('m_step'): # execute GMM-EM m-step beta_0, m_0, C_0, v_0 = niw.natural_to_standard(*gmm_prior[1:]) alpha_0 = dirichlet.natural_to_standard(gmm_prior[0]) alpha_k, beta_k, m_k, C_k, v_k, x_k, S_k = gmm.m_step(x_samples, r_nk, alpha_0, beta_0, m_0, C_0, v_0, name='gmm_m_step') A, b, beta, v_hat = niw.standard_to_natural(beta_k, m_k, C_k, v_k) alpha = dirichlet.standard_to_natural(alpha_k) return tf.tuple([alpha, A, b, beta, v_hat], name='theta_star')
loli_tr = diagonal_gaussian_logprob( y_tr_coll, y_mean_rec, out2_rec, log_z_given_y_phi) tf.summary.scalar('mse_tr', mse_tr) tf.summary.scalar('loli_tr', loli_tr) if lbl_tr is not None: entr_tr, prty_tr = purity(tf.exp(log_z_given_y_phi), lbl_tr) tf.summary.scalar('entropy_tr', entr_tr) tf.summary.scalar('purity_tr', prty_tr) # useful values for tensorboard and plotting with tf.name_scope('plotting_prep'): if 'smm' in config['method']: mu, sigma = svae.unpack_smm(theta[1:3]) else: beta_k, m_k, C_k, v_k = niw.natural_to_standard( theta[1], theta[2], theta[3], theta[4]) mu, sigma = niw.expected_values((beta_k, m_k, C_k, v_k)) alpha_k = dirichlet.natural_to_standard(theta[0]) expected_log_pi = dirichlet.expected_log_pi(alpha_k) pi_theta = tf.exp(expected_log_pi) theta_plot = mu, sigma, pi_theta q_z_given_y_phi = tf.exp(log_z_given_y_phi) neg_normed_elbo = -tf.divide(tf.reduce_sum(tower_elbo), size_minibatch) tf.summary.scalar('elbo/elbo_normed', neg_normed_elbo) tf.summary.scalar( 'elbo/neg_rec_err', tf.divide(tf.reduce_sum(tower_neg_rec_err), size_minibatch)) tf.summary.scalar(
def compute_elbo(y, reconstructions, theta, phi_tilde, x_k_samps, log_z_given_y_phi, decoder_type): # ELBO for latent GMM with tf.name_scope('elbo'): # unpack phi_gmm and compute expected theta with tf.name_scope('expct_theta_to_nat'): beta_k, m_k, C_k, v_k = niw.natural_to_standard(*theta[1:]) mu, sigma = niw.expected_values((beta_k, m_k, C_k, v_k)) eta1_theta, eta2_theta = gaussian.standard_to_natural(mu, sigma) alpha_k = dirichlet.natural_to_standard(theta[0]) expected_log_pi_theta = dirichlet.expected_log_pi(alpha_k) # do not backpropagate through GMM with tf.name_scope('block_backprop'): eta1_theta = tf.stop_gradient(eta1_theta) eta2_theta = tf.stop_gradient(eta2_theta) expected_log_pi_theta = tf.stop_gradient(expected_log_pi_theta) r_nk = tf.exp(log_z_given_y_phi) # compute negative reconstruction error; sum over minibatch (use VAE function) means, out_2 = reconstructions # out_2 is either gaussian variances or bernoulli logits. if decoder_type == 'standard': neg_reconstruction_error = vae.expected_diagonal_gaussian_loglike(y, means, out_2, weights=r_nk) elif decoder_type == 'bernoulli': neg_reconstruction_error = vae.expected_bernoulli_loglike(y, out_2, r_nk=r_nk) else: raise NotImplementedError # compute E[log q_phi(x,z=k|y)] eta1_phi_tilde, eta2_phi_tilde = phi_tilde N, K, L, _ = eta2_phi_tilde.get_shape().as_list() eta1_phi_tilde = tf.reshape(eta1_phi_tilde, (N, K, L)) N, K, S, L = x_k_samps.get_shape().as_list() with tf.name_scope('compute_regularizer'): with tf.name_scope('log_numerator'): log_N_x_given_phi = gaussian.log_probability_nat_per_samp(x_k_samps, eta1_phi_tilde, eta2_phi_tilde) log_numerator = log_N_x_given_phi + tf.expand_dims(log_z_given_y_phi, axis=2) with tf.name_scope('log_denominator'): log_N_x_given_theta = gaussian.log_probability_nat_per_samp(x_k_samps, tf.tile(tf.expand_dims(eta1_theta, axis=0), [N, 1, 1]), tf.tile(tf.expand_dims(eta2_theta, axis=0), [N, 1, 1, 1])) log_denominator = log_N_x_given_theta + tf.expand_dims(tf.expand_dims(expected_log_pi_theta, axis=0), axis=2) regularizer_term = tf.reduce_mean( tf.reduce_sum( tf.reduce_sum( tf.multiply(tf.expand_dims(r_nk, axis=2), log_numerator - log_denominator), axis=1), # weighted average over components axis=0) # sum over minibatch ) # mean over samples elbo = tf.subtract(neg_reconstruction_error, regularizer_term, name='elbo') with tf.name_scope('elbo_summaries'): details = tf.tuple((neg_reconstruction_error, tf.reduce_sum(tf.multiply(r_nk, tf.reduce_mean(log_numerator, -1))), tf.reduce_sum(tf.multiply(r_nk, tf.reduce_mean(log_denominator, -1))), regularizer_term), name='debug') return elbo, details
def visualize_svae(ax, config, log_path, ratio_tr=0.7, nb_samples=20, grid_density=100, window=((-20, 20), (-20, 20)), param_device='/cpu:0'): with tf.device(param_device): if config['dataset'] in ['mnist', 'fashion']: binarise = True size_minibatch = 1024 output_type = 'bernoulli' else: binarise = False size_minibatch = -1 output_type = 'standard' # First we build the model graph so that we can load the learned parameters from a checkpoint. # Initialisations don't matter, they'll be overwritten with saver.restore(). data, lbl, _, _ = make_minibatch(config['dataset'], path_datadir='../datasets', ratio_tr=ratio_tr, seed_split=0, size_minibatch=size_minibatch, size_testbatch=-1, binarise=binarise) # define nn-architecture encoder_layers = [(config['U'], tf.tanh), (config['U'], tf.tanh), (config['L'], 'natparam')] decoder_layers = [(config['U'], tf.tanh), (config['U'], tf.tanh), (int(data.get_shape()[1]), output_type)] sample_size = 100 if config['dataset'] in ['mnist', 'fashion']: data = tf.where(tf.equal(data, -1), tf.zeros_like(data, dtype=tf.float32), tf.ones_like(data, dtype=tf.float32)) with tf.name_scope('model'): gmm_prior, theta = svae.init_mm(config['K'], config['L'], seed=config['seed'], param_device='/gpu:0') theta_copied = niw.natural_to_standard(tf.identity(gmm_prior[1]), tf.identity(gmm_prior[2]), tf.identity(gmm_prior[3]), tf.identity(gmm_prior[4])) _, sigma_k = niw.expected_values(theta_copied) pi_k_init = tf.nn.softmax( tf.random_normal(shape=(config['K'], ), mean=0.0, stddev=1., seed=config['seed'])) L_k = tf.cholesky(sigma_k) mu_k = tf.random_normal(shape=(config['K'], config['L']), stddev=1, seed=config['seed']) with tf.variable_scope('phi_gmm'): mu_k = variable_on_device('mu_k', shape=None, initializer=mu_k, trainable=True, device=param_device) L_k = variable_on_device('L_k', shape=None, initializer=L_k, trainable=True, device=param_device) pi_k = variable_on_device('log_pi_k', shape=None, initializer=pi_k_init, trainable=True, device=param_device) phi_gmm = mu_k, L_k, pi_k _ = vae.make_encoder(data, layerspecs=encoder_layers, stddev_init=.1, seed=config['seed']) with tf.name_scope('random_sampling'): # compute expected theta_pgm beta_k, m_k, C_k, v_k = niw.natural_to_standard(*theta[1:]) alpha_k = dirichlet.natural_to_standard(theta[0]) mean, cov = niw.expected_values((beta_k, m_k, C_k, v_k)) expected_log_pi = dirichlet.expected_log_pi(alpha_k) pi = tf.exp(expected_log_pi) # sample from prior (first from x_k_samples = tf.contrib.distributions.MultivariateNormalFullCovariance( loc=mean, covariance_matrix=cov).sample(sample_size) z_samples = tf.multinomial(logits=tf.reshape(tf.log(pi), (1, -1)), num_samples=sample_size, name='k_samples') z_samples = tf.squeeze(z_samples) assert z_samples.get_shape() == (sample_size, ) assert x_k_samples.get_shape() == (sample_size, config['K'], config['L']) # compute reconstructions y_k_samples, _ = vae.make_decoder(x_k_samples, layerspecs=decoder_layers, stddev_init=.1, seed=config['seed']) assert y_k_samples.get_shape() == (sample_size, config['K'], data.get_shape()[1]) with tf.name_scope('cluster_sample_data'): tf.get_variable_scope().reuse_variables() _, clustering = svae.predict(data, phi_gmm, encoder_layers, decoder_layers, seed=config['seed']) # load trained model saver = tf.train.Saver() model_path = log_path + '/' + generate_log_id(config) print(model_path) latest_ckpnt = tf.train.latest_checkpoint(model_path) sess_config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=sess_config) saver.restore(sess, latest_ckpnt) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) collected_y_samps = [] collected_z_samps = [] for s in range(nb_samples): y_samps, z_samps = sess.run((y_k_samples, z_samples)) collected_y_samps.append(y_samps) collected_z_samps.append(z_samps) collected_y_samps = np.concatenate(collected_y_samps, axis=0) collected_z_samps = np.concatenate(collected_z_samps, axis=0) assert collected_y_samps.shape == (nb_samples * sample_size, config['K'], data.shape[1]) assert collected_z_samps.shape == (nb_samples * sample_size, ) # use 300 sample points from the dataset data, lbl, clustering = sess.run( (data[:300], lbl[:300], clustering[:300])) # compute PCA if necessary samples_2d = [] if data.shape[1] > 2: pca = PCA(n_components=2).fit(data) data2d = pca.transform(data) for z_samples in range(config['K']): chosen = collected_z_samps == z_samples samps_k = collected_y_samps[chosen, z_samples, :] if samps_k.size > 0: samples_2d.append(pca.transform(samps_k)) else: data2d = data for z_samples in range(config['K']): chosen = (collected_z_samps == z_samples) samps_k = collected_y_samps[chosen, z_samples, :] if samps_k.size > 0: samples_2d.append(samps_k) # plot 2d-histogram (one histogram for each of the K components) from matplotlib.colors import LogNorm for z_samples, samples in enumerate(samples_2d): ax.hist2d(samples[:, 0], samples[:, 1], bins=grid_density, range=window, cmap=make_colormap(dark_colors[z_samples % len(dark_colors)]), normed=True, norm=LogNorm()) # overlay histogram with sample datapoints (coloured according to their most likely cluster allocation) labels = np.argmax(lbl, axis=1) for c in np.unique(labels): in_class_c = (labels == c) color = bright_colors[int(c % len(bright_colors))] marker = markers[int(c % len(markers))] ax.scatter(data2d[in_class_c, 0], data2d[in_class_c, 1], c=color, marker=marker, s=data_dot_size, linewidths=0)