def init_mm_params(nb_components, latent_dims, alpha_scale=.1, beta_scale=1e-5, v_init=10., m_scale=1., C_scale=10., seed=0, as_variables=True, trainable=False, device='/gpu:0', name='gmm'): with tf.name_scope('gmm_initialization'): alpha_init = alpha_scale * tf.ones((nb_components,)) beta_init = beta_scale * tf.ones((nb_components,)) v_init = tf.tile([float(latent_dims + v_init)], [nb_components]) means_init = m_scale * tf.random_uniform((nb_components, latent_dims), minval=-1, maxval=1, seed=seed) covariance_init = C_scale * tf.tile(tf.expand_dims(tf.eye(latent_dims), axis=0), [nb_components, 1, 1]) # transform to natural parameters A, b, beta, v_hat = niw.standard_to_natural(beta_init, means_init, covariance_init, v_init) alpha = dirichlet.standard_to_natural(alpha_init) # init variable if as_variables: with tf.variable_scope(name): alpha = variable_on_device('alpha_k', shape=None, initializer=alpha, trainable=trainable, device=device) A = variable_on_device('beta_k', shape=None, initializer=A, trainable=trainable, device=device) b = variable_on_device('m_k', shape=None, initializer=b, trainable=trainable, device=device) beta = variable_on_device('C_k', shape=None, initializer=beta, trainable=trainable, device=device) v_hat = variable_on_device('v_k', shape=None, initializer=v_hat, trainable=trainable, device=device) params = alpha, A, b, beta, v_hat return params
def make_nnet(input, layerspecs, stddev, name, param_device='/gpu:0', seed=0): with tf.variable_scope(name): # ravel inputs: (M, K, D) -> (M*K, D) input_shape = input.get_shape() input_dim = int(input_shape[-1]) input = tf.reshape(input, (-1, input_dim)) prev_layer = input # create all layers except the output layer for i, (hidden_units, activation) in enumerate(layerspecs[:-1]): prev_layer = make_layer(prev_layer, hidden_units, stddev, activation, 'layer_%d' % i, param_device, seed) # create output layer output_dim, type = layerspecs[-1] if type == 'bernoulli': out_mlp = make_bernoulli_layer(prev_layer, output_dim, stddev, param_device=param_device, seed=seed) else: out_mlp = make_gaussian_layer(prev_layer, output_dim, stddev, type, param_device=param_device, seed=seed) # create resnet-like shortcut (as in Johnson's SVAE code) with tf.variable_scope('shortcut'): orthonormal_cols = tf.constant(rand_partial_isometry(input_dim, output_dim, 1., seed=seed), dtype=tf.float32) W = variable_on_device('W', shape=None, initializer=orthonormal_cols, trainable=True, device=param_device, dtype=tf.float32) b1 = variable_on_device('b1', shape=None, initializer=tf.zeros(output_dim), trainable=True, device=param_device, dtype=tf.float32) out_res = tf.add(tf.matmul(input, W), b1, name='res_shortcut_1') # create shortcut for second output (in Gaussian case) if type != 'bernoulli': b2 = variable_on_device('b2', shape=None, initializer=tf.zeros(output_dim), trainable=True, device=param_device, dtype=tf.float32) if type == 'standard': a = tf.constant(1., dtype=tf.float32) elif type == 'natparam': a = tf.constant(-0.5, dtype=tf.float32) else: raise NotImplementedError out_res = (out_res, tf.multiply(a, tf.log1p(tf.exp(b2)), name='res_shortcut_2')) with tf.variable_scope('resnet_out'): # unravel output: (M*K, D) -> (M, K, D) output_shape = input_shape[:-1].concatenate(output_dim) if type == 'bernoulli': outputs = tf.reshape(tf.add(out_mlp, out_res), output_shape, name='output') else: outputs = ( tf.reshape(tf.add(out_mlp[0], out_res[0]), output_shape, name='output_0'), tf.reshape(tf.add(out_mlp[1], out_res[1]), output_shape, name='output_1') ) return outputs
def make_loc_scale_variables(theta, param_device='/gpu:0', name='copy_m_v'): # create location/scale variables for point estimations with tf.name_scope(name): theta_copied = niw.natural_to_standard(tf.identity(theta[1]), tf.identity(theta[2]), tf.identity(theta[3]), tf.identity(theta[4])) mu_k_init, sigma_k = niw.expected_values(theta_copied) L_k_init = tf.cholesky(sigma_k) mu_k = variable_on_device('mu_k', shape=None, initializer=mu_k_init, trainable=True, device=param_device) L_k = variable_on_device('L_k', shape=None, initializer=L_k_init, trainable=True, device=param_device) return mu_k, L_k
def init_recognition_params(theta, nb_components, seed=0, param_device='/gpu:0', var_scope='phi_gmm'): # make parameters for PGM part of recognition network with tf.name_scope('init_' + var_scope): pi_k_init = tf.nn.softmax(tf.random_normal(shape=(nb_components,), mean=0.0, stddev=1., seed=seed)) with tf.variable_scope(var_scope): mu_k, L_k = make_loc_scale_variables(theta, param_device) pi_k = variable_on_device('log_pi_k', shape=None, initializer=pi_k_init, trainable=True, device=param_device) return mu_k, L_k, pi_k
# init helper values for SMM (theta is a constant, we just need it to init the rec GMM below) gmm_prior, theta = svae.init_mm(config['K'], config['L'], seed=config['seed'], param_device=param_device, theta_as_variable=False) # create tensor for Student-t parameters with tf.variable_scope('theta'): mu_k, L_k = svae.make_loc_scale_variables( gmm_prior, param_device=param_device) DoF = config['DoF'] * tf.ones( (config['K'], ), dtype=tf.float32) DoF = variable_on_device('DoF_k', shape=None, initializer=DoF, trainable=False, device=param_device) alpha_k = variable_on_device('alpha_k', shape=None, initializer=theta[0], trainable=False, device=param_device) # init inference GMM parameters phi_gmm = svae.init_recognition_params( theta, config['K'], seed=config['seed'], param_device=param_device)
def visualize_svae(ax, config, log_path, ratio_tr=0.7, nb_samples=20, grid_density=100, window=((-20, 20), (-20, 20)), param_device='/cpu:0'): with tf.device(param_device): if config['dataset'] in ['mnist', 'fashion']: binarise = True size_minibatch = 1024 output_type = 'bernoulli' else: binarise = False size_minibatch = -1 output_type = 'standard' # First we build the model graph so that we can load the learned parameters from a checkpoint. # Initialisations don't matter, they'll be overwritten with saver.restore(). data, lbl, _, _ = make_minibatch(config['dataset'], path_datadir='../datasets', ratio_tr=ratio_tr, seed_split=0, size_minibatch=size_minibatch, size_testbatch=-1, binarise=binarise) # define nn-architecture encoder_layers = [(config['U'], tf.tanh), (config['U'], tf.tanh), (config['L'], 'natparam')] decoder_layers = [(config['U'], tf.tanh), (config['U'], tf.tanh), (int(data.get_shape()[1]), output_type)] sample_size = 100 if config['dataset'] in ['mnist', 'fashion']: data = tf.where(tf.equal(data, -1), tf.zeros_like(data, dtype=tf.float32), tf.ones_like(data, dtype=tf.float32)) with tf.name_scope('model'): gmm_prior, theta = svae.init_mm(config['K'], config['L'], seed=config['seed'], param_device='/gpu:0') theta_copied = niw.natural_to_standard(tf.identity(gmm_prior[1]), tf.identity(gmm_prior[2]), tf.identity(gmm_prior[3]), tf.identity(gmm_prior[4])) _, sigma_k = niw.expected_values(theta_copied) pi_k_init = tf.nn.softmax( tf.random_normal(shape=(config['K'], ), mean=0.0, stddev=1., seed=config['seed'])) L_k = tf.cholesky(sigma_k) mu_k = tf.random_normal(shape=(config['K'], config['L']), stddev=1, seed=config['seed']) with tf.variable_scope('phi_gmm'): mu_k = variable_on_device('mu_k', shape=None, initializer=mu_k, trainable=True, device=param_device) L_k = variable_on_device('L_k', shape=None, initializer=L_k, trainable=True, device=param_device) pi_k = variable_on_device('log_pi_k', shape=None, initializer=pi_k_init, trainable=True, device=param_device) phi_gmm = mu_k, L_k, pi_k _ = vae.make_encoder(data, layerspecs=encoder_layers, stddev_init=.1, seed=config['seed']) with tf.name_scope('random_sampling'): # compute expected theta_pgm beta_k, m_k, C_k, v_k = niw.natural_to_standard(*theta[1:]) alpha_k = dirichlet.natural_to_standard(theta[0]) mean, cov = niw.expected_values((beta_k, m_k, C_k, v_k)) expected_log_pi = dirichlet.expected_log_pi(alpha_k) pi = tf.exp(expected_log_pi) # sample from prior (first from x_k_samples = tf.contrib.distributions.MultivariateNormalFullCovariance( loc=mean, covariance_matrix=cov).sample(sample_size) z_samples = tf.multinomial(logits=tf.reshape(tf.log(pi), (1, -1)), num_samples=sample_size, name='k_samples') z_samples = tf.squeeze(z_samples) assert z_samples.get_shape() == (sample_size, ) assert x_k_samples.get_shape() == (sample_size, config['K'], config['L']) # compute reconstructions y_k_samples, _ = vae.make_decoder(x_k_samples, layerspecs=decoder_layers, stddev_init=.1, seed=config['seed']) assert y_k_samples.get_shape() == (sample_size, config['K'], data.get_shape()[1]) with tf.name_scope('cluster_sample_data'): tf.get_variable_scope().reuse_variables() _, clustering = svae.predict(data, phi_gmm, encoder_layers, decoder_layers, seed=config['seed']) # load trained model saver = tf.train.Saver() model_path = log_path + '/' + generate_log_id(config) print(model_path) latest_ckpnt = tf.train.latest_checkpoint(model_path) sess_config = tf.ConfigProto(allow_soft_placement=True) sess = tf.Session(config=sess_config) saver.restore(sess, latest_ckpnt) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) collected_y_samps = [] collected_z_samps = [] for s in range(nb_samples): y_samps, z_samps = sess.run((y_k_samples, z_samples)) collected_y_samps.append(y_samps) collected_z_samps.append(z_samps) collected_y_samps = np.concatenate(collected_y_samps, axis=0) collected_z_samps = np.concatenate(collected_z_samps, axis=0) assert collected_y_samps.shape == (nb_samples * sample_size, config['K'], data.shape[1]) assert collected_z_samps.shape == (nb_samples * sample_size, ) # use 300 sample points from the dataset data, lbl, clustering = sess.run( (data[:300], lbl[:300], clustering[:300])) # compute PCA if necessary samples_2d = [] if data.shape[1] > 2: pca = PCA(n_components=2).fit(data) data2d = pca.transform(data) for z_samples in range(config['K']): chosen = collected_z_samps == z_samples samps_k = collected_y_samps[chosen, z_samples, :] if samps_k.size > 0: samples_2d.append(pca.transform(samps_k)) else: data2d = data for z_samples in range(config['K']): chosen = (collected_z_samps == z_samples) samps_k = collected_y_samps[chosen, z_samples, :] if samps_k.size > 0: samples_2d.append(samps_k) # plot 2d-histogram (one histogram for each of the K components) from matplotlib.colors import LogNorm for z_samples, samples in enumerate(samples_2d): ax.hist2d(samples[:, 0], samples[:, 1], bins=grid_density, range=window, cmap=make_colormap(dark_colors[z_samples % len(dark_colors)]), normed=True, norm=LogNorm()) # overlay histogram with sample datapoints (coloured according to their most likely cluster allocation) labels = np.argmax(lbl, axis=1) for c in np.unique(labels): in_class_c = (labels == c) color = bright_colors[int(c % len(bright_colors))] marker = markers[int(c % len(markers))] ax.scatter(data2d[in_class_c, 0], data2d[in_class_c, 1], c=color, marker=marker, s=data_dot_size, linewidths=0)