def log_prob(self, zs, xs, fs, lens): # Compute means of z locations by adding drift to each z z_locs = zs[:, :-1, :] + self.drift[tf.newaxis, tf.newaxis, :] z_locs = tf.pad(z_locs, [[0, 0], [1, 0], [0, 0]], mode="CONSTANT") # Compute z log probs. log_p_z = tfd.Normal(loc=z_locs, scale=self.z_scale).log_prob(zs) # Compute x log probs as normals centered at each z. log_p_x_given_z = tfd.Normal(loc=zs, scale=self.x_scale).log_prob(xs) # Compute probability of failure log probs. # zs are [batch, time, state_size], weight matrix is [state_size, 1] # After multiplication should be [batch, time, 1] bern_logits = tf.einsum("ijk,kl->ijl", zs, self.W_f) + self.b_f[tf.newaxis, tf.newaxis, :] bern_logits = tf.reshape( bern_logits, [tf.shape(bern_logits)[0], tf.shape(bern_logits)[1]]) bern_logits *= self.bern_temp log_p_f_given_z = tfd.Bernoulli(logits=bern_logits).log_prob(fs) # Sum over state dimension. log_p = tf.reduce_sum(log_p_z + log_p_x_given_z + log_p_f_given_z[:, :, tf.newaxis], axis=-1) # Mask out timesteps past the end. log_p *= tf.sequence_mask(lens, dtype=log_p.dtype) return log_p
def log_prob(self, zs, xs, T, z_lens, x_lens): """Computes the log probability of a set of samples. Args: zs: A set of [batch_size, max_z_num_timesteps, state_dim] latent states. xs: A set of [batch_size, max_x_num_timesteps, state_dim] observations. T: A set of [batch_size] integers denoting the number of censored steps. z_lens: A set of [batch_size] integers denoting the length of each sequence of zs. x_lens: A set of [batch_size] integers denoting the length of each sequence of observations. Note that T must equal z_lens - x_lens. Returns: log_p_z: A [batch_size, max_z_num_timesteps] set of logprobs of zs. log_p_x_given_z: A [batch_size, max_x_num_timesteps] set of logprobs of xs. log_p_T: A [batch_size] set of logprobs of T. """ # First, reverse the zs rev_zs = tf.reverse_sequence(zs, z_lens, seq_axis=1, batch_axis=0) batch_size = tf.shape(zs)[0] # Compute means of z locations by adding drift to each z rev_z_locs = rev_zs[:, :-1, :] + self.drift[tf.newaxis, tf.newaxis, :] z0_mu = tf.tile(self.z0_mu[tf.newaxis, tf.newaxis, :], [batch_size, 1, 1]) rev_z_locs = tf.concat([z0_mu, rev_z_locs], axis=1) # Compute z log probs. rev_log_p_z = tfd.Normal(loc=rev_z_locs, scale=self.z_scale).log_prob(rev_zs) rev_log_p_z *= tf.sequence_mask(z_lens, dtype=rev_log_p_z.dtype)[:, :, tf.newaxis] # Reverse the log probs back log_p_z = tf.reverse_sequence(rev_log_p_z, z_lens, seq_axis=1, batch_axis=0) log_p_z = tf.reduce_sum(log_p_z, axis=-1) # To compute the prob of xs, mask out all zs beyond the first x_len masked_zs = zs * tf.sequence_mask(x_lens, maxlen=tf.reduce_max(z_lens), dtype=zs.dtype)[:, :, tf.newaxis] masked_zs = masked_zs[:, :tf.reduce_max(x_lens), :] log_p_x_given_z = tfd.Normal(loc=masked_zs, scale=self.x_scale).log_prob(xs) log_p_x_given_z *= tf.sequence_mask( x_lens, dtype=log_p_x_given_z.dtype)[:, :, tf.newaxis] log_p_x_given_z = tf.reduce_sum(log_p_x_given_z, axis=-1) log_p_T = tfd.Categorical(logits=self.T_logits).log_prob(T) return log_p_z, log_p_x_given_z, log_p_T
def kl_reg(mean, variance, weight: float = 5e-4): """ Return the kl_regularization based on mean and variance tensors. Args: mean: the mean output from the first layer variance: the variance output from the first layer weight: the weight for the loss function Returns: the KL-divergence between normal distribution and model distribution """ identity = distributions.Normal(K.zeros_like(mean), K.ones_like(mean)) model = distributions.Normal(mean, K.sqrt(variance)) return weight * distributions.kl_divergence(identity, model)
def __init__(self, num_classes, link_function, p, use_tau, **kwargs): self.num_classes = num_classes self.dist = distributions.Normal(loc=0., scale=1.) self.link_function = link_function self.p = p.copy() self.use_tau = use_tau super(CLM, self).__init__(**kwargs)
def call(self, inputs, **kwargs): """ Forward pass through the layer. Args: inputs: the input tensors to pass through the layer Returns: the output tensors from the layer """ if self.is_first: # convert vectors to distributions mean, variance = self._call_first(inputs) else: # transform the distributions mean, variance = self._call_generic(inputs) # pass the mean and variance through the activation mean = self.mean_activation(mean) variance = self.variance_activation(variance) # apply the dropout if enabled if self.dropout: mean = K.dropout(mean, self.dropout) variance = K.dropout(variance, self.dropout) # sample from the distribution if the last layer if self.is_last: dist = distributions.Normal(mean, K.sqrt(variance)) return self.last_activation(dist.sample()) return [mean, variance]
def robust_binary_crossentropy(y_true, y_pred, logit_var): """ Calculate binary accuracy, ignoring the magic number :param y_true: Ground Truth :type y_true: Union(tf.Tensor, tf.Variable) :param y_pred: Prediction in logits space :type y_pred: Union(tf.Tensor, tf.Variable) :param logit_var: Predictive variance in logits space :type logit_var: Union(tf.Tensor, tf.Variable) :return: categorical cross-entropy :rtype: tf.Tensor :History: 2018-Mar-15 - Written - Henry Leung (University of Toronto) """ variance_depressor = tf.reduce_mean( tf.exp(logit_var) - tf.ones_like(logit_var)) undistorted_loss = binary_crossentropy(y_true, y_pred, from_logits=True) dist = distributions.Normal(loc=y_pred, scale=logit_var) mc_result = tf.map_fn(lambda x: -tf.nn.elu( undistorted_loss - binary_crossentropy(y_true, x, from_logits=True)), dist.sample([25]), dtype=tf.float32) variance_loss = tf.reduce_mean(mc_result, axis=0) * undistorted_loss return (variance_loss + undistorted_loss + variance_depressor) * magic_correction_term(y_true)
def decoder2(z, opt, reuse=False): """ decoder network """ with tf.compat.v1.variable_scope("decoder2", reuse=reuse): de_dense1 = tf.layers.dense( inputs=z, units=opt.inflate_to_size1, activation=None, name="decoder_dense1", kernel_initializer=tf.contrib.layers.xavier_initializer()) de_dense1 = tf.layers.batch_normalization(de_dense1) de_dense1 = tf.nn.leaky_relu(de_dense1) de_dense1 = tf.layers.dropout(de_dense1, opt.dropout_rate) de_dense2 = tf.layers.dense( inputs=de_dense1, units=opt.inflate_to_size2, activation=None, name="decoder_dense2", kernel_initializer=tf.contrib.layers.xavier_initializer()) de_dense2 = tf.layers.batch_normalization(de_dense2) de_dense2 = tf.nn.leaky_relu(de_dense2) de_dense2 = tf.layers.dropout(de_dense2, opt.dropout_rate) de_loc = tf.layers.dense(inputs=de_dense2, units=opt.gex_size, activation=None, name="decoder_loc") de_scale = tf.ones_like(de_loc) return ds.Normal(de_loc, de_scale)
def while_step(t, prev_z, rev_log_q_z_ta, rev_zs_ta): # Compute the distribution over z_{T-t} # [batch_size] steps till next x steps_till_next_x = tf.maximum(T - t, 0) # Fetch the next x value. next_x_ind = tf.minimum(tf.maximum(t - T, 0), x_lens - 1) r = tf.range(0, batch_size) inds = tf.stack([r, next_x_ind], axis=-1) x = tf.gather_nd(rev_xs, inds) z_loc_input = tf.concat( [x, prev_z, tf.to_float(steps_till_next_x)[:, tf.newaxis]], axis=1) z_loc = tf.matmul(z_loc_input, self.W_z) + self.b_z[tf.newaxis, :] log_sigmas = tf.gather(self.log_sigma, steps_till_next_x) z_scale = tf.math.maximum(tf.math.softplus(log_sigmas), self.sigma_min) q_z = tfd.Normal(loc=z_loc, scale=z_scale) new_z = q_z.sample() log_q_new_z = q_z.log_prob(new_z) new_z = tf.where(t < z_lens, new_z, tf.zeros_like(new_z)) log_q_new_z = tf.where(t < z_lens, log_q_new_z, tf.zeros_like(log_q_new_z)) new_rev_log_q_z_ta = rev_log_q_z_ta.write(t, log_q_new_z) new_rev_zs_ta = rev_zs_ta.write(t, new_z) return t + 1, new_z, new_rev_log_q_z_ta, new_rev_zs_ta
def tf_standardGaussian_prior(batch_size, dim): """ TensorFlow standard Gaussian distributions """ shp = [batch_size, dim] loc = tf.zeros(shp) scale = tf.ones(shp) return ds.Normal(loc, scale)
def sample(self, batch_size, z0=None, max_length=50): zs_ta = tf.TensorArray(dtype=self.dtype, size=5, dynamic_size=True, name="sample_zs") fs_ta = tf.TensorArray(dtype=tf.int32, size=5, dynamic_size=True, name="sample_fs") t0 = tf.constant(0) failed = tf.zeros([batch_size], dtype=tf.bool) lens = tf.ones([batch_size], dtype=tf.int32) if z0 is None: z0 = tf.zeros([batch_size, self.state_size], dtype=self.dtype) - self.drift[tf.newaxis, :] def while_predicate(t, failed, *unused_args): return tf.math.logical_and( tf.math.reduce_any(tf.math.logical_not(failed)), t < 50) def while_step(t, failed, lens, prev_z, zs_ta, fs_ta): # z_loc is [batch_size, state_size] z_loc = prev_z + self.drift[tf.newaxis, :] # new_zs is [batch_size, state_size] new_zs = tfd.Normal(loc=z_loc, scale=self.z_scale).sample() # multiply [batch_size, state_size] new_zs by [state_size, 1] W_f # then add [:, 1] b_f. bern_logits = tf.matmul(new_zs, self.W_f) + self.b_f[tf.newaxis, :] bern_logits = tf.reshape(bern_logits, [batch_size]) bern_logits *= self.bern_temp # Sample a [batch_size] set of failure indicators new_fs = tfd.Bernoulli(logits=bern_logits).sample() # Update Tensorarrays new_zs_ta = zs_ta.write( t, tf.where(failed, tf.zeros_like(new_zs), new_zs)) new_fs_ta = fs_ta.write( t, tf.where(failed, tf.zeros_like(new_fs), new_fs)) # Update failure indicators new_failed = tf.logical_or(failed, tf.equal(new_fs, 1)) # Update lengths (add one only if the process hasn't failed) new_lens = lens + (1 - tf.to_int32(new_failed)) return t + 1, new_failed, new_lens, new_zs, new_zs_ta, new_fs_ta _, _, lens, _, zs_ta, fs_ta = tf.while_loop(while_predicate, while_step, loop_vars=(t0, failed, lens, z0, zs_ta, fs_ta), parallel_iterations=1) zs = zs_ta.stack() fs = fs_ta.stack() xs = tfd.Normal(loc=zs, scale=self.x_scale).sample() return zs, xs, fs, lens
def _uniform_unit_norm(dimension, shape, dtype, seed): """Returns a batch of points chosen uniformly from the unit hypersphere.""" # This works because the Gaussian distribution is spherically symmetric. # raw shape: shape + [dimension] raw = tfd.Normal(loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)).sample(tf.concat( [shape, [dimension]], axis=0), seed=seed()) unit_norm = raw / tf.norm(raw, ord=2, axis=-1)[..., tf.newaxis] return unit_norm
def logposterior(self, params_log): """TODO: Docstring for posterior. :params_log: TODO :tf_dis: TODO :tf: TODO :returns: TODO """ zeros_n = tf.zeros([self.n, 1]) sigma2 = tf.exp(params_log[0, 0]) phi = tf.exp(params_log[1, 0]) Sigma_gp = tf_cov_exp(self.tf_dis, sigma2, phi, 0.0) Sigma_marginal = self.prior_c_sigma2 + Sigma_gp Sigma_z = Sigma_marginal + tf.eye(self.n) posterior_prob_log = dmvnorm(self.tf_y, zeros_n, Sigma_z) + \ distr.Normal(tf.log(1.0), 0.4).log_prob(params_log[0,0]) + \ distr.Normal(tf.log(0.1), 0.4).log_prob(params_log[1,0]) return posterior_prob_log
def make_entity_bias(entity_batch): bias_batch = tf.nn.embedding_lookup(bias, entity_batch) if options.degenerate: std_dev = 0. else: # 1/tf.sqrt(prior_prec_entity[:, 0]) # More precise if more ratings, should be clipped # 1. # Too imprecise std_dev = tf.nn.softplus(bias_batch[:, 1]) return tfd.Normal(loc=bias_batch[:, 0], scale=std_dev, name='bias_posterior')
def __init__(self, mean): """Gaussian with negative squared error as log probability. The log_prob() method computes the sum of the element-wise squared distances. This means that its value is both unnormalized and does not depend on the standard deviation. Args: mean: Mean of the distribution. stddev: Standard deviation, ignored by log_prob(). """ self._dist = tfd.Normal(mean, 1.0) self._mean = mean
def mh_r(x0, x1): # log P(x1)g(x0|x1) e_x1 = transformed_energy(x1, energ_emb, m)[:, 0] inv_diag_g_x1 = 1. / m.metric_diag(x1) grad_x1 = tf.gradients(e_x1, [x1])[0] nx_mean = x1 + stepsz[:, None] * (-inv_diag_g_x1 * grad_x1 + m.invdiag_grad(x1)) g_x0_x1 = tfd.Normal(loc=nx_mean, scale=tf.sqrt(2 * stepsz[:, None] * inv_diag_g_x1)).log_prob(x0) g_x0_x1 = tf.reduce_sum(g_x0_x1, axis=-1) return g_x0_x1 - e_x1
def prior(latent_size): """Prior builds the prior distribution against the provided latent tensor. Args: latent_size (int): The dimension of the latent space. Returns: tf.distributions.Normal: The prior over a single latent tensor. """ shp = [latent_size] loc = tf.zeros(shp) scale = tf.ones(shp) return ds.Normal(loc, scale)
def decoder(latent, img_size, units): """Decoder builds a decoder network on the given latent variable tensor. Args: lv (tf.Tensor): sample_size x batch_size x latent_size latent tensor. Returns: (tf.distribution.Normal): The batch_shape = (sample x batch x img) normal distributions representing the sampled img likelihoods. """ hidden = tf.layers.dense(latent, units) loc = tf.layers.dense(hidden, img_size) scale = tf.layers.dense(hidden, img_size) return ds.Normal(loc, scale)
def encoder(img, latent_size, units): """Encoder builds an encoder network against the provided image tensor. Args: img (tf.Tensor): batch_size x img_size tensor of flat images. Returns: (tf.distribution.Normal): The batch_shape = (batch_size, latent_size) batch of posterior normal distributions. """ hidden = tf.layers.dense(img, units) loc = tf.layers.dense(hidden, latent_size) scale = tf.layers.dense(hidden, latent_size) return ds.Normal(loc, scale)
def make_sparse_pred_reg(sigma2, x): #x = tf.cast(x, tf.float32) x2 = x # ** 2 # FIXME if x is 0/1 it's okay this_bias = tf.reduce_sum(all_bias, axis=0) this_feat = tf.reduce_sum(all_feat, axis=0) w = tf.reshape(this_bias, (-1, 1)) # w = tf.reshape(bias[:, 0], (-1, 1)) # Otherwise tf.matmul is crying # V = users[:, embedding_size:] V = this_feat V2 = V**2 logits = (tf.squeeze(tf.sparse_tensor_dense_matmul(x, w)) + 0.5 * tf.reduce_sum(tf.sparse_tensor_dense_matmul(x, V)**2 - tf.sparse_tensor_dense_matmul(x2, V2), axis=1)) return tfd.Normal(logits, scale=sigma2)
def update_mala(self, Sigma_proposal, h): """TODO: Docstring for update_rwmh. :L_proposal: Lower traingular cholesky decomposition of the covariance of the proposal distribution :returns: TODO """ zeros_n = tf.zeros([self.n, 1]) dims = tf.shape(self.params_log) Sigma_proposal = h * Sigma_proposal L_proposal = tf.cholesky(Sigma_proposal) L_inv = tf.matrix_inverse(L_proposal) candidate = self.params_log candidate += 0.5 * tf.matmul(Sigma_proposal, self.params_logpost_grad) candidate += tf.matmul(L_proposal, distr.Normal(0.0, 1.0).sample(dims)) cand_logpost = self.logposterior(candidate) cand_logpost_grad = tf.gradients(cand_logpost, candidate)[0] # cand_logpost_grad = tf.gradients(self.logposterior(candidate), candidate)[0] center_current = self.params_log - candidate - \ 0.5 * tf.matmul(Sigma_proposal, cand_logpost_grad) center_cand = candidate - self.params_log - \ 0.5 * tf.matmul(Sigma_proposal, self.params_logpost_grad) logprob = cand_logpost - self.params_logpost logprob -= 0.5 * tf.reduce_sum( tf.square(tf.matmul(L_inv, center_current))) logprob += 0.5 * tf.reduce_sum(tf.square(tf.matmul(L_inv, center_cand))) log_unif = tf.log(distr.Uniform().sample()) new, new_logpost, new_logpost_grad = tf.cond( tf.greater(logprob, log_unif), lambda: (candidate, cand_logpost, cand_logpost_grad), lambda: (self.params_log, self.params_logpost, self.params_logpost_grad)) op_param = tf.assign(self.params_log, new) op_logpost = tf.assign(self.params_logpost, new_logpost) op_logpost_grad = tf.assign(self.params_logpost_grad, new_logpost_grad) return op_param, op_logpost, op_logpost_grad
def calculate_probability(training_results, number_labels, latent_dim, stddv_datapoints, X_test): final_proba = [] for i in range(number_labels): z = list( pd.DataFrame(training_results[i][1].z_mean.transpose()).mean()) z_av = np.array(z, dtype="float32") z_av.shape = (1, latent_dim) x_dist = tfd.Normal( loc=tf.matmul(z_av, training_results[i][1].w_mean.transpose()), scale=stddv_datapoints * tf.ones([1, training_results[i][1].w_mean.shape[0]]), name="x_experiment{}".format(i)) proba = [] for testpoint in X_test: probability = tf.reduce_mean(x_dist.log_prob(testpoint)).eval() proba.append(probability) final_proba.append(proba) return np.array(final_proba).transpose()
def while_step(t, prev_z, log_q_z_ta, zs_ta): x = xs_ta.read(t) # Concatenate the previous z and current x along state dimension # z and x are currently [batch, state_size] q_input = tf.concat([prev_z, x], 1) # Multiply by parameters to create mean vector q_loc = tf.matmul(q_input, self.W_mu) + self.b_mu[tf.newaxis, :] # Create scale vector by softplussing parameters q_scale = tf.math.maximum(tf.math.softplus(self.log_sigma), self.sigma_min) # Sample and compute logprob q_z = tfd.Normal(loc=q_loc, scale=q_scale) new_z = q_z.sample() # Update TensorArray new_zs_ta = zs_ta.write( t, tf.where(t < lens, new_z, tf.zeros_like(new_z))) new_log_q_z = q_z.log_prob(new_z) new_log_q_z_ta = log_q_z_ta.write( t, tf.where(t < lens, new_log_q_z, tf.zeros_like(new_log_q_z))) return (t + 1, new_z, new_log_q_z_ta, new_zs_ta)
def MutualInformationLowerBound(c_rec, c_sample, opt): """ compute the mutual information lower bound for InfoGANs """ ll_con = None est_vec = c_rec[:, :opt.code_size] c_sample_vec = c_sample[:, :opt.code_size] if opt.InfoGAN_fix_std: std_vec = tf.ones_like(est_vec) else: std_vec = c_rec[:, opt.code_size:2 * opt.code_size] std_vec = tf.nn.softplus(std_vec) ll_con_dist = ds.Normal(est_vec, std_vec) ll_conLogProb = ll_con_dist.log_prob(c_sample_vec) ll_con = tf.reduce_sum(ll_conLogProb, [1]) result_con = tf.reduce_mean(ll_con) return result_con
def while_step(t, failed, lens, prev_z, zs_ta, fs_ta): # z_loc is [batch_size, state_size] z_loc = prev_z + self.drift[tf.newaxis, :] # new_zs is [batch_size, state_size] new_zs = tfd.Normal(loc=z_loc, scale=self.z_scale).sample() # multiply [batch_size, state_size] new_zs by [state_size, 1] W_f # then add [:, 1] b_f. bern_logits = tf.matmul(new_zs, self.W_f) + self.b_f[tf.newaxis, :] bern_logits = tf.reshape(bern_logits, [batch_size]) bern_logits *= self.bern_temp # Sample a [batch_size] set of failure indicators new_fs = tfd.Bernoulli(logits=bern_logits).sample() # Update Tensorarrays new_zs_ta = zs_ta.write( t, tf.where(failed, tf.zeros_like(new_zs), new_zs)) new_fs_ta = fs_ta.write( t, tf.where(failed, tf.zeros_like(new_fs), new_fs)) # Update failure indicators new_failed = tf.logical_or(failed, tf.equal(new_fs, 1)) # Update lengths (add one only if the process hasn't failed) new_lens = lens + (1 - tf.to_int32(new_failed)) return t + 1, new_failed, new_lens, new_zs, new_zs_ta, new_fs_ta
def update_rwmh(self, L_proposal): """TODO: Docstring for update_rwmh. :L_proposal: Lower traingular cholesky decomposition of the covariance of the proposal distribution :returns: TODO """ zeros_n = tf.zeros([self.n, 1]) dims = tf.shape(self.params_log) candidate = self.params_log candidate += tf.matmul(L_proposal, distr.Normal(0.0, 1.0).sample(dims)) cand_logpost = self.logposterior(candidate) logprob = cand_logpost - self.params_logpost log_unif = tf.log(distr.Uniform().sample()) new, new_logpost = tf.cond( tf.greater(logprob, log_unif), lambda: (candidate, cand_logpost), lambda: (self.params_log, self.params_logpost)) op_param = tf.assign(self.params_log, new) op_logpost = tf.assign(self.params_logpost, new_logpost) return op_param, op_logpost
def _nnpom(self, projected, thresholds): if self.use_tau == 1: projected = K.reshape(projected, shape=[-1]) / self.tau else: projected = K.reshape(projected, shape=[-1]) # projected = K.Print(projected, data=[K.reduce_min(projected), K.reduce_max(projected), K.reduce_mean(projected)], message='projected min max mean') m = K.shape(projected)[0] a = K.reshape(K.tile(thresholds, [m]), shape=[m, -1]) b = K.transpose( K.reshape(K.tile(projected, [self.num_classes - 1]), shape=[-1, m])) z3 = a - b # z3 = K.cond(K.reduce_min(K.abs(z3)) < 0.01, lambda: K.Print(z3, data=[K.reduce_min(K.abs(z3))], message='z3 abs min', summarize=100), lambda: z3) if self.link_function == 'probit': a3T = self.dist.cdf(z3) elif self.link_function == 'cloglog': a3T = 1 - K.exp(-K.exp(z3)) elif self.link_function == 'glogit': a3T = 1.0 / K.pow(1.0 + K.exp(-self.lmbd * (z3 - self.mu)), self.alpha) elif self.link_function == 'cauchit': a3T = K.atan(z3 / math.pi) + 0.5 elif self.link_function == 'lgamma': a3T = K.cond( self.q < 0, lambda: igammac(K.pow(self.q, -2), K.pow(self.q, -2) * K.exp(self.q * z3)), lambda: K.cond( self.q > 0, lambda: igamma( K.pow(self.q, -2), K.pow(self.q, -2) * K.exp(self.q * z3)), lambda: self. dist.cdf(z3))) elif self.link_function == 'gauss': # a3T = 1.0 / 2.0 + K.sign(z3) * K.igamma(1.0 / self.alpha, K.pow(K.abs(z3) / self.r, self.alpha)) / (2 * K.exp(K.lgamma(1.0 / self.alpha))) # z3 = K.Print(z3, data=[K.reduce_max(K.abs(z3))], message='z3 abs max') # K.sigmoid(z3 - self.p['mu']) - 1) a3T = 1.0 / 2.0 + K.tanh(z3 - self.p['mu']) * igamma( 1.0 / self.p['alpha'], K.pow(K.pow((z3 - self.p['mu']) / self.p['r'], 2), self. p['alpha'])) / (2 * K.exp(lgamma(1.0 / self.p['alpha']))) elif self.link_function == 'expgauss': u = self.lmbd * (z3 - self.mu) v = self.lmbd * self.sigma dist1 = distributions.Normal(loc=0., scale=v) dist2 = distributions.Normal(loc=v, scale=K.pow(v, 2)) a3T = dist1.cdf(u) - K.exp(-u + K.pow(v, 2) / 2 + K.log(dist2.cdf(u))) elif self.link_function == 'ggamma': a3T = igamma(self.p['d'] / self.p['p'], K.pow((z3 / self.p['a']), self.p['p'])) / K.exp( lgamma(self.p['d'] / self.p['p'])) else: a3T = 1.0 / (1.0 + K.exp(-z3)) a3 = K.concatenate([a3T, K.ones([m, 1])], axis=1) a3 = K.concatenate( [K.reshape(a3[:, 0], shape=[-1, 1]), a3[:, 1:] - a3[:, 0:-1]], axis=-1) return a3
def make_likelihood_reg(sigma2, feat_users, feat_items, bias_users, bias_items): logits = global_bias + tf.reduce_sum(feat_users * feat_items, 1) + bias_users + bias_items return tfd.Normal(logits, scale=sigma2, name='pred')
def make_prior(): return tfd.Normal(loc=[0.] * embedding_size, scale=[1.] * embedding_size)
def calc_KLdiv(self, z_prior, z_post): post_mu = tf.tile(z_post['mu'][:, tf.newaxis, :], [1, self.num_classes_kn, 1]) if self.z_dist == 'N': post_sigma = tf.tile(z_post['sigma'][:, tf.newaxis, :], [1, self.num_classes_kn, 1]) dist_prior = tfd.Normal(loc=z_prior['mu'], scale=z_prior['sigma'], allow_nan_stats=~self.debug) dist_post = tfd.Normal(loc=post_mu, scale=post_sigma, allow_nan_stats=~self.debug) KLdiv = dist_post.kl_divergence(dist_prior) # [B, hyp, z] elif self.z_dist == 'B': if self.z_B_kl in [20, 212]: # Monte carlo approximation on the logistic node (a true lower bound but can exhibit higher variance) post_log_sample = tf.tile( z_post['log_sample'][:, tf.newaxis, :], [1, self.num_classes_kn, 1]) dist_prior = pseudo_LogRelaxedBernoulli( logits=z_prior['mu'], temperature=self.m['VAEEncoder'].temp_prior, allow_nan_stats=~self.debug) dist_post = pseudo_LogRelaxedBernoulli( logits=post_mu, temperature=self.m['VAEEncoder'].temp_post, allow_nan_stats=~self.debug) KLdiv = dist_post.log_prob( post_log_sample) - dist_prior.log_prob( post_log_sample) # [B, hyp, z] if self.z_B_kl == 212: # slightly different relaxation from equation 21, but seemed to learn quite well KLdiv *= dist_post.prob(post_log_sample) elif self.z_B_kl == 21: # relax computation of the discrete log mass: not a true lower bound, be aware of overfitting on spurious elements in this 'KL' def pseudo_kl(a_logits, b_logits, z_logits): """Bernoulli-kl with 'external' labels given by z_logits""" delta_probs0 = tf.nn.softplus(-b_logits) - tf.nn.softplus( -a_logits) delta_probs1 = tf.nn.softplus(b_logits) - tf.nn.softplus( a_logits) return (tf.nn.sigmoid(z_logits) * delta_probs0 + tf.nn.sigmoid(-z_logits) * delta_probs1) post_log_sample = tf.tile( z_post['log_sample'][:, tf.newaxis, :], [1, self.num_classes_kn, 1]) KLdiv = pseudo_kl(post_mu, z_prior['mu'], z_logits=post_log_sample) elif self.z_B_kl == 22: # replace discrete mass with the analytic discrete KL: not a true lower bound, be aware of overfitting on spurious elements in this 'KL' dist_prior = tfd.Bernoulli(logits=z_prior['mu'], allow_nan_stats=~self.debug) dist_post = tfd.Bernoulli(logits=post_mu, allow_nan_stats=~self.debug) KLdiv = dist_post.kl_divergence(dist_prior) # [B, hyp, z] else: raise ValueError('Unknown z_B_kl: {}'.format(self.z_B_kl)) else: raise ValueError('Unknown z_dist: {}'.format(self.z_dist)) KLdiv = self.z_kl_weight * tf.reduce_sum(KLdiv, axis=2) if self.uk_cycling: # mask the prediction error of the current uk classes with the highest prediction error of the observation KLdiv = tf.where( self.current_cycl_uk_mask, tf.tile(tf.reduce_max(KLdiv, axis=1, keep_dims=True), [1, self.num_classes_kn]), KLdiv) return KLdiv
def __init__(self, FLAGS, env, phase): super().__init__(FLAGS, env, phase) min_glimpses = 3 random_locations = phase['random_locations'] # tf.logical_and(self.epoch_num < FLAGS.pre_train_epochs, self.is_training) # Initialise modules n_policies = FLAGS.num_classes if FLAGS.planner == 'ActInf' else 1 policyNet = PolicyNetwork(FLAGS, self.B, n_policies) glimpseEncoder = GlimpseEncoder(FLAGS) VAEencoder = Encoder(FLAGS, env.patch_shape_flat) VAEdecoder = Decoder(FLAGS, env.patch_shape_flat) stateTransition_AC = StateTransition_AC(FLAGS.size_rnn, 2*FLAGS.size_z) fc_baseline = tf.layers.Dense(1, name='fc_baseline') submodules = {'policyNet': policyNet, 'VAEencoder': VAEencoder, 'VAEdecoder': VAEdecoder} if FLAGS.planner == 'ActInf': planner = ActInfPlanner(FLAGS, submodules, self.B, env.patch_shape_flat, self.C, stateTransition_AC) elif FLAGS.planner == 'RL': planner = REINFORCEPlanner(FLAGS, submodules, self.B, env.patch_shape_flat) else: raise ValueError('Undefined planner.') self.n_policies = planner.n_policies # variables to remember. Probably to be implemented via TensorArray out_ta = [] out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='obs')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='glimpse_nlls_posterior')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='glimpse_reconstr')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='zs_post')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='G')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='actions')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='actions_mean')) out_ta.append(tf.TensorArray(tf.int32, size=min_glimpses, dynamic_size=True, name='decisions')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='rewards')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='baselines')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses+1, dynamic_size=True, name='current_cs')) out_ta.append(tf.TensorArray(tf.bool, size=min_glimpses, dynamic_size=True, name='done')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='exp_exp_obs')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='exp_obs')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='H_exp_exp_obs')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='exp_H')) out_ta.append(tf.TensorArray(tf.float32, size=min_glimpses, dynamic_size=True, name='potential_actions')) ta_d = {} for i, ta in enumerate(out_ta): ta_d[ta.handle.name.split('/')[-1].replace(':0', '')] = ta # Initial values last_done = tf.zeros([self.B], dtype=tf.bool) last_decision = tf.fill([self.B], -1) # in case starting calculation after initial observation (as first location should be identical for all images) next_action, next_action_mean = policyNet.inital_loc() next_decision = tf.fill([self.B], -1) current_state = stateTransition_AC.initial_state(self.B, next_action) ta_d['current_cs'] = write_zero_out(0, ta_d['current_cs'], current_state['c'], last_done) # out of loop to not create new tensors every step one_hot_label = tf.one_hot(tf.range(FLAGS.num_classes), depth=FLAGS.num_classes) one_hot_label_repeated = repeat_axis(one_hot_label, 0, self.B) # [B * hyp, hyp] def current_belief_update(current_state, new_observation, exp_obs_prior, time): """Given a new observation, and the last believes over the state, update the believes over the states. The sufficient statistic of the old state in this case is z, as the VAEencoder is class-specific. Returns: c: [B, num_classes} believe over classes based on past observations zs_post: [B, num_classes, size_z] inferred zs conditional on each class glimpse_nll_stacked: [B, num_classes] likelihood of each past observation conditional on each class """ with tf.name_scope('Belief_update'): # Infer posterior z for all hypotheses with tf.name_scope('poterior_inference_per_hyp'): class_conditional_s = tf.reshape(current_state['s'], [self.B * FLAGS.num_classes, FLAGS.size_rnn]) new_action_repeated = repeat_axis(current_state['l'], 0, FLAGS.num_classes) new_observation_repeated = repeat_axis(new_observation, 0, FLAGS.num_classes) z_post = VAEencoder.posterior_inference(one_hot_label_repeated, class_conditional_s, tf.stop_gradient(new_action_repeated), new_observation_repeated) # 2 possibilties to infer state from received observations: # i) judge by likelihood of the observations under each hypothesis # ii) train a separate model (e.g. LSTM) for infering states # TODO: CAN WE DO THIS IN AN ENCODED SPACE? posterior = VAEdecoder.decode(one_hot_label_repeated, class_conditional_s, z_post['sample'], tf.stop_gradient(new_action_repeated), new_observation_repeated) # ^= filtering, given that transitions are deterministic zs_post = tf.reshape(tf.concat([z_post['mu'], z_post['sigma']], axis=1), [self.B, FLAGS.num_classes, 2*FLAGS.size_z]) zs_post_samples = tf.reshape(z_post['sample'], [self.B, FLAGS.num_classes, FLAGS.size_z]) reconstr_post = tf.reshape(posterior['sample'], [self.B, FLAGS.num_classes, env.patch_shape_flat]) nll_post = tf.reshape(posterior['loss'], [self.B, FLAGS.num_classes]) # believes over the classes based on all past observations (uniformly weighted) with tf.name_scope('belief_update'): # TODO: THINK ABOUT THE SHAPE. PRIOR SHOULD BE FOR EACH HYP. USE new_observation_repeated? prior_nll = calculate_gaussian_nll(exp_obs_prior, new_observation) if time == 0: c = tf.nn.softmax(-prior_nll, axis=1) else: c = (1. / time) * tf.nn.softmax(-prior_nll, axis=1) + (time - 1.) / time * current_state['c'] return (c, # [B, num_classes] zs_post, # [B, num_classes, 2*z] zs_post_samples, # [B, num_classes, z] nll_post, # [B, num_classes] reconstr_post) # [B, num_classes, glimpse] with tf.name_scope('Main_loop'): for time in range(FLAGS.num_glimpses): if time == 0: if time > 1: if random_locations: next_decision, next_action, next_action_mean, pl_records = planner.random_policy() else: next_decision, next_action, next_action_mean, next_exp_obs, pl_records = planner.planning_step(current_state, z_samples, time, self.is_training) # TODO : Could REUSE FROM PLANNING STEP current_state = stateTransition_AC([last_z, labels, next_action], last_state) observation, corr_classification_fb, done = env.step(next_action, next_decision) done = tf.logical_or(last_done, done) obs_enc = glimpseEncoder.encode(observation) current_state['c'], zs_post, z_samples, nll_posterior, reconstr_posterior = current_belief_update(current_state, obs_enc, next_exp_obs, time) # baseline = fc_baseline(tf.stop_gradient(tf.concat([current_c, tf.fill([self.B, 1], tf.cast(time, tf.float32))], axis=1))) baseline = tf.squeeze(fc_baseline(tf.stop_gradient(current_state['c'])), 1) # t=0 to T-1. ACTION RECORDING HAS TO STAY BEFORE PLANNING OR WILL BE OVERWRITTEN ta_d['obs'] = write_zero_out(time, ta_d['obs'], observation, done) ta_d['zs_post'] = write_zero_out(time, ta_d['zs_post'], zs_post, done) # [B, n_policies, size_z] ta_d['glimpse_nlls_posterior'] = write_zero_out(time, ta_d['glimpse_nlls_posterior'], nll_posterior, done) # [B, n_policies] ta_d['glimpse_reconstr'] = write_zero_out(time, ta_d['glimpse_reconstr'], reconstr_posterior, done) # for visualisation only ta_d['actions'] = write_zero_out(time, ta_d['actions'], next_action, done) # location actions, not including the decision acions ta_d['actions_mean'] = write_zero_out(time, ta_d['actions_mean'], next_action_mean, done) # location actions, not including the decision acions ta_d['baselines'] = write_zero_out(time, ta_d['baselines'], baseline, done) ta_d['done'] = ta_d['done'].write(time, done) # t=0 to T ta_d['rewards'] = write_zero_out(time, ta_d['rewards'] , corr_classification_fb, last_done) if random_locations: next_decision, next_action, next_action_mean, pl_records = planner.random_policy() else: next_decision, next_action, next_action_mean, pl_records = planner.planning_step(current_state, zs_post, z_samples, time, self.is_training) # t=1 to T for k, v in pl_records.items(): ta_d[k] = write_zero_out(time, ta_d[k], v, last_done) ta_d['current_cs'] = write_zero_out(time+1, ta_d['current_cs'], current_state['c'], last_done) # ONLY ONE t=0 TO T ta_d['decisions'] = write_zero_out(time, ta_d['decisions'], next_decision, last_done) # copy forward classification_decision = tf.where(last_done, last_decision, next_decision) # pass on to next time step last_done = done last_decision = next_decision last_z = zs_post # TODO: or should this be the sampled ones? # last_c = current_c # TODO: could also use the one from planning (new_c) or pi # last_s = current_s last_state = current_state # TODO: break loop if tf.reduce_all(last_done) (requires tf.while loop) time += 1 with tf.name_scope('Stacking'): self.obs = ta_d['obs'].stack() # [T,B,glimpse] self.actions = ta_d['actions'].stack() # [T,B,2] actions_mean = ta_d['actions_mean'].stack() # [T,B,2] self.decisions = ta_d['decisions'].stack() rewards = ta_d['rewards'].stack() done = ta_d['done'].stack() self.glimpse_nlls_posterior = ta_d['glimpse_nlls_posterior'].stack() # [T,B,hyp] zs_post = ta_d['zs_post'].stack() # [T,B,hyp,2*z] self.state_believes = ta_d['current_cs'].stack() # [T+1,B,hyp] self.G = ta_d['G'].stack() # not zero'd-out so far! bl_loc = ta_d['baselines'].stack() self.glimpse_reconstr = ta_d['glimpse_reconstr'].stack() # [T,B,hyp,glimpse] # further records for debugging self.exp_exp_obs = ta_d['exp_exp_obs'].stack() self.exp_obs = ta_d['exp_obs'].stack() self.H_exp_exp_obs = ta_d['H_exp_exp_obs'].stack() self.exp_H = ta_d['exp_H'].stack() self.potential_actions = ta_d['potential_actions'].stack() # [T,B,n_policies,loc] self.num_glimpses_dyn = tf.shape(self.obs)[0] T = FLAGS.num_glimpses - tf.count_nonzero(done, 0, dtype=tf.float32) self.avg_T = tf.reduce_mean(T) with tf.name_scope('Losses'): with tf.name_scope('RL'): returns = tf.cumsum(rewards, reverse=True, axis=0) policy_losses = policyNet.REINFORCE_losses(returns, bl_loc, self.actions, actions_mean) # [T,B] policy_loss = tf.reduce_sum(tf.reduce_mean(policy_losses, 1)) baseline_mse = tf.reduce_mean(tf.square(tf.stop_gradient(returns[1:]) - bl_loc[:-1])) with tf.name_scope('Classification'): # might never make a classification decision # TODO: SHOULD I FORCE THE ACTION AT t=t TO BE A CLASSIFICATION? self.classification = classification_decision with tf.name_scope('VAE'): # mask losses of wrong hyptheses nll_posterior = tf.reduce_sum(self.glimpse_nlls_posterior, 0) # sum over time correct_hypoths = tf.cast(tf.one_hot(env.y_MC, depth=FLAGS.num_classes), tf.bool) nll_posterior = tf.where(correct_hypoths, nll_posterior, tf.zeros_like(nll_posterior)) # zero-out all but true hypothesis nll_posterior = tf.reduce_mean(nll_posterior) # mean over batch # assume N(0,1) prior model (event though atm prior never used) prior_mu = tf.fill([self.B, FLAGS.size_z], 0.) prior_sigma = tf.fill([self.B, FLAGS.size_z], 1.) zs_post_correct = tf.boolean_mask(zs_post, correct_hypoths, axis=1) post_mu, post_sigma = tf.split(zs_post_correct, 2, axis=2) # KL_div = T * VAEencoder.kl_div_normal(post_mu, post_sigma, prior_mu, prior_sigma) # NOTE: "T *" is wrong as T is [self.B]. Incorporat before reducing to a scalar N_post = tfd.Normal(loc=post_mu, scale=post_sigma) N_prior = tfd.Normal(loc=prior_mu, scale=prior_sigma) KL_div = N_post.kl_divergence(N_prior) KL_div = tf.where(tf.tile(done[:, :, tf.newaxis], [1, 1, FLAGS.size_z]), tf.zeros_like(KL_div), KL_div) # replace those that are done KL_div = tf.reduce_mean(tf.reduce_sum(KL_div, 0)) # TODO: SCALE LOSSES DIFFERENTLY? (only necessary if they flow into the same weights, might not be the case so far) self.loss = policy_loss + baseline_mse + nll_posterior + KL_div with tf.variable_scope('Optimizer'): if random_locations: pretrain_vars = VAEencoder.trainable + VAEdecoder.trainable self.train_op, gradient_check_Pre, _ = self._create_train_op(FLAGS, nll_posterior + KL_div, self.global_step, varlist=pretrain_vars) else: self.train_op, gradient_check_F, _ = self._create_train_op(FLAGS, self.loss, self.global_step) with tf.name_scope('Summaries'): metrics_upd_coll = "streaming_updates" scalars = {'loss/loss': self.loss, 'loss/accuracy': tf.reduce_mean(tf.cast(tf.equal(classification_decision, self.y_MC), tf.float32)), 'loss/VAE_nll_posterior': nll_posterior, 'loss/VAE_KL_div': KL_div, 'loss/RL_loc_baseline_mse': tf.reduce_mean(baseline_mse), 'loss/RL_policy_loss': policy_loss, 'loss/RL_returns': tf.reduce_mean(returns), 'misc/T': self.avg_T, 'misc/share_no_decision': tf.count_nonzero(tf.equal(classification_decision, -1), dtype=tf.float32) / tf.cast(self.B, tf.float32)} for name, scalar in scalars.items(): tf.summary.scalar(name, scalar) tf.metrics.mean(scalar, name=name, updates_collections=metrics_upd_coll) self.metrics_update = tf.get_collection(metrics_upd_coll) self.metrics_names = [v.name.replace('_1/update_op:0', '').replace('Summaries/', '') for v in self.metrics_update] self.summary = tf.summary.merge_all() self.glimpses_composed = env.composed_glimpse(FLAGS, self.obs, self.num_glimpses_dyn) self.acc = tf.reduce_mean(tf.cast(tf.equal(classification_decision, self.y_MC), tf.float32)) # only to get easy direct intermendiate outputs self.saver = self._create_saver(phase)