def build_kl(self): # estimate entropy surrogate estimator = SpectralScoreEstimator( eta=self.eta, n_eigen_threshold=self.n_eigen_threshold) entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_rand) # compute analytic cross entropy kernel_matrix = self.prior_kernel.K(tf.cast(self.x_rand, tf.float64)) \ + self.injected_noise ** 2 * tf.eye(tf.shape(self.x_rand)[0], dtype=tf.float64) prior_dist = tf.contrib.distributions.MultivariateNormalFullCovariance( tf.zeros([tf.shape(self.x_rand)[0]], dtype=tf.float64), kernel_matrix) cross_entropy = -tf.reduce_mean( prior_dist.log_prob(tf.to_double(self.noisy_func_x_rand))) self.kl_surrogate = -entropy_sur + tf.to_float(cross_entropy)
def build_kl(self): # estimate entropy surrogate estimator = SpectralScoreEstimator( eta=self.eta, n_eigen_threshold=self.n_eigen_threshold) entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_rand) # estimate cross entropy self.prior_func_x_rand = self.prior_gen(self.x_rand, self.n_particles) self.noisy_prior_func_x_rand = self.prior_func_x_rand + self.injected_noise * tf.random_normal( tf.shape(self.prior_func_x_rand)) cross_entropy_gradients = estimator.compute_gradients( self.noisy_prior_func_x_rand, self.noisy_func_x_rand) cross_entropy_sur = -tf.reduce_mean( tf.reduce_sum( tf.stop_gradient(cross_entropy_gradients) * self.noisy_func_x_rand, -1)) self.kl_surrogate = -entropy_sur + cross_entropy_sur
def build_model(self, activation_fn=tf.nn.relu): """Defines the actual NN model with fully connected layers. The loss is computed for partial feedback settings (bandits), so only the observed outcome is backpropagated (see weighted loss). Selects the optimizer and, finally, it also initializes the graph. Args: activation_fn: the activation function used in the nn layers. """ if self.verbose: print("Initializing model {}.".format(self.name)) neg_kl_term, l_number = 0, 0 # Build network. input_x = tf.tile(tf.expand_dims(self.x_adv, 0), [self.n_sample, 1, 1]) n_in = self.n_in for l_number, n_nodes in enumerate(self.layers): if n_nodes > 0: h = self.build_layer(input_x, [n_in, n_nodes], l_number, self.hparams.activation) input_x = h n_in = n_nodes # Create last linear layer h = self.build_layer(input_x, [n_in, self.n_out], l_number + 1, activation_fn=lambda x: x) self.func_x_adv = h # (h - tf.to_float(self.gp.input_mean)) / tf.to_float(self.gp.input_std) self.func_x = self.func_x_adv[:, :tf.shape(self.x)[0]] self.func_adv = self.func_x_adv[:, tf.shape(self.x)[0]:] self.y_pred = h[0, :tf.shape(self.x)[0]] * tf.to_float( self.gp.input_std) + tf.to_float(self.gp.input_mean) #TODO: mean self.injected_noise = 0.01 noisy_func_x_adv = self.func_x_adv + self.injected_noise * tf.random_normal( tf.shape(h)) tmp = tf.boolean_mask(tf.transpose(noisy_func_x_adv, [1, 2, 0]), self.weights_x_adv > 0) self.noisy_func_x_adv = tf.transpose(tmp) # alpha = tf.nn.softplus(self.gp.noise) + 1e-6 # self.obs_sigma = tf.constant(self.hparams.noise_sigma) self.obs_sigma = tf.nn.softplus( tf.get_variable('pre_noise_sigma', initializer=-3.)) y_normed = (self.y - tf.to_float(self.gp.input_mean)) / tf.to_float( self.gp.input_std) log_likelihood = log_gaussian(y_normed, self.func_x, self.obs_sigma, reduce_sum=False) # Compute functional kl divergence x_adv_64, w_x_adv_64 = tf.cast(self.x_adv, tf.float64), tf.cast( self.weights_x_adv, tf.float64) eye = tf.eye(tf.shape(self.x_adv)[0], dtype=tf.float64) prior_cov = self.gp.cov(x_adv_64, x_adv_64) * self.gp.task_cov(w_x_adv_64, w_x_adv_64) \ + self.injected_noise ** 2 * eye prior_cov_root = tf.to_float(tf.linalg.cholesky(prior_cov)) estimator = SpectralScoreEstimator(n_eigen_threshold=0.99) entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_adv) prior_dist = zs.distributions.MultivariateNormalCholesky( mean=tf.zeros([tf.shape(self.x_adv)[0]]), cov_tril=prior_cov_root) cross_entropy = tf.reduce_mean( prior_dist.log_prob(self.noisy_func_x_adv)) kl = -entropy_sur - cross_entropy # Only take into account observed outcomes (bandits setting) batch_size = tf.to_float(tf.shape(self.x)[0]) self.weighted_log_likelihood = tf.reduce_sum( log_likelihood * self.weights) / batch_size self.global_step = tf.train.get_or_create_global_step() kl_coeff = 1. # tf.minimum(tf.to_float(self.global_step % 20000) / 15000., 1.) #TODO elbo = self.weighted_log_likelihood - kl / batch_size * kl_coeff self.loss = -elbo vars = list( set(tf.trainable_variables()) - set(tf.trainable_variables(self.gp.name))) optimizer = tf.train.AdamOptimizer(self.hparams.initial_lr) gradients = optimizer.compute_gradients(self.loss, var_list=vars) clipped_grads = [(tf.clip_by_value(grad, -self.hparams.max_grad_norm, self.hparams.max_grad_norm), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(clipped_grads)