Exemple #1
0
    def build_kl(self):
        # estimate entropy surrogate
        estimator = SpectralScoreEstimator(
            eta=self.eta, n_eigen_threshold=self.n_eigen_threshold)
        entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_rand)

        # compute analytic cross entropy
        kernel_matrix = self.prior_kernel.K(tf.cast(self.x_rand, tf.float64)) \
                        + self.injected_noise ** 2 * tf.eye(tf.shape(self.x_rand)[0], dtype=tf.float64)
        prior_dist = tf.contrib.distributions.MultivariateNormalFullCovariance(
            tf.zeros([tf.shape(self.x_rand)[0]], dtype=tf.float64),
            kernel_matrix)
        cross_entropy = -tf.reduce_mean(
            prior_dist.log_prob(tf.to_double(self.noisy_func_x_rand)))

        self.kl_surrogate = -entropy_sur + tf.to_float(cross_entropy)
Exemple #2
0
    def build_kl(self):
        # estimate entropy surrogate
        estimator = SpectralScoreEstimator(
            eta=self.eta, n_eigen_threshold=self.n_eigen_threshold)
        entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_rand)

        # estimate cross entropy
        self.prior_func_x_rand = self.prior_gen(self.x_rand, self.n_particles)
        self.noisy_prior_func_x_rand = self.prior_func_x_rand + self.injected_noise * tf.random_normal(
            tf.shape(self.prior_func_x_rand))

        cross_entropy_gradients = estimator.compute_gradients(
            self.noisy_prior_func_x_rand, self.noisy_func_x_rand)
        cross_entropy_sur = -tf.reduce_mean(
            tf.reduce_sum(
                tf.stop_gradient(cross_entropy_gradients) *
                self.noisy_func_x_rand, -1))

        self.kl_surrogate = -entropy_sur + cross_entropy_sur
Exemple #3
0
    def build_model(self, activation_fn=tf.nn.relu):
        """Defines the actual NN model with fully connected layers.

    The loss is computed for partial feedback settings (bandits), so only
    the observed outcome is backpropagated (see weighted loss).
    Selects the optimizer and, finally, it also initializes the graph.

    Args:
      activation_fn: the activation function used in the nn layers.
    """

        if self.verbose:
            print("Initializing model {}.".format(self.name))
        neg_kl_term, l_number = 0, 0

        # Build network.
        input_x = tf.tile(tf.expand_dims(self.x_adv, 0), [self.n_sample, 1, 1])
        n_in = self.n_in

        for l_number, n_nodes in enumerate(self.layers):
            if n_nodes > 0:
                h = self.build_layer(input_x, [n_in, n_nodes], l_number,
                                     self.hparams.activation)
                input_x = h
                n_in = n_nodes

        # Create last linear layer
        h = self.build_layer(input_x, [n_in, self.n_out],
                             l_number + 1,
                             activation_fn=lambda x: x)

        self.func_x_adv = h  # (h - tf.to_float(self.gp.input_mean)) / tf.to_float(self.gp.input_std)
        self.func_x = self.func_x_adv[:, :tf.shape(self.x)[0]]
        self.func_adv = self.func_x_adv[:, tf.shape(self.x)[0]:]
        self.y_pred = h[0, :tf.shape(self.x)[0]] * tf.to_float(
            self.gp.input_std) + tf.to_float(self.gp.input_mean)
        #TODO: mean

        self.injected_noise = 0.01
        noisy_func_x_adv = self.func_x_adv + self.injected_noise * tf.random_normal(
            tf.shape(h))
        tmp = tf.boolean_mask(tf.transpose(noisy_func_x_adv, [1, 2, 0]),
                              self.weights_x_adv > 0)
        self.noisy_func_x_adv = tf.transpose(tmp)

        # alpha = tf.nn.softplus(self.gp.noise) + 1e-6
        # self.obs_sigma = tf.constant(self.hparams.noise_sigma)
        self.obs_sigma = tf.nn.softplus(
            tf.get_variable('pre_noise_sigma', initializer=-3.))
        y_normed = (self.y - tf.to_float(self.gp.input_mean)) / tf.to_float(
            self.gp.input_std)
        log_likelihood = log_gaussian(y_normed,
                                      self.func_x,
                                      self.obs_sigma,
                                      reduce_sum=False)

        # Compute functional kl divergence
        x_adv_64, w_x_adv_64 = tf.cast(self.x_adv, tf.float64), tf.cast(
            self.weights_x_adv, tf.float64)
        eye = tf.eye(tf.shape(self.x_adv)[0], dtype=tf.float64)
        prior_cov = self.gp.cov(x_adv_64, x_adv_64) * self.gp.task_cov(w_x_adv_64, w_x_adv_64) \
          + self.injected_noise ** 2 * eye
        prior_cov_root = tf.to_float(tf.linalg.cholesky(prior_cov))

        estimator = SpectralScoreEstimator(n_eigen_threshold=0.99)
        entropy_sur = entropy_surrogate(estimator, self.noisy_func_x_adv)

        prior_dist = zs.distributions.MultivariateNormalCholesky(
            mean=tf.zeros([tf.shape(self.x_adv)[0]]), cov_tril=prior_cov_root)
        cross_entropy = tf.reduce_mean(
            prior_dist.log_prob(self.noisy_func_x_adv))
        kl = -entropy_sur - cross_entropy

        # Only take into account observed outcomes (bandits setting)
        batch_size = tf.to_float(tf.shape(self.x)[0])
        self.weighted_log_likelihood = tf.reduce_sum(
            log_likelihood * self.weights) / batch_size

        self.global_step = tf.train.get_or_create_global_step()
        kl_coeff = 1.  # tf.minimum(tf.to_float(self.global_step % 20000) / 15000., 1.) #TODO
        elbo = self.weighted_log_likelihood - kl / batch_size * kl_coeff

        self.loss = -elbo
        vars = list(
            set(tf.trainable_variables()) -
            set(tf.trainable_variables(self.gp.name)))
        optimizer = tf.train.AdamOptimizer(self.hparams.initial_lr)
        gradients = optimizer.compute_gradients(self.loss, var_list=vars)
        clipped_grads = [(tf.clip_by_value(grad, -self.hparams.max_grad_norm,
                                           self.hparams.max_grad_norm), var)
                         for grad, var in gradients]
        self.train_op = optimizer.apply_gradients(clipped_grads)