Ejemplo n.º 1
0
    def _build_graph(self, **kwargs):
        """ We treat tfFunctionApproximator as the stochastic map of the policy
        (which inputs ph_x and outputs ts_yh) and build additional
        attributes/methods required by Policy """
        # build tf.Variables
        # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars,
        #                ph_y, ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r
        tfFunctionApproximator._build_graph(self, **kwargs)
        # r_dim: dimension of randomness in generating actions.
        # build additional graphs for Policy
        # build conditional distribution
        self._pi = self._yh
        self._pi_given_r = U.function([self.ph_x, self.ph_r],
                                      self.ts_pi_given_r)
        self._pid = U.function([self.ph_x],
                               self.ts_pid)  # derandomized actions
        # actions and the randomness used in generating actions concatenated.
        self._pir = U.function([self.ph_x], self.ts_pir)
        self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp)
        self._logp_grad = U.function([self.ph_x, self.ph_y],
                                     tf.gradients(self.ts_logp, self.ts_vars))
        # build fvp operator (this depends only on self)
        ph_g, ts_grads = self._sh_vars.build_flat_ph()
        ts_kl = self.build_kl(self, self, p1_sg=True)
        ts_kl_grads = U.gradients(ts_kl,
                                  self.ts_vars)  # grad to the 2nd arg of KL
        ts_inner_prod = tf.add_n([
            tf.reduce_sum(kg * v)
            for (kg, v) in zipsame(ts_kl_grads, ts_grads)
        ])
        ts_fvp = U.gradients(
            ts_inner_prod,
            self.ts_vars)  # Fisher (information matrix) and Vector Product
        ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp],
                           axis=-1)  # continuous vector
        self._fvp = U.function([self.ph_x, ph_g], ts_fvp)

        # build nabla logp f.
        ts_loss = tf.reduce_sum(self.ph_f * self.ts_logp)  # sum!!
        ts_grads = U.gradients(ts_loss, self.ts_vars)
        ts_grads = [
            g if g is not None else tf.zeros_like(v)
            for (v, g) in zipsame(self.ts_vars, ts_grads)
        ]
        # need to flatten
        compute_ts_grad = U.function([self.ph_x, self.ph_y, self.ph_f],
                                     ts_grads)
        self.nabla_logp_f = lambda x, y, f: flatten(compute_ts_grad(x, y, f))
Ejemplo n.º 2
0
 def flatten(self, vs):
     return flatten(vs)
Ejemplo n.º 3
0
 def logp_grad(self, x, y):
     # XXX Only support single instance, due to tf can not compute jacobian.
     # Should throw error if more than one instances are provided.
     # Return a flat np array.
     return flatten(self._logp_grad(x[None], y[None]))
Ejemplo n.º 4
0
        def _build_dist(self, ts_nor_x, ph_y):
            # mean and std
            self.ts_mean = cls._build_func_apprx(
                self,
                ts_nor_x)  # use the tfFunctionApproximator to define mean
            self._ts_logstd = tf.get_variable(
                'logstd',
                shape=[self.y_dim],
                initializer=tf.constant_initializer(np.log(self._init_std)))
            self._ts_stop_std_grad = tf.get_variable(
                'stop_std_grad',
                initializer=tf.constant(False),
                trainable=False)
            _ts_logstd = tf.cond(
                self._ts_stop_std_grad,  # whether to stop gradient
                true_fn=lambda: tf.stop_gradient(self._ts_logstd),
                false_fn=lambda: self._ts_logstd)
            # make sure the distribution does not degenerate
            self.ts_logstd = tf.maximum(tf.to_float(np.log(self._min_std)),
                                        _ts_logstd)
            ts_std = tf.exp(self.ts_logstd)
            self._std = U.function([], ts_std)
            self._set_logstd = U.build_set([self._ts_logstd])
            self._set_stop_std_grad = U.build_set([self._ts_stop_std_grad])

            # pi
            # self.ts_noise = tf.random_normal(tf.shape(ts_std), stddev=ts_std, seed=self.seed)
            rand = tf.random_normal(tf.shape(self.ts_mean), seed=self.seed)
            ts_noise = ts_std * rand
            ts_pi = self.ts_mean + ts_noise
            ts_pid = self.ts_mean
            # Need to broadcast noise to each row.
            # n = tf.shape(self.ts_mean)[0]
            # noise = tf.reshape(tf.tile(self.ts_noise, [n]), [n, -1])
            # ts_pir = tf.concat([ts_pi, noise], 1)
            ts_pir = tf.concat([ts_pi, rand], axis=1)
            ts_pi_given_r = self.ts_mean + ts_std * self.ph_r
            # logp
            ts_logp = self._build_logp(self.y_dim, ph_y, self.ts_mean,
                                       self.ts_logstd)

            # XXX Full matrix
            # expectation with a quadratic function 0.5 y^t A y + b^t y + c
            ph_A = tf.placeholder(tf_float,
                                  shape=(None, self.y_dim, self.y_dim),
                                  name='A')
            ph_b = tf.placeholder(tf_float,
                                  shape=(None, self.y_dim, 1),
                                  name='b')
            ph_c = tf.placeholder(tf_float, shape=(None, ), name='c')
            ph_w = tf.placeholder(tf_float, shape=(None, ), name='w')

            ts_mean = tf.expand_dims(self.ts_mean, -1)  # None * dim_y * 1
            ts_var = tf.reshape(ts_std**2,
                                (1, self.y_dim, 1))  # None * dim_y * 1

            ts_quad_exp = tf.squeeze(0.5*tf.matmul(ts_mean, tf.matmul(ph_A, ts_mean), adjoint_a=True)) \
                         + tf.squeeze(tf.matmul(ph_b, ts_mean,adjoint_a=True)) \
                         + ph_c + 0.5*tf.trace(ph_A*ts_var)  # None
            self._quad_exp = U.function([self.ph_x, ph_A, ph_b, ph_c],
                                        ts_quad_exp)
            ts_quad_exp_grads = U.gradients(ts_quad_exp * ph_w,
                                            self.ts_vars)  # just a vector
            compute_ts_quad_exp_grad = U.function(
                [self.ph_x, ph_A, ph_b, ph_c, ph_w], ts_quad_exp_grads)
            self._nabla_quad_exp = lambda x, A, b, c, w: flatten(
                compute_ts_quad_exp_grad(x, A, b, c, w))

            # XXX Diagonal
            ph_A_diag = tf.placeholder(tf_float,
                                       shape=(None, self.y_dim, 1),
                                       name='A_diag')
            ts_quad_exp_diag = tf.squeeze(0.5*tf.matmul(ts_mean, ph_A_diag*ts_mean, adjoint_a=True)) \
                               + tf.squeeze(tf.matmul(ph_b, ts_mean,adjoint_a=True)) \
                               + ph_c + 0.5*tf.squeeze(tf.reduce_sum(ph_A_diag*ts_var, axis=1))  # None
            self._quad_exp_diag = U.function(
                [self.ph_x, ph_A_diag, ph_b, ph_c], ts_quad_exp_diag)
            ts_quad_exp_grads_diag = U.gradients(ts_quad_exp_diag * ph_w,
                                                 self.ts_vars)  # just a vector
            compute_ts_quad_exp_grad_diag = U.function(
                [self.ph_x, ph_A_diag, ph_b, ph_c, ph_w],
                ts_quad_exp_grads_diag)
            self._nabla_quad_exp_diag = lambda x, A, b, c, w: flatten(
                compute_ts_quad_exp_grad_diag(x, A, b, c, w))

            return ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r
Ejemplo n.º 5
0
 def compute_grad(self):
     if self._args is None:
         raise ValueError('Oracle has not been initialized')
     grads = self._compute_grad(*self._args)
     return flatten(grads)