def _build_graph(self, **kwargs): """ We treat tfFunctionApproximator as the stochastic map of the policy (which inputs ph_x and outputs ts_yh) and build additional attributes/methods required by Policy """ # build tf.Variables # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars, # ph_y, ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r tfFunctionApproximator._build_graph(self, **kwargs) # r_dim: dimension of randomness in generating actions. # build additional graphs for Policy # build conditional distribution self._pi = self._yh self._pi_given_r = U.function([self.ph_x, self.ph_r], self.ts_pi_given_r) self._pid = U.function([self.ph_x], self.ts_pid) # derandomized actions # actions and the randomness used in generating actions concatenated. self._pir = U.function([self.ph_x], self.ts_pir) self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp) self._logp_grad = U.function([self.ph_x, self.ph_y], tf.gradients(self.ts_logp, self.ts_vars)) # build fvp operator (this depends only on self) ph_g, ts_grads = self._sh_vars.build_flat_ph() ts_kl = self.build_kl(self, self, p1_sg=True) ts_kl_grads = U.gradients(ts_kl, self.ts_vars) # grad to the 2nd arg of KL ts_inner_prod = tf.add_n([ tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads) ]) ts_fvp = U.gradients( ts_inner_prod, self.ts_vars) # Fisher (information matrix) and Vector Product ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1) # continuous vector self._fvp = U.function([self.ph_x, ph_g], ts_fvp) # build nabla logp f. ts_loss = tf.reduce_sum(self.ph_f * self.ts_logp) # sum!! ts_grads = U.gradients(ts_loss, self.ts_vars) ts_grads = [ g if g is not None else tf.zeros_like(v) for (v, g) in zipsame(self.ts_vars, ts_grads) ] # need to flatten compute_ts_grad = U.function([self.ph_x, self.ph_y, self.ph_f], ts_grads) self.nabla_logp_f = lambda x, y, f: flatten(compute_ts_grad(x, y, f))
def flatten(self, vs): return flatten(vs)
def logp_grad(self, x, y): # XXX Only support single instance, due to tf can not compute jacobian. # Should throw error if more than one instances are provided. # Return a flat np array. return flatten(self._logp_grad(x[None], y[None]))
def _build_dist(self, ts_nor_x, ph_y): # mean and std self.ts_mean = cls._build_func_apprx( self, ts_nor_x) # use the tfFunctionApproximator to define mean self._ts_logstd = tf.get_variable( 'logstd', shape=[self.y_dim], initializer=tf.constant_initializer(np.log(self._init_std))) self._ts_stop_std_grad = tf.get_variable( 'stop_std_grad', initializer=tf.constant(False), trainable=False) _ts_logstd = tf.cond( self._ts_stop_std_grad, # whether to stop gradient true_fn=lambda: tf.stop_gradient(self._ts_logstd), false_fn=lambda: self._ts_logstd) # make sure the distribution does not degenerate self.ts_logstd = tf.maximum(tf.to_float(np.log(self._min_std)), _ts_logstd) ts_std = tf.exp(self.ts_logstd) self._std = U.function([], ts_std) self._set_logstd = U.build_set([self._ts_logstd]) self._set_stop_std_grad = U.build_set([self._ts_stop_std_grad]) # pi # self.ts_noise = tf.random_normal(tf.shape(ts_std), stddev=ts_std, seed=self.seed) rand = tf.random_normal(tf.shape(self.ts_mean), seed=self.seed) ts_noise = ts_std * rand ts_pi = self.ts_mean + ts_noise ts_pid = self.ts_mean # Need to broadcast noise to each row. # n = tf.shape(self.ts_mean)[0] # noise = tf.reshape(tf.tile(self.ts_noise, [n]), [n, -1]) # ts_pir = tf.concat([ts_pi, noise], 1) ts_pir = tf.concat([ts_pi, rand], axis=1) ts_pi_given_r = self.ts_mean + ts_std * self.ph_r # logp ts_logp = self._build_logp(self.y_dim, ph_y, self.ts_mean, self.ts_logstd) # XXX Full matrix # expectation with a quadratic function 0.5 y^t A y + b^t y + c ph_A = tf.placeholder(tf_float, shape=(None, self.y_dim, self.y_dim), name='A') ph_b = tf.placeholder(tf_float, shape=(None, self.y_dim, 1), name='b') ph_c = tf.placeholder(tf_float, shape=(None, ), name='c') ph_w = tf.placeholder(tf_float, shape=(None, ), name='w') ts_mean = tf.expand_dims(self.ts_mean, -1) # None * dim_y * 1 ts_var = tf.reshape(ts_std**2, (1, self.y_dim, 1)) # None * dim_y * 1 ts_quad_exp = tf.squeeze(0.5*tf.matmul(ts_mean, tf.matmul(ph_A, ts_mean), adjoint_a=True)) \ + tf.squeeze(tf.matmul(ph_b, ts_mean,adjoint_a=True)) \ + ph_c + 0.5*tf.trace(ph_A*ts_var) # None self._quad_exp = U.function([self.ph_x, ph_A, ph_b, ph_c], ts_quad_exp) ts_quad_exp_grads = U.gradients(ts_quad_exp * ph_w, self.ts_vars) # just a vector compute_ts_quad_exp_grad = U.function( [self.ph_x, ph_A, ph_b, ph_c, ph_w], ts_quad_exp_grads) self._nabla_quad_exp = lambda x, A, b, c, w: flatten( compute_ts_quad_exp_grad(x, A, b, c, w)) # XXX Diagonal ph_A_diag = tf.placeholder(tf_float, shape=(None, self.y_dim, 1), name='A_diag') ts_quad_exp_diag = tf.squeeze(0.5*tf.matmul(ts_mean, ph_A_diag*ts_mean, adjoint_a=True)) \ + tf.squeeze(tf.matmul(ph_b, ts_mean,adjoint_a=True)) \ + ph_c + 0.5*tf.squeeze(tf.reduce_sum(ph_A_diag*ts_var, axis=1)) # None self._quad_exp_diag = U.function( [self.ph_x, ph_A_diag, ph_b, ph_c], ts_quad_exp_diag) ts_quad_exp_grads_diag = U.gradients(ts_quad_exp_diag * ph_w, self.ts_vars) # just a vector compute_ts_quad_exp_grad_diag = U.function( [self.ph_x, ph_A_diag, ph_b, ph_c, ph_w], ts_quad_exp_grads_diag) self._nabla_quad_exp_diag = lambda x, A, b, c, w: flatten( compute_ts_quad_exp_grad_diag(x, A, b, c, w)) return ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r
def compute_grad(self): if self._args is None: raise ValueError('Oracle has not been initialized') grads = self._compute_grad(*self._args) return flatten(grads)