def _build_graph(self, **kwargs): """ We treat tfFunctionApproximator as the stochastic map of the policy (which inputs ph_x and outputs ts_yh) and build additional attributes/methods required by Policy """ # build tf.Variables # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars, # ph_y, ts_pi, ts_logp, ts_pid, ts_pir, ts_pi_given_r tfFunctionApproximator._build_graph(self, **kwargs) # r_dim: dimension of randomness in generating actions. # build additional graphs for Policy # build conditional distribution self._pi = self._yh self._pi_given_r = U.function([self.ph_x, self.ph_r], self.ts_pi_given_r) self._pid = U.function([self.ph_x], self.ts_pid) # derandomized actions # actions and the randomness used in generating actions concatenated. self._pir = U.function([self.ph_x], self.ts_pir) self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp) self._logp_grad = U.function([self.ph_x, self.ph_y], tf.gradients(self.ts_logp, self.ts_vars)) # build fvp operator (this depends only on self) ph_g, ts_grads = self._sh_vars.build_flat_ph() ts_kl = self.build_kl(self, self, p1_sg=True) ts_kl_grads = U.gradients(ts_kl, self.ts_vars) # grad to the 2nd arg of KL ts_inner_prod = tf.add_n([ tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads) ]) ts_fvp = U.gradients( ts_inner_prod, self.ts_vars) # Fisher (information matrix) and Vector Product ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1) # continuous vector self._fvp = U.function([self.ph_x, ph_g], ts_fvp) # build nabla logp f. ts_loss = tf.reduce_sum(self.ph_f * self.ts_logp) # sum!! ts_grads = U.gradients(ts_loss, self.ts_vars) ts_grads = [ g if g is not None else tf.zeros_like(v) for (v, g) in zipsame(self.ts_vars, ts_grads) ] # need to flatten compute_ts_grad = U.function([self.ph_x, self.ph_y, self.ph_f], ts_grads) self.nabla_logp_f = lambda x, y, f: flatten(compute_ts_grad(x, y, f))
def _build_graph(self, **kwargs): """ We treat tfFunctionApproximator as the stochastic map of the policy (which inputs ph_x and outputs ts_yh) and build additional attributes/methods required by Policy """ # build tf.Variables # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars, # ph_y, ts_pi, ts_logp, ts_pid tfFunctionApproximator._build_graph(self, **kwargs) # build additional graphs for Policy # build conditional distribution self._pi = self._yh self._pid = U.function([self.ph_x], self.ts_pid) self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp) # build fvp operator (this depends only on self) ph_g, ts_grads = self._sh_vars.build_flat_ph() ts_kl = self.build_kl(self, self, p1_sg=True) ts_kl_grads = U.gradients(ts_kl, self.ts_vars) # grad to the 2nd arg of KL ts_inner_prod = tf.add_n([ tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads) ]) ts_fvp = U.gradients( ts_inner_prod, self.ts_vars) # Fisher (information matrix) and Vector Product ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1) # continuous vector self._fvp = U.function([self.ph_x, ph_g], ts_fvp)
def _build_graph(self, **bg_kwargs): ts_loss, ph_args = self._build_loss_op(**bg_kwargs) # define compute_loss and compute_grad wrt loss self._compute_loss = U.function(ph_args, ts_loss) ts_grads = U.gradients(ts_loss, self._ts_vars) # fill None with zeros; otherwise tf.run will attempt to fetch for None. ts_grads = [g if g is not None else tf.zeros_like(v) for (v, g) in zipsame(self._ts_vars, ts_grads)] self._compute_grad = U.function(ph_args, ts_grads)
def get_valcombs_and_keys(ranges): keys = [] values = [] for r in ranges: keys += r[::2] values = [list(zipsame(*r[1::2])) for r in ranges] cs = itertools.product(*values) combs = [] for c in cs: comb = [] for x in c: comb += x print(comb) combs.append(comb) return combs, keys