Beispiel #1
0
    def advs(self,
             ro,
             lambd=None,
             use_is=None,
             ref_policy=None):  # advantage function
        """ Compute adv (evaluated at ro) wrt to ref_policy.

            ro: a list of Rollout isinstances

            Note `ref_policy` argument is only considered when `self.use_is`
            is True; in this case, if `ref_policy` is None, it is wrt to
            `self.ref_policy`. Otherwise, when `self.use_is`_is is False, the
            adv is biased toward the behavior policy that collected the data.
        """
        use_is = use_is or self.use_is
        vfns = self.vfns(ro)
        if use_is is 'multi':
            ws = self.weights(ro, ref_policy)  # importance weight
            advs = [
                self._pe.adv(rollout.rws, vf, rollout.done, w=w, lambd=lambd)
                for rollout, vf, w in zipsame(ro, vfns, ws)
            ]
        else:
            advs = [
                self._spe.adv(rollout.rws,
                              vf,
                              rollout.done,
                              w=1.0,
                              lambd=lambd)
                for rollout, vf in zipsame(ro, vfns)
            ]
        return advs, vfns
Beispiel #2
0
 def _build_graph(self, **bg_kwargs):
     ts_loss, ph_args = self._build_loss_op(**bg_kwargs)
     # define compute_loss and compute_grad wrt loss
     self._compute_loss = U.function(ph_args, ts_loss)
     ts_grads = U.gradients(ts_loss, self._ts_vars)
     # fill None with zeros; otherwise tf.run will attempt to fetch for None.
     ts_grads = [g if g is not None else tf.zeros_like(v) for (v, g) in
                 zipsame(self._ts_vars, ts_grads)]
     self._compute_grad = U.function(ph_args, ts_grads)
Beispiel #3
0
 def ts_fvp0(self, ts_xs, ts_ys, ts_gs):
     """ Computes F(self.pi)*g based on the expected outer product. """
     dummy = tf.ones(shape=(len(ts_xs,)))
     with tf.GradientTape(watch_accessed_variables=False) as gt:
         gt.watch(dummy)
         ts_sum_logp_grads = self.ts_logp_grad(ts_xs, ts_ys, dummy)
         ts_pd = tf.math.accumulate_n([tf.reduce_sum(u*v) for (u, v) in zipsame(ts_sum_logp_grads, ts_gs)])
     ts_fs = gt.gradient(ts_pd, dummy)  # shape (N,)
     N = tf.constant(len(dummy),dtype=tf_float)
     return self.ts_logp_grad(ts_xs, ts_ys, ts_fs/N)
Beispiel #4
0
 def ts_fvp(self, ts_xs, ts_gs):
     """ Computes F(self.pi)*g based on the Hessian of the entropy. """
     with tf.GradientTape(watch_accessed_variables=False) as gt:
         gt.watch(self.ts_variables)
         with tf.GradientTape(watch_accessed_variables=False) as gt2:
             gt2.watch(self.ts_variables)  #  TODO add sample weight below??
             ts_kl = self.ts_kl(self, ts_xs, p1_sg=True)
         ts_kl_grads = gt2.gradient(ts_kl, self.ts_variables)
         ts_pd = tf.math.accumulate_n([tf.reduce_sum(kg*v) for (kg, v) in zipsame(ts_kl_grads, ts_gs)])
     ts_fvp = gt.gradient(ts_pd, self.ts_variables)
     return ts_fvp
Beispiel #5
0
 def ts_fvp(self, ts_xs, ts_gs):
     """ Computes F(self.pi)*g, where F is the Fisher information matrix and
     g is a np.ndarray in the same shape as self.variable """
     with tf.GradientTape() as gt:
         gt.watch(self.ts_variables)
         with tf.GradientTape() as gt2:
             gt2.watch(self.ts_variables)  #  TODO add sample weight below??
             ts_kl = self.ts_kl(self, ts_xs, p1_sg=True)
         ts_kl_grads = gt2.gradient(ts_kl, self.ts_variables)
         ts_pd = tf.add_n([
             tf.reduce_sum(kg * v)
             for (kg, v) in zipsame(ts_kl_grads, ts_gs)
         ])
     ts_fvp = gt.gradient(ts_pd, self.ts_variables)
     return ts_fvp
Beispiel #6
0
def get_combs_and_keys(ranges):

    keys = []
    values = []
    for r in ranges:
        keys += r[::2]
    values = [list(zipsame(*r[1::2])) for r in ranges]
    cs = itertools.product(*values)
    combs = []
    for c in cs:
        comb = []
        for x in c:
            comb += x
        # print(comb)
        combs.append(comb)
    return combs, keys
Beispiel #7
0
    def split(self, ro, policy_as_expert):
        # Split ro into two phases
        rollouts = ro.to_list()
        ro_mix = [rollouts[i] for i in self._ind_ro_mix]
        ro_pol = [rollouts[i] for i in self._ind_ro_pol]
        assert (len(ro_mix) + len(ro_pol)) == len(rollouts)
        ro_exps = [[] for _ in range(len(self.experts))]
        for r, t, s, k in zipsame(ro_mix, self._t_switch, self._scale,
                                  self._k_star):
            assert len(r) >= t  # because t >= 1
            if not policy_as_expert or k < len(self.experts) - 1:
                # we assume the last expert is the learner
                r = r[t:]
            r.weight = 1.0
            ro_exps[k].append(r)
        if policy_as_expert:
            ro_pol += ro_exps[-1]
            del ro_exps[-1]
        ro_exps = [Dataset(ro_exp) for ro_exp in ro_exps]
        ro_pol = Dataset(ro_pol)

        return ro_exps, ro_pol
Beispiel #8
0
    def _build_graph(self, **kwargs):
        """ We treat tfFunctionApproximator as the stochastic map of the policy
        (which inputs ph_x and outputs ts_yh) and build additional
        attributes/methods required by Policy """
        # build tf.Variables
        # add attributes ph_x, ts_nor_x, ts_y, _yh, _sh_vars,
        #                ph_y, ts_pi, ts_logp, ts_pid
        tfFunctionApproximator._build_graph(self, **kwargs)

        # build additional graphs for Policy
        # build conditional distribution
        self._pi = self._yh
        self._pid = U.function([self.ph_x], self.ts_pid)
        self._logp = U.function([self.ph_x, self.ph_y], self.ts_logp)
        # build fvp operator (this depends only on self)
        ph_g, ts_grads = self._sh_vars.build_flat_ph()
        ts_kl = self.build_kl(self, self, p1_sg=True)
        ts_kl_grads = U.gradients(ts_kl, self.ts_vars)  # grad to the 2nd arg of KL
        ts_inner_prod = tf.add_n([tf.reduce_sum(kg * v) for (kg, v) in zipsame(ts_kl_grads, ts_grads)])
        ts_fvp = U.gradients(ts_inner_prod, self.ts_vars)  # Fisher (information matrix) and Vector Product
        ts_fvp = tf.concat([tf.reshape(f, [-1]) for f in ts_fvp], axis=-1)  # continuous vector
        self._fvp = U.function([self.ph_x, ph_g], ts_fvp)
Beispiel #9
0
 def mean_variable(self, val):
     vals = unflatten(val, shapes=self.mean_var_shapes)
     [var.assign(val) for var, val in zipsame(self.ts_mean_variables, vals)]
 def variables(self, vals):  # vals can be a list of nd.array or tf.Tensor
     [var.assign(val) for var, val in zipsame(self.ts_variables, vals)]
Beispiel #11
0
def var_assign(ts_var, x):
    # convert np.ndarray(s) to tf.Tensor(s)
    if type(ts_var) is list:
        return [vv.assign(xx) for vv, xx in zipsame(ts_var, x)]
    else:
        return ts_var(x)