Esempio n. 1
0
    def _decide_action(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(self.exp_params_curr.rate)

            a, logp = self._eval_actor(s, g, self._exp_action)
            a = a[0]
            logp = logp[0]

        return a, logp
Esempio n. 2
0
    def _decide_action(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            self._exp_action = False
            a = self._eval_actor(s, g)[0]
            logp = 0

            if self._enable_stoch_policy():
                # epsilon-greedy
                rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
                if rand_action:
                    norm_exp_noise = np.random.randn(*a.shape)
                    norm_exp_noise *= self.exp_params_curr.noise
                    exp_noise = norm_exp_noise * self.a_norm.std
                    a += exp_noise

                    logp = self._calc_action_logp(norm_exp_noise)
                    self._exp_action = True

        return a, logp