Beispiel #1
0
 def _decide_action(self, s, g):
     with self.sess.as_default(), self.graph.as_default():
         self._exp_action = self._enable_stoch_policy(
         ) and MathUtil.flip_coin(self.exp_params_curr.rate)
         #print("_decide_action._exp_action=",self._exp_action)
         a, logp = self._eval_actor(s, g, self._exp_action)
     return a[0], logp[0]
Beispiel #2
0
 def _decide_action(self, s, g):
   with self.sess.as_default(), self.graph.as_default():
     self._exp_action = self._enable_stoch_policy() and MathUtil.flip_coin(
         self.exp_params_curr.rate)
     #print("_decide_action._exp_action=",self._exp_action)
     a, logp = self._eval_actor(s, g, self._exp_action)
   return a[0], logp[0]
Beispiel #3
0
  def _decide_action(self, s, g):
    with self.sess.as_default(), self.graph.as_default():
      self._exp_action = False
      a = self._eval_actor(s, g)[0]
      logp = 0

      if self._enable_stoch_policy():
        # epsilon-greedy
        rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
        if rand_action:
          norm_exp_noise = np.random.randn(*a.shape)
          norm_exp_noise *= self.exp_params_curr.noise
          exp_noise = norm_exp_noise * self.a_norm.std
          a += exp_noise

          logp = self._calc_action_logp(norm_exp_noise)
          self._exp_action = True

    return a, logp
Beispiel #4
0
  def _decide_action(self, s, g):
    with self.sess.as_default(), self.graph.as_default():
      self._exp_action = False
      a = self._eval_actor(s, g)[0]
      logp = 0

      if self._enable_stoch_policy():
        # epsilon-greedy
        rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
        if rand_action:
          norm_exp_noise = np.random.randn(*a.shape)
          norm_exp_noise *= self.exp_params_curr.noise
          exp_noise = norm_exp_noise * self.a_norm.std
          a += exp_noise

          logp = self._calc_action_logp(norm_exp_noise)
          self._exp_action = True

    return a, logp