Example #1
0
 def _get_action(self, s, visual_s, cell_state, options):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s,
                                             visual_s,
                                             cell_state=cell_state,
                                             record_cs=True,
                                             train=False)
         q = self.q_net(feat)  # [B, P]
         pi = self.intra_option_net(feat)  # [B, P, A]
         beta = self.termination_net(feat)  # [B, P]
         options_onehot = tf.one_hot(options,
                                     self.options_num,
                                     dtype=tf.float32)  # [B, P]
         options_onehot_expanded = tf.expand_dims(options_onehot,
                                                  axis=-1)  # [B, P, 1]
         pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
         if self.is_continuous:
             log_std = tf.gather(self.log_std, options)
             mu = tf.math.tanh(pi)
             a, _ = gaussian_clip_rsample(mu, log_std)
         else:
             pi = pi / self.boltzmann_temperature
             dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
             a = dist.sample()
         max_options = tf.cast(tf.argmax(q, axis=-1),
                               dtype=tf.int32)  # [B, P] => [B, ]
         if self.use_eps_greedy:
             new_options = max_options
         else:
             beta_probs = tf.reduce_sum(beta * options_onehot,
                                        axis=1)  # [B, P] => [B,]
             beta_dist = tfp.distributions.Bernoulli(probs=beta_probs)
             new_options = tf.where(beta_dist.sample() < 1, options,
                                    max_options)
     return a, new_options, cell_state
Example #2
0
 def _get_action(self, s, visual_s, cell_state, options):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s,
                                             visual_s,
                                             cell_state=cell_state,
                                             record_cs=True)
         q = self.q_net(feat)  # [B, P]
         pi = self.intra_option_net(feat)  # [B, P, A]
         options_onehot = tf.one_hot(options,
                                     self.options_num,
                                     dtype=tf.float32)  # [B, P]
         options_onehot_expanded = tf.expand_dims(options_onehot,
                                                  axis=-1)  # [B, P, 1]
         pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
         if self.is_continuous:
             log_std = tf.gather(self.log_std, options)
             mu = tf.math.tanh(pi)
             a, _ = gaussian_clip_rsample(mu, log_std)
         else:
             pi = pi / self.boltzmann_temperature
             dist = tfp.distributions.Categorical(logits=pi)  # [B, ]
             a = dist.sample()
         interests = self.interest_net(feat)  # [B, P]
         op_logits = interests * q  # [B, P] or tf.nn.softmax(q)
         new_options = tfp.distributions.Categorical(
             logits=op_logits).sample()
     return a, new_options, cell_state
Example #3
0
 def _get_action(self, s, visual_s, cell_state, options):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s,
                                             visual_s,
                                             cell_state=cell_state,
                                             record_cs=True,
                                             train=False)
         q, pi, beta = self.net(feat)  # [B, P], [B, P, A], [B, P], [B, P]
         options_onehot = tf.one_hot(options,
                                     self.options_num,
                                     dtype=tf.float32)  # [B, P]
         options_onehot_expanded = tf.expand_dims(options_onehot,
                                                  axis=-1)  # [B, P, 1]
         pi = tf.reduce_sum(pi * options_onehot_expanded, axis=1)  # [B, A]
         if self.is_continuous:
             log_std = tf.gather(self.log_std, options)
             mu = pi
             sample_op, _ = gaussian_clip_rsample(mu, log_std)
             log_prob = gaussian_likelihood_sum(sample_op, mu, log_std)
         else:
             logits = pi
             norm_dist = tfp.distributions.Categorical(logits)
             sample_op = norm_dist.sample()
             log_prob = norm_dist.log_prob(sample_op)
         q_o = tf.reduce_sum(q * options_onehot, axis=-1)  # [B, ]
         beta_adv = q_o - ((1 - self.eps) * tf.reduce_max(q, axis=-1) +
                           self.eps * tf.reduce_mean(q, axis=-1))  # [B, ]
         max_options = tf.cast(tf.argmax(q, axis=-1),
                               dtype=tf.int32)  # [B, P] => [B, ]
         beta_probs = tf.reduce_sum(beta * options_onehot,
                                    axis=1)  # [B, P] => [B,]
         beta_dist = tfp.distributions.Bernoulli(probs=beta_probs)
         new_options = tf.where(beta_dist.sample() < 1, options,
                                max_options)  # <1 则不改变op, =1 则改变op
     return sample_op, q_o, log_prob, beta_adv, new_options, max_options, cell_state
Example #4
0
 def _get_action(self, s, visual_s, evaluation):
     s, visual_s = self.cast(s, visual_s)
     with tf.device(self.device):
         if self.is_continuous:
             mu = self.actor_net(s, visual_s)
             sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
         else:
             logits = self.actor_net(s, visual_s)
             norm_dist = tfp.distributions.Categorical(logits)
             sample_op = norm_dist.sample()
     return sample_op
Example #5
0
 def _get_action(self, s, visual_s, cell_state):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True, train=False)
         if self.is_continuous:
             mu = self.net(feat)
             sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
         else:
             logits = self.net(feat)
             norm_dist = tfp.distributions.Categorical(logits)
             sample_op = norm_dist.sample()
     return sample_op, cell_state
Example #6
0
File: ac.py Project: yyht/RLs
 def _get_action(self, s, visual_s, cell_state):
     with tf.device(self.device):
         feat, cell_state = self.get_feature(s, visual_s, cell_state=cell_state, record_cs=True)
         if self.is_continuous:
             mu = self.actor_net(feat)
             sample_op, _ = gaussian_clip_rsample(mu, self.log_std)
             log_prob = gaussian_likelihood_sum(sample_op, mu, self.log_std)
         else:
             logits = self.actor_net(feat)
             norm_dist = tfp.distributions.Categorical(logits)
             sample_op = norm_dist.sample()
             log_prob = norm_dist.log_prob(sample_op)
     return sample_op, log_prob, cell_state