Example #1
0
    def train_option(self, bootstrap_value_mix):
        rollout = np.array(self.episode_buffer_option)
        observations = rollout[:, 0]
        option_directions = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        eigen_rewards = rollout[:, 4]
        next_observations = rollout[:, 5]
        """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory"""
        eigen_rewards_plus = np.asarray(eigen_rewards.tolist() +
                                        [bootstrap_value_mix])
        discounted_eigen_returns = reward_discount(eigen_rewards_plus,
                                                   self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_eigen_return: discounted_eigen_returns,
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.actions_placeholder: actions,
            self.local_network.matrix_sf:
            [self.global_network.sf_matrix_buffer]
            # self.local_network.current_option_direction: option_directions,
        }
        """Do an update on the intra-option policies"""
        _, self.summaries_option = self.sess.run([
            self.local_network.apply_grads_option,
            self.local_network.merged_summary_option,
        ],
                                                 feed_dict=feed_dict)
        """Store the bootstrap target returns at the end of the trajectory"""
        self.eigen_R = discounted_eigen_returns[-1]
    def train_option(self, bootstrap_value, bootstrap_value_mix):
        rollout = np.array(
            self.episode_buffer_option)  # s, self.option, self.action, r, r_i
        observations = rollout[:, 0]
        options = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        eigen_rewards = rollout[:, 4]
        primitive_actions = rollout[:, 5]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_returns = reward_discount(rewards_plus,
                                             self.config.discount)[:-1]

        options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \
        observations1, observations2 = [], [], [], [], [], [], [], []

        if self.config.eigen:
            eigen_rewards_plus = np.asarray(eigen_rewards.tolist() +
                                            [bootstrap_value_mix])
            discounted_eigen_returns = discount(eigen_rewards_plus,
                                                self.config.discount)[:-1]
            discounted_eigen_returns1, discounted_eigen_returns2 = [], []

        for i, primitive in enumerate(primitive_actions):
            if primitive:
                options1.append(options[i])
                actions1.append(actions[i])
                discounted_returns1.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns1.append(
                        discounted_eigen_returns[i])
                observations1.append(observations[i])
            else:
                options2.append(options[i])
                actions2.append(actions[i])
                discounted_returns2.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns2.append(
                        discounted_eigen_returns[i])
                observations2.append(observations[i])

        if len(observations1) > 0:
            feed_dict = {
                self.local_network.target_return: discounted_returns1,
                self.local_network.observation: np.stack(observations1,
                                                         axis=0),
                self.local_network.options_placeholder: options1
            }
            to_run = [self.local_network.apply_grads_primitive_option]

            _ = self.sess.run(to_run, feed_dict=feed_dict)

        if len(observations2) > 0:

            feed_dict = {
                self.local_network.target_return: discounted_returns2,
                self.local_network.observation: np.stack(observations2,
                                                         axis=0),
                self.local_network.actions_placeholder: actions2,
                self.local_network.options_placeholder: options2
            }
            to_run = [
                self.local_network.apply_grads_option,
                self.local_network.merged_summary_option,
                self.local_network.option_loss, self.local_network.policy_loss,
                self.local_network.entropy_loss,
                self.local_network.critic_loss, self.local_network.term_loss
            ]

            if self.config.eigen:
                feed_dict[self.local_network.
                          target_eigen_return] = discounted_eigen_returns2
                to_run.append(self.local_network.eigen_critic_loss)

            results = self.sess.run(to_run, feed_dict=feed_dict)
            results.append(discounted_returns[-1])
            if self.config.eigen:
                results.append(discounted_eigen_returns[-1])
        else:
            return None

        return results[1:]
  def train_option(self, bootstrap_value, bootstrap_value_mix):
    rollout = np.array(self.episode_buffer_option)
    observations = rollout[:, 0]
    options = rollout[:, 1]
    actions = rollout[:, 2]
    rewards = rollout[:, 3]
    eigen_rewards = rollout[:, 4]
    primitive_actions = rollout[:, 5]
    next_observations = rollout[:, 6]

    """Construct list of discounted returns for the entire n-step trajectory"""
    rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
    discounted_returns = reward_discount(rewards_plus, self.config.discount)[:-1]

    """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory"""
    eigen_rewards_plus = np.asarray(eigen_rewards.tolist() + [bootstrap_value_mix])
    discounted_eigen_returns = reward_discount(eigen_rewards_plus, self.config.discount)[:-1]

    """Get the real directions executed in the environment, not the ones corresponding to the options of the high-level policy, since the former might not be the ones that need to be assigned credit for the return"""
    feed_dict = {
      self.local_network.observation: np.concatenate((np.stack(observations, 0), np.stack(next_observations, 0)),
                                                     axis=0)}
    fi = self.sess.run(self.local_network.fi,
                       feed_dict=feed_dict)
    fi_next = fi[len(observations):]
    fi = fi[:len(observations)]
    real_directions = fi_next - fi
    real_approx_options, directions = [], []
    for i, d in enumerate(real_directions):
      if primitive_actions[i]:
        real_approx_options.append(options[i])
        directions.append(np.zeros((self.config.sf_layers[-1])))
      else:
        directions.append(self.global_network.directions[options[i]])
        real_approx_options.append(np.argmax([self.cosine_similarity(d, self.directions[o]) for o in
                                              range(self.nb_options)]) if self.total_episodes > 0 else options[i])

    """Do an update on the option-value function critic"""
    feed_dict = {self.local_network.target_return: discounted_returns,
                 self.local_network.observation: np.stack(observations, axis=0),
                 # self.local_network.options_placeholder: real_approx_options,
                 self.local_network.options_placeholder: options,
                 # self.local_network.option_direction_placeholder: real_directions
                 self.local_network.option_direction_placeholder: directions
                 }

    _, self.summaries_critic = self.sess.run([self.local_network.apply_grads_critic,
                                       self.local_network.merged_summary_critic,
                                       ], feed_dict=feed_dict)

    """Do an update on the option termination conditions"""
    feed_dict = {
      self.local_network.observation: np.stack(next_observations, axis=0),
      # self.local_network.options_placeholder: real_approx_options,
      self.local_network.options_placeholder: options,
      # self.local_network.option_direction_placeholder: real_directions,
      self.local_network.option_direction_placeholder: directions,
      self.local_network.primitive_actions_placeholder: primitive_actions
    }

    _, self.summaries_termination = self.sess.run([self.local_network.apply_grads_term,
                                     self.local_network.merged_summary_term,
                                    ], feed_dict=feed_dict)

    feed_dict = {self.local_network.target_return: discounted_returns,
                 self.local_network.target_eigen_return: discounted_eigen_returns,
                 self.local_network.observation: np.stack(observations, axis=0),
                 self.local_network.actions_placeholder: actions,
                 self.local_network.options_placeholder: options,
                 self.local_network.option_direction_placeholder: directions,
                 self.local_network.primitive_actions_placeholder: primitive_actions
                 }

    """Do an update on the intra-option policies"""
    _, self.summaries_option = self.sess.run([self.local_network.apply_grads_option,
                                       self.local_network.merged_summary_option,
                                       ], feed_dict=feed_dict)

    """Store the bootstrap target returns at the end of the trajectory"""
    self.R = discounted_returns[-1]
    self.eigen_R = discounted_eigen_returns[-1]
    def train_option(self, bootstrap_value, bootstrap_value_mix):  #
        rollout = np.array(self.episode_buffer_option)
        observations = rollout[:, 0]
        options = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        eigen_rewards = rollout[:, 4]
        primitive_actions = rollout[:, 5]
        next_observations = rollout[:, 6]
        """Construct list of discounted returns for the entire n-step trajectory"""
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_returns = reward_discount(rewards_plus,
                                             self.config.discount)[:-1]
        """Construct list of discounted returns using mixed reward signals for the entire n-step trajectory"""
        eigen_rewards_plus = np.asarray(eigen_rewards.tolist() +
                                        [bootstrap_value_mix])
        discounted_eigen_returns = reward_discount(eigen_rewards_plus,
                                                   self.config.discount)[:-1]
        """Do an update on the option-value function critic"""
        feed_dict = {
            self.local_network.target_return: discounted_returns,
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.options_placeholder: options,
        }

        _, self.summaries_critic = self.sess.run([
            self.local_network.apply_grads_critic,
            self.local_network.merged_summary_critic,
        ],
                                                 feed_dict=feed_dict)
        """Do an update on the option termination conditions"""
        feed_dict = {
            self.local_network.observation: np.stack(next_observations,
                                                     axis=0),
            self.local_network.options_placeholder: options,
            self.local_network.primitive_actions_placeholder: primitive_actions
        }

        _, self.summaries_termination = self.sess.run([
            self.local_network.apply_grads_term,
            self.local_network.merged_summary_term,
        ],
                                                      feed_dict=feed_dict)
        """Do an update on the intra-option policies"""
        feed_dict = {
            self.local_network.target_return: discounted_returns,
            self.local_network.target_eigen_return: discounted_eigen_returns,
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.actions_placeholder: actions,
            self.local_network.options_placeholder: options,
            self.local_network.primitive_actions_placeholder: primitive_actions
        }

        _, self.summaries_option = self.sess.run([
            self.local_network.apply_grads_option,
            self.local_network.merged_summary_option,
        ],
                                                 feed_dict=feed_dict)
        """Store the bootstrap target returns at the end of the trajectory"""
        self.R = discounted_returns[-1]
        self.eigen_R = discounted_eigen_returns[-1]