def train_sf(self, bootstrap_sf):
        rollout = np.array(self.episode_buffer_sf)

        observations = rollout[:, 0]

        feed_dict = {
            self.local_network.observation: np.stack(observations, axis=0)
        }
        fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict)

        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf: np.stack(discounted_sf, axis=0),
            self.local_network.observation: np.stack(observations, axis=0)
        }  # ,

        _, ms, sf_loss = \
          self.sess.run([self.local_network.apply_grads_sf,
                         self.local_network.merged_summary_sf,
                         self.local_network.sf_loss],
                        feed_dict=feed_dict)

        return ms, sf_loss
  def train_sf(self, bootstrap_sf):
    rollout = np.array(self.episode_buffer_sf)
    observations = rollout[:, 0]
    next_observations = rollout[:, 1]
    actions = rollout[:, 2]
    rewards = rollout[:, 3]
    fi = rollout[:, 4]

    """Construct list of latent representations for the entire trajectory"""
    sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
    """Construct the targets for the next step successor representations for the entire trajectory"""
    discounted_sf = discount(sf_plus, self.config.discount)[:-1]

    feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0),
                 self.local_network.observation: np.stack(observations, axis=0),
                 self.local_network.actions_placeholder: actions,
                 self.local_network.target_next_obs: np.stack(next_observations, axis=0)}

    _, self.summaries_sf, sf_loss, _, self.summaries_aux, aux_loss = \
      self.sess.run([self.local_network.apply_grads_sf,
                     self.local_network.merged_summary_sf,
                     self.local_network.sf_loss,
                     self.local_network.apply_grads_aux,
                     self.local_network.merged_summary_aux,
                     self.local_network.aux_loss
                     ],
                    feed_dict=feed_dict)
  def train_sf(self, rollout, sess, bootstrap_sf, summaries=False):
    rollout = np.array(rollout)
    observations = rollout[:, 0]
    # next_observations = rollout[:, 1]
    # actions = rollout[:, 2]

    feed_dict = {self.local_network.observation: np.stack(observations, axis=0)}
    fi = sess.run(self.local_network.fi,
                  feed_dict=feed_dict)

    sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
    discounted_sf = discount(sf_plus, self.config.discount)[:-1]

    feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0),
                 self.local_network.observation: np.stack(observations, axis=0)}  # ,
    # self.local_network.target_next_obs: np.stack(next_observations, axis=0),
    # self.local_network.actions_placeholder: actions}

    _, ms, sf_loss = \
      sess.run([self.local_network.apply_grads_sf,
                self.local_network.merged_summary_sf,
                self.local_network.sf_loss],
               feed_dict=feed_dict)

    return ms, sf_loss
Example #4
0
    def train(self, rollout, bootstrap_sf):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        fi = np.identity(self.nb_states)[observations]
        """Construct list of one=hot encodings for the entire trajectory"""
        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        """Construct the targets for the next step successor representations for the entire trajectory"""
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf:
            np.stack(discounted_sf, axis=0),
            self.local_network.observation:
            np.identity(self.nb_states)[observations]
        }

        _, self.summaries, loss = \
          self.sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.loss],
                   feed_dict=feed_dict)
  def train_sf(self, rollout, bootstrap_sf):
    rollout = np.array(rollout)
    observations = rollout[:, 0]
    # next_observations = rollout[:, 1]
    # actions = rollout[:, 2]

    """Get the latent representations for each state"""
    feed_dict = {self.local_network.observation: np.stack(observations, axis=0)}
    fi = self.sess.run(self.local_network.fi,
                  feed_dict=feed_dict)
    """Construct list of latent representations for the entire trajectory"""
    sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
    """Construct the targets for the next step successor representations for the entire trajectory"""
    discounted_sf = discount(sf_plus, self.config.discount)[:-1]

    feed_dict = {self.local_network.target_sf: np.stack(discounted_sf, axis=0),
                 self.local_network.observation: np.stack(observations, axis=0)}  # ,
    _, self.summaries_sf, sf_loss = \
      self.sess.run([self.local_network.apply_grads_sf,
                self.local_network.merged_summary_sf,
                self.local_network.sf_loss],
               feed_dict=feed_dict)
    def train(self, rollout, sess, bootstrap_sf, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        # actions = rollout[:, 1]
        # sf = rollout[:, 2]
        # fi = rollout[:, 3]
        fi = np.identity(self.nb_states)[observations]
        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf:
            np.stack(discounted_sf, axis=0),
            self.local_network.observation:
            np.identity(self.nb_states)[observations]
        }

        _, ms, loss, sf_loss = \
          sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.loss,
                    self.local_network.sf_loss],
                   feed_dict=feed_dict)
        return ms, loss, sf_loss
    def train_option(self, bootstrap_value, bootstrap_value_mix):
        rollout = np.array(
            self.episode_buffer_option)  # s, self.option, self.action, r, r_i
        observations = rollout[:, 0]
        options = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        eigen_rewards = rollout[:, 4]
        primitive_actions = rollout[:, 5]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_returns = reward_discount(rewards_plus,
                                             self.config.discount)[:-1]

        options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \
        observations1, observations2 = [], [], [], [], [], [], [], []

        if self.config.eigen:
            eigen_rewards_plus = np.asarray(eigen_rewards.tolist() +
                                            [bootstrap_value_mix])
            discounted_eigen_returns = discount(eigen_rewards_plus,
                                                self.config.discount)[:-1]
            discounted_eigen_returns1, discounted_eigen_returns2 = [], []

        for i, primitive in enumerate(primitive_actions):
            if primitive:
                options1.append(options[i])
                actions1.append(actions[i])
                discounted_returns1.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns1.append(
                        discounted_eigen_returns[i])
                observations1.append(observations[i])
            else:
                options2.append(options[i])
                actions2.append(actions[i])
                discounted_returns2.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns2.append(
                        discounted_eigen_returns[i])
                observations2.append(observations[i])

        if len(observations1) > 0:
            feed_dict = {
                self.local_network.target_return: discounted_returns1,
                self.local_network.observation: np.stack(observations1,
                                                         axis=0),
                self.local_network.options_placeholder: options1
            }
            to_run = [self.local_network.apply_grads_primitive_option]

            _ = self.sess.run(to_run, feed_dict=feed_dict)

        if len(observations2) > 0:

            feed_dict = {
                self.local_network.target_return: discounted_returns2,
                self.local_network.observation: np.stack(observations2,
                                                         axis=0),
                self.local_network.actions_placeholder: actions2,
                self.local_network.options_placeholder: options2
            }
            to_run = [
                self.local_network.apply_grads_option,
                self.local_network.merged_summary_option,
                self.local_network.option_loss, self.local_network.policy_loss,
                self.local_network.entropy_loss,
                self.local_network.critic_loss, self.local_network.term_loss
            ]

            if self.config.eigen:
                feed_dict[self.local_network.
                          target_eigen_return] = discounted_eigen_returns2
                to_run.append(self.local_network.eigen_critic_loss)

            results = self.sess.run(to_run, feed_dict=feed_dict)
            results.append(discounted_returns[-1])
            if self.config.eigen:
                results.append(discounted_eigen_returns[-1])
        else:
            return None

        return results[1:]