Exemple #1
0
    def train(self,
              rollout,
              sess,
              bootstrap_value,
              bootstrap_sf,
              summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        options = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        timesteps = rollout[:, 4]
        done = rollout[:, 5]
        option_term = rollout[:, 6]
        values = rollout[:, 7]
        q_values = rollout[:, 8]
        niu = rollout[:, 9]
        sf = rollout[:, 10]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        sf_plus = np.asarray(sf.tolist() + [bootstrap_sf])
        discounted_rewards = discount(rewards_plus, self.config.discount)[:-1]
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_return: discounted_rewards,
            self.local_network.target_r: rewards,
            self.local_network.target_sf: np.stack(discounted_sf, axis=0),
            self.local_network.delib: niu,
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.actions_placeholder: actions,
            self.local_network.options_placeholder: options
        }

        _, ms, img_summ, loss, policy_loss, entropy_loss, sf_loss, instant_r_loss, auto_loss, term_loss = \
          sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.image_summaries,
                    self.local_network.loss,
                    self.local_network.policy_loss,
                    self.local_network.entropy_loss,
                    self.local_network.sf_loss,
                    self.local_network.instant_r_loss,
                    self.local_network.auto_loss,
                    self.local_network.term_loss],
                   feed_dict=feed_dict)
        # sess.run(self.update_local_vars)
        return ms, img_summ, loss, policy_loss, entropy_loss, sf_loss, instant_r_loss, auto_loss, term_loss
    def train(self, rollout, sess, bootstrap_sf, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        fi = rollout[:, 1]
        next_observations = rollout[:, 2]
        actions = rollout[:, 3]

        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf: np.stack(discounted_sf, axis=0),
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.target_next_obs: np.stack(next_observations,
                                                         axis=0),
            self.local_network.actions_placeholder: actions
        }

        _, ms, loss, sf_loss, aux_loss = \
          sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.loss,
                    self.local_network.sf_loss,
                    self.local_network.aux_loss],
                   feed_dict=feed_dict)

        return ms, loss, sf_loss, aux_loss
Exemple #3
0
    def train(self, rollout, sess, bootstrap_value, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        timesteps = rollout[:, 3]
        values = rollout[:, 5]

        # The advantage function uses "Generalized Advantage Estimation"
        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(rewards_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_return: discounted_rewards,
            self.local_network.observation: np.stack(observations, axis=0),
            self.local_network.actions_placeholder: actions
        }

        _, ms, img_summ, loss, option_policy_loss, option_entropy_loss, option_critic_loss = \
          sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.image_summaries,
                    self.local_network.loss,
                    self.local_network.option_policy_loss,
                    self.local_network.option_entropy_loss,
                    self.local_network.option_critic_loss],
                   feed_dict=feed_dict)
        return ms, img_summ, loss, option_policy_loss, option_entropy_loss, option_critic_loss
    def train_sf(self, bootstrap_sf):
        rollout = np.array(self.episode_buffer_sf)

        try:
            observations = rollout[:, 0]
        except:
            print("Dasdas")

        feed_dict = {
            self.local_network.observation: np.stack(observations, axis=0)
        }
        fi = self.sess.run(self.local_network.fi, feed_dict=feed_dict)

        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf: np.stack(discounted_sf, axis=0),
            self.local_network.observation: np.stack(observations, axis=0)
        }  # ,

        _, ms, sf_loss = \
          self.sess.run([self.local_network.apply_grads_sf,
                         self.local_network.merged_summary_sf,
                         self.local_network.sf_loss],
                        feed_dict=feed_dict)

        return ms, sf_loss
Exemple #5
0
    def train(self, rollout, sess, bootstrap_sf, summaries=False):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        # actions = rollout[:, 1]
        # sf = rollout[:, 2]
        # fi = rollout[:, 3]
        fi = np.identity(self.nb_states)[observations]
        sf_plus = np.asarray(fi.tolist() + [bootstrap_sf])
        discounted_sf = discount(sf_plus, self.config.discount)[:-1]

        feed_dict = {
            self.local_network.target_sf:
            np.stack(discounted_sf, axis=0),
            self.local_network.observation:
            np.identity(self.nb_states)[observations]
        }

        _, ms, loss, sf_loss = \
          sess.run([self.local_network.apply_grads,
                    self.local_network.merged_summary,
                    self.local_network.loss,
                    self.local_network.sf_loss],
                   feed_dict=feed_dict)
        return ms, loss, sf_loss
    def train_option(self, bootstrap_value, bootstrap_value_mix):
        rollout = np.array(
            self.episode_buffer_option)  # s, self.option, self.action, r, r_i
        observations = rollout[:, 0]
        options = rollout[:, 1]
        actions = rollout[:, 2]
        rewards = rollout[:, 3]
        eigen_rewards = rollout[:, 4]
        primitive_actions = rollout[:, 5]

        rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_returns = reward_discount(rewards_plus,
                                             self.config.discount)[:-1]

        options1, options2, actions1, actions2, discounted_returns1, discounted_returns2, \
        observations1, observations2 = [], [], [], [], [], [], [], []

        if self.config.eigen:
            eigen_rewards_plus = np.asarray(eigen_rewards.tolist() +
                                            [bootstrap_value_mix])
            discounted_eigen_returns = discount(eigen_rewards_plus,
                                                self.config.discount)[:-1]
            discounted_eigen_returns1, discounted_eigen_returns2 = [], []

        for i, primitive in enumerate(primitive_actions):
            if primitive:
                options1.append(options[i])
                actions1.append(actions[i])
                discounted_returns1.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns1.append(
                        discounted_eigen_returns[i])
                observations1.append(observations[i])
            else:
                options2.append(options[i])
                actions2.append(actions[i])
                discounted_returns2.append(discounted_returns[i])
                if self.config.eigen:
                    discounted_eigen_returns2.append(
                        discounted_eigen_returns[i])
                observations2.append(observations[i])

        if len(observations1) > 0:
            feed_dict = {
                self.local_network.target_return: discounted_returns1,
                self.local_network.observation: np.stack(observations1,
                                                         axis=0),
                self.local_network.options_placeholder: options1
            }
            to_run = [self.local_network.apply_grads_primitive_option]

            _ = self.sess.run(to_run, feed_dict=feed_dict)

        if len(observations2) > 0:

            feed_dict = {
                self.local_network.target_return: discounted_returns2,
                self.local_network.observation: np.stack(observations2,
                                                         axis=0),
                self.local_network.actions_placeholder: actions2,
                self.local_network.options_placeholder: options2
            }
            to_run = [
                self.local_network.apply_grads_option,
                self.local_network.merged_summary_option,
                self.local_network.option_loss, self.local_network.policy_loss,
                self.local_network.entropy_loss,
                self.local_network.critic_loss, self.local_network.term_loss
            ]

            if self.config.eigen:
                feed_dict[self.local_network.
                          target_eigen_return] = discounted_eigen_returns2
                to_run.append(self.local_network.eigen_critic_loss)

            results = self.sess.run(to_run, feed_dict=feed_dict)
            results.append(discounted_returns[-1])
            if self.config.eigen:
                results.append(discounted_eigen_returns[-1])
        else:
            return None

        return results[1:]