Exemple #1
0
    def sample(self):
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        for agent, current_observation in zip(self.agents, self._current_observation_n):
            action, _ = agent.policy.get_action(current_observation)
            if agent.joint_policy:
                action_n.append(np.array(action)[0:agent._action_dim])
            else:
                action_n.append(np.array(action))
        next_observation_n, reward_n, done_n, info = self.env.step(action_n)
        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1

        for i, agent in enumerate(self.agents):
            action = deepcopy(action_n[i])
            if agent.pool.joint:
                opponent_action = deepcopy(action_n)
                del opponent_action[i]
                opponent_action = np.array(opponent_action).flatten()
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i],
                                      opponent_action=opponent_action)
            else:
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i])

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return

            self._path_length = 0

            self._path_return = np.array([0.] * self.agent_num, dtype=np.float32)
            self._n_episodes += 1

            self.log_diagnostics()
            logger.dump_tabular(with_prefix=False)

        else:
            self._current_observation_n = next_observation_n
Exemple #2
0
    def _train(self, env, policy, initial_exploration_policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy, pool)
            initial_exploration_done = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.code consolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
Exemple #3
0
    def sample(self):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        for agent, current_observation in zip(self.agents, self._current_observation_n):
            action, _ = agent.policy.get_action(current_observation)
            # print(action)
            if agent.joint_policy:
                action_n.append(np.array(action)[0:agent._action_dim])
            else:
                action_n.append(np.array(action))

        action_n = np.asarray(action_n)

        next_observation_n, reward_n, done_n, info = self.env.step(action_n)
        if self.global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1

        for i, agent in enumerate(self.agents):
            action = deepcopy(action_n[i])
            if agent.pool.joint:
                opponent_action = deepcopy(action_n)
                opponent_action = np.delete(opponent_action, i, 0)
                opponent_action = np.array(opponent_action).flatten()
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i],
                                      opponent_action=opponent_action)
            else:
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i])
        self._current_observation_n = next_observation_n
        for i, rew in enumerate(reward_n):
            self.episode_rewards[-1] += rew
            self.agent_rewards[i][-1] += rew

        if self.step % (25 * 1000) == 0:
            print("steps: {}, episodes: {}, mean episode reward: {}".format(
                        self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:])))
        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self.episode_rewards.append(0)
            for a in self.agent_rewards:
                a.append(0)
            self._path_length = 0

            self._path_return = np.array([0.] * self.agent_num, dtype=np.float32)
            self._n_episodes += 1
            self.log_diagnostics()
            logger.dump_tabular(with_prefix=False)

        else:
            self._current_observation_n = next_observation_n