def sample(self): if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] for agent, current_observation in zip(self.agents, self._current_observation_n): action, _ = agent.policy.get_action(current_observation) if agent.joint_policy: action_n.append(np.array(action)[0:agent._action_dim]) else: action_n.append(np.array(action)) next_observation_n, reward_n, done_n, info = self.env.step(action_n) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): action = deepcopy(action_n[i]) if agent.pool.joint: opponent_action = deepcopy(action_n) del opponent_action[i] opponent_action = np.array(opponent_action).flatten() agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i], opponent_action=opponent_action) else: agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i]) if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self._path_length = 0 self._path_return = np.array([0.] * self.agent_num, dtype=np.float32) self._n_episodes += 1 self.log_diagnostics() logger.dump_tabular(with_prefix=False) else: self._current_observation_n = next_observation_n
def _train(self, env, policy, initial_exploration_policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, pool) initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def sample(self): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] for agent, current_observation in zip(self.agents, self._current_observation_n): action, _ = agent.policy.get_action(current_observation) # print(action) if agent.joint_policy: action_n.append(np.array(action)[0:agent._action_dim]) else: action_n.append(np.array(action)) action_n = np.asarray(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) if self.global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): action = deepcopy(action_n[i]) if agent.pool.joint: opponent_action = deepcopy(action_n) opponent_action = np.delete(opponent_action, i, 0) opponent_action = np.array(opponent_action).flatten() agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i], opponent_action=opponent_action) else: agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i]) self._current_observation_n = next_observation_n for i, rew in enumerate(reward_n): self.episode_rewards[-1] += rew self.agent_rewards[i][-1] += rew if self.step % (25 * 1000) == 0: print("steps: {}, episodes: {}, mean episode reward: {}".format( self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:]))) if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self.episode_rewards.append(0) for a in self.agent_rewards: a.append(0) self._path_length = 0 self._path_return = np.array([0.] * self.agent_num, dtype=np.float32) self._n_episodes += 1 self.log_diagnostics() logger.dump_tabular(with_prefix=False) else: self._current_observation_n = next_observation_n