def record_env_step(self, reward, done, info, traj_buffer_idx, rollout_step): """ Policy inputs (obs) and policy outputs (actions, values, ...) for the current rollout step are already added to the trajectory buffer the only job remaining is to add auxiliary data: rewards, done flags, etc. :param reward: last reward from the env step :param done: last value of done flag :param info: info dictionary :param traj_buffer_idx: index of the trajectory buffer we're currently using on this worker :param rollout_step: number of steps since we started the current rollout. When this reaches cfg.rollout we finalize the trajectory buffer and send it to the learner. """ self.traj_tensors['rewards'][traj_buffer_idx, rollout_step][0] = float(reward) self.traj_tensors['dones'][traj_buffer_idx, rollout_step][0] = done env_steps = info.get('num_frames', 1) self.rollout_env_steps += env_steps self.last_episode_duration += env_steps if done: self.new_episode = True self.last_episode_true_reward = info.get('true_reward', self.last_episode_reward) self.last_episode_extra_stats = info.get('episode_extra_stats', dict()) set_training_info(self.env_training_info_interface, self.approx_env_steps.get(self.curr_policy_id, 0))
def _on_new_policy(self, new_policy_id): """Called when the new policy is sampled for this actor.""" self.curr_policy_id = new_policy_id # we're switching to a different policy - reset the rnn hidden state self._reset_rnn_state() if self.cfg.with_pbt and self.pbt_reward_shaping[self.curr_policy_id] is not None: set_reward_shaping(self.env, self.pbt_reward_shaping[self.curr_policy_id], self.agent_idx) set_training_info(self.env_training_info_interface, self.approx_env_steps.get(self.curr_policy_id, 0))