def record_env_step(self, reward, done, info, traj_buffer_idx, rollout_step):
        """
        Policy inputs (obs) and policy outputs (actions, values, ...) for the current rollout step
        are already added to the trajectory buffer
        the only job remaining is to add auxiliary data: rewards, done flags, etc.

        :param reward: last reward from the env step
        :param done: last value of done flag
        :param info: info dictionary
        :param traj_buffer_idx: index of the trajectory buffer we're currently using on this worker
        :param rollout_step: number of steps since we started the current rollout. When this reaches cfg.rollout
        we finalize the trajectory buffer and send it to the learner.
        """

        self.traj_tensors['rewards'][traj_buffer_idx, rollout_step][0] = float(reward)
        self.traj_tensors['dones'][traj_buffer_idx, rollout_step][0] = done

        env_steps = info.get('num_frames', 1)
        self.rollout_env_steps += env_steps
        self.last_episode_duration += env_steps

        if done:
            self.new_episode = True
            self.last_episode_true_reward = info.get('true_reward', self.last_episode_reward)
            self.last_episode_extra_stats = info.get('episode_extra_stats', dict())

            set_training_info(self.env_training_info_interface, self.approx_env_steps.get(self.curr_policy_id, 0))
    def _on_new_policy(self, new_policy_id):
        """Called when the new policy is sampled for this actor."""
        self.curr_policy_id = new_policy_id
        # we're switching to a different policy - reset the rnn hidden state
        self._reset_rnn_state()

        if self.cfg.with_pbt and self.pbt_reward_shaping[self.curr_policy_id] is not None:
            set_reward_shaping(self.env, self.pbt_reward_shaping[self.curr_policy_id], self.agent_idx)
            set_training_info(self.env_training_info_interface, self.approx_env_steps.get(self.curr_policy_id, 0))