def postprocess_trajectory( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: """Postprocesses a trajectory and returns the processed trajectory. The trajectory contains only data from one episode and from one agent. - If `config.batch_mode=truncate_episodes` (default), sample_batch may contain a truncated (at-the-end) episode, in case the `config.rollout_fragment_length` was reached by the sampler. - If `config.batch_mode=complete_episodes`, sample_batch will contain exactly one episode (no matter how long). New columns can be added to sample_batch and existing ones may be altered. Args: policy (Policy): The Policy used to generate the trajectory (`sample_batch`) sample_batch (SampleBatch): The SampleBatch to postprocess. other_agent_batches (Optional[Dict[AgentID, SampleBatch]]): Optional dict of AgentIDs mapping to other agents' trajectory data (from the same episode). NOTE: The other agents use the same policy. episode (Optional[MultiAgentEpisode]): Optional multi-agent episode object in which the agents operated. Returns: SampleBatch: The postprocessed, modified SampleBatch (or a new one). """ return postprocess_nstep_and_prio(policy, sample_batch)
def postprocess_with_HER(policy, sample_batch, _other_agent_batches=None, _episode=None): """ postprocess the sampled batch, inject modified trajectory with modified goal condition """ # Hindsight Experience Replay trajectory augmentation if type(sample_batch) is SampleBatch: # init list of new trajectories augmented_trajs = [sample_batch] # init HER sampling strategy her_sampler = SamplingStrategy(policy, sample_batch) # sample n new trajectories using sampling strategy for i in range(policy.config['num_her_traj']): augmented_trajs.append(her_sampler.sample_trajectory()) # concatenate sampled trajectories sample_batch = SampleBatch.concat_samples(augmented_trajs) # RLlib Original DQN postprocess_fn Implementation sample_batch = postprocess_nstep_and_prio(policy, sample_batch, _other_agent_batches, _episode) return sample_batch
def postprocess_trajectory(policy, sample_batch, other_agent_batches=None, episode=None): if 'infos' not in sample_batch: sample_batch['members'] = np.ones_like( sample_batch[SampleBatch.REWARDS]).astype(np.int32) print("infos field not in sample_batch !!!") else: sample_batch['members'] = np.array( [info['active_member'] for info in sample_batch['infos']], dtype=np.int32) return postprocess_nstep_and_prio(policy, sample_batch)
def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): if self.config["parameter_noise"]: # adjust the sigma of parameter space noise states, noisy_actions = [ list(x) for x in sample_batch.columns( [SampleBatch.CUR_OBS, SampleBatch.ACTIONS]) ] self.sess.run(self.remove_parameter_noise_op) # TODO(sven): This won't work if exploration != Noise, which is # probably fine as parameter_noise will soon be its own # Exploration class. clean_actions, cur_noise_scale = self.sess.run( [self.output_actions, self.exploration.get_info()], feed_dict={ self.cur_observations: states, self._is_exploring: False, self._timestep: self.global_timestep, }) distance_in_action_space = np.sqrt( np.mean(np.square(clean_actions - noisy_actions))) self.pi_distance = distance_in_action_space if distance_in_action_space < \ self.config["exploration_config"].get("ou_sigma", 0.2) * \ cur_noise_scale: # multiplying the sampled OU noise by noise scale is # equivalent to multiplying the sigma of OU by noise scale self.parameter_noise_sigma_val *= 1.01 else: self.parameter_noise_sigma_val /= 1.01 self.parameter_noise_sigma.load(self.parameter_noise_sigma_val, session=self.sess) return postprocess_nstep_and_prio(self, sample_batch)
def postprocess_trajectory(policy, sample_batch, other_agent_batches=None, episode=None): return postprocess_nstep_and_prio(policy, sample_batch)