def __call__(self, samples: SampleBatchType) -> SampleBatchType: _check_sample_batch_type(samples) wrapped = False if isinstance(samples, SampleBatch): samples = samples.as_multi_agent() wrapped = True for policy_id in samples.policy_batches: batch = samples.policy_batches[policy_id] for field in self.fields: if field not in batch: raise KeyError( f"`{field}` not found in SampleBatch for policy " f"`{policy_id}`! Maybe this policy fails to add " f"{field} in its `postprocess_trajectory` method? Or " "this policy is not meant to learn at all and you " "forgot to add it to the list under `config." "multiagent.policies_to_train`.") batch[field] = standardized(batch[field]) if wrapped: samples = samples.policy_batches[DEFAULT_POLICY_ID] return samples
def inner_adaptation_steps(itr): buf = [] split = [] metrics = {} for samples in itr: # Processing Samples (Standardize Advantages) split_lst = [] for sample in samples: sample["advantages"] = standardized(sample["advantages"]) split_lst.append(sample.count) buf.extend(samples) split.append(split_lst) adapt_iter = len(split) - 1 metrics = post_process_metrics(adapt_iter, workers, metrics) if len(split) > inner_steps: out = SampleBatch.concat_samples(buf) out["split"] = np.array(split) buf = [] split = [] # Reporting Adaptation Rew Diff ep_rew_pre = metrics["episode_reward_mean"] ep_rew_post = metrics["episode_reward_mean_adapt_" + str(inner_steps)] metrics["adaptation_delta"] = ep_rew_post - ep_rew_pre yield out, metrics metrics = {} else: inner_adaptation(workers, samples)
def __call__(self, samples: SampleBatchType) -> SampleBatchType: _check_sample_batch_type(samples) wrapped = False if isinstance(samples, SampleBatch): samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples}, samples.count) wrapped = True for policy_id in samples.policy_batches: batch = samples.policy_batches[policy_id] for field in self.fields: batch[field] = standardized(batch[field]) if wrapped: samples = samples.policy_batches[DEFAULT_POLICY_ID] return samples
def standardize_fields(samples: SampleBatchType, fields: List[str]) -> SampleBatchType: """Standardize fields of the given SampleBatch""" _check_sample_batch_type(samples) wrapped = False if isinstance(samples, SampleBatch): samples = samples.as_multi_agent() wrapped = True for policy_id in samples.policy_batches: batch = samples.policy_batches[policy_id] for field in fields: if field in batch: batch[field] = standardized(batch[field]) if wrapped: samples = samples.policy_batches[DEFAULT_POLICY_ID] return samples
def post_process_samples(samples, config: AlgorithmConfigDict): # Instead of using NN for value function, we use regression split_lst = [] for sample in samples: indexes = np.asarray(sample["dones"]).nonzero()[0] indexes = indexes + 1 reward_list = np.split(sample["rewards"], indexes)[:-1] observation_list = np.split(sample["obs"], indexes)[:-1] paths = [] for i in range(0, len(reward_list)): paths.append({ "rewards": reward_list[i], "observations": observation_list[i] }) paths = calculate_gae_advantages(paths, config["gamma"], config["lambda"]) advantages = np.concatenate([path["advantages"] for path in paths]) sample["advantages"] = standardized(advantages) split_lst.append(sample.count) return samples, split_lst
def training_iteration(self) -> ResultDict: # Generate common experiences, collect batch for PPO, store every (DQN) batch # into replay buffer. ppo_batches = [] num_env_steps = 0 # PPO batch size fixed at 200. while num_env_steps < 200: ma_batches = synchronous_parallel_sample(worker_set=self.workers, concat=False) # Loop through (parallely collected) ma-batches. for ma_batch in ma_batches: # Update sampled counters. self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count self._counters[ NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps() ppo_batch = ma_batch.policy_batches.pop("ppo_policy") # Add collected batches (only for DQN policy) to replay buffer. self.local_replay_buffer.add(ma_batch) ppo_batches.append(ppo_batch) num_env_steps += ppo_batch.count # DQN sub-flow. dqn_train_results = {} dqn_train_batch = self.local_replay_buffer.sample(num_items=64) if dqn_train_batch is not None: dqn_train_results = train_one_step(self, dqn_train_batch, ["dqn_policy"]) self._counters[ "agent_steps_trained_DQN"] += dqn_train_batch.agent_steps() print( "DQN policy learning on samples from", "agent steps trained", dqn_train_batch.agent_steps(), ) # Update DQN's target net every 500 train steps. if (self._counters["agent_steps_trained_DQN"] - self._counters[LAST_TARGET_UPDATE_TS] >= 500): self.workers.local_worker().get_policy( "dqn_policy").update_target() self._counters[NUM_TARGET_UPDATES] += 1 self._counters[LAST_TARGET_UPDATE_TS] = self._counters[ "agent_steps_trained_DQN"] # PPO sub-flow. ppo_train_batch = SampleBatch.concat_samples(ppo_batches) self._counters[ "agent_steps_trained_PPO"] += ppo_train_batch.agent_steps() # Standardize advantages. ppo_train_batch[Postprocessing.ADVANTAGES] = standardized( ppo_train_batch[Postprocessing.ADVANTAGES]) print( "PPO policy learning on samples from", "agent steps trained", ppo_train_batch.agent_steps(), ) ppo_train_batch = MultiAgentBatch({"ppo_policy": ppo_train_batch}, ppo_train_batch.count) ppo_train_results = train_one_step(self, ppo_train_batch, ["ppo_policy"]) # Combine results for PPO and DQN into one results dict. results = dict(ppo_train_results, **dqn_train_results) return results
def post_process_samples(self, samples): split_lst = [] for sample in samples: sample["advantages"] = standardized(sample["advantages"]) split_lst.append(sample.count) return samples, split_lst