def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): sample_batch = super().postprocess_trajectory(sample_batch) return compute_gae_for_sample_batch(self, sample_batch, other_agent_batches, episode)
def postprocess_trajectory( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[Episode] = None, ) -> SampleBatch: """Postprocesses a trajectory and returns the processed trajectory. The trajectory contains only data from one episode and from one agent. - If `config.batch_mode=truncate_episodes` (default), sample_batch may contain a truncated (at-the-end) episode, in case the `config.rollout_fragment_length` was reached by the sampler. - If `config.batch_mode=complete_episodes`, sample_batch will contain exactly one episode (no matter how long). New columns can be added to sample_batch and existing ones may be altered. Args: policy (Policy): The Policy used to generate the trajectory (`sample_batch`) sample_batch (SampleBatch): The SampleBatch to postprocess. other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional dict of AgentIDs mapping to other agents' trajectory data (from the same episode). NOTE: The other agents use the same policy. episode (Optional[Episode]): Optional multi-agent episode object in which the agents operated. Returns: SampleBatch: The postprocessed, modified SampleBatch (or a new one). """ if not policy.config["vtrace"]: sample_batch = compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode) return sample_batch
def postprocess_ppo_gae( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[Episode] = None) -> SampleBatch: return compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode)
def add_advantages(policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[PolicyID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: return compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode)
def postprocess_trajectory( self, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[Episode] = None, ): sample_batch = super().postprocess_trajectory(sample_batch) return compute_gae_for_sample_batch(self, sample_batch, other_agent_batches, episode)
def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): # Do all post-processing always with no_grad(). # Not using this here will introduce a memory leak # in torch (issue #6962). # TODO: no_grad still necessary? with torch.no_grad(): return compute_gae_for_sample_batch(self, sample_batch, other_agent_batches, episode)
def postprocess_trajectory( self, sample_batch: SampleBatch, other_agent_batches: Optional[SampleBatch] = None, episode: Optional["Episode"] = None, ): if not self.config["vtrace"]: sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode) return sample_batch
def test_ppo_free_log_std(self): """Tests the free log std option works.""" config = ( ppo.PPOConfig() .rollouts( num_rollout_workers=0, ) .training( gamma=0.99, model=dict( fcnet_hiddens=[10], fcnet_activation="linear", free_log_std=True, vf_share_layers=True, ), ) ) for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPO(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check the free log std var is created. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] def get_value(): if fw == "tf": return policy.get_session().run(log_std_var)[0] elif fw == "torch": return log_std_var.detach().cpu().numpy()[0] else: return log_std_var.numpy()[0] # Check the variable is initially zero. init_std = get_value() assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": batch = policy._lazy_tensor_dict(batch) policy.learn_on_batch(batch) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std trainer.stop()
def postprocess_advantages(policy, sample_batch, other_agent_batches=None, episode=None): # Stub serving backward compatibility. deprecation_warning( old="rllib.agents.a3c.a3c_tf_policy.postprocess_advantages", new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch", error=False) return compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode)
def postprocess_trajectory(self, sample_batch, other_agent_batches=None, episode=None): ''' Calculate GAE in postprocess ''' with torch.no_grad(): # Call super's postprocess_trajectory first. # sample_batch = super().postprocess_trajectory( # sample_batch, other_agent_batches, episode) return compute_gae_for_sample_batch(self, sample_batch, other_agent_batches, episode)
def postprocess_ppo_gae( policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: # Stub serving backward compatibility. deprecation_warning( old="rllib.agents.ppo.ppo_tf_policy.postprocess_ppo_gae", new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch", error=False) return compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode)
def add_advantages(policy: Policy, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[PolicyID, SampleBatch]] = None, episode: Optional[MultiAgentEpisode] = None) -> SampleBatch: # Stub serving backward compatibility. deprecation_warning( old="rllib.agents.a3c.a3c_torch_policy.add_advantages", new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch", error=False) return compute_gae_for_sample_batch(policy, sample_batch, other_agent_batches, episode)
def test_ppo_free_log_std(self): """Tests the free log std option works.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["free_log_std"] = True config["model"]["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check the free log std var is created. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] def get_value(): if fw == "tf": return policy.get_session().run(log_std_var)[0] elif fw == "torch": return log_std_var.detach().cpu().numpy()[0] else: return log_std_var.numpy()[0] # Check the variable is initially zero. init_std = get_value() assert init_std == 0.0, init_std batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": batch = policy._lazy_tensor_dict(batch) policy.learn_on_batch(batch) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std trainer.stop()
def postprocess_trajectory( self, sample_batch: SampleBatch, other_agent_batches: Optional[Dict[Any, SampleBatch]] = None, episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. sample_batch = super().postprocess_trajectory(sample_batch, other_agent_batches, episode) if not self.config["vtrace"]: # Do all post-processing always with no_grad(). # Not using this here will introduce a memory leak # in torch (issue #6962). with torch.no_grad(): sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode) return sample_batch
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy()) if fw == "torch": train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw in ["tf2", "tfe"]: ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy._mean_kl, policy._mean_entropy, policy._mean_policy_loss, policy._mean_vf_loss, policy._total_loss, ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy._mean_kl, kl) check(policy._mean_entropy, entropy) check(policy._mean_policy_loss, np.mean(-pg_loss)) check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy._total_loss, overall_loss, decimals=4) trainer.stop()
def learn_on_batch(self, train_batch): # print(type(train_batch)) # Turn the values into tensors # train_batch_tensor = self._lazy_tensor_dict(train_batch) # train_batch_tensor = train_batch_tensor # restore_original_dimensions() # print(train_batch_tensor.keys()) # update the skill dynamics # Set Model to train mode. if self.model: self.model.train() if self.dynamics: self.dynamics.train() stats = defaultdict(int) if self.use_dynamics: c = 0 for ep in range(self.dynamics_epochs): for mb in minibatches( train_batch, self.minibatch_size ): # minibatches(train_batch.copy(), self.minibatch_size) c += 1 mb["is_training"] = True minibatch = self._lazy_tensor_dict(mb) obs = _unpack_obs(minibatch['obs'], self.model.options['orig_obs_space'], torch) next_obs = _unpack_obs( minibatch['new_obs'], self.model.options['orig_obs_space'], torch) dynamics_obs = obs['dynamics_obs'] next_dynamics_obs = next_obs['dynamics_obs'] - obs[ 'dynamics_obs'] z = obs['z'] log_prob = self.dynamics.get_log_prob(dynamics_obs, z, next_dynamics_obs, training=True) dynamics_loss = -torch.mean(log_prob) orth_loss = self.dynamics.orthogonal_regularization() l2_loss = self.dynamics.l2_regularization() if self.config['dynamics_orth_reg']: dynamics_loss += orth_loss if self.config['dynamics_l2_reg'] and not self.config[ 'dynamics_spectral_norm']: dynamics_loss += l2_loss self.dynamics_opt.zero_grad() dynamics_loss.backward() if self.config['grad_clip']: grad_norm = nn.utils.clip_grad_norm_( self.dynamics.parameters(), self.config['grad_clip']) self.dynamics_opt.step() stats['dynamics_loss'] += dynamics_loss.item() stats['orth_loss'] += orth_loss.item() stats['l2_loss'] += l2_loss.item() stats['dynamics_loss'] /= c stats['orth_loss'] /= c stats['l2_loss'] /= c self.dynamics.eval() # compute intrinsic reward with torch.no_grad(): batch = self._lazy_tensor_dict(train_batch) obs = _unpack_obs(batch['obs'], self.model.options['orig_obs_space'], torch) next_obs = _unpack_obs(batch['new_obs'], self.model.options['orig_obs_space'], torch) z = obs['z'] dynamics_obs = obs['dynamics_obs'] next_dynamics_obs = next_obs['dynamics_obs'] - obs[ 'dynamics_obs'] dads_reward, info = self.dynamics.compute_reward( dynamics_obs, z, next_dynamics_obs) dads_reward = self.config[ 'dads_reward_scale'] * dads_reward.numpy() # # replace the reward column in train_batch # print(train_batch['rewards'].shape) train_batch['rewards'] = dads_reward stats['avg_dads_reward'] = dads_reward.mean() stats['num_skills_higher_prob'] = info['num_higher_prob'] # calculate GAE for dads reward here? trajs = train_batch.split_by_episode() processed_trajs = [] for traj in trajs: processed_trajs.append(compute_gae_for_sample_batch(self, traj)) batch = SampleBatch.concat_samples(processed_trajs) # train_batch = compute_gae_for_sample_batch(self, self._lazy_numpy_dict(train_batch)) # train_batch = self._lazy_tensor_dict(train_batch) # update agent using RL algo # split to minibatches c = 0 for ep in range(self.ppo_epochs): # batch.shuffle() for mb in minibatches(batch, self.minibatch_size): c += 1 mb["is_training"] = True # minibatch = mb.copy() mb['advantages'] = standardize(mb['advantages']) minibatch = self._lazy_tensor_dict(mb) # compute the loss loss_out = ppo_surrogate_loss(self, self.model, self.dist_class, minibatch) # compute gradient self.ppo_opt.zero_grad() # the learning_rate is already used in ppo_surrogate_loss loss_out.backward() # grad norm if self.config['grad_clip']: grad_norm = nn.utils.clip_grad_norm_( self.model.parameters(), self.config['grad_clip']) self.ppo_opt.step() # log stats stats['ppo_loss'] += loss_out.item() stats['ppo_loss'] /= c # add more info about the loss stats.update(kl_and_loss_stats(self, train_batch)) # { # "loss": loss_out.item(), # 'test': 1 # # "grad_norm": grad_norm # # if isinstance(grad_norm, float) else grad_norm.item(), # } return {LEARNER_STATS_KEY: stats}