def test_ppo_free_log_std(self): """Tests the free log std option works.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["model"]["free_log_std"] = True config["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check the free log std var is created. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 1, matching log_std_var = matching[0] def get_value(): if fw == "tf": return policy.get_session().run(log_std_var)[0] elif fw == "torch": return log_std_var.detach().numpy()[0] else: return log_std_var.numpy()[0] # Check the variable is initially zero. init_std = get_value() assert init_std == 0.0, init_std if fw in ["tf", "tfe"]: batch = postprocess_ppo_gae_tf(policy, FAKE_BATCH) else: batch = postprocess_ppo_gae_torch(policy, FAKE_BATCH) batch = policy._lazy_tensor_dict(batch) policy.learn_on_batch(batch) # Check the variable is updated. post_std = get_value() assert post_std != 0.0, post_std trainer.stop()
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Check no free log std var by default. if fw == "torch": matching = [ v for (n, v) in policy.model.named_parameters() if "log_std" in n ] else: matching = [ v for v in policy.model.trainable_variables() if "log_std" in str(v) ] assert len(matching) == 0, matching # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf" or fw == "tfe": train_batch = postprocess_ppo_gae_tf(policy, FAKE_BATCH) else: train_batch = postprocess_ppo_gae_torch(policy, FAKE_BATCH) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw == "tfe": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy.loss_obj.mean_kl, policy.loss_obj.mean_entropy, policy.loss_obj.mean_policy_loss, policy.loss_obj.mean_vf_loss, policy.loss_obj.loss ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4) trainer.stop()
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["eager"] = True config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), BEHAVIOUR_LOGITS: np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32) } # tf. trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach to # train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] train_batch = postprocess_ppo_gae_tf(policy, train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) for # tf. ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) vars = policy.model.trainable_variables() expected_logits = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(), vars[1].numpy()), vars[4].numpy(), vars[5].numpy()) expected_value_outs = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[2].numpy(), vars[3].numpy()), vars[6].numpy(), vars[7].numpy()) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical, train_batch, expected_logits, expected_value_outs ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4) # Torch. config["use_pytorch"] = True trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) # for tf. ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, TorchCategorical, train_batch, policy.model.last_output(), policy.model.value_function().detach().numpy() ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array( [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array( [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array( [-0.5, -0.1, -0.2], dtype=np.float32), } for fw in ["tf", "torch"]: print("framework={}".format(fw)) config["use_pytorch"] = fw == "torch" config["eager"] = fw == "tf" trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf": train_batch = postprocess_ppo_gae_tf(policy, train_batch) else: train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss (results are stored in policy.loss_obj) # for tf. if fw == "tf": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) else: ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw == "tf" else \ list(policy.model.parameters()) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0], vars[1]) expected_logits = fc(expected_shared_out, vars[2], vars[3]) expected_value_outs = fc(expected_shared_out, vars[4], vars[5]) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw == "tf" else TorchCategorical, train_batch, expected_logits, expected_value_outs ) check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)
def test_ppo_loss_function(self): """Tests the PPO loss function math.""" config = ppo.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" config["vf_share_layers"] = True # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]], dtype=np.float32), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.PREV_ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.PREV_REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32), SampleBatch.ACTION_DIST_INPUTS: np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32), SampleBatch.ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32), } for fw, sess in framework_iterator(config, session=True): trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] = # [0.50005, -0.505, 0.5] if fw == "tf" or fw == "eager": train_batch = postprocess_ppo_gae_tf(policy, train_batch) else: train_batch = postprocess_ppo_gae_torch(policy, train_batch) train_batch = policy._lazy_tensor_dict(train_batch) # Check Advantage values. check(train_batch[Postprocessing.VALUE_TARGETS], [0.50005, -0.505, 0.5]) # Calculate actual PPO loss. if fw == "eager": ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch) elif fw == "torch": ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical, train_batch) vars = policy.model.variables() if fw != "torch" else \ list(policy.model.parameters()) if fw == "tf": vars = policy.get_session().run(vars) expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0 if fw != "torch" else 2], vars[1 if fw != "torch" else 3], framework=fw) expected_logits = fc(expected_shared_out, vars[2 if fw != "torch" else 0], vars[3 if fw != "torch" else 1], framework=fw) expected_value_outs = fc(expected_shared_out, vars[4], vars[5], framework=fw) kl, entropy, pg_loss, vf_loss, overall_loss = \ self._ppo_loss_helper( policy, policy.model, Categorical if fw != "torch" else TorchCategorical, train_batch, expected_logits, expected_value_outs, sess=sess ) if sess: policy_sess = policy.get_session() k, e, pl, v, tl = policy_sess.run( [ policy.loss_obj.mean_kl, policy.loss_obj.mean_entropy, policy.loss_obj.mean_policy_loss, policy.loss_obj.mean_vf_loss, policy.loss_obj.loss ], feed_dict=policy._get_loss_inputs_dict(train_batch, shuffle=False)) check(k, kl) check(e, entropy) check(pl, np.mean(-pg_loss)) check(v, np.mean(vf_loss), decimals=4) check(tl, overall_loss, decimals=4) else: check(policy.loss_obj.mean_kl, kl) check(policy.loss_obj.mean_entropy, entropy) check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss)) check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4) check(policy.loss_obj.loss, overall_loss, decimals=4)