Beispiel #1
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy._mean_kl, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
Beispiel #2
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]],
                     dtype=np.float32),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.VF_PREDS:
            np.array([0.5, 0.6, 0.7], dtype=np.float32),
            BEHAVIOUR_LOGITS:
            np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            ACTION_LOGP:
            np.array([-0.5, -0.1, -0.2], dtype=np.float32)
        }

        # tf.
        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
        # [0.50005, -0.505, 0.5]
        train_batch = postprocess_ppo_gae_tf(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.VALUE_TARGETS],
              [0.50005, -0.505, 0.5])

        # Calculate actual PPO loss (results are stored in policy.loss_obj) for
        # tf.
        ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch)

        vars = policy.model.trainable_variables()
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[4].numpy(), vars[5].numpy())
        expected_value_outs = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[2].numpy(),
               vars[3].numpy()), vars[6].numpy(), vars[7].numpy())

        kl, entropy, pg_loss, vf_loss, overall_loss = \
            self._ppo_loss_helper(
                policy, policy.model, Categorical, train_batch,
                expected_logits, expected_value_outs
            )
        check(policy.loss_obj.mean_kl, kl)
        check(policy.loss_obj.mean_entropy, entropy)
        check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
        check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
        check(policy.loss_obj.loss, overall_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = postprocess_ppo_gae_torch(policy, train_batch)
        train_batch = policy._lazy_tensor_dict(train_batch)

        # Check Advantage values.
        check(train_batch[Postprocessing.VALUE_TARGETS],
              [0.50005, -0.505, 0.5])

        # Calculate actual PPO loss (results are stored in policy.loss_obj)
        # for tf.
        ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical,
                                 train_batch)

        kl, entropy, pg_loss, vf_loss, overall_loss = \
            self._ppo_loss_helper(
                policy, policy.model, TorchCategorical, train_batch,
                policy.model.last_output(),
                policy.model.value_function().detach().numpy()
            )
        check(policy.loss_obj.mean_kl, kl)
        check(policy.loss_obj.mean_entropy, entropy)
        check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
        check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
        check(policy.loss_obj.loss, overall_loss, decimals=4)
Beispiel #3
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["vf_share_layers"] = True

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS: np.array(
                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                 [0.9, 1.0, 1.1, 1.2]],
                dtype=np.float32),
            SampleBatch.ACTIONS: np.array([0, 1, 1]),
            SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES: np.array([False, False, True]),
            SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
            SampleBatch.ACTION_DIST_INPUTS: np.array(
                [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            SampleBatch.ACTION_LOGP: np.array(
                [-0.5, -0.1, -0.2], dtype=np.float32),
        }

        for fw in ["tf", "torch"]:
            print("framework={}".format(fw))
            config["use_pytorch"] = fw == "torch"
            config["eager"] = fw == "tf"

            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            if fw == "tf":
                train_batch = postprocess_ppo_gae_tf(policy, train_batch)
            else:
                train_batch = postprocess_ppo_gae_torch(policy, train_batch)
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss (results are stored in policy.loss_obj)
            # for tf.
            if fw == "tf":
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            else:
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw == "tf" else \
                list(policy.model.parameters())
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0],
                                     vars[1])
            expected_logits = fc(expected_shared_out, vars[2], vars[3])
            expected_value_outs = fc(expected_shared_out, vars[4], vars[5])

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw == "tf" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs
                )
            check(policy.loss_obj.mean_kl, kl)
            check(policy.loss_obj.mean_entropy, entropy)
            check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
            check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
            check(policy.loss_obj.loss, overall_loss, decimals=4)
Beispiel #4
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["vf_share_layers"] = True

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]],
                     dtype=np.float32),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.PREV_ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.PREV_REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.VF_PREDS:
            np.array([0.5, 0.6, 0.7], dtype=np.float32),
            SampleBatch.ACTION_DIST_INPUTS:
            np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            SampleBatch.ACTION_LOGP:
            np.array([-0.5, -0.1, -0.2], dtype=np.float32),
        }

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            if fw == "tf" or fw == "eager":
                train_batch = postprocess_ppo_gae_tf(policy, train_batch)
            else:
                train_batch = postprocess_ppo_gae_torch(policy, train_batch)
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw == "eager":
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy.loss_obj.mean_kl, policy.loss_obj.mean_entropy,
                        policy.loss_obj.mean_policy_loss,
                        policy.loss_obj.mean_vf_loss, policy.loss_obj.loss
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy.loss_obj.mean_kl, kl)
                check(policy.loss_obj.mean_entropy, entropy)
                check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
                check(policy.loss_obj.mean_vf_loss,
                      np.mean(vf_loss),
                      decimals=4)
                check(policy.loss_obj.loss, overall_loss, decimals=4)