Beispiel #1
0
    def test_ddpg_compilation(self):
        """Test whether DDPG can be built with both frameworks."""
        config = ddpg.DDPGConfig()
        config.num_workers = 0
        config.num_envs_per_worker = 2
        config.replay_buffer_config["learning_starts"] = 0
        explore = config.exploration_config.update({"random_timesteps": 100})
        config.exploration(exploration_config=explore)

        num_iterations = 1

        # Test against all frameworks.
        for _ in framework_iterator(config, with_eager_tracing=True):
            algo = config.build(env="Pendulum-v1")
            for i in range(num_iterations):
                results = algo.train()
                check_train_results(results)
                print(results)
            check_compute_single_action(algo)
            # Ensure apply_gradient_fn is being called and updating global_step
            pol = algo.get_policy()
            if config.framework_str == "tf":
                a = pol.get_session().run(pol.global_step)
            else:
                a = pol.global_step
            check(a, 500)
            algo.stop()
Beispiel #2
0
    def test_ddpg_exploration_and_with_random_prerun(self):
        """Tests DDPG's Exploration (w/ random actions for n timesteps)."""

        core_config = ddpg.DDPGConfig().rollouts(num_rollout_workers=0)
        obs = np.array([0.0, 0.1, -0.1])

        # Test against all frameworks.
        for _ in framework_iterator(core_config):
            config = copy.deepcopy(core_config)
            # Default OUNoise setup.
            algo = config.build(env="Pendulum-v1")
            # Setting explore=False should always return the same action.
            a_ = algo.compute_single_action(obs, explore=False)
            check(algo.get_policy().global_timestep, 1)
            for i in range(50):
                a = algo.compute_single_action(obs, explore=False)
                check(algo.get_policy().global_timestep, i + 2)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for i in range(50):
                actions.append(algo.compute_single_action(obs))
                check(algo.get_policy().global_timestep, i + 52)
            check(np.std(actions), 0.0, false=True)
            algo.stop()

            # Check randomness at beginning.
            config.exploration_config.update(
                {
                    # Act randomly at beginning ...
                    "random_timesteps": 50,
                    # Then act very closely to deterministic actions thereafter.
                    "ou_base_scale": 0.001,
                    "initial_scale": 0.001,
                    "final_scale": 0.001,
                }
            )

            algo = ddpg.DDPG(config=config, env="Pendulum-v1")
            # ts=0 (get a deterministic action as per explore=False).
            deterministic_action = algo.compute_single_action(obs, explore=False)
            check(algo.get_policy().global_timestep, 1)
            # ts=1-49 (in random window).
            random_a = []
            for i in range(1, 50):
                random_a.append(algo.compute_single_action(obs, explore=True))
                check(algo.get_policy().global_timestep, i + 1)
                check(random_a[-1], deterministic_action, false=True)
            self.assertTrue(np.std(random_a) > 0.5)

            # ts > 50 (a=deterministic_action + scale * N[0,1])
            for i in range(50):
                a = algo.compute_single_action(obs, explore=True)
                check(algo.get_policy().global_timestep, i + 51)
                check(a, deterministic_action, rtol=0.1)

            # ts >> 50 (BUT: explore=False -> expect deterministic action).
            for i in range(50):
                a = algo.compute_single_action(obs, explore=False)
                check(algo.get_policy().global_timestep, i + 101)
                check(a, deterministic_action)
            algo.stop()
Beispiel #3
0
    def test_ddpg_loss_function(self):
        """Tests DDPG loss function results across all frameworks."""
        config = ddpg.DDPGConfig()
        # Run locally.
        config.seed = 42
        config.num_workers = 0
        config.twin_q = True
        config.use_huber = True
        config.huber_threshold = 1.0
        config.gamma = 0.99
        # Make this small (seems to introduce errors).
        config.l2_reg = 1e-10
        config.replay_buffer_config = {
            "type": "MultiAgentReplayBuffer",
            "capacity": 50000,
            "learning_starts": 0,
        }
        # Use very simple nets.
        config.actor_hiddens = [10]
        config.critic_hiddens = [10]
        # Make sure, timing differences do not affect Algorithm.train().
        config.min_time_s_per_iteration = 0
        config.min_sample_timesteps_per_iteration = 100

        map_ = {
            # Normal net.
            "default_policy/actor_hidden_0/kernel": "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0/bias": "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out/kernel": "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out/bias": "policy_model.action_out._model.0.bias",
            "default_policy/sequential/q_hidden_0/kernel": "q_model.q_hidden_0"
            "._model.0.weight",
            "default_policy/sequential/q_hidden_0/bias": "q_model.q_hidden_0."
            "_model.0.bias",
            "default_policy/sequential/q_out/kernel": "q_model.q_out._model."
            "0.weight",
            "default_policy/sequential/q_out/bias": "q_model.q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_1/twin_q_hidden_0/kernel": "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_1/twin_q_hidden_0/bias": "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_1/twin_q_out/kernel": "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_1/twin_q_out/bias": "twin_"
            "q_model.twin_q_out._model.0.bias",
            # Target net.
            "default_policy/actor_hidden_0_1/kernel": "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0_1/bias": "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out_1/kernel": "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out_1/bias": "policy_model.action_out._model"
            ".0.bias",
            "default_policy/sequential_2/q_hidden_0/kernel": "q_model."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_2/q_hidden_0/bias": "q_model."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_2/q_out/kernel": "q_model."
            "q_out._model.0.weight",
            "default_policy/sequential_2/q_out/bias": "q_model.q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_3/twin_q_hidden_0/kernel": "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_3/twin_q_hidden_0/bias": "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_3/twin_q_out/kernel": "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_3/twin_q_out/bias": "twin_"
            "q_model.twin_q_out._model.0.bias",
        }

        env = SimpleEnv
        batch_size = 100
        obs_size = (batch_size, 1)
        actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_t = None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(
            config, frameworks=("tf", "torch"), session=True
        ):
            # Generate Algorithm and get its default Policy object.
            algo = config.build(env=env)
            policy = algo.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                assert fw == "tf"  # Start with the tf vars-dict.
                weights_dict = policy.get_weights()
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "torch":
                # Actually convert to torch tensors.
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_t = self._ddpg_loss_helper(
                    input_,
                    weights_dict,
                    sorted(weights_dict.keys()),
                    fw,
                    gamma=config.gamma,
                    huber_threshold=config.huber_threshold,
                    l2_reg=config.l2_reg,
                    sess=sess,
                )

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, t, tf_c_grads, tf_a_grads = p_sess.run(
                    [
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.td_error,
                        policy._critic_optimizer.compute_gradients(
                            policy.critic_loss, policy.model.q_variables()
                        ),
                        policy._actor_optimizer.compute_gradients(
                            policy.actor_loss, policy.model.policy_variables()
                        ),
                    ],
                    feed_dict=policy._get_loss_inputs_dict(input_, shuffle=False),
                )
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]

            elif fw == "torch":
                policy.loss(policy.model, None, input_)
                c, a, t = (
                    policy.get_tower_stats("critic_loss")[0],
                    policy.get_tower_stats("actor_loss")[0],
                    policy.get_tower_stats("td_error")[0],
                )
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                # Test actor gradients.
                policy._actor_optimizer.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None for v in policy.model.policy_variables())
                a.backward()
                # `actor_loss` depends on Q-net vars
                # (but not twin-Q-net vars!).
                assert not any(v.grad is None for v in policy.model.q_variables()[:4])
                assert all(v.grad is None for v in policy.model.q_variables()[4:])
                assert not all(
                    torch.mean(v.grad) == 0 for v in policy.model.policy_variables()
                )
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.policy_variables()
                )
                # Compare with tf ones.
                torch_a_grads = [v.grad for v in policy.model.policy_variables()]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy._critic_optimizer.zero_grad()
                assert all(
                    v.grad is None or torch.mean(v.grad) == 0.0
                    for v in policy.model.q_variables()
                )
                assert all(
                    v.grad is None or torch.min(v.grad) == 0.0
                    for v in policy.model.q_variables()
                )
                c.backward()
                assert not all(
                    torch.mean(v.grad) == 0 for v in policy.model.q_variables()
                )
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables()
                )
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [v.grad for v in policy.model.policy_variables()]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(t, prev_fw_loss[2])

            prev_fw_loss = (c, a, t)

            # Update weights from our batch (n times).
            for update_iteration in range(6):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = algo.local_replay_buffer
                    patch_buffer_with_fake_sampling_method(buf, in_)
                    algo.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(
                            updated_weights["default_policy/actor_hidden_0/kernel"],
                            tf_updated_weights[-1][
                                "default_policy/actor_hidden_0/kernel"
                            ],
                            false=True,
                        )
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = algo.local_replay_buffer
                    patch_buffer_with_fake_sampling_method(buf, in_)
                    algo.train()
                    # Compare updated model and target weights.
                    for tf_key in tf_weights.keys():
                        tf_var = tf_weights[tf_key]
                        # Model.
                        if re.search(
                            "actor_out_1|actor_hidden_0_1|sequential_[23]", tf_key
                        ):
                            torch_var = policy.target_model.state_dict()[map_[tf_key]]
                        # Target model.
                        else:
                            torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var, np.transpose(torch_var.cpu()), atol=0.1)
                        else:
                            check(tf_var, torch_var, atol=0.1)

            algo.stop()
Beispiel #4
0
def _import_ddpg():
    import ray.rllib.algorithms.ddpg as ddpg

    return ddpg.DDPG, ddpg.DDPGConfig().to_dict()