def test_shared_value_function_policy_for_discrete_container_action_space(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)

        # action_space (complex nested container action space).
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=Dict(b1=IntBox(3), b2=IntBox(4)),
                            add_batch_rank=True)
        #flat_float_action_space = dict(
        #    type="dict",
        #    a=FloatBox(shape=(2,)),
        #    b=Dict(b1=FloatBox(shape=(3,)), b2=FloatBox(shape=(4,))),
        #    add_batch_rank=True
        #)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs (batch size=2).
        states = state_space.sample(size=2)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                states, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            b=dict(b1=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
                   b2=np.matmul(
                       expected_nn_output, policy_params[
                           base_scope +
                           "2/action-network/action-layer/dense/kernel"])))
        test.test(("get_adapter_outputs", states),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      nn_outputs=expected_nn_output),
                  decimals=5)

        # State-values.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(
            ("get_state_values", states, ["state_values"]),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # logits-values: One for each action-choice per item in the batch (simply take the remaining out nodes).
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output,
                      adapter_outputs=expected_action_layer_outputs),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probs_output = dict(
            a=softmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=softmax(expected_action_layer_outputs["b"]["b1"],
                              axis=-1),
                   b2=softmax(expected_action_layer_outputs["b"]["b2"],
                              axis=-1)))
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      parameters=expected_action_layer_outputs),
                  decimals=5)

        print("Probs: {}".format(expected_probs_output))

        # Action sample.
        expected_actions = dict(
            a=np.argmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=np.argmax(expected_action_layer_outputs["b"]["b1"],
                                axis=-1),
                   b2=np.argmax(expected_action_layer_outputs["b"]["b2"],
                                axis=-1)))
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood.
        expected_action_llh_output = np.log(np.array([expected_probs_output["a"][0][action["a"][0]],
                                                      expected_probs_output["a"][1][action["a"][1]]])) + \
                                     np.log(np.array([expected_probs_output["b"]["b1"][0][action["b"]["b1"][0]],
                                                      expected_probs_output["b"]["b1"][1][action["b"]["b1"][1]]
                                                      ])
                                            ) + \
                                     np.log(np.array([expected_probs_output["b"]["b2"][0][action["b"]["b2"][0]],
                                                      expected_probs_output["b"]["b2"][1][action["b"]["b2"][1]],
                                                      ])
                                            )
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_llh_output,
                      adapter_outputs=expected_action_layer_outputs),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)["action"]
        self.assertTrue(out["a"].dtype == np.int32)
        self.assertTrue(out["a"].shape == (2, ))
        self.assertTrue(out["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["b"]["b1"].shape == (2, ))
        self.assertTrue(out["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["b"]["b2"].shape == (2, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)["action"]
        self.assertTrue(out["a"].dtype == np.int32)
        self.assertTrue(out["a"].shape == (2, ))
        self.assertTrue(out["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["b"]["b1"].shape == (2, ))
        self.assertTrue(out["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["b"]["b2"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states),
                        expected_outputs=None)["entropy"]
        self.assertTrue(out["a"].dtype == np.float32)
        self.assertTrue(out["a"].shape == (2, ))
        self.assertTrue(out["b"]["b1"].dtype == np.float32)
        self.assertTrue(out["b"]["b1"].shape == (2, ))
        self.assertTrue(out["b"]["b2"].dtype == np.float32)
        self.assertTrue(out["b"]["b2"].shape == (2, ))
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=BoolBox(),
                            b=IntBox(3),
                            add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (batch size=32).
        batch_size = 32
        states = state_space.sample(batch_size)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.squeeze(
                np.matmul(
                    expected_nn_output, policy_params[
                        "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
                )),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_adapter_outputs", states),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      nn_outputs=expected_nn_output),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probs_output = dict(
            a=np.array(sigmoid(expected_action_layer_outputs["a"]),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      parameters=dict(a=expected_probs_output["a"],
                                      b=expected_action_layer_outputs["b"])),
                  decimals=5)

        print("Probs: {}".format(expected_probs_output))

        expected_actions = dict(a=expected_probs_output["a"] > 0.5,
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood (sum of the composite llhs).
        expected_action_llh_output = \
            np.log(np.array([expected_probs_output["a"][i] if action["a"][i] else 1.0 - expected_probs_output["a"][i] for i in range(batch_size)])) + \
            np.log(np.array([expected_probs_output["b"][i][action["b"][i]] for i in range(batch_size)]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_llh_output,
                      adapter_outputs=expected_action_layer_outputs),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (batch_size, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (batch_size, ))
Beispiel #3
0
    def test_sac_agent_component_on_fake_env(self):
        config = config_from_path("configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(2,))
        continuous_action_space = FloatBox(low=-1.0, high=1.0)
        terminal_space = BoolBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"], action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = ValueFunction.from_spec(config["value_function"])

        agent_component = SACAgentComponent(
            agent=None,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2
        )

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=FloatBox(add_batch_rank=True),
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                preprocessed_s_prime=state_space.with_batch_rank(),
                importance_weights=FloatBox(add_batch_rank=True),
                preprocessed_next_states=state_space.with_batch_rank(),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                # TODO: how to provide the space for multiple component variables?
                # q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                # )
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer,
                ),
            )
        )

        policy_loss = []
        vf_loss = []

        # This test simulates an env that always requires actions to be close to the max-pdf
        # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs.
        # The component should learn to produce actions like that (close to 0.5).
        true_mean = 0.5
        target_dist = stats.norm(loc=true_mean, scale=0.2)
        batch_size = 100
        for _ in range(5000):
            action_sample = continuous_action_space.sample(batch_size)
            rewards = target_dist.pdf(action_sample)
            result = test.test(("update_from_external_batch", [
                state_space.sample(batch_size),
                action_sample,
                rewards,
                [True] * batch_size,
                state_space.sample(batch_size),
                [1.0] * batch_size  # importance
            ]))
            policy_loss.append(result["actor_loss"])
            vf_loss.append(result["critic_loss"])

        self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:]))
        self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:]))

        action_sample = np.linspace(-1, 1, batch_size)
        q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            q_val = q_val.flatten()
            np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2)

        action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False]))
        action_sample = action_sample.flatten()
        np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)
    def test_metrics(self):
        """
        Tests metric collection for 1 and multiple environments.
        """
        agent_config = config_from_path("configs/apex_agent_cartpole.json")

        ray_spec = agent_config["execution_spec"].pop("ray_spec")
        ray_spec["worker_spec"]["worker_sample_size"] = 100
        worker_spec = ray_spec["worker_spec"]
        worker = RayValueWorker.as_remote().remote(agent_config,
                                                   ray_spec["worker_spec"],
                                                   self.env_spec,
                                                   auto_build=True)

        print("Testing statistics for 1 environment:")
        # Run for a while:
        task = worker.execute_and_get_timesteps.remote(100,
                                                       break_on_terminal=False)
        sleep(1)
        # Include a transition between calls.
        task = worker.execute_and_get_timesteps.remote(100,
                                                       break_on_terminal=False)
        sleep(1)
        # Retrieve result.
        result = ray.get(task)
        print('Task results:')
        print(result.get_metrics())

        # Get worker metrics.
        task = worker.get_workload_statistics.remote()
        result = ray.get(task)
        print("Worker statistics:")

        # In cartpole, num timesteps = reward -> must be the same.
        print("Cartpole episode rewards: {}".format(result["episode_rewards"]))
        print("Cartpole episode timesteps: {}".format(
            result["episode_timesteps"]))
        recursive_assert_almost_equal(result["episode_rewards"],
                                      result["episode_timesteps"])

        # Now repeat this but for multiple environments.
        print("Testing statistics for 4 environments:")
        worker_spec["num_worker_environments"] = 4
        worker_spec["num_background_environments"] = 2
        worker = RayValueWorker.as_remote().remote(agent_config,
                                                   ray_spec["worker_spec"],
                                                   self.env_spec,
                                                   auto_build=True)

        task = worker.execute_and_get_timesteps.remote(100,
                                                       break_on_terminal=False)
        sleep(1)
        result = ray.get(task)
        task = worker.execute_and_get_timesteps.remote(100,
                                                       break_on_terminal=False)
        sleep(1)
        result = ray.get(task)
        task = worker.get_workload_statistics.remote()
        result = ray.get(task)
        print("Multi-env statistics:")
        print("Cartpole episode rewards: {}".format(result["episode_rewards"]))
        print("Cartpole episode timesteps: {}".format(
            result["episode_timesteps"]))
        recursive_assert_almost_equal(result["episode_rewards"],
                                      result["episode_timesteps"])
Beispiel #5
0
    def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(3,), add_batch_rank=True, add_time_rank=True)

        # action_space (4 possible actions).
        action_space = IntBox(4, add_batch_rank=True, add_time_rank=True)
        flat_float_action_space = FloatBox(shape=(4,), add_batch_rank=True, add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        # Add folding to network.
        network_spec["fold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(unfold_time_rank=True),
            action_space=action_space,
            value_unfold_time_rank=True
        )
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_input=state_space,
                actions=action_space,
                probabilities=flat_float_action_space,
                parameters=flat_float_action_space,
                logits=flat_float_action_space
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(shared_value_function_policy.variables)

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 3))
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(np.matmul(
            states_folded, policy_params["shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
        ), 0.1)
        test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params["shared-value-function-policy/action-adapter-0/action-network/"
                                              "action-layer/dense/kernel"]
        )
        expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 3, 4))
        test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            policy_params["shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        expected_state_value_output_unfolded = np.reshape(expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states), expected_outputs=dict(state_values=expected_state_value_output_unfolded),
                  decimals=5)

        expected_action_layer_output_unfolded = np.reshape(expected_action_layer_output, newshape=(2, 3, 4))
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states, ["state_values", "logits"]),
            expected_outputs=dict(
                state_values=expected_state_value_output_unfolded, logits=expected_action_layer_output_unfolded
            ), decimals=5
        )

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = softmax(expected_action_layer_output_unfolded, axis=-1)
        test.test(("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict(
            logits=expected_action_layer_output_unfolded,
            probabilities=expected_probabilities_output
        ), decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_action_layer_output_unfolded, axis=-1)
        test.test(("get_action", states), expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = np.log(np.array([[
            expected_probabilities_output[0][0][expected_actions[0][0]],
            expected_probabilities_output[0][1][expected_actions[0][1]],
            expected_probabilities_output[0][2][expected_actions[0][2]],
        ], [
            expected_probabilities_output[1][0][expected_actions[1][0]],
            expected_probabilities_output[1][1][expected_actions[1][1]],
            expected_probabilities_output[1][2][expected_actions[1][2]],
        ]]))
        test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=dict(
            action_log_probs=expected_action_log_prob_output, logits=expected_action_layer_output_unfolded
        ), decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, 3))  # Make sure output is unfolded.
    def test_environment_stepper_on_2x2_grid_world(self):
        preprocessor_spec = [
            dict(type="reshape",
                 flatten=True,
                 flatten_categories=self.grid_world_2x2_action_space.
                 num_categories)
        ]
        network_spec = config_from_path("configs/test_simple_nn.json")
        # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal).
        network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1],
                                                     [-0.2, 0.2], [-0.4, 0.2]]
        network_spec["layers"][0]["biases_spec"] = False
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1],
                                                        [0.4, 0.2, -0.2, 0.2]],
                                          biases_spec=False),
                 action_space=self.grid_world_2x2_action_space,
                 deterministic=True), exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="grid_world", world="2x2"),
            actor_component_spec=actor_component,
            state_space=self.grid_world_2x2_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.grid_world_2x2_action_probs_space,
            num_steps=5)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.grid_world_2x2_action_space,
        )

        # Step 5 times through the Env and collect results.
        expected = (
            np.array([False, True, False, True, False]),  # t_
            np.array([0, 1, 0, 1, 0, 1]),  # s' (raw)
            np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299]],
                     dtype=np.float32))
        out = test.test("step", expected_outputs=expected, decimals=2)
        print(out)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([True, False, True, False, True]),  # t_
            np.array([1, 0, 1, 0, 1, 0]),  # s' (raw)
            np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825],
                      [0.21869287, 0.17905058, 0.36056358, 0.24169299],
                      [0.2547221, 0.2651175, 0.23048209, 0.24967825]],
                     dtype=np.float32))
        out = test.test("step", expected_outputs=expected)
        print(out)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Beispiel #7
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(nn_input=state_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=expected_nn_output,
                  decimals=6)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            policy_params["policy/action-adapter/action-layer/dense/kernel"])
        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(2, 5))
        test.test(("get_adapter_outputs", states, ["adapter_outputs"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = softmax(expected_action_layer_output,
                                                axis=-1)
        test.test(("get_adapter_outputs_and_parameters", states, [0, 1, 2]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output,
                      parameters=expected_probabilities_output,
                      log_probs=np.log(expected_probabilities_output)),
                  decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))
    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            store_last_memory_batch=True,
            store_last_q_table=True,
            discount=0.95
        )
        worker = SingleThreadedWorker(env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(
            loss_per_item=[
                spaces.FloatBox(shape=(4,), add_batch_rank=True),
                spaces.IntBox(4, add_batch_rank=True),
                spaces.FloatBox(add_batch_rank=True),
                spaces.BoolBox(add_batch_rank=True),
                spaces.FloatBox(shape=(4,), add_batch_rank=True),
                spaces.FloatBox(shape=(4,), add_batch_rank=True)
            ]
        ), action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a],  key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet)
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                                                               [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet)
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 +
                                                               [[0.0, 1.0, 0.0, 0.0]] +
                                                               [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var("replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 4 +  # + [-3.0] +
                                                                [0.0] * (agent.memory.capacity - 4)))
        test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 +
                                                                  [False] * (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([False, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])
        )
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net,
                                                 matrix2_target_net, agent, loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net)
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=4)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent("states_buffer", [], key_or_index="env_0")  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                                                               [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False]))
        test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net)
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [], key_or_index="env_0")  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 +
                                                               [[0.0, 0.0, 1.0, 0.0]] +
                                                               [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False]))

        test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net)
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])  # TODO: <- This is wrong and must be fixed (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 +
                                                               [[0.0, 0.0, 1.0, 0.0]] +
                                                               [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent, loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2)  # again: old matrix
        test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2)
        test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)
    def test_sac_agent_component_functionality(self):
        config = config_from_path("configs/sac_component_for_fake_env_test.json")

        # Arbitrary state space, state should not be used in this example.
        state_space = FloatBox(shape=(8,))
        continuous_action_space = FloatBox(shape=(1,), low=-2.0, high=2.0)
        terminal_space = BoolBox(add_batch_rank=True)
        rewards_space = FloatBox(add_batch_rank=True)
        policy = Policy.from_spec(config["policy"], action_space=continuous_action_space)
        policy.add_components(Synchronizable(), expose_apis="sync")
        q_function = SACValueNetwork.from_spec(config["value_function"])

        class DummyAgent(object):
            def __init__(self):
                self.graph_executor = None

        dummy_agent = DummyAgent()
        agent_component = SACAgentComponent(
            agent=dummy_agent,
            policy=policy,
            q_function=q_function,
            preprocessor=PreprocessorStack.from_spec([]),
            memory=ReplayMemory.from_spec(config["memory"]),
            discount=config["discount"],
            initial_alpha=config["initial_alpha"],
            target_entropy=None,
            optimizer=AdamOptimizer.from_spec(config["optimizer"]),
            vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"),
            alpha_optimizer=None,
            q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0),
            num_q_functions=2
        )

        test = ComponentTest(
            component=agent_component,
            input_spaces=dict(
                increment=int,
                episode_reward=float,
                states=state_space.with_batch_rank(),
                preprocessed_states=state_space.with_batch_rank(),
                env_actions=continuous_action_space.with_batch_rank(),
                actions=continuous_action_space.with_batch_rank(),
                rewards=rewards_space,
                next_states=state_space.with_batch_rank(),
                terminals=terminal_space,
                batch_size=int,
                importance_weights=FloatBox(add_batch_rank=True),
                deterministic=bool,
                weights="variables:{}".format(policy.scope),
                time_percentage=float
                # TODO: how to provide the space for multiple component variables?
                #q_weights=Dict(
                #    q_0="variables:{}".format(q_function.scope),
                #    q_1="variables:{}".format(agent_component._q_functions[1].scope),
                #)
            ),
            action_space=continuous_action_space,
            build_kwargs=dict(
                optimizer=agent_component._optimizer,
                build_options=dict(
                    vf_optimizer=agent_component.vf_optimizer,
                ),
            ),
            auto_build=False
        )
        dummy_agent.graph_executor = test.graph_executor
        test.build()

        batch_size = 10
        action_sample = continuous_action_space.with_batch_rank().sample(batch_size)
        rewards = rewards_space.sample(batch_size)
        # Check, whether an update runs ok.
        result = test.test(("update_from_external_batch", [
            state_space.sample(batch_size),
            action_sample,
            rewards,
            [True] * batch_size,
            state_space.sample(batch_size),
            [1.0] * batch_size  # importance
        ]))
        self.assertTrue(result["actor_loss"].dtype == np.float32)
        self.assertTrue(result["critic_loss"].dtype == np.float32)

        action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1))
        q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample]))
        for q_val in q_values:
            self.assertTrue(q_val.dtype == np.float32)
            self.assertTrue(q_val.shape == (batch_size, 1))

        action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False]))
        self.assertTrue(action_sample.dtype == np.float32)
        self.assertTrue(action_sample.shape == (batch_size, 1))
    def test_batched_backend_equivalence(self):
        return
        """
        Tests if Python and TensorFlow backend return the same output
        for a standard DQN-style preprocessing stack.
        """
        env_spec = dict(type="openai",
                        gym_env="Pong-v0",
                        frameskip=4,
                        max_num_noops=30,
                        episodic_life=True)
        # Test with batching because we assume vector environments to be the normal case going forward.
        env = SequentialVectorEnv(num_environments=4,
                                  env_spec=env_spec,
                                  num_background_envs=2)
        in_space = env.state_space

        agent_config = config_from_path("configs/ray_apex_for_pong.json")
        preprocessing_spec = deepcopy(agent_config["preprocessing_spec"])

        # Set up python preprocessor.
        scopes = [preprocessor["scope"] for preprocessor in preprocessing_spec]
        # Set backend to python.
        for spec in preprocessing_spec:
            spec["backend"] = "python"
        python_processor = PreprocessorStack(*preprocessing_spec,
                                             backend="python")
        for sub_comp_scope in scopes:
            python_processor.sub_components[sub_comp_scope].create_variables(
                dict(preprocessing_inputs=in_space))
        python_processor.reset()

        # To have the use case we considered so far, use agent interface for TF backend.
        agent_config.pop("type")
        agent = ApexAgent(state_space=env.state_space,
                          action_space=env.action_space,
                          **agent_config)

        # Generate a few states from random set points. Test if preprocessed states are almost equal
        states = np.asarray(env.reset_all())
        actions, agent_preprocessed_states = agent.get_action(
            states=states,
            use_exploration=False,
            extra_returns="preprocessed_states")
        print("TensorFlow preprocessed shape: {}".format(
            np.asarray(agent_preprocessed_states).shape))
        python_preprocessed_states = python_processor.preprocess(states)
        print("Python preprocessed shape: {}".format(
            np.asarray(python_preprocessed_states).shape))
        print("Asserting (almost) equal values:")
        for tf_state, python_state in zip(agent_preprocessed_states,
                                          python_preprocessed_states):
            flat_tf = np.ndarray.flatten(tf_state)
            flat_python = np.ndarray.flatten(python_state)
            for x, y in zip(flat_tf, flat_python):
                recursive_assert_almost_equal(x, y, decimals=3)

        states, _, _, _ = env.step(actions)
        actions, agent_preprocessed_states = agent.get_action(
            states=states,
            use_exploration=False,
            extra_returns="preprocessed_states")
        print("TensorFlow preprocessed shape: {}".format(
            np.asarray(agent_preprocessed_states).shape))
        python_preprocessed_states = python_processor.preprocess(states)
        print("Python preprocessed shape: {}".format(
            np.asarray(python_preprocessed_states).shape))
        print("Asserting (almost) equal values:")
        recursive_assert_almost_equal(agent_preprocessed_states,
                                      python_preprocessed_states,
                                      decimals=3)
Beispiel #11
0
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=IntBox(3),
                            add_batch_rank=True)
        flat_float_action_space = dict(type="dict",
                                       a=FloatBox(shape=(2, )),
                                       b=FloatBox(shape=(3, )),
                                       add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_input=state_space,
                                 actions=action_space,
                                 probabilities=flat_float_action_space,
                                 logits=flat_float_action_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (batch size=2).
        states = state_space.sample(2)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
            ),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_outputs),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = dict(
            a=np.array(softmax(expected_action_layer_outputs["a"], axis=-1),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_outputs,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = dict(a=np.argmax(expected_action_layer_outputs["a"],
                                            axis=-1),
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (2, ))

        # Action log-probs.
        expected_action_log_prob_output = dict(
            a=np.log(
                np.array([
                    expected_probabilities_output["a"][0][expected_actions["a"]
                                                          [0]],
                    expected_probabilities_output["a"][1][expected_actions["a"]
                                                          [1]]
                ])),
            b=np.log(
                np.array([
                    expected_probabilities_output["b"][0][expected_actions["b"]
                                                          [0]],
                    expected_probabilities_output["b"][1][expected_actions["b"]
                                                          [1]]
                ])),
        )
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_outputs),
                  decimals=5)
Beispiel #12
0
    def test_policy_for_bounded_continuous_action_space(self):
        """
        https://github.com/rlgraph/rlgraph/issues/43
        """
        nn_input_space = FloatBox(shape=(4,), add_batch_rank=True)
        action_space = FloatBox(low=-1.0, high=1.0, shape=(1,), add_batch_rank=True)
        # Double the shape for alpha/beta params.
        action_space_parameters = FloatBox(shape=(2,), add_batch_rank=True)

        policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space)
        test = ComponentTest(
            component=policy,
            input_spaces=dict(
                nn_input=nn_input_space,
                actions=action_space,
                logits=FloatBox(shape=(1,), add_batch_rank=True),
                probabilities=FloatBox(add_batch_rank=True),
                parameters=action_space_parameters,
            ),
            action_space=action_space
        )

        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = np.matmul(nn_input, policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output))

        # Raw action layer output.
        expected_raw_logits = np.matmul(
            expected_nn_output, policy_params["policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_logits),
                  decimals=5)

        # Parameter (alpha/betas).
        expected_parameters_output = np.log(np.exp(expected_raw_logits) + 1.0) + 1.0
        test.test(("get_logits_parameters_log_probs", nn_input, ["logits", "parameters"]), expected_outputs=dict(
            logits=expected_raw_logits, parameters=expected_parameters_output
        ), decimals=5)

        print("Params: {}".format(expected_parameters_output))

        actions = test.test(("get_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Action log-probs.
        actions_scaled_back = (actions + 1.0) / 2.0
        expected_action_log_prob_output = np.log(beta.pdf(actions_scaled_back, expected_parameters_output[:, 1], expected_parameters_output[:, 0]))
        expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]])
        test.test(("get_action_log_probs", [nn_input, actions]),
                  expected_outputs=dict(action_log_probs=expected_action_log_prob_output,
                                        logits=expected_raw_logits), decimals=5)

        # Stochastic sample.
        actions = test.test(("get_stochastic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Deterministic sample.
        actions = test.test(("get_deterministic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Distribution's entropy.
        entropy = test.test(("get_entropy", nn_input))["entropy"]
        self.assertTrue(entropy.dtype == np.float32)
        self.assertTrue(entropy.shape == (3, 1))
Beispiel #13
0
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        np.random.seed(10)
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(3,), add_batch_rank=True)

        # action_space (2 possible actions).
        action_space = IntBox(2, add_batch_rank=True)
        flat_float_action_space = FloatBox(shape=(2,), add_batch_rank=True)

        # Policy with dueling logic.
        policy = DuelingPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_adapter_spec=dict(
                pre_network_spec=[
                    dict(type="dense", units=10, activation="lrelu", activation_params=[0.1])
                ]
            ),
            units_state_value_stream=10,
            action_space=action_space
        )
        test = ComponentTest(
            component=policy,
            input_spaces=dict(
                nn_input=nn_input_space,
                actions=action_space,
                probabilities=flat_float_action_space,
                parameters=flat_float_action_space,
                logits=flat_float_action_space
            ),
            action_space=action_space
        )
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = relu(np.matmul(
            nn_input, policy_params["dueling-policy/test-network/hidden-layer/dense/kernel"]), 0.1
        )
        test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output))

        # Raw action layer output.
        expected_raw_advantages = np.matmul(relu(np.matmul(
            expected_nn_output, policy_params["dueling-policy/action-adapter-0/action-network/dense-layer/dense/kernel"]
        ), 0.1), policy_params["dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel"])
        test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_advantages),
                  decimals=5)

        # Single state values.
        expected_state_values = np.matmul(relu(np.matmul(
            expected_nn_output,
            policy_params["dueling-policy/dense-layer-state-value-stream/dense/kernel"]
        )), policy_params["dueling-policy/state-value-node/dense/kernel"])
        test.test(("get_state_values", nn_input), expected_outputs=dict(state_values=expected_state_values),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_q_values_output = expected_state_values + expected_raw_advantages - \
            np.mean(expected_raw_advantages, axis=-1, keepdims=True)
        test.test(("get_logits_probabilities_log_probs", nn_input, "logits"), expected_outputs=dict(
            logits=expected_q_values_output
        ), decimals=5)

        # Parameter (probabilities). Softmaxed q_values.
        expected_probabilities_output = softmax(expected_q_values_output, axis=-1)
        test.test(("get_logits_probabilities_log_probs", nn_input, ["logits", "probabilities"]), expected_outputs=dict(
            logits=expected_q_values_output, probabilities=expected_probabilities_output
        ), decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_q_values_output, axis=-1)
        test.test(("get_action", nn_input), expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = np.log(np.array([
            expected_probabilities_output[0][expected_actions[0]],
            expected_probabilities_output[1][expected_actions[1]],
            expected_probabilities_output[2][expected_actions[2]],
        ]))
        test.test(("get_action_log_probs", [nn_input, expected_actions]),
                  expected_outputs=dict(action_log_probs=expected_action_log_prob_output, logits=expected_q_values_output), decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", nn_input), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3,))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", nn_input), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3,))

        # Distribution's entropy.
        out = test.test(("get_entropy", nn_input), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3,))
    def test_shared_value_function_policy_for_discrete_container_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(6, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # Action_space.
        action_space = Tuple(IntBox(2),
                             IntBox(3),
                             Dict(a=IntBox(4), ),
                             add_batch_rank=True,
                             add_time_rank=True)
        #flat_float_action_space = Tuple(
        #    FloatBox(shape=(2,)),
        #    FloatBox(shape=(3,)),
        #    Dict(
        #        a=FloatBox(shape=(4,)),
        #    ),
        #    add_batch_rank=True,
        #    add_time_rank=True
        #)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        network_spec["fold_time_rank"] = True
        network_spec["unfold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(fold_time_rank=True,
                                     unfold_time_rank=True),
            action_space=action_space,
            value_fold_time_rank=True,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)
        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 6))
        # Raw NN-output (still folded).
        expected_nn_output = np.reshape(relu(
            np.matmul(
                states_folded, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1),
                                        newshape=(2, 3, 3))
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = tuple([
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
            dict(a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "2/action-network/action-layer/dense/kernel"]))
        ])
        expected_action_layer_output_unfolded = tuple([
            np.reshape(expected_action_layer_output[0], newshape=(2, 3, 2)),
            np.reshape(expected_action_layer_output[1], newshape=(2, 3, 3)),
            dict(a=np.reshape(expected_action_layer_output[2]["a"],
                              newshape=(2, 3, 4)))
        ])
        test.test(("get_adapter_outputs", states),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output_unfolded,
                      nn_outputs=expected_nn_output),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states, ["state_values"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probs_output = tuple([
            softmax(expected_action_layer_output_unfolded[0], axis=-1),
            softmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=softmax(expected_action_layer_output_unfolded[2]["a"],
                           axis=-1))
        ])
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output_unfolded,
                      parameters=expected_action_layer_output_unfolded),
                  decimals=5)

        print("Probs: {}".format(expected_probs_output))

        expected_actions = tuple([
            np.argmax(expected_action_layer_output_unfolded[0], axis=-1),
            np.argmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=np.argmax(expected_action_layer_output_unfolded[2]["a"],
                             axis=-1), )
        ])
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood.
        expected_action_llh_output = np.log(
            np.array([
                [
                    expected_probs_output[0][0][0][action[0][0][0]],
                    expected_probs_output[0][0][1][action[0][0][1]],
                    expected_probs_output[0][0][2][action[0][0][2]],
                ],
                [
                    expected_probs_output[0][1][0][action[0][1][0]],
                    expected_probs_output[0][1][1][action[0][1][1]],
                    expected_probs_output[0][1][2][action[0][1][2]],
                ]
            ])) + np.log(
                np.array([[
                    expected_probs_output[1][0][0][action[1][0][0]],
                    expected_probs_output[1][0][1][action[1][0][1]],
                    expected_probs_output[1][0][2][action[1][0][2]],
                ],
                          [
                              expected_probs_output[1][1][0][action[1][1][0]],
                              expected_probs_output[1][1][1][action[1][1][1]],
                              expected_probs_output[1][1][2][action[1][1][2]],
                          ]])
            ) + np.log(
                np.array([[
                    expected_probs_output[2]["a"][0][0][action[2]["a"][0][0]],
                    expected_probs_output[2]["a"][0][1][action[2]["a"][0][1]],
                    expected_probs_output[2]["a"][0][2][action[2]["a"][0][2]],
                ],
                          [
                              expected_probs_output[2]["a"][1][0][action[2]
                                                                  ["a"][1][0]],
                              expected_probs_output[2]["a"][1][1][action[2]
                                                                  ["a"][1][1]],
                              expected_probs_output[2]["a"][1][2][action[2]
                                                                  ["a"][1][2]],
                          ]]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_llh_output,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"][0].dtype == np.float32)
        self.assertTrue(
            out["entropy"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][1].dtype == np.float32)
        self.assertTrue(
            out["entropy"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][2]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.
    def test_environment_stepper_on_deterministic_env_with_returning_action_probs(
            self):
        preprocessor_spec = [dict(type="divide", divisor=2)]
        network_spec = config_from_path("configs/test_simple_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=6),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=3)

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_hid = weights[policy_scope +
                              "test-network/hidden-layer/dense/kernel"]
        biases_hid = weights[policy_scope +
                             "test-network/hidden-layer/dense/bias"]
        weights_action = weights[
            policy_scope +
            "action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[
            policy_scope +
            "action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        expected = (
            # t_
            np.array([False, False, False]),
            # s' (raw)
            np.array([[0.0], [1.0], [2.0], [3.0]]),
            # action probs
            np.array([
                softmax(
                    dense_layer(
                        dense_layer(np.array([0.0]), weights_hid, biases_hid),
                        weights_action, biases_action)),
                softmax(
                    dense_layer(
                        dense_layer(np.array([0.5]), weights_hid, biases_hid),
                        weights_action, biases_action)),
                softmax(
                    dense_layer(
                        dense_layer(np.array([1.0]), weights_hid, biases_hid),
                        weights_action, biases_action))
            ]))
        test.test("step", expected_outputs=expected, decimals=3)

        # Step again, check whether stitching of states/etc.. works.
        expected = (
            np.array([False, False, True]),
            np.array([[3.0], [4.0], [5.0], [0.0]]),  # s' (raw)
            np.array([
                softmax(
                    dense_layer(
                        dense_layer(np.array([1.5]), weights_hid, biases_hid),
                        weights_action, biases_action)),
                softmax(
                    dense_layer(
                        dense_layer(np.array([2.0]), weights_hid, biases_hid),
                        weights_action, biases_action)),
                softmax(
                    dense_layer(
                        dense_layer(np.array([2.5]), weights_hid, biases_hid),
                        weights_action, biases_action))
            ]))
        test.test("step", expected_outputs=expected, decimals=3)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(5, ), add_batch_rank=True)

        # Action space.
        action_space = Dict(dict(a=Tuple(IntBox(2), IntBox(3)),
                                 b=Dict(dict(ba=IntBox(4)))),
                            add_batch_rank=True)
        #flat_float_action_space = Dict(dict(
        #    a=Tuple(FloatBox(shape=(2,)), FloatBox(shape=(3,))),
        #    b=Dict(dict(ba=FloatBox(shape=(4,))))
        #), add_batch_rank=True)

        # Policy with dueling logic.
        policy = DuelingPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            # Make all sub action adapters the same.
            action_adapter_spec=dict(pre_network_spec=[
                dict(type="dense",
                     units=5,
                     activation="lrelu",
                     activation_params=[0.2])
            ]),
            units_state_value_stream=2,
            action_space=action_space)
        test = ComponentTest(
            component=policy,
            input_spaces=dict(
                nn_inputs=nn_input_space,
                actions=action_space,
                #logits=flat_float_action_space,
                #parameters=flat_float_action_space
            ),
            action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                nn_input, policy_params[
                    "dueling-policy/test-network/hidden-layer/dense/kernel"]),
            0.2)
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output.
        expected_raw_advantages = dict(
            a=(
                np.matmul(
                    relu(
                        np.matmul(
                            expected_nn_output, policy_params[
                                "dueling-policy/action-adapter-0/action-network/dense-layer/dense/kernel"]
                        ), 0.2),
                    policy_params[
                        "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel"]
                ),
                np.matmul(
                    relu(
                        np.matmul(
                            expected_nn_output, policy_params[
                                "dueling-policy/action-adapter-1/action-network/dense-layer/dense/kernel"]
                        ), 0.2),
                    policy_params[
                        "dueling-policy/action-adapter-1/action-network/action-layer/dense/kernel"]
                ),
            ),
            b=dict(ba=np.matmul(
                relu(
                    np.matmul(
                        expected_nn_output, policy_params[
                            "dueling-policy/action-adapter-2/action-network/dense-layer/dense/kernel"]
                    ), 0.2),
                policy_params[
                    "dueling-policy/action-adapter-2/action-network/action-layer/dense/kernel"]
            )))

        # Single state values.
        expected_state_values = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output, policy_params[
                        "dueling-policy/dense-layer-state-value-stream/dense/kernel"]
                )),
            policy_params["dueling-policy/state-value-node/dense/kernel"])
        test.test(("get_state_values", nn_input, ["state_values"]),
                  expected_outputs=dict(state_values=expected_state_values),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_q_values_output = dict(
            a=(
                expected_state_values + expected_raw_advantages["a"][0] -
                np.mean(
                    expected_raw_advantages["a"][0], axis=-1, keepdims=True),
                expected_state_values + expected_raw_advantages["a"][1] -
                np.mean(
                    expected_raw_advantages["a"][1], axis=-1, keepdims=True),
            ),
            b=dict(
                ba=expected_state_values + expected_raw_advantages["b"]["ba"] -
                np.mean(
                    expected_raw_advantages["b"]["ba"], axis=-1, keepdims=True)
            ))
        test.test(
            ("get_adapter_outputs", nn_input),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  nn_outputs=expected_nn_output,
                                  advantages=expected_raw_advantages,
                                  q_values=expected_q_values_output),
            decimals=5)

        test.test(
            ("get_adapter_outputs_and_parameters", nn_input,
             ["adapter_outputs"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output),
            decimals=5)

        # Parameter (probabilities). Softmaxed q_values.
        expected_probs_output = dict(
            a=(softmax(expected_q_values_output["a"][0], axis=-1),
               softmax(expected_q_values_output["a"][1], axis=-1)),
            b=dict(ba=np.maximum(
                softmax(expected_q_values_output["b"]["ba"], axis=-1),
                SMALL_NUMBER)))
        expected_log_probs_output = dict(
            a=(np.log(expected_probs_output["a"][0]),
               np.log(expected_probs_output["a"][1])),
            b=dict(ba=np.log(expected_probs_output["b"]["ba"])))
        test.test(
            ("get_adapter_outputs_and_parameters", nn_input,
             ["adapter_outputs", "parameters", "log_probs"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  parameters=expected_q_values_output,
                                  log_probs=expected_log_probs_output),
            decimals=5)

        print("Probs: {}".format(expected_probs_output))

        expected_actions = dict(
            a=(np.argmax(expected_q_values_output["a"][0], axis=-1),
               np.argmax(expected_q_values_output["a"][1], axis=-1)),
            b=dict(ba=np.argmax(expected_q_values_output["b"]["ba"], axis=-1)))
        test.test(("get_action", nn_input, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood.
        expected_action_llh_output = np.array([
            expected_log_probs_output["a"][0][0][action["a"][0][0]],
            expected_log_probs_output["a"][0][1][action["a"][0][1]],
            expected_log_probs_output["a"][0][2][action["a"][0][2]],
        ]) + np.array([
            expected_log_probs_output["a"][1][0][action["a"][1][0]],
            expected_log_probs_output["a"][1][1][action["a"][1][1]],
            expected_log_probs_output["a"][1][2][action["a"][1][2]],
        ]) + np.array([
            expected_log_probs_output["b"]["ba"][0][action["b"]["ba"][0]],
            expected_log_probs_output["b"]["ba"][1][action["b"]["ba"][1]],
            expected_log_probs_output["b"]["ba"][2][action["b"]["ba"][2]],
        ])
        test.test(
            ("get_log_likelihood", [nn_input, action]),
            expected_outputs=dict(log_likelihood=expected_action_llh_output,
                                  adapter_outputs=expected_q_values_output),
            decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"][0].dtype == np.int32)
        self.assertTrue(out["action"]["a"][0].shape == (3, ))
        self.assertTrue(out["action"]["a"][1].dtype == np.int32)
        self.assertTrue(out["action"]["a"][1].shape == (3, ))
        self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["ba"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"][0].dtype == np.int32)
        self.assertTrue(out["action"]["a"][0].shape == (3, ))
        self.assertTrue(out["action"]["a"][1].dtype == np.int32)
        self.assertTrue(out["action"]["a"][1].shape == (3, ))
        self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["ba"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", nn_input), expected_outputs=None)
        self.assertTrue(out["entropy"]["a"][0].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"][0].shape == (3, ))
        self.assertTrue(out["entropy"]["a"][1].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"][1].shape == (3, ))
        self.assertTrue(out["entropy"]["b"]["ba"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["ba"].shape == (3, ))
    def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(
            self):
        internal_states_space = Tuple(FloatBox(shape=(3, )),
                                      FloatBox(shape=(3, )))
        preprocessor_spec = [dict(type="multiply", factor=0.1)]
        network_spec = config_from_path("configs/test_lstm_nn.json")
        exploration_spec = None
        actor_component = ActorComponent(
            preprocessor_spec,
            dict(network_spec=network_spec,
                 action_space=self.deterministic_env_action_space),
            exploration_spec)
        environment_stepper = EnvironmentStepper(
            environment_spec=dict(type="deterministic_env",
                                  steps_to_terminal=3),
            actor_component_spec=actor_component,
            state_space=self.deterministic_env_state_space,
            reward_space="float32",
            internal_states_space=internal_states_space,
            add_action_probs=True,
            action_probs_space=self.deterministic_action_probs_space,
            num_steps=4,
        )

        test = ComponentTest(
            component=environment_stepper,
            action_space=self.deterministic_env_action_space,
        )

        weights = test.read_variable_values(
            environment_stepper.actor_component.policy.variables)
        policy_scope = "environment-stepper/actor-component/policy/"
        weights_lstm = weights[policy_scope +
                               "test-lstm-network/lstm-layer/lstm-cell/kernel"]
        biases_lstm = weights[policy_scope +
                              "test-lstm-network/lstm-layer/lstm-cell/bias"]
        weights_action = weights[
            policy_scope +
            "action-adapter-0/action-network/action-layer/dense/kernel"]
        biases_action = weights[
            policy_scope +
            "action-adapter-0/action-network/action-layer/dense/bias"]

        # Step 3 times through the Env and collect results.
        lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm)
        lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm,
                            lstm_1[1])
        lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm,
                            lstm_2[1])
        lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm,
                            lstm_3[1])
        expected = (
            np.array([False, False, True, False]),
            np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]),  # s' (raw)
            np.array([
                softmax(
                    dense_layer(np.squeeze(lstm_1[0]), weights_action,
                                biases_action)),
                softmax(
                    dense_layer(np.squeeze(lstm_2[0]), weights_action,
                                biases_action)),
                softmax(
                    dense_layer(np.squeeze(lstm_3[0]), weights_action,
                                biases_action)),
                softmax(
                    dense_layer(np.squeeze(lstm_4[0]), weights_action,
                                biases_action)),
            ]),  # action probs
            # internal states
            (np.squeeze(
                np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0],
                          lstm_3[1][0], lstm_4[1][0]])),
             np.squeeze(
                 np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1],
                           lstm_3[1][1], lstm_4[1][1]]))))
        test.test("step", expected_outputs=expected)

        # Make sure we close the session (to shut down the Env on the server).
        test.terminate()
Beispiel #18
0
    def test_double_dqn_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = DQNAgent.from_spec(agent_config,
                                   double_q=True,
                                   dueling_q=False,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space,
                                   execution_spec=dict(seed=15),
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("LAST q-table:\n{}".format(agent.last_q_table))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
        self.assertGreaterEqual(results["max_episode_reward"], -1.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 3)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (0., 0., -1., 0.): {
                "forward": (-5.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., -1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
        }
        for state, q_values_forward, q_values_jump in zip(
                agent.last_q_table["states"],
                agent.last_q_table["q_values"]["forward"],
                agent.last_q_table["q_values"]["jump"]):
            state, q_values_forward, q_values_jump = tuple(state), tuple(
                q_values_forward), tuple(q_values_jump)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(
                q_values_forward,
                expected_q_values_per_state[state]["forward"],
                decimals=0)
            recursive_assert_almost_equal(
                q_values_jump,
                expected_q_values_per_state[state]["jump"],
                decimals=0)
Beispiel #19
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4,), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)
        flat_float_action_space = FloatBox(shape=(3,), add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space
        )
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_input=state_space,
                actions=action_space,
                probabilities=flat_float_action_space,
                parameters=flat_float_action_space,
                logits=flat_float_action_space
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(shared_value_function_policy.variables)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(np.matmul(
            states, policy_params["shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
        ), 0.1)
        test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            policy_params["shared-value-function-policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            policy_params["shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(("get_state_values", states), expected_outputs=dict(state_values=expected_state_value_output),
                  decimals=5)

        # Logits-values.
        test.test(("get_state_values_logits_probabilities_log_probs", states, ["state_values", "logits"]),
                  expected_outputs=dict(state_values=expected_state_value_output, logits=expected_action_layer_output),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = softmax(expected_action_layer_output, axis=-1)
        test.test(("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict(
            logits=expected_action_layer_output,
            probabilities=expected_probabilities_output
        ), decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states), expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3,))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3,))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3,))