Esempio n. 1
0
    def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(3, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # action_space (4 possible actions).
        action_space = IntBox(4, add_batch_rank=True, add_time_rank=True)
        flat_float_action_space = FloatBox(shape=(4, ),
                                           add_batch_rank=True,
                                           add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        # Add folding and unfolding to network.
        network_spec["fold_time_rank"] = True
        network_spec["unfold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(fold_time_rank=True,
                                     unfold_time_rank=True),
            action_space=action_space,
            value_fold_time_rank=True,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 3))
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = np.reshape(relu(
            np.matmul(
                states_folded,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1),
                                        newshape=states.shape)
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))

        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(2, 3, 4))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states, ["state_values"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        expected_action_layer_output_unfolded = np.reshape(
            expected_action_layer_output, newshape=(2, 3, 4))
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output_unfolded, axis=-1),
            SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "nn_outputs"]),
                  expected_outputs=dict(
                      nn_outputs=expected_nn_output,
                      adapter_outputs=expected_action_layer_output_unfolded,
                      parameters=expected_parameters_output),
                  decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output_unfolded,
                                     axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([[
                expected_parameters_output[0][0][action[0][0]],
                expected_parameters_output[0][1][action[0][1]],
                expected_parameters_output[0][2][action[0][2]],
            ],
                      [
                          expected_parameters_output[1][0][action[1][0]],
                          expected_parameters_output[1][1][action[1][1]],
                          expected_parameters_output[1][2][action[1][2]],
                      ]]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(
            out["entropy"].shape == (2, 3))  # Make sure output is unfolded.
Esempio n. 2
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                states,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1)

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        test.test(
            ("get_state_values", states, ["state_values"]),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # Logits-values.
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output,
                      adapter_outputs=expected_action_layer_output),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(
            ("get_adapter_outputs_and_parameters", states,
             ["adapter_outputs", "parameters"]),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  parameters=expected_parameters_output),
            decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]],
                expected_parameters_output[2][action[2]],
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output, llh)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
    def test_shared_value_function_policy_for_discrete_container_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(6, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # Action_space.
        action_space = Tuple(IntBox(2),
                             IntBox(3),
                             Dict(a=IntBox(4), ),
                             add_batch_rank=True,
                             add_time_rank=True)
        flat_float_action_space = Tuple(FloatBox(shape=(2, )),
                                        FloatBox(shape=(3, )),
                                        Dict(a=FloatBox(shape=(4, )), ),
                                        add_batch_rank=True,
                                        add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        network_spec["fold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(unfold_time_rank=True),
            action_space=action_space,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              parameters=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)
        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 6))
        # Raw NN-output (still folded).
        expected_nn_output = relu(
            np.matmul(
                states_folded, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = tuple([
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
            dict(a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "2/action-network/action-layer/dense/kernel"]))
        ])
        expected_action_layer_output_unfolded = tuple([
            np.reshape(expected_action_layer_output[0], newshape=(2, 3, 2)),
            np.reshape(expected_action_layer_output[1], newshape=(2, 3, 3)),
            dict(a=np.reshape(expected_action_layer_output[2]["a"],
                              newshape=(2, 3, 4)))
        ])
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(
                      output=expected_action_layer_output_unfolded),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        test.test(("get_state_values_logits_probabilities_log_probs", states,
                   ["state_values", "logits"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      logits=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = tuple([
            softmax(expected_action_layer_output_unfolded[0], axis=-1),
            softmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=softmax(expected_action_layer_output_unfolded[2]["a"],
                           axis=-1))
        ])
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_output_unfolded,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = tuple([
            np.argmax(expected_action_layer_output_unfolded[0], axis=-1),
            np.argmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=np.argmax(expected_action_layer_output_unfolded[2]["a"],
                             axis=-1), )
        ])
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = tuple([
            np.log(
                np.array([[
                    expected_probabilities_output[0][0][0][expected_actions[0]
                                                           [0][0]],
                    expected_probabilities_output[0][0][1][expected_actions[0]
                                                           [0][1]],
                    expected_probabilities_output[0][0][2][expected_actions[0]
                                                           [0][2]],
                ],
                          [
                              expected_probabilities_output[0][1][0][
                                  expected_actions[0][1][0]],
                              expected_probabilities_output[0][1][1][
                                  expected_actions[0][1][1]],
                              expected_probabilities_output[0][1][2][
                                  expected_actions[0][1][2]],
                          ]])),
            np.log(
                np.array([[
                    expected_probabilities_output[1][0][0][expected_actions[1]
                                                           [0][0]],
                    expected_probabilities_output[1][0][1][expected_actions[1]
                                                           [0][1]],
                    expected_probabilities_output[1][0][2][expected_actions[1]
                                                           [0][2]],
                ],
                          [
                              expected_probabilities_output[1][1][0][
                                  expected_actions[1][1][0]],
                              expected_probabilities_output[1][1][1][
                                  expected_actions[1][1][1]],
                              expected_probabilities_output[1][1][2][
                                  expected_actions[1][1][2]],
                          ]])),
            dict(a=np.log(
                np.array([[
                    expected_probabilities_output[2]["a"][0][0][
                        expected_actions[2]["a"][0][0]],
                    expected_probabilities_output[2]["a"][0][1][
                        expected_actions[2]["a"][0][1]],
                    expected_probabilities_output[2]["a"][0][2][
                        expected_actions[2]["a"][0][2]],
                ],
                          [
                              expected_probabilities_output[2]["a"][1][0][
                                  expected_actions[2]["a"][1][0]],
                              expected_probabilities_output[2]["a"][1][1][
                                  expected_actions[2]["a"][1][1]],
                              expected_probabilities_output[2]["a"][1][2][
                                  expected_actions[2]["a"][1][2]],
                          ]])))
        ])
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_output_unfolded),
                  decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"][0].dtype == np.float32)
        self.assertTrue(
            out["entropy"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][1].dtype == np.float32)
        self.assertTrue(
            out["entropy"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][2]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.
Esempio n. 4
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)
        flat_float_action_space = FloatBox(shape=(3, ), add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                states, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(
            ("get_state_values", states),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # Logits-values.
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states,
             ["state_values", "logits"]),
            expected_outputs=dict(state_values=expected_state_value_output,
                                  logits=expected_action_layer_output),
            decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = softmax(expected_action_layer_output,
                                                axis=-1)
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_output,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
    def test_shared_value_function_policy_for_discrete_container_action_space(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)

        # action_space (complex nested container action space).
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=Dict(b1=IntBox(3), b2=IntBox(4)),
                            add_batch_rank=True)
        flat_float_action_space = dict(type="dict",
                                       a=FloatBox(shape=(2, )),
                                       b=Dict(b1=FloatBox(shape=(3, )),
                                              b2=FloatBox(shape=(4, ))),
                                       add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              parameters=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)

        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs (batch size=2).
        states = state_space.sample(size=2)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                states, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            b=dict(b1=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
                   b2=np.matmul(
                       expected_nn_output, policy_params[
                           base_scope +
                           "2/action-network/action-layer/dense/kernel"])))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_outputs),
                  decimals=5)

        # State-values.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(
            ("get_state_values", states),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # logits-values: One for each action-choice per item in the batch (simply take the remaining out nodes).
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states,
             ["state_values", "logits"]),
            expected_outputs=dict(state_values=expected_state_value_output,
                                  logits=expected_action_layer_outputs),
            decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = dict(
            a=softmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=softmax(expected_action_layer_outputs["b"]["b1"],
                              axis=-1),
                   b2=softmax(expected_action_layer_outputs["b"]["b2"],
                              axis=-1)))
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_outputs,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Action sample.
        expected_actions = dict(
            a=np.argmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=np.argmax(expected_action_layer_outputs["b"]["b1"],
                                axis=-1),
                   b2=np.argmax(expected_action_layer_outputs["b"]["b2"],
                                axis=-1)))
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b2"].shape == (2, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b2"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"]["b1"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"]["b2"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["b2"].shape == (2, ))