コード例 #1
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4,), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space)
        test = ComponentTest(
            component=policy,
            input_spaces=dict(nn_input=state_space),
            action_space=action_space
        )
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(states, policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params["policy/action-adapter/action-layer/dense/kernel"]
        )
        expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 5))
        test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states), expected_outputs=dict(action=expected_actions, last_internal_states=None))

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = softmax(expected_action_layer_output, axis=-1)
        test.test(("get_logits_parameters_log_probs", states, [0, 1, 2]), expected_outputs=dict(
            logits=expected_action_layer_output,
            parameters=expected_probabilities_output,
            log_probs=np.log(expected_probabilities_output)
        ), decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2,))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2,))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2,))
コード例 #2
0
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        # np.random.seed(10)
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(3, ), add_batch_rank=True)

        # action_space (2 possible actions).
        action_space = IntBox(2, add_batch_rank=True)
        # flat_float_action_space = FloatBox(shape=(2,), add_batch_rank=True)

        # Policy with dueling logic.
        policy = DuelingPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_adapter_spec=dict(pre_network_spec=[
                dict(type="dense",
                     units=10,
                     activation="lrelu",
                     activation_params=[0.1])
            ]),
            units_state_value_stream=10,
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=nn_input_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                nn_input,
                ComponentTest.read_params(
                    "dueling-policy/test-network/hidden-layer",
                    policy_params)), 0.1)
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output)

        # Single state values.
        expected_state_values = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output,
                    ComponentTest.read_params(
                        "dueling-policy/dense-layer-state-value-stream",
                        policy_params))),
            ComponentTest.read_params("dueling-policy/state-value-node",
                                      policy_params))
        test.test(
            ("get_state_values", nn_input, ["state_values", "nn_outputs"]),
            expected_outputs=dict(state_values=expected_state_values,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Raw action layer output.
        expected_raw_advantages = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output,
                    ComponentTest.read_params(
                        "dueling-policy/action-adapter-0/action-network/dense-layer",
                        policy_params)), 0.1),
            ComponentTest.read_params(
                "dueling-policy/action-adapter-0/action-network/action-layer",
                policy_params))

        # Q-values: One for each item in the batch.
        expected_q_values_output = expected_state_values + expected_raw_advantages - \
            np.mean(expected_raw_advantages, axis=-1, keepdims=True)
        test.test(
            ("get_adapter_outputs", nn_input,
             ["adapter_outputs", "advantages"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  advantages=expected_raw_advantages),
            decimals=5)

        # Parameter (probabilities). Softmaxed q_values.
        expected_parameters_output = np.maximum(
            softmax(expected_q_values_output, axis=-1), SMALL_NUMBER)
        test.test(
            ("get_adapter_outputs_and_parameters", nn_input,
             ["adapter_outputs", "parameters"]),
            expected_outputs=dict(adapter_outputs=expected_q_values_output,
                                  parameters=expected_parameters_output),
            decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_q_values_output, axis=-1)
        test.test(("get_action", nn_input, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]],
                expected_parameters_output[2][action[2]],
            ]))
        test.test(("get_log_likelihood", [nn_input, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output,
                      adapter_outputs=expected_q_values_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", nn_input), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
コード例 #3
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "log_probs"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output,
                      parameters=np.array(expected_parameters_output,
                                          dtype=np.float32),
                      log_probs=np.log(expected_parameters_output)),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]]
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))
コード例 #4
0
    def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(3, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # action_space (4 possible actions).
        action_space = IntBox(4, add_batch_rank=True, add_time_rank=True)
        flat_float_action_space = FloatBox(shape=(4, ),
                                           add_batch_rank=True,
                                           add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        # Add folding and unfolding to network.
        network_spec["fold_time_rank"] = True
        network_spec["unfold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(fold_time_rank=True,
                                     unfold_time_rank=True),
            action_space=action_space,
            value_fold_time_rank=True,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 3))
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = np.reshape(relu(
            np.matmul(
                states_folded,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1),
                                        newshape=states.shape)
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))

        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(2, 3, 4))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states, ["state_values"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        expected_action_layer_output_unfolded = np.reshape(
            expected_action_layer_output, newshape=(2, 3, 4))
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output_unfolded, axis=-1),
            SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "nn_outputs"]),
                  expected_outputs=dict(
                      nn_outputs=expected_nn_output,
                      adapter_outputs=expected_action_layer_output_unfolded,
                      parameters=expected_parameters_output),
                  decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output_unfolded,
                                     axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([[
                expected_parameters_output[0][0][action[0][0]],
                expected_parameters_output[0][1][action[0][1]],
                expected_parameters_output[0][2][action[0][2]],
            ],
                      [
                          expected_parameters_output[1][0][action[1][0]],
                          expected_parameters_output[1][1][action[1][1]],
                          expected_parameters_output[1][2][action[1][2]],
                      ]]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output,
                      adapter_outputs=expected_action_layer_output_unfolded),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(
            out["action"].shape == (2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(
            out["entropy"].shape == (2, 3))  # Make sure output is unfolded.
コード例 #5
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(
                nn_inputs=state_space,
                actions=action_space,
            ),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                states,
                ComponentTest.read_params(
                    "shared-value-function-policy/test-network/hidden-layer",
                    policy_params)), 0.1)

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "shared-value-function-policy/value-function-node/dense-layer",
                policy_params))
        test.test(
            ("get_state_values", states, ["state_values"]),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # Logits-values.
        test.test(("get_state_values_adapter_outputs_and_parameters", states,
                   ["state_values", "adapter_outputs"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output,
                      adapter_outputs=expected_action_layer_output),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(
            ("get_adapter_outputs_and_parameters", states,
             ["adapter_outputs", "parameters"]),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  parameters=expected_parameters_output),
            decimals=5)

        print("Probs: {}".format(expected_parameters_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-llh.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]],
                expected_parameters_output[2][action[2]],
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output, llh)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
コード例 #6
0
ファイル: test_policies.py プロジェクト: mugenZebra/rlgraph
    def test_policy_for_discrete_action_space_with_baseline_layer(self):
        np.random.seed(11)
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)

        # Policy with baseline action adapter.
        policy = Policy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_adapter_spec=dict(type="baseline_action_adapter",
                                     action_space=action_space))
        test = ComponentTest(component=policy,
                             input_spaces=dict(nn_input=state_space),
                             action_space=action_space,
                             seed=11)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        expected_nn_output = relu(expected_nn_output, 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params[
                "policy/baseline-action-adapter/action-layer/dense/kernel"])
        expected_action_layer_output = np.reshape(expected_action_layer_output,
                                                  newshape=(3, 4))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # State-values: One for each item in the batch (simply take first out-node of action_layer).
        expected_state_value_output = expected_action_layer_output[:, :1]
        # logits-values: One for each action-choice per item in the batch (simply take the remaining out nodes).
        expected_logits_output = expected_action_layer_output[:, 1:]
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states,
             ["state_values", "logits"]),
            expected_outputs=dict(state_values=expected_state_value_output,
                                  logits=expected_logits_output),
            decimals=5)

        expected_actions = np.argmax(expected_logits_output, axis=-1)
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = softmax(expected_logits_output,
                                                axis=-1)
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_logits_output,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Stochastic sample.
        #expected_actions = np.array([0, 2, 2])
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)

        # Deterministic sample.
        #expected_actions = np.array([2, 2, 2])
        out = test.test(
            ("get_max_likelihood_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)

        # Distribution's entropy.
        #expected_h = np.array([1.08, 1.08, 1.03])
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=2)
        self.assertTrue(out["entropy"].dtype == np.float32)
コード例 #7
0
ファイル: test_policies.py プロジェクト: mugenZebra/rlgraph
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        np.random.seed(10)
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(3, ), add_batch_rank=True)

        # action_space (2 possible actions).
        action_space = IntBox(2, add_batch_rank=True)

        # Policy with additional dueling layer.
        policy = Policy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_adapter_spec=dict(type="dueling-action-adapter",
                                     action_space=action_space,
                                     units_state_value_stream=10,
                                     units_advantage_stream=10),
        )
        test = ComponentTest(component=policy,
                             input_spaces=dict(nn_input=nn_input_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (3 input nodes, batch size=3).
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                nn_input,
                policy_params["policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", nn_input),
                  expected_outputs=dict(output=expected_nn_output))

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_state_value = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output, policy_params[
                        "policy/dueling-action-adapter/dense-layer-state-value-stream/dense/kernel"]
                )),
            policy_params[
                "policy/dueling-action-adapter/state-value-node/dense/kernel"])
        expected_raw_advantages = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output, policy_params[
                        "policy/dueling-action-adapter/dense-layer-advantage-stream/dense/kernel"]
                )), policy_params[
                    "policy/dueling-action-adapter/action-layer/dense/kernel"])
        test.test(("get_action_layer_output", nn_input),
                  expected_outputs=dict(state_value_node=expected_state_value,
                                        output=expected_raw_advantages),
                  decimals=5)

        # State-values: One for each item in the batch (simply take first out-node of action_layer).
        #expected_state_value_output = np.squeeze(expected_action_layer_output[:, :1], axis=-1)
        # Advantage-values: One for each action-choice per item in the batch (simply take second and third out-node
        #expected_advantage_values_output = expected_action_layer_output[:, 1:]
        # Q-values: One for each action-choice per item in the batch (calculate from state-values and advantage-values
        expected_q_values_output = expected_state_value + expected_raw_advantages - \
            np.mean(expected_raw_advantages, axis=-1, keepdims=True)
        test.test(("get_logits_probabilities_log_probs", nn_input,
                   ["state_values", "logits"]),
                  expected_outputs=dict(state_values=expected_state_value,
                                        logits=expected_q_values_output),
                  decimals=5)

        expected_actions = np.argmax(expected_q_values_output, axis=-1)
        test.test(("get_action", nn_input),
                  expected_outputs=dict(action=expected_actions))

        # Parameter (probabilities). Softmaxed q_values.
        expected_probabilities_output = softmax(expected_q_values_output,
                                                axis=-1)
        test.test(
            ("get_logits_probabilities_log_probs", nn_input,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_q_values_output,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Stochastic sample.
        #expected_actions = np.array([1, 1, 1])
        out = test.test(
            ("get_stochastic_action", nn_input),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)

        # Deterministic sample.
        #expected_actions = np.array([0, 0, 0])
        out = test.test(
            ("get_max_likelihood_action", nn_input),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)

        # Distribution's entropy.
        #expected_h = np.array([0.675, 0.674, 0.682])
        out = test.test(
            ("get_entropy", nn_input),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"].dtype == np.float32)
コード例 #8
0
    def test_shared_value_function_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (3 possible actions).
        action_space = IntBox(3, add_batch_rank=True)
        flat_float_action_space = FloatBox(shape=(3, ), add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)

        # Some NN inputs (4 input nodes, batch size=3).
        states = state_space.sample(size=3)
        # Raw NN-output (3 hidden nodes). All weights=1.5, no biases.
        expected_nn_output = relu(
            np.matmul(
                states, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(
            ("get_state_values", states),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # Logits-values.
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states,
             ["state_values", "logits"]),
            expected_outputs=dict(state_values=expected_state_value_output,
                                  logits=expected_action_layer_output),
            decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = softmax(expected_action_layer_output,
                                                axis=-1)
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_output,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (3, ))
コード例 #9
0
    def test_policy_for_discrete_action_space_with_dueling_layer(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        nn_input_space = FloatBox(shape=(5, ), add_batch_rank=True)

        # Action space.
        action_space = Dict(dict(a=Tuple(IntBox(2), IntBox(3)),
                                 b=Dict(dict(ba=IntBox(4)))),
                            add_batch_rank=True)
        flat_float_action_space = Dict(dict(
            a=Tuple(FloatBox(shape=(2, )), FloatBox(shape=(3, ))),
            b=Dict(dict(ba=FloatBox(shape=(4, ))))),
                                       add_batch_rank=True)

        # Policy with dueling logic.
        policy = DuelingPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            # Make all sub action adapters the same.
            action_adapter_spec=dict(pre_network_spec=[
                dict(type="dense",
                     units=5,
                     activation="lrelu",
                     activation_params=[0.2])
            ]),
            units_state_value_stream=2,
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_input=nn_input_space,
                                 actions=action_space,
                                 logits=flat_float_action_space,
                                 probabilities=flat_float_action_space,
                                 parameters=flat_float_action_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                nn_input, policy_params[
                    "dueling-policy/test-network/hidden-layer/dense/kernel"]),
            0.2)
        test.test(("get_nn_output", nn_input),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output.
        expected_raw_advantages = dict(
            a=(
                np.matmul(
                    relu(
                        np.matmul(
                            expected_nn_output, policy_params[
                                "dueling-policy/action-adapter-0/action-network/dense-layer/dense/kernel"]
                        ), 0.2),
                    policy_params[
                        "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel"]
                ),
                np.matmul(
                    relu(
                        np.matmul(
                            expected_nn_output, policy_params[
                                "dueling-policy/action-adapter-1/action-network/dense-layer/dense/kernel"]
                        ), 0.2),
                    policy_params[
                        "dueling-policy/action-adapter-1/action-network/action-layer/dense/kernel"]
                ),
            ),
            b=dict(ba=np.matmul(
                relu(
                    np.matmul(
                        expected_nn_output, policy_params[
                            "dueling-policy/action-adapter-2/action-network/dense-layer/dense/kernel"]
                    ), 0.2),
                policy_params[
                    "dueling-policy/action-adapter-2/action-network/action-layer/dense/kernel"]
            )))
        test.test(("get_action_layer_output", nn_input),
                  expected_outputs=dict(output=expected_raw_advantages),
                  decimals=5)

        # Single state values.
        expected_state_values = np.matmul(
            relu(
                np.matmul(
                    expected_nn_output, policy_params[
                        "dueling-policy/dense-layer-state-value-stream/dense/kernel"]
                )),
            policy_params["dueling-policy/state-value-node/dense/kernel"])
        test.test(("get_state_values", nn_input),
                  expected_outputs=dict(state_values=expected_state_values),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_q_values_output = dict(
            a=(
                expected_state_values + expected_raw_advantages["a"][0] -
                np.mean(
                    expected_raw_advantages["a"][0], axis=-1, keepdims=True),
                expected_state_values + expected_raw_advantages["a"][1] -
                np.mean(
                    expected_raw_advantages["a"][1], axis=-1, keepdims=True),
            ),
            b=dict(
                ba=expected_state_values + expected_raw_advantages["b"]["ba"] -
                np.mean(
                    expected_raw_advantages["b"]["ba"], axis=-1, keepdims=True)
            ))
        test.test(("get_logits_probabilities_log_probs", nn_input, ["logits"]),
                  expected_outputs=dict(logits=expected_q_values_output),
                  decimals=5)

        # Parameter (probabilities). Softmaxed q_values.
        expected_probabilities_output = dict(
            a=(softmax(expected_q_values_output["a"][0], axis=-1),
               softmax(expected_q_values_output["a"][1], axis=-1)),
            b=dict(ba=softmax(expected_q_values_output["b"]["ba"], axis=-1)))
        expected_log_probs_output = dict(
            a=(np.log(expected_probabilities_output["a"][0]),
               np.log(expected_probabilities_output["a"][1])),
            b=dict(ba=np.log(expected_probabilities_output["b"]["ba"])))
        test.test(
            ("get_logits_probabilities_log_probs", nn_input,
             ["logits", "probabilities", "log_probs"]),
            expected_outputs=dict(logits=expected_q_values_output,
                                  probabilities=expected_probabilities_output,
                                  log_probs=expected_log_probs_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = dict(
            a=(np.argmax(expected_q_values_output["a"][0], axis=-1),
               np.argmax(expected_q_values_output["a"][1], axis=-1)),
            b=dict(ba=np.argmax(expected_q_values_output["b"]["ba"], axis=-1)))
        test.test(("get_action", nn_input),
                  expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = dict(
            a=(np.array(
                [
                    expected_log_probs_output["a"][0][0][expected_actions["a"]
                                                         [0][0]],
                    expected_log_probs_output["a"][0][1][expected_actions["a"]
                                                         [0][1]],
                    expected_log_probs_output["a"][0][2][expected_actions["a"]
                                                         [0][2]],
                ]),
               np.array([
                   expected_log_probs_output["a"][1][0][expected_actions["a"]
                                                        [1][0]],
                   expected_log_probs_output["a"][1][1][expected_actions["a"]
                                                        [1][1]],
                   expected_log_probs_output["a"][1][2][expected_actions["a"]
                                                        [1][2]],
               ])),
            b=dict(ba=np.array([
                expected_log_probs_output["b"]["ba"][0][expected_actions["b"]
                                                        ["ba"][0]],
                expected_log_probs_output["b"]["ba"][1][expected_actions["b"]
                                                        ["ba"][1]],
                expected_log_probs_output["b"]["ba"][2][expected_actions["b"]
                                                        ["ba"][2]],
            ])))
        test.test(("get_action_log_probs", [nn_input, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_q_values_output),
                  decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"][0].dtype == np.int32)
        self.assertTrue(out["action"]["a"][0].shape == (3, ))
        self.assertTrue(out["action"]["a"][1].dtype == np.int32)
        self.assertTrue(out["action"]["a"][1].shape == (3, ))
        self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["ba"].shape == (3, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", nn_input),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"][0].dtype == np.int32)
        self.assertTrue(out["action"]["a"][0].shape == (3, ))
        self.assertTrue(out["action"]["a"][1].dtype == np.int32)
        self.assertTrue(out["action"]["a"][1].shape == (3, ))
        self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["ba"].shape == (3, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", nn_input), expected_outputs=None)
        self.assertTrue(out["entropy"]["a"][0].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"][0].shape == (3, ))
        self.assertTrue(out["entropy"]["a"][1].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"][1].shape == (3, ))
        self.assertTrue(out["entropy"]["b"]["ba"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["ba"].shape == (3, ))
コード例 #10
0
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=IntBox(3),
                            add_batch_rank=True)
        flat_float_action_space = dict(type="dict",
                                       a=FloatBox(shape=(2, )),
                                       b=FloatBox(shape=(3, )),
                                       add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_input=state_space,
                                 actions=action_space,
                                 probabilities=flat_float_action_space,
                                 parameters=flat_float_action_space,
                                 logits=flat_float_action_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (batch size=2).
        states = state_space.sample(2)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
            ),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_outputs),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = dict(
            a=np.array(softmax(expected_action_layer_outputs["a"], axis=-1),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_outputs,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = dict(a=np.argmax(expected_action_layer_outputs["a"],
                                            axis=-1),
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (2, ))

        # Action log-probs.
        expected_action_log_prob_output = dict(
            a=np.log(
                np.array([
                    expected_probabilities_output["a"][0][expected_actions["a"]
                                                          [0]],
                    expected_probabilities_output["a"][1][expected_actions["a"]
                                                          [1]]
                ])),
            b=np.log(
                np.array([
                    expected_probabilities_output["b"][0][expected_actions["b"]
                                                          [0]],
                    expected_probabilities_output["b"][1][expected_actions["b"]
                                                          [1]]
                ])),
        )
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_outputs),
                  decimals=5)
コード例 #11
0
    def test_shared_value_function_policy_for_discrete_container_action_space_with_time_rank_folding(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(6, ),
                               add_batch_rank=True,
                               add_time_rank=True)

        # Action_space.
        action_space = Tuple(IntBox(2),
                             IntBox(3),
                             Dict(a=IntBox(4), ),
                             add_batch_rank=True,
                             add_time_rank=True)
        flat_float_action_space = Tuple(FloatBox(shape=(2, )),
                                        FloatBox(shape=(3, )),
                                        Dict(a=FloatBox(shape=(4, )), ),
                                        add_batch_rank=True,
                                        add_time_rank=True)

        # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.).
        network_spec = config_from_path("configs/test_lrelu_nn.json")
        network_spec["fold_time_rank"] = True
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=network_spec,
            action_adapter_spec=dict(unfold_time_rank=True),
            action_space=action_space,
            value_unfold_time_rank=True)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              parameters=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)
        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs.
        states = state_space.sample(size=(2, 3))
        states_folded = np.reshape(states, newshape=(6, 6))
        # Raw NN-output (still folded).
        expected_nn_output = relu(
            np.matmul(
                states_folded, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value
        expected_action_layer_output = tuple([
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
            dict(a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "2/action-network/action-layer/dense/kernel"]))
        ])
        expected_action_layer_output_unfolded = tuple([
            np.reshape(expected_action_layer_output[0], newshape=(2, 3, 2)),
            np.reshape(expected_action_layer_output[1], newshape=(2, 3, 3)),
            dict(a=np.reshape(expected_action_layer_output[2]["a"],
                              newshape=(2, 3, 4)))
        ])
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(
                      output=expected_action_layer_output_unfolded),
                  decimals=5)

        # State-values: One for each item in the batch.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        expected_state_value_output_unfolded = np.reshape(
            expected_state_value_output, newshape=(2, 3, 1))
        test.test(("get_state_values", states),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded),
                  decimals=5)

        test.test(("get_state_values_logits_probabilities_log_probs", states,
                   ["state_values", "logits"]),
                  expected_outputs=dict(
                      state_values=expected_state_value_output_unfolded,
                      logits=expected_action_layer_output_unfolded),
                  decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = tuple([
            softmax(expected_action_layer_output_unfolded[0], axis=-1),
            softmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=softmax(expected_action_layer_output_unfolded[2]["a"],
                           axis=-1))
        ])
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_output_unfolded,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = tuple([
            np.argmax(expected_action_layer_output_unfolded[0], axis=-1),
            np.argmax(expected_action_layer_output_unfolded[1], axis=-1),
            dict(a=np.argmax(expected_action_layer_output_unfolded[2]["a"],
                             axis=-1), )
        ])
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = tuple([
            np.log(
                np.array([[
                    expected_probabilities_output[0][0][0][expected_actions[0]
                                                           [0][0]],
                    expected_probabilities_output[0][0][1][expected_actions[0]
                                                           [0][1]],
                    expected_probabilities_output[0][0][2][expected_actions[0]
                                                           [0][2]],
                ],
                          [
                              expected_probabilities_output[0][1][0][
                                  expected_actions[0][1][0]],
                              expected_probabilities_output[0][1][1][
                                  expected_actions[0][1][1]],
                              expected_probabilities_output[0][1][2][
                                  expected_actions[0][1][2]],
                          ]])),
            np.log(
                np.array([[
                    expected_probabilities_output[1][0][0][expected_actions[1]
                                                           [0][0]],
                    expected_probabilities_output[1][0][1][expected_actions[1]
                                                           [0][1]],
                    expected_probabilities_output[1][0][2][expected_actions[1]
                                                           [0][2]],
                ],
                          [
                              expected_probabilities_output[1][1][0][
                                  expected_actions[1][1][0]],
                              expected_probabilities_output[1][1][1][
                                  expected_actions[1][1][1]],
                              expected_probabilities_output[1][1][2][
                                  expected_actions[1][1][2]],
                          ]])),
            dict(a=np.log(
                np.array([[
                    expected_probabilities_output[2]["a"][0][0][
                        expected_actions[2]["a"][0][0]],
                    expected_probabilities_output[2]["a"][0][1][
                        expected_actions[2]["a"][0][1]],
                    expected_probabilities_output[2]["a"][0][2][
                        expected_actions[2]["a"][0][2]],
                ],
                          [
                              expected_probabilities_output[2]["a"][1][0][
                                  expected_actions[2]["a"][1][0]],
                              expected_probabilities_output[2]["a"][1][1][
                                  expected_actions[2]["a"][1][1]],
                              expected_probabilities_output[2]["a"][1][2][
                                  expected_actions[2]["a"][1][2]],
                          ]])))
        ])
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_output_unfolded),
                  decimals=5)

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"][0].dtype == np.int32)
        self.assertTrue(
            out["action"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][1].dtype == np.int32)
        self.assertTrue(
            out["action"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["action"][2]["a"].dtype == np.int32)
        self.assertTrue(out["action"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"][0].dtype == np.float32)
        self.assertTrue(
            out["entropy"][0].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][1].dtype == np.float32)
        self.assertTrue(
            out["entropy"][1].shape == (2, 3))  # Make sure output is unfolded.
        self.assertTrue(out["entropy"][2]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"][2]["a"].shape == (
            2, 3))  # Make sure output is unfolded.
コード例 #12
0
    def test_shared_value_function_policy_for_discrete_container_action_space(
            self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)

        # action_space (complex nested container action space).
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=Dict(b1=IntBox(3), b2=IntBox(4)),
                            add_batch_rank=True)
        flat_float_action_space = dict(type="dict",
                                       a=FloatBox(shape=(2, )),
                                       b=Dict(b1=FloatBox(shape=(3, )),
                                              b2=FloatBox(shape=(4, ))),
                                       add_batch_rank=True)

        # Policy with baseline action adapter.
        shared_value_function_policy = SharedValueFunctionPolicy(
            network_spec=config_from_path("configs/test_lrelu_nn.json"),
            action_space=action_space)
        test = ComponentTest(
            component=shared_value_function_policy,
            input_spaces=dict(nn_input=state_space,
                              actions=action_space,
                              probabilities=flat_float_action_space,
                              parameters=flat_float_action_space,
                              logits=flat_float_action_space),
            action_space=action_space,
        )
        policy_params = test.read_variable_values(
            shared_value_function_policy.variables)

        base_scope = "shared-value-function-policy/action-adapter-"

        # Some NN inputs (batch size=2).
        states = state_space.sample(size=2)
        # Raw NN-output.
        expected_nn_output = relu(
            np.matmul(
                states, policy_params[
                    "shared-value-function-policy/test-network/hidden-layer/dense/kernel"]
            ), 0.1)
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=5)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "0/action-network/action-layer/dense/kernel"]),
            b=dict(b1=np.matmul(
                expected_nn_output,
                policy_params[base_scope +
                              "1/action-network/action-layer/dense/kernel"]),
                   b2=np.matmul(
                       expected_nn_output, policy_params[
                           base_scope +
                           "2/action-network/action-layer/dense/kernel"])))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_outputs),
                  decimals=5)

        # State-values.
        expected_state_value_output = np.matmul(
            expected_nn_output, policy_params[
                "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"]
        )
        test.test(
            ("get_state_values", states),
            expected_outputs=dict(state_values=expected_state_value_output),
            decimals=5)

        # logits-values: One for each action-choice per item in the batch (simply take the remaining out nodes).
        test.test(
            ("get_state_values_logits_probabilities_log_probs", states,
             ["state_values", "logits"]),
            expected_outputs=dict(state_values=expected_state_value_output,
                                  logits=expected_action_layer_outputs),
            decimals=5)

        # Parameter (probabilities). Softmaxed logits.
        expected_probabilities_output = dict(
            a=softmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=softmax(expected_action_layer_outputs["b"]["b1"],
                              axis=-1),
                   b2=softmax(expected_action_layer_outputs["b"]["b2"],
                              axis=-1)))
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_outputs,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        # Action sample.
        expected_actions = dict(
            a=np.argmax(expected_action_layer_outputs["a"], axis=-1),
            b=dict(b1=np.argmax(expected_action_layer_outputs["b"]["b1"],
                                axis=-1),
                   b2=np.argmax(expected_action_layer_outputs["b"]["b2"],
                                axis=-1)))
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b2"].shape == (2, ))

        # Deterministic sample.
        out = test.test(("get_deterministic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b1"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["action"]["b"]["b2"].dtype == np.int32)
        self.assertTrue(out["action"]["b"]["b2"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"]["b1"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["b1"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"]["b2"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"]["b2"].shape == (2, ))
コード例 #13
0
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=BoolBox(),
                            b=IntBox(3),
                            add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (batch size=32).
        batch_size = 32
        states = state_space.sample(batch_size)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.squeeze(
                np.matmul(
                    expected_nn_output, policy_params[
                        "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
                )),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_adapter_outputs", states),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      nn_outputs=expected_nn_output),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probs_output = dict(
            a=np.array(sigmoid(expected_action_layer_outputs["a"]),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      parameters=dict(a=expected_probs_output["a"],
                                      b=expected_action_layer_outputs["b"])),
                  decimals=5)

        print("Probs: {}".format(expected_probs_output))

        expected_actions = dict(a=expected_probs_output["a"] > 0.5,
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood (sum of the composite llhs).
        expected_action_llh_output = \
            np.log(np.array([expected_probs_output["a"][i] if action["a"][i] else 1.0 - expected_probs_output["a"][i] for i in range(batch_size)])) + \
            np.log(np.array([expected_probs_output["b"][i][action["b"][i]] for i in range(batch_size)]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_llh_output,
                      adapter_outputs=expected_action_layer_outputs),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (batch_size, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (batch_size, ))