Example #1
0
    def test_policy_for_bounded_continuous_action_space(self):
        """
        https://github.com/rlgraph/rlgraph/issues/43
        """
        nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True)
        action_space = FloatBox(low=-1.0,
                                high=1.0,
                                shape=(1, ),
                                add_batch_rank=True)
        # Double the shape for alpha/beta params.
        # action_space_parameters = Tuple(FloatBox(shape=(1,)), FloatBox(shape=(1,)), add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=nn_input_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)

        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            nn_input,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output)

        # Raw action layer output.
        expected_raw_logits = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(("get_adapter_outputs", nn_input),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        nn_outputs=expected_nn_output),
                  decimals=5)

        # Parameter (alpha/betas).
        expected_alpha_parameters = np.log(
            np.exp(expected_raw_logits[:, 0:1]) + 1.0) + 1.0
        expected_beta_parameters = np.log(
            np.exp(expected_raw_logits[:, 1:]) + 1.0) + 1.0
        expected_parameters = tuple(
            [expected_alpha_parameters, expected_beta_parameters])
        test.test(("get_adapter_outputs_and_parameters", nn_input,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        parameters=expected_parameters),
                  decimals=5)

        print("Params: {}".format(expected_parameters))

        action = test.test(("get_action", nn_input))["action"]
        self.assertTrue(action.dtype == np.float32)
        self.assertGreaterEqual(action.min(), -1.0)
        self.assertLessEqual(action.max(), 1.0)
        self.assertTrue(action.shape == (3, 1))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        actions_scaled_back = (action + 1.0) / 2.0
        expected_action_log_llh_output = np.log(
            beta.pdf(actions_scaled_back, expected_alpha_parameters,
                     expected_beta_parameters))
        # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]],
        # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]])
        test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        actions = test.test(("get_stochastic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Deterministic sample.
        actions = test.test(("get_deterministic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Distribution's entropy.
        entropy = test.test(("get_entropy", nn_input))["entropy"]
        self.assertTrue(entropy.dtype == np.float32)
        self.assertTrue(entropy.shape == (3, 1))
Example #2
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)
        flat_float_action_space = FloatBox(shape=(5, ), add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_input=state_space,
                                 actions=action_space,
                                 logits=flat_float_action_space,
                                 probabilities=flat_float_action_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=6)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output, policy_params[
                "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_output),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = softmax(expected_action_layer_output,
                                                axis=-1)
        test.test(("get_logits_probabilities_log_probs", states,
                   ["logits", "probabilities"]),
                  expected_outputs=dict(logits=expected_action_layer_output,
                                        probabilities=np.array(
                                            expected_probabilities_output,
                                            dtype=np.float32)),
                  decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Action log-probs.
        expected_action_log_prob_output = np.log(
            np.array([
                expected_probabilities_output[0][expected_actions[0]],
                expected_probabilities_output[1][expected_actions[1]],
            ]))
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_output),
                  decimals=5)

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"].dtype == np.int32)
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))

        # Action log-probs.
        expected_action_log_prob_output = dict(
            action_log_probs=np.log(
                np.array([
                    expected_probabilities_output[0][expected_actions[0]],
                    expected_probabilities_output[1][expected_actions[1]]
                ])),
            logits=expected_action_layer_output)
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=expected_action_log_prob_output,
                  decimals=5)
Example #3
0
    def test_policy_for_discrete_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (5 possible actions).
        action_space = IntBox(5, add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (4 input nodes, batch size=2).
        states = np.array([[-0.08, 0.4, -0.05, -0.55],
                           [13.0, -14.0, 10.0, -16.0]])
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_parameters_output = np.maximum(
            softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER)
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters", "log_probs"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_output,
                      parameters=np.array(expected_parameters_output,
                                          dtype=np.float32),
                      log_probs=np.log(expected_parameters_output)),
                  decimals=5)

        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_parameters_output[0][action[0]],
                expected_parameters_output[1][action[1]]
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.int32
                        or (out["action"].dtype == np.int64))
        self.assertTrue(out["action"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (2, ))
Example #4
0
    def test_policy_for_bounded_continuous_action_space_using_squashed_normal(
            self):
        """
        Same test case, but with different bounded continuous distribution (squashed normal).
        """
        nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True)
        action_space = FloatBox(low=-2.0,
                                high=1.0,
                                shape=(1, ),
                                add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space,
            distributions_spec=dict(
                bounded_distribution_type="squashed-normal"))
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=nn_input_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)

        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            nn_input,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))
        test.test(("get_nn_outputs", nn_input),
                  expected_outputs=expected_nn_output)

        # Raw action layer output.
        expected_raw_logits = np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params))
        test.test(("get_adapter_outputs", nn_input),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        nn_outputs=expected_nn_output),
                  decimals=5)

        # Parameter (mean/stddev).
        expected_mean_parameters = expected_raw_logits[:, 0:1]
        expected_log_stddev_parameters = np.clip(expected_raw_logits[:, 1:2],
                                                 MIN_LOG_STDDEV,
                                                 MAX_LOG_STDDEV)
        expected_parameters = tuple(
            [expected_mean_parameters,
             np.exp(expected_log_stddev_parameters)])
        test.test(("get_adapter_outputs_and_parameters", nn_input,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(adapter_outputs=expected_raw_logits,
                                        parameters=expected_parameters),
                  decimals=5)

        print("Params: {}".format(expected_parameters))

        action = test.test(("get_action", nn_input))["action"]
        self.assertTrue(action.dtype == np.float32)
        self.assertGreaterEqual(action.min(), -2.0)
        self.assertLessEqual(action.max(), 1.0)
        self.assertTrue(action.shape == (3, 1))

        out = test.test(("get_action_and_log_likelihood", nn_input))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        actions_tanh_d = (action + 2.0) / 3.0 * 2.0 - 1.0
        actions_unsquashed = np.arctanh(actions_tanh_d)
        expected_action_log_llh_output = np.log(
            norm.pdf(actions_unsquashed,
                     loc=expected_parameters[0],
                     scale=expected_parameters[1]))
        expected_action_log_llh_output -= np.sum(np.log(1 - actions_tanh_d**2 +
                                                        SMALL_NUMBER),
                                                 axis=-1,
                                                 keepdims=True)
        # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]],
        # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]])
        test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        actions = test.test(("get_stochastic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -2.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Deterministic sample.
        actions = test.test(("get_deterministic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -2.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Distribution's entropy.
        entropy = test.test(("get_entropy", nn_input))["entropy"]
        self.assertTrue(entropy.dtype == np.float32)
        self.assertTrue(entropy.shape == (3, 1))
Example #5
0
    def test_policy_for_boolean_action_space(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # action_space (simple boolean).
        action_space = BoolBox(add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs.
        batch_size = 32
        states = state_space.sample(batch_size)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            ComponentTest.read_params("policy/test-network/hidden-layer",
                                      policy_params))

        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=5)

        # Raw action layer output; Expected shape=(): 2=batch
        expected_action_layer_output = np.squeeze(np.matmul(
            expected_nn_output,
            ComponentTest.read_params(
                "policy/action-adapter-0/action-network/action-layer",
                policy_params)),
                                                  axis=-1)
        test.test(
            ("get_adapter_outputs", states),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  nn_outputs=expected_nn_output),
            decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probs_output = sigmoid(expected_action_layer_output)
        test.test(
            ("get_adapter_outputs_and_parameters", states,
             ["adapter_outputs", "parameters", "log_probs"]),
            expected_outputs=dict(adapter_outputs=expected_action_layer_output,
                                  parameters=expected_probs_output,
                                  log_probs=np.log(expected_probs_output)),
            decimals=5)

        expected_actions = expected_action_layer_output > 0.0
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        # Get action AND log-llh.
        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-probs.
        expected_action_log_llh_output = np.log(
            np.array([
                expected_probs_output[i] if action[i] else 1.0 -
                expected_probs_output[i] for i in range(batch_size)
            ]))
        test.test(("get_log_likelihood", [states, action], "log_likelihood"),
                  expected_outputs=dict(
                      log_likelihood=expected_action_log_llh_output),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_log_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(("get_stochastic_action", states),
                        expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.bool_)
        self.assertTrue(out["action"].shape == (batch_size, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states), expected_outputs=None)
        self.assertTrue(out["action"].dtype == np.bool_)
        self.assertTrue(out["action"].shape == (batch_size, ))

        # Distribution's entropy.
        out = test.test(("get_entropy", states), expected_outputs=None)
        self.assertTrue(out["entropy"].dtype == np.float32)
        self.assertTrue(out["entropy"].shape == (batch_size, ))
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=IntBox(2),
                            b=IntBox(3),
                            add_batch_rank=True)
        flat_float_action_space = dict(type="dict",
                                       a=FloatBox(shape=(2, )),
                                       b=FloatBox(shape=(3, )),
                                       add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_input=state_space,
                                 actions=action_space,
                                 probabilities=flat_float_action_space,
                                 parameters=flat_float_action_space,
                                 logits=flat_float_action_space),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs (batch size=2).
        states = state_space.sample(2)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", states),
                  expected_outputs=dict(output=expected_nn_output),
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
            ),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_action_layer_output", states),
                  expected_outputs=dict(output=expected_action_layer_outputs),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probabilities_output = dict(
            a=np.array(softmax(expected_action_layer_outputs["a"], axis=-1),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(
            ("get_logits_probabilities_log_probs", states,
             ["logits", "probabilities"]),
            expected_outputs=dict(logits=expected_action_layer_outputs,
                                  probabilities=expected_probabilities_output),
            decimals=5)

        print("Probs: {}".format(expected_probabilities_output))

        expected_actions = dict(a=np.argmax(expected_action_layer_outputs["a"],
                                            axis=-1),
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states),
                  expected_outputs=dict(action=expected_actions))

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.int32)
        self.assertTrue(out["action"]["a"].shape == (2, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (2, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (2, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (2, ))

        # Action log-probs.
        expected_action_log_prob_output = dict(
            a=np.log(
                np.array([
                    expected_probabilities_output["a"][0][expected_actions["a"]
                                                          [0]],
                    expected_probabilities_output["a"][1][expected_actions["a"]
                                                          [1]]
                ])),
            b=np.log(
                np.array([
                    expected_probabilities_output["b"][0][expected_actions["b"]
                                                          [0]],
                    expected_probabilities_output["b"][1][expected_actions["b"]
                                                          [1]]
                ])),
        )
        test.test(("get_action_log_probs", [states, expected_actions]),
                  expected_outputs=dict(
                      action_log_probs=expected_action_log_prob_output,
                      logits=expected_action_layer_outputs),
                  decimals=5)
    def test_policy_for_discrete_container_action_space(self):
        # state_space.
        state_space = FloatBox(shape=(4, ), add_batch_rank=True)

        # Container action space.
        action_space = dict(type="dict",
                            a=BoolBox(),
                            b=IntBox(3),
                            add_batch_rank=True)

        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        test = ComponentTest(component=policy,
                             input_spaces=dict(
                                 nn_inputs=state_space,
                                 actions=action_space,
                             ),
                             action_space=action_space)
        policy_params = test.read_variable_values(policy.variable_registry)

        # Some NN inputs (batch size=32).
        batch_size = 32
        states = state_space.sample(batch_size)
        # Raw NN-output.
        expected_nn_output = np.matmul(
            states,
            policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_outputs", states),
                  expected_outputs=expected_nn_output,
                  decimals=6)

        # Raw action layers' output.
        expected_action_layer_outputs = dict(
            a=np.squeeze(
                np.matmul(
                    expected_nn_output, policy_params[
                        "policy/action-adapter-0/action-network/action-layer/dense/kernel"]
                )),
            b=np.matmul(
                expected_nn_output, policy_params[
                    "policy/action-adapter-1/action-network/action-layer/dense/kernel"]
            ))
        test.test(("get_adapter_outputs", states),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      nn_outputs=expected_nn_output),
                  decimals=5)

        # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs).
        expected_probs_output = dict(
            a=np.array(sigmoid(expected_action_layer_outputs["a"]),
                       dtype=np.float32),
            b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1),
                       dtype=np.float32))
        test.test(("get_adapter_outputs_and_parameters", states,
                   ["adapter_outputs", "parameters"]),
                  expected_outputs=dict(
                      adapter_outputs=expected_action_layer_outputs,
                      parameters=dict(a=expected_probs_output["a"],
                                      b=expected_action_layer_outputs["b"])),
                  decimals=5)

        print("Probs: {}".format(expected_probs_output))

        expected_actions = dict(a=expected_probs_output["a"] > 0.5,
                                b=np.argmax(expected_action_layer_outputs["b"],
                                            axis=-1))
        test.test(("get_action", states, ["action"]),
                  expected_outputs=dict(action=expected_actions))

        out = test.test(("get_action_and_log_likelihood", states))
        action = out["action"]
        llh = out["log_likelihood"]

        # Action log-likelihood (sum of the composite llhs).
        expected_action_llh_output = \
            np.log(np.array([expected_probs_output["a"][i] if action["a"][i] else 1.0 - expected_probs_output["a"][i] for i in range(batch_size)])) + \
            np.log(np.array([expected_probs_output["b"][i][action["b"][i]] for i in range(batch_size)]))
        test.test(("get_log_likelihood", [states, action]),
                  expected_outputs=dict(
                      log_likelihood=expected_action_llh_output,
                      adapter_outputs=expected_action_layer_outputs),
                  decimals=5)
        recursive_assert_almost_equal(expected_action_llh_output,
                                      llh,
                                      decimals=5)

        # Stochastic sample.
        out = test.test(
            ("get_stochastic_action", states),
            expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Deterministic sample.
        test.test(("get_deterministic_action", states),
                  expected_outputs=None)  # dict(action=expected_actions))
        self.assertTrue(out["action"]["a"].dtype == np.bool_)
        self.assertTrue(out["action"]["a"].shape == (batch_size, ))
        self.assertTrue(out["action"]["b"].dtype == np.int32)
        self.assertTrue(out["action"]["b"].shape == (batch_size, ))

        # Distribution's entropy.
        out = test.test(
            ("get_entropy", states),
            expected_outputs=None)  # dict(entropy=expected_h), decimals=3)
        self.assertTrue(out["entropy"]["a"].dtype == np.float32)
        self.assertTrue(out["entropy"]["a"].shape == (batch_size, ))
        self.assertTrue(out["entropy"]["b"].dtype == np.float32)
        self.assertTrue(out["entropy"]["b"].shape == (batch_size, ))
Example #8
0
    def test_policy_for_bounded_continuous_action_space(self):
        """
        https://github.com/rlgraph/rlgraph/issues/43
        """
        nn_input_space = FloatBox(shape=(4,), add_batch_rank=True)
        action_space = FloatBox(low=-1.0, high=1.0, shape=(1,), add_batch_rank=True)
        # Double the shape for alpha/beta params.
        action_space_parameters = FloatBox(shape=(2,), add_batch_rank=True)

        policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space)
        test = ComponentTest(
            component=policy,
            input_spaces=dict(
                nn_input=nn_input_space,
                actions=action_space,
                logits=FloatBox(shape=(1,), add_batch_rank=True),
                probabilities=FloatBox(add_batch_rank=True),
                parameters=action_space_parameters,
            ),
            action_space=action_space
        )

        policy_params = test.read_variable_values(policy.variables)

        # Some NN inputs.
        nn_input = nn_input_space.sample(size=3)
        # Raw NN-output.
        expected_nn_output = np.matmul(nn_input, policy_params["policy/test-network/hidden-layer/dense/kernel"])
        test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output))

        # Raw action layer output.
        expected_raw_logits = np.matmul(
            expected_nn_output, policy_params["policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_logits),
                  decimals=5)

        # Parameter (alpha/betas).
        expected_parameters_output = np.log(np.exp(expected_raw_logits) + 1.0) + 1.0
        test.test(("get_logits_parameters_log_probs", nn_input, ["logits", "parameters"]), expected_outputs=dict(
            logits=expected_raw_logits, parameters=expected_parameters_output
        ), decimals=5)

        print("Params: {}".format(expected_parameters_output))

        actions = test.test(("get_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Action log-probs.
        actions_scaled_back = (actions + 1.0) / 2.0
        expected_action_log_prob_output = np.log(beta.pdf(actions_scaled_back, expected_parameters_output[:, 1], expected_parameters_output[:, 0]))
        expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]])
        test.test(("get_action_log_probs", [nn_input, actions]),
                  expected_outputs=dict(action_log_probs=expected_action_log_prob_output,
                                        logits=expected_raw_logits), decimals=5)

        # Stochastic sample.
        actions = test.test(("get_stochastic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Deterministic sample.
        actions = test.test(("get_deterministic_action", nn_input))["action"]
        self.assertTrue(actions.dtype == np.float32)
        self.assertGreaterEqual(actions.min(), -1.0)
        self.assertLessEqual(actions.max(), 1.0)
        self.assertTrue(actions.shape == (3, 1))

        # Distribution's entropy.
        entropy = test.test(("get_entropy", nn_input))["entropy"]
        self.assertTrue(entropy.dtype == np.float32)
        self.assertTrue(entropy.shape == (3, 1))