def test_policy_for_bounded_continuous_action_space(self): """ https://github.com/rlgraph/rlgraph/issues/43 """ nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True) action_space = FloatBox(low=-1.0, high=1.0, shape=(1, ), add_batch_rank=True) # Double the shape for alpha/beta params. # action_space_parameters = Tuple(FloatBox(shape=(1,)), FloatBox(shape=(1,)), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=nn_input_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = np.matmul( nn_input, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", nn_input), expected_outputs=expected_nn_output) # Raw action layer output. expected_raw_logits = np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)) test.test(("get_adapter_outputs", nn_input), expected_outputs=dict(adapter_outputs=expected_raw_logits, nn_outputs=expected_nn_output), decimals=5) # Parameter (alpha/betas). expected_alpha_parameters = np.log( np.exp(expected_raw_logits[:, 0:1]) + 1.0) + 1.0 expected_beta_parameters = np.log( np.exp(expected_raw_logits[:, 1:]) + 1.0) + 1.0 expected_parameters = tuple( [expected_alpha_parameters, expected_beta_parameters]) test.test(("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs", "parameters"]), expected_outputs=dict(adapter_outputs=expected_raw_logits, parameters=expected_parameters), decimals=5) print("Params: {}".format(expected_parameters)) action = test.test(("get_action", nn_input))["action"] self.assertTrue(action.dtype == np.float32) self.assertGreaterEqual(action.min(), -1.0) self.assertLessEqual(action.max(), 1.0) self.assertTrue(action.shape == (3, 1)) out = test.test(("get_action_and_log_likelihood", nn_input)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. actions_scaled_back = (action + 1.0) / 2.0 expected_action_log_llh_output = np.log( beta.pdf(actions_scaled_back, expected_alpha_parameters, expected_beta_parameters)) # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]]) test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. actions = test.test(("get_stochastic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Deterministic sample. actions = test.test(("get_deterministic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Distribution's entropy. entropy = test.test(("get_entropy", nn_input))["entropy"] self.assertTrue(entropy.dtype == np.float32) self.assertTrue(entropy.shape == (3, 1))
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) flat_float_action_space = FloatBox(shape=(5, ), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_input=state_space, actions=action_space, logits=flat_float_action_space, probabilities=flat_float_action_space), action_space=action_space) policy_params = test.read_variable_values(policy.variables) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = softmax(expected_action_layer_output, axis=-1) test.test(("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict(logits=expected_action_layer_output, probabilities=np.array( expected_probabilities_output, dtype=np.float32)), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states), expected_outputs=dict(action=expected_actions)) # Action log-probs. expected_action_log_prob_output = np.log( np.array([ expected_probabilities_output[0][expected_actions[0]], expected_probabilities_output[1][expected_actions[1]], ])) test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=dict( action_log_probs=expected_action_log_prob_output, logits=expected_action_layer_output), decimals=5) # Stochastic sample. out = test.test( ("get_stochastic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, )) # Distribution's entropy. out = test.test( ("get_entropy", states), expected_outputs=None) # dict(entropy=expected_h), decimals=3) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2, )) # Action log-probs. expected_action_log_prob_output = dict( action_log_probs=np.log( np.array([ expected_probabilities_output[0][expected_actions[0]], expected_probabilities_output[1][expected_actions[1]] ])), logits=expected_action_layer_output) test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=expected_action_log_prob_output, decimals=5)
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul( states, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)) test.test( ("get_adapter_outputs", states), expected_outputs=dict(adapter_outputs=expected_action_layer_output, nn_outputs=expected_nn_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_parameters_output = np.maximum( softmax(expected_action_layer_output, axis=-1), SMALL_NUMBER) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters", "log_probs"]), expected_outputs=dict( adapter_outputs=expected_action_layer_output, parameters=np.array(expected_parameters_output, dtype=np.float32), log_probs=np.log(expected_parameters_output)), decimals=5) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) # Get action AND log-llh. out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. expected_action_log_llh_output = np.log( np.array([ expected_parameters_output[0][action[0]], expected_parameters_output[1][action[1]] ])) test.test(("get_log_likelihood", [states, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (2, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32 or (out["action"].dtype == np.int64)) self.assertTrue(out["action"].shape == (2, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2, ))
def test_policy_for_bounded_continuous_action_space_using_squashed_normal( self): """ Same test case, but with different bounded continuous distribution (squashed normal). """ nn_input_space = FloatBox(shape=(4, ), add_batch_rank=True) action_space = FloatBox(low=-2.0, high=1.0, shape=(1, ), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space, distributions_spec=dict( bounded_distribution_type="squashed-normal")) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=nn_input_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = np.matmul( nn_input, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", nn_input), expected_outputs=expected_nn_output) # Raw action layer output. expected_raw_logits = np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)) test.test(("get_adapter_outputs", nn_input), expected_outputs=dict(adapter_outputs=expected_raw_logits, nn_outputs=expected_nn_output), decimals=5) # Parameter (mean/stddev). expected_mean_parameters = expected_raw_logits[:, 0:1] expected_log_stddev_parameters = np.clip(expected_raw_logits[:, 1:2], MIN_LOG_STDDEV, MAX_LOG_STDDEV) expected_parameters = tuple( [expected_mean_parameters, np.exp(expected_log_stddev_parameters)]) test.test(("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs", "parameters"]), expected_outputs=dict(adapter_outputs=expected_raw_logits, parameters=expected_parameters), decimals=5) print("Params: {}".format(expected_parameters)) action = test.test(("get_action", nn_input))["action"] self.assertTrue(action.dtype == np.float32) self.assertGreaterEqual(action.min(), -2.0) self.assertLessEqual(action.max(), 1.0) self.assertTrue(action.shape == (3, 1)) out = test.test(("get_action_and_log_likelihood", nn_input)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. actions_tanh_d = (action + 2.0) / 3.0 * 2.0 - 1.0 actions_unsquashed = np.arctanh(actions_tanh_d) expected_action_log_llh_output = np.log( norm.pdf(actions_unsquashed, loc=expected_parameters[0], scale=expected_parameters[1])) expected_action_log_llh_output -= np.sum(np.log(1 - actions_tanh_d**2 + SMALL_NUMBER), axis=-1, keepdims=True) # expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], # [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]]) test.test(("get_log_likelihood", [nn_input, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. actions = test.test(("get_stochastic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -2.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Deterministic sample. actions = test.test(("get_deterministic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -2.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Distribution's entropy. entropy = test.test(("get_entropy", nn_input))["entropy"] self.assertTrue(entropy.dtype == np.float32) self.assertTrue(entropy.shape == (3, 1))
def test_policy_for_boolean_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (simple boolean). action_space = BoolBox(add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. batch_size = 32 states = state_space.sample(batch_size) # Raw NN-output. expected_nn_output = np.matmul( states, ComponentTest.read_params("policy/test-network/hidden-layer", policy_params)) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(): 2=batch expected_action_layer_output = np.squeeze(np.matmul( expected_nn_output, ComponentTest.read_params( "policy/action-adapter-0/action-network/action-layer", policy_params)), axis=-1) test.test( ("get_adapter_outputs", states), expected_outputs=dict(adapter_outputs=expected_action_layer_output, nn_outputs=expected_nn_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probs_output = sigmoid(expected_action_layer_output) test.test( ("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters", "log_probs"]), expected_outputs=dict(adapter_outputs=expected_action_layer_output, parameters=expected_probs_output, log_probs=np.log(expected_probs_output)), decimals=5) expected_actions = expected_action_layer_output > 0.0 test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) # Get action AND log-llh. out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-probs. expected_action_log_llh_output = np.log( np.array([ expected_probs_output[i] if action[i] else 1.0 - expected_probs_output[i] for i in range(batch_size) ])) test.test(("get_log_likelihood", [states, action], "log_likelihood"), expected_outputs=dict( log_likelihood=expected_action_log_llh_output), decimals=5) recursive_assert_almost_equal(expected_action_log_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.bool_) self.assertTrue(out["action"].shape == (batch_size, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.bool_) self.assertTrue(out["action"].shape == (batch_size, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (batch_size, ))
def test_policy_for_discrete_container_action_space(self): # state_space. state_space = FloatBox(shape=(4, ), add_batch_rank=True) # Container action space. action_space = dict(type="dict", a=IntBox(2), b=IntBox(3), add_batch_rank=True) flat_float_action_space = dict(type="dict", a=FloatBox(shape=(2, )), b=FloatBox(shape=(3, )), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_input=state_space, actions=action_space, probabilities=flat_float_action_space, parameters=flat_float_action_space, logits=flat_float_action_space), action_space=action_space) policy_params = test.read_variable_values(policy.variables) # Some NN inputs (batch size=2). states = state_space.sample(2) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6) # Raw action layers' output. expected_action_layer_outputs = dict( a=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-0/action-network/action-layer/dense/kernel"] ), b=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-1/action-network/action-layer/dense/kernel"] )) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_outputs), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = dict( a=np.array(softmax(expected_action_layer_outputs["a"], axis=-1), dtype=np.float32), b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1), dtype=np.float32)) test.test( ("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict(logits=expected_action_layer_outputs, probabilities=expected_probabilities_output), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = dict(a=np.argmax(expected_action_layer_outputs["a"], axis=-1), b=np.argmax(expected_action_layer_outputs["b"], axis=-1)) test.test(("get_action", states), expected_outputs=dict(action=expected_actions)) # Stochastic sample. out = test.test( ("get_stochastic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.int32) self.assertTrue(out["action"]["a"].shape == (2, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (2, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.int32) self.assertTrue(out["action"]["a"].shape == (2, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (2, )) # Distribution's entropy. out = test.test( ("get_entropy", states), expected_outputs=None) # dict(entropy=expected_h), decimals=3) self.assertTrue(out["entropy"]["a"].dtype == np.float32) self.assertTrue(out["entropy"]["a"].shape == (2, )) self.assertTrue(out["entropy"]["b"].dtype == np.float32) self.assertTrue(out["entropy"]["b"].shape == (2, )) # Action log-probs. expected_action_log_prob_output = dict( a=np.log( np.array([ expected_probabilities_output["a"][0][expected_actions["a"] [0]], expected_probabilities_output["a"][1][expected_actions["a"] [1]] ])), b=np.log( np.array([ expected_probabilities_output["b"][0][expected_actions["b"] [0]], expected_probabilities_output["b"][1][expected_actions["b"] [1]] ])), ) test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=dict( action_log_probs=expected_action_log_prob_output, logits=expected_action_layer_outputs), decimals=5)
def test_policy_for_discrete_container_action_space(self): # state_space. state_space = FloatBox(shape=(4, ), add_batch_rank=True) # Container action space. action_space = dict(type="dict", a=BoolBox(), b=IntBox(3), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (batch size=32). batch_size = 32 states = state_space.sample(batch_size) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=6) # Raw action layers' output. expected_action_layer_outputs = dict( a=np.squeeze( np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-0/action-network/action-layer/dense/kernel"] )), b=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-1/action-network/action-layer/dense/kernel"] )) test.test(("get_adapter_outputs", states), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, nn_outputs=expected_nn_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probs_output = dict( a=np.array(sigmoid(expected_action_layer_outputs["a"]), dtype=np.float32), b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1), dtype=np.float32)) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters"]), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, parameters=dict(a=expected_probs_output["a"], b=expected_action_layer_outputs["b"])), decimals=5) print("Probs: {}".format(expected_probs_output)) expected_actions = dict(a=expected_probs_output["a"] > 0.5, b=np.argmax(expected_action_layer_outputs["b"], axis=-1)) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-likelihood (sum of the composite llhs). expected_action_llh_output = \ np.log(np.array([expected_probs_output["a"][i] if action["a"][i] else 1.0 - expected_probs_output["a"][i] for i in range(batch_size)])) + \ np.log(np.array([expected_probs_output["b"][i][action["b"][i]] for i in range(batch_size)])) test.test(("get_log_likelihood", [states, action]), expected_outputs=dict( log_likelihood=expected_action_llh_output, adapter_outputs=expected_action_layer_outputs), decimals=5) recursive_assert_almost_equal(expected_action_llh_output, llh, decimals=5) # Stochastic sample. out = test.test( ("get_stochastic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.bool_) self.assertTrue(out["action"]["a"].shape == (batch_size, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (batch_size, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.bool_) self.assertTrue(out["action"]["a"].shape == (batch_size, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (batch_size, )) # Distribution's entropy. out = test.test( ("get_entropy", states), expected_outputs=None) # dict(entropy=expected_h), decimals=3) self.assertTrue(out["entropy"]["a"].dtype == np.float32) self.assertTrue(out["entropy"]["a"].shape == (batch_size, )) self.assertTrue(out["entropy"]["b"].dtype == np.float32) self.assertTrue(out["entropy"]["b"].shape == (batch_size, ))
def test_policy_for_bounded_continuous_action_space(self): """ https://github.com/rlgraph/rlgraph/issues/43 """ nn_input_space = FloatBox(shape=(4,), add_batch_rank=True) action_space = FloatBox(low=-1.0, high=1.0, shape=(1,), add_batch_rank=True) # Double the shape for alpha/beta params. action_space_parameters = FloatBox(shape=(2,), add_batch_rank=True) policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest( component=policy, input_spaces=dict( nn_input=nn_input_space, actions=action_space, logits=FloatBox(shape=(1,), add_batch_rank=True), probabilities=FloatBox(add_batch_rank=True), parameters=action_space_parameters, ), action_space=action_space ) policy_params = test.read_variable_values(policy.variables) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = np.matmul(nn_input, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output)) # Raw action layer output. expected_raw_logits = np.matmul( expected_nn_output, policy_params["policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_logits), decimals=5) # Parameter (alpha/betas). expected_parameters_output = np.log(np.exp(expected_raw_logits) + 1.0) + 1.0 test.test(("get_logits_parameters_log_probs", nn_input, ["logits", "parameters"]), expected_outputs=dict( logits=expected_raw_logits, parameters=expected_parameters_output ), decimals=5) print("Params: {}".format(expected_parameters_output)) actions = test.test(("get_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Action log-probs. actions_scaled_back = (actions + 1.0) / 2.0 expected_action_log_prob_output = np.log(beta.pdf(actions_scaled_back, expected_parameters_output[:, 1], expected_parameters_output[:, 0])) expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]]) test.test(("get_action_log_probs", [nn_input, actions]), expected_outputs=dict(action_log_probs=expected_action_log_prob_output, logits=expected_raw_logits), decimals=5) # Stochastic sample. actions = test.test(("get_stochastic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Deterministic sample. actions = test.test(("get_deterministic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Distribution's entropy. entropy = test.test(("get_entropy", nn_input))["entropy"] self.assertTrue(entropy.dtype == np.float32) self.assertTrue(entropy.shape == (3, 1))