def test_shared_value_function_policy_for_discrete_container_action_space( self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(5, ), add_batch_rank=True) # action_space (complex nested container action space). action_space = dict(type="dict", a=IntBox(2), b=Dict(b1=IntBox(3), b2=IntBox(4)), add_batch_rank=True) #flat_float_action_space = dict( # type="dict", # a=FloatBox(shape=(2,)), # b=Dict(b1=FloatBox(shape=(3,)), b2=FloatBox(shape=(4,))), # add_batch_rank=True #) # Policy with baseline action adapter. shared_value_function_policy = SharedValueFunctionPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), action_space=action_space) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space, ) policy_params = test.read_variable_values( shared_value_function_policy.variable_registry) base_scope = "shared-value-function-policy/action-adapter-" # Some NN inputs (batch size=2). states = state_space.sample(size=2) # Raw NN-output. expected_nn_output = relu( np.matmul( states, policy_params[ "shared-value-function-policy/test-network/hidden-layer/dense/kernel"] ), 0.1) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layers' output. expected_action_layer_outputs = dict( a=np.matmul( expected_nn_output, policy_params[base_scope + "0/action-network/action-layer/dense/kernel"]), b=dict(b1=np.matmul( expected_nn_output, policy_params[base_scope + "1/action-network/action-layer/dense/kernel"]), b2=np.matmul( expected_nn_output, policy_params[ base_scope + "2/action-network/action-layer/dense/kernel"]))) test.test(("get_adapter_outputs", states), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, nn_outputs=expected_nn_output), decimals=5) # State-values. expected_state_value_output = np.matmul( expected_nn_output, policy_params[ "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"] ) test.test( ("get_state_values", states, ["state_values"]), expected_outputs=dict(state_values=expected_state_value_output), decimals=5) # logits-values: One for each action-choice per item in the batch (simply take the remaining out nodes). test.test(("get_state_values_adapter_outputs_and_parameters", states, ["state_values", "adapter_outputs"]), expected_outputs=dict( state_values=expected_state_value_output, adapter_outputs=expected_action_layer_outputs), decimals=5) # Parameter (probabilities). Softmaxed logits. expected_probs_output = dict( a=softmax(expected_action_layer_outputs["a"], axis=-1), b=dict(b1=softmax(expected_action_layer_outputs["b"]["b1"], axis=-1), b2=softmax(expected_action_layer_outputs["b"]["b2"], axis=-1))) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters"]), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, parameters=expected_action_layer_outputs), decimals=5) print("Probs: {}".format(expected_probs_output)) # Action sample. expected_actions = dict( a=np.argmax(expected_action_layer_outputs["a"], axis=-1), b=dict(b1=np.argmax(expected_action_layer_outputs["b"]["b1"], axis=-1), b2=np.argmax(expected_action_layer_outputs["b"]["b2"], axis=-1))) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-likelihood. expected_action_llh_output = np.log(np.array([expected_probs_output["a"][0][action["a"][0]], expected_probs_output["a"][1][action["a"][1]]])) + \ np.log(np.array([expected_probs_output["b"]["b1"][0][action["b"]["b1"][0]], expected_probs_output["b"]["b1"][1][action["b"]["b1"][1]] ]) ) + \ np.log(np.array([expected_probs_output["b"]["b2"][0][action["b"]["b2"][0]], expected_probs_output["b"]["b2"][1][action["b"]["b2"][1]], ]) ) test.test(("get_log_likelihood", [states, action]), expected_outputs=dict( log_likelihood=expected_action_llh_output, adapter_outputs=expected_action_layer_outputs), decimals=5) recursive_assert_almost_equal(expected_action_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None)["action"] self.assertTrue(out["a"].dtype == np.int32) self.assertTrue(out["a"].shape == (2, )) self.assertTrue(out["b"]["b1"].dtype == np.int32) self.assertTrue(out["b"]["b1"].shape == (2, )) self.assertTrue(out["b"]["b2"].dtype == np.int32) self.assertTrue(out["b"]["b2"].shape == (2, )) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None)["action"] self.assertTrue(out["a"].dtype == np.int32) self.assertTrue(out["a"].shape == (2, )) self.assertTrue(out["b"]["b1"].dtype == np.int32) self.assertTrue(out["b"]["b1"].shape == (2, )) self.assertTrue(out["b"]["b2"].dtype == np.int32) self.assertTrue(out["b"]["b2"].shape == (2, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None)["entropy"] self.assertTrue(out["a"].dtype == np.float32) self.assertTrue(out["a"].shape == (2, )) self.assertTrue(out["b"]["b1"].dtype == np.float32) self.assertTrue(out["b"]["b1"].shape == (2, )) self.assertTrue(out["b"]["b2"].dtype == np.float32) self.assertTrue(out["b"]["b2"].shape == (2, ))
def test_policy_for_discrete_container_action_space(self): # state_space. state_space = FloatBox(shape=(4, ), add_batch_rank=True) # Container action space. action_space = dict(type="dict", a=BoolBox(), b=IntBox(3), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (batch size=32). batch_size = 32 states = state_space.sample(batch_size) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=6) # Raw action layers' output. expected_action_layer_outputs = dict( a=np.squeeze( np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-0/action-network/action-layer/dense/kernel"] )), b=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-1/action-network/action-layer/dense/kernel"] )) test.test(("get_adapter_outputs", states), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, nn_outputs=expected_nn_output), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probs_output = dict( a=np.array(sigmoid(expected_action_layer_outputs["a"]), dtype=np.float32), b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1), dtype=np.float32)) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters"]), expected_outputs=dict( adapter_outputs=expected_action_layer_outputs, parameters=dict(a=expected_probs_output["a"], b=expected_action_layer_outputs["b"])), decimals=5) print("Probs: {}".format(expected_probs_output)) expected_actions = dict(a=expected_probs_output["a"] > 0.5, b=np.argmax(expected_action_layer_outputs["b"], axis=-1)) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-likelihood (sum of the composite llhs). expected_action_llh_output = \ np.log(np.array([expected_probs_output["a"][i] if action["a"][i] else 1.0 - expected_probs_output["a"][i] for i in range(batch_size)])) + \ np.log(np.array([expected_probs_output["b"][i][action["b"][i]] for i in range(batch_size)])) test.test(("get_log_likelihood", [states, action]), expected_outputs=dict( log_likelihood=expected_action_llh_output, adapter_outputs=expected_action_layer_outputs), decimals=5) recursive_assert_almost_equal(expected_action_llh_output, llh, decimals=5) # Stochastic sample. out = test.test( ("get_stochastic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.bool_) self.assertTrue(out["action"]["a"].shape == (batch_size, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (batch_size, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.bool_) self.assertTrue(out["action"]["a"].shape == (batch_size, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (batch_size, )) # Distribution's entropy. out = test.test( ("get_entropy", states), expected_outputs=None) # dict(entropy=expected_h), decimals=3) self.assertTrue(out["entropy"]["a"].dtype == np.float32) self.assertTrue(out["entropy"]["a"].shape == (batch_size, )) self.assertTrue(out["entropy"]["b"].dtype == np.float32) self.assertTrue(out["entropy"]["b"].shape == (batch_size, ))
def test_sac_agent_component_on_fake_env(self): config = config_from_path("configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(2,)) continuous_action_space = FloatBox(low=-1.0, high=1.0) terminal_space = BoolBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = ValueFunction.from_spec(config["value_function"]) agent_component = SACAgentComponent( agent=None, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2 ) test = ComponentTest( component=agent_component, input_spaces=dict( states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=FloatBox(add_batch_rank=True), next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, preprocessed_s_prime=state_space.with_batch_rank(), importance_weights=FloatBox(add_batch_rank=True), preprocessed_next_states=state_space.with_batch_rank(), deterministic=bool, weights="variables:{}".format(policy.scope), # TODO: how to provide the space for multiple component variables? # q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), # ) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), ) ) policy_loss = [] vf_loss = [] # This test simulates an env that always requires actions to be close to the max-pdf # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs. # The component should learn to produce actions like that (close to 0.5). true_mean = 0.5 target_dist = stats.norm(loc=true_mean, scale=0.2) batch_size = 100 for _ in range(5000): action_sample = continuous_action_space.sample(batch_size) rewards = target_dist.pdf(action_sample) result = test.test(("update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) policy_loss.append(result["actor_loss"]) vf_loss.append(result["critic_loss"]) self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:])) self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:])) action_sample = np.linspace(-1, 1, batch_size) q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: q_val = q_val.flatten() np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) action_sample = action_sample.flatten() np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)
def test_metrics(self): """ Tests metric collection for 1 and multiple environments. """ agent_config = config_from_path("configs/apex_agent_cartpole.json") ray_spec = agent_config["execution_spec"].pop("ray_spec") ray_spec["worker_spec"]["worker_sample_size"] = 100 worker_spec = ray_spec["worker_spec"] worker = RayValueWorker.as_remote().remote(agent_config, ray_spec["worker_spec"], self.env_spec, auto_build=True) print("Testing statistics for 1 environment:") # Run for a while: task = worker.execute_and_get_timesteps.remote(100, break_on_terminal=False) sleep(1) # Include a transition between calls. task = worker.execute_and_get_timesteps.remote(100, break_on_terminal=False) sleep(1) # Retrieve result. result = ray.get(task) print('Task results:') print(result.get_metrics()) # Get worker metrics. task = worker.get_workload_statistics.remote() result = ray.get(task) print("Worker statistics:") # In cartpole, num timesteps = reward -> must be the same. print("Cartpole episode rewards: {}".format(result["episode_rewards"])) print("Cartpole episode timesteps: {}".format( result["episode_timesteps"])) recursive_assert_almost_equal(result["episode_rewards"], result["episode_timesteps"]) # Now repeat this but for multiple environments. print("Testing statistics for 4 environments:") worker_spec["num_worker_environments"] = 4 worker_spec["num_background_environments"] = 2 worker = RayValueWorker.as_remote().remote(agent_config, ray_spec["worker_spec"], self.env_spec, auto_build=True) task = worker.execute_and_get_timesteps.remote(100, break_on_terminal=False) sleep(1) result = ray.get(task) task = worker.execute_and_get_timesteps.remote(100, break_on_terminal=False) sleep(1) result = ray.get(task) task = worker.get_workload_statistics.remote() result = ray.get(task) print("Multi-env statistics:") print("Cartpole episode rewards: {}".format(result["episode_rewards"])) print("Cartpole episode timesteps: {}".format( result["episode_timesteps"])) recursive_assert_almost_equal(result["episode_rewards"], result["episode_timesteps"])
def test_shared_value_function_policy_for_discrete_action_space_with_time_rank_folding(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(3,), add_batch_rank=True, add_time_rank=True) # action_space (4 possible actions). action_space = IntBox(4, add_batch_rank=True, add_time_rank=True) flat_float_action_space = FloatBox(shape=(4,), add_batch_rank=True, add_time_rank=True) # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.). network_spec = config_from_path("configs/test_lrelu_nn.json") # Add folding to network. network_spec["fold_time_rank"] = True shared_value_function_policy = SharedValueFunctionPolicy( network_spec=network_spec, action_adapter_spec=dict(unfold_time_rank=True), action_space=action_space, value_unfold_time_rank=True ) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_input=state_space, actions=action_space, probabilities=flat_float_action_space, parameters=flat_float_action_space, logits=flat_float_action_space ), action_space=action_space, ) policy_params = test.read_variable_values(shared_value_function_policy.variables) # Some NN inputs. states = state_space.sample(size=(2, 3)) states_folded = np.reshape(states, newshape=(6, 3)) # Raw NN-output (3 hidden nodes). All weights=1.5, no biases. expected_nn_output = relu(np.matmul( states_folded, policy_params["shared-value-function-policy/test-network/hidden-layer/dense/kernel"] ), 0.1) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=5) # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value expected_action_layer_output = np.matmul( expected_nn_output, policy_params["shared-value-function-policy/action-adapter-0/action-network/" "action-layer/dense/kernel"] ) expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 3, 4)) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output), decimals=5) # State-values: One for each item in the batch. expected_state_value_output = np.matmul( expected_nn_output, policy_params["shared-value-function-policy/value-function-node/dense-layer/dense/kernel"] ) expected_state_value_output_unfolded = np.reshape(expected_state_value_output, newshape=(2, 3, 1)) test.test(("get_state_values", states), expected_outputs=dict(state_values=expected_state_value_output_unfolded), decimals=5) expected_action_layer_output_unfolded = np.reshape(expected_action_layer_output, newshape=(2, 3, 4)) test.test( ("get_state_values_logits_probabilities_log_probs", states, ["state_values", "logits"]), expected_outputs=dict( state_values=expected_state_value_output_unfolded, logits=expected_action_layer_output_unfolded ), decimals=5 ) # Parameter (probabilities). Softmaxed logits. expected_probabilities_output = softmax(expected_action_layer_output_unfolded, axis=-1) test.test(("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict( logits=expected_action_layer_output_unfolded, probabilities=expected_probabilities_output ), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = np.argmax(expected_action_layer_output_unfolded, axis=-1) test.test(("get_action", states), expected_outputs=dict(action=expected_actions)) # Action log-probs. expected_action_log_prob_output = np.log(np.array([[ expected_probabilities_output[0][0][expected_actions[0][0]], expected_probabilities_output[0][1][expected_actions[0][1]], expected_probabilities_output[0][2][expected_actions[0][2]], ], [ expected_probabilities_output[1][0][expected_actions[1][0]], expected_probabilities_output[1][1][expected_actions[1][1]], expected_probabilities_output[1][2][expected_actions[1][2]], ]])) test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=dict( action_log_probs=expected_action_log_prob_output, logits=expected_action_layer_output_unfolded ), decimals=5) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, 3)) # Make sure output is unfolded. # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, 3)) # Make sure output is unfolded. # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2, 3)) # Make sure output is unfolded.
def test_environment_stepper_on_2x2_grid_world(self): preprocessor_spec = [ dict(type="reshape", flatten=True, flatten_categories=self.grid_world_2x2_action_space. num_categories) ] network_spec = config_from_path("configs/test_simple_nn.json") # Try to find a NN that outputs greedy actions down in start state and right in state=1 (to reach goal). network_spec["layers"][0]["weights_spec"] = [[0.5, -0.5], [-0.1, 0.1], [-0.2, 0.2], [-0.4, 0.2]] network_spec["layers"][0]["biases_spec"] = False exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_adapter_spec=dict(weights_spec=[[0.1, -0.5, 0.5, 0.1], [0.4, 0.2, -0.2, 0.2]], biases_spec=False), action_space=self.grid_world_2x2_action_space, deterministic=True), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="grid_world", world="2x2"), actor_component_spec=actor_component, state_space=self.grid_world_2x2_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.grid_world_2x2_action_probs_space, num_steps=5) test = ComponentTest( component=environment_stepper, action_space=self.grid_world_2x2_action_space, ) # Step 5 times through the Env and collect results. expected = ( np.array([False, True, False, True, False]), # t_ np.array([0, 1, 0, 1, 0, 1]), # s' (raw) np.array([[0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299]], dtype=np.float32)) out = test.test("step", expected_outputs=expected, decimals=2) print(out) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([True, False, True, False, True]), # t_ np.array([1, 0, 1, 0, 1, 0]), # s' (raw) np.array([[0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825], [0.21869287, 0.17905058, 0.36056358, 0.24169299], [0.2547221, 0.2651175, 0.23048209, 0.24967825]], dtype=np.float32)) out = test.test("step", expected_outputs=expected) print(out) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4, ), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict(nn_input=state_space), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=expected_nn_output, decimals=6) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, policy_params["policy/action-adapter/action-layer/dense/kernel"]) expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 5)) test.test(("get_adapter_outputs", states, ["adapter_outputs"]), expected_outputs=dict( adapter_outputs=expected_action_layer_output), decimals=5) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = softmax(expected_action_layer_output, axis=-1) test.test(("get_adapter_outputs_and_parameters", states, [0, 1, 2]), expected_outputs=dict( adapter_outputs=expected_action_layer_output, parameters=expected_probabilities_output, log_probs=np.log(expected_probabilities_output)), decimals=5) print("Probs: {}".format(expected_probabilities_output)) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, )) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2, )) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2, ))
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, store_last_memory_batch=True, store_last_q_table=True, discount=0.95 ) worker = SingleThreadedWorker(env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict( loss_per_item=[ spaces.FloatBox(shape=(4,), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4,), add_batch_rank=True), spaces.FloatBox(shape=(4,), add_batch_rank=True) ] ), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]) ) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent("states_buffer", [], key_or_index="env_0") # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent("states_buffer", [], key_or_index="env_0") # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var("replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var("policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var("target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)
def test_sac_agent_component_functionality(self): config = config_from_path("configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(8,)) continuous_action_space = FloatBox(shape=(1,), low=-2.0, high=2.0) terminal_space = BoolBox(add_batch_rank=True) rewards_space = FloatBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = SACValueNetwork.from_spec(config["value_function"]) class DummyAgent(object): def __init__(self): self.graph_executor = None dummy_agent = DummyAgent() agent_component = SACAgentComponent( agent=dummy_agent, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2 ) test = ComponentTest( component=agent_component, input_spaces=dict( increment=int, episode_reward=float, states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), env_actions=continuous_action_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=rewards_space, next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, importance_weights=FloatBox(add_batch_rank=True), deterministic=bool, weights="variables:{}".format(policy.scope), time_percentage=float # TODO: how to provide the space for multiple component variables? #q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), #) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), ), auto_build=False ) dummy_agent.graph_executor = test.graph_executor test.build() batch_size = 10 action_sample = continuous_action_space.with_batch_rank().sample(batch_size) rewards = rewards_space.sample(batch_size) # Check, whether an update runs ok. result = test.test(("update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) self.assertTrue(result["actor_loss"].dtype == np.float32) self.assertTrue(result["critic_loss"].dtype == np.float32) action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1)) q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: self.assertTrue(q_val.dtype == np.float32) self.assertTrue(q_val.shape == (batch_size, 1)) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) self.assertTrue(action_sample.dtype == np.float32) self.assertTrue(action_sample.shape == (batch_size, 1))
def test_batched_backend_equivalence(self): return """ Tests if Python and TensorFlow backend return the same output for a standard DQN-style preprocessing stack. """ env_spec = dict(type="openai", gym_env="Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) # Test with batching because we assume vector environments to be the normal case going forward. env = SequentialVectorEnv(num_environments=4, env_spec=env_spec, num_background_envs=2) in_space = env.state_space agent_config = config_from_path("configs/ray_apex_for_pong.json") preprocessing_spec = deepcopy(agent_config["preprocessing_spec"]) # Set up python preprocessor. scopes = [preprocessor["scope"] for preprocessor in preprocessing_spec] # Set backend to python. for spec in preprocessing_spec: spec["backend"] = "python" python_processor = PreprocessorStack(*preprocessing_spec, backend="python") for sub_comp_scope in scopes: python_processor.sub_components[sub_comp_scope].create_variables( dict(preprocessing_inputs=in_space)) python_processor.reset() # To have the use case we considered so far, use agent interface for TF backend. agent_config.pop("type") agent = ApexAgent(state_space=env.state_space, action_space=env.action_space, **agent_config) # Generate a few states from random set points. Test if preprocessed states are almost equal states = np.asarray(env.reset_all()) actions, agent_preprocessed_states = agent.get_action( states=states, use_exploration=False, extra_returns="preprocessed_states") print("TensorFlow preprocessed shape: {}".format( np.asarray(agent_preprocessed_states).shape)) python_preprocessed_states = python_processor.preprocess(states) print("Python preprocessed shape: {}".format( np.asarray(python_preprocessed_states).shape)) print("Asserting (almost) equal values:") for tf_state, python_state in zip(agent_preprocessed_states, python_preprocessed_states): flat_tf = np.ndarray.flatten(tf_state) flat_python = np.ndarray.flatten(python_state) for x, y in zip(flat_tf, flat_python): recursive_assert_almost_equal(x, y, decimals=3) states, _, _, _ = env.step(actions) actions, agent_preprocessed_states = agent.get_action( states=states, use_exploration=False, extra_returns="preprocessed_states") print("TensorFlow preprocessed shape: {}".format( np.asarray(agent_preprocessed_states).shape)) python_preprocessed_states = python_processor.preprocess(states) print("Python preprocessed shape: {}".format( np.asarray(python_preprocessed_states).shape)) print("Asserting (almost) equal values:") recursive_assert_almost_equal(agent_preprocessed_states, python_preprocessed_states, decimals=3)
def test_policy_for_discrete_container_action_space(self): # state_space. state_space = FloatBox(shape=(4, ), add_batch_rank=True) # Container action space. action_space = dict(type="dict", a=IntBox(2), b=IntBox(3), add_batch_rank=True) flat_float_action_space = dict(type="dict", a=FloatBox(shape=(2, )), b=FloatBox(shape=(3, )), add_batch_rank=True) policy = Policy( network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest(component=policy, input_spaces=dict( nn_input=state_space, actions=action_space, probabilities=flat_float_action_space, logits=flat_float_action_space), action_space=action_space) policy_params = test.read_variable_values(policy.variables) # Some NN inputs (batch size=2). states = state_space.sample(2) # Raw NN-output. expected_nn_output = np.matmul( states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6) # Raw action layers' output. expected_action_layer_outputs = dict( a=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-0/action-network/action-layer/dense/kernel"] ), b=np.matmul( expected_nn_output, policy_params[ "policy/action-adapter-1/action-network/action-layer/dense/kernel"] )) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_outputs), decimals=5) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = dict( a=np.array(softmax(expected_action_layer_outputs["a"], axis=-1), dtype=np.float32), b=np.array(softmax(expected_action_layer_outputs["b"], axis=-1), dtype=np.float32)) test.test( ("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict(logits=expected_action_layer_outputs, probabilities=expected_probabilities_output), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = dict(a=np.argmax(expected_action_layer_outputs["a"], axis=-1), b=np.argmax(expected_action_layer_outputs["b"], axis=-1)) test.test(("get_action", states), expected_outputs=dict(action=expected_actions)) # Stochastic sample. out = test.test( ("get_stochastic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.int32) self.assertTrue(out["action"]["a"].shape == (2, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (2, )) # Deterministic sample. test.test(("get_deterministic_action", states), expected_outputs=None) # dict(action=expected_actions)) self.assertTrue(out["action"]["a"].dtype == np.int32) self.assertTrue(out["action"]["a"].shape == (2, )) self.assertTrue(out["action"]["b"].dtype == np.int32) self.assertTrue(out["action"]["b"].shape == (2, )) # Distribution's entropy. out = test.test( ("get_entropy", states), expected_outputs=None) # dict(entropy=expected_h), decimals=3) self.assertTrue(out["entropy"]["a"].dtype == np.float32) self.assertTrue(out["entropy"]["a"].shape == (2, )) self.assertTrue(out["entropy"]["b"].dtype == np.float32) self.assertTrue(out["entropy"]["b"].shape == (2, )) # Action log-probs. expected_action_log_prob_output = dict( a=np.log( np.array([ expected_probabilities_output["a"][0][expected_actions["a"] [0]], expected_probabilities_output["a"][1][expected_actions["a"] [1]] ])), b=np.log( np.array([ expected_probabilities_output["b"][0][expected_actions["b"] [0]], expected_probabilities_output["b"][1][expected_actions["b"] [1]] ])), ) test.test(("get_action_log_probs", [states, expected_actions]), expected_outputs=dict( action_log_probs=expected_action_log_prob_output, logits=expected_action_layer_outputs), decimals=5)
def test_policy_for_bounded_continuous_action_space(self): """ https://github.com/rlgraph/rlgraph/issues/43 """ nn_input_space = FloatBox(shape=(4,), add_batch_rank=True) action_space = FloatBox(low=-1.0, high=1.0, shape=(1,), add_batch_rank=True) # Double the shape for alpha/beta params. action_space_parameters = FloatBox(shape=(2,), add_batch_rank=True) policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest( component=policy, input_spaces=dict( nn_input=nn_input_space, actions=action_space, logits=FloatBox(shape=(1,), add_batch_rank=True), probabilities=FloatBox(add_batch_rank=True), parameters=action_space_parameters, ), action_space=action_space ) policy_params = test.read_variable_values(policy.variables) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = np.matmul(nn_input, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output)) # Raw action layer output. expected_raw_logits = np.matmul( expected_nn_output, policy_params["policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_logits), decimals=5) # Parameter (alpha/betas). expected_parameters_output = np.log(np.exp(expected_raw_logits) + 1.0) + 1.0 test.test(("get_logits_parameters_log_probs", nn_input, ["logits", "parameters"]), expected_outputs=dict( logits=expected_raw_logits, parameters=expected_parameters_output ), decimals=5) print("Params: {}".format(expected_parameters_output)) actions = test.test(("get_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Action log-probs. actions_scaled_back = (actions + 1.0) / 2.0 expected_action_log_prob_output = np.log(beta.pdf(actions_scaled_back, expected_parameters_output[:, 1], expected_parameters_output[:, 0])) expected_action_log_prob_output = np.array([[expected_action_log_prob_output[0][0]], [expected_action_log_prob_output[1][1]], [expected_action_log_prob_output[2][2]]]) test.test(("get_action_log_probs", [nn_input, actions]), expected_outputs=dict(action_log_probs=expected_action_log_prob_output, logits=expected_raw_logits), decimals=5) # Stochastic sample. actions = test.test(("get_stochastic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Deterministic sample. actions = test.test(("get_deterministic_action", nn_input))["action"] self.assertTrue(actions.dtype == np.float32) self.assertGreaterEqual(actions.min(), -1.0) self.assertLessEqual(actions.max(), 1.0) self.assertTrue(actions.shape == (3, 1)) # Distribution's entropy. entropy = test.test(("get_entropy", nn_input))["entropy"] self.assertTrue(entropy.dtype == np.float32) self.assertTrue(entropy.shape == (3, 1))
def test_policy_for_discrete_action_space_with_dueling_layer(self): np.random.seed(10) # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). nn_input_space = FloatBox(shape=(3,), add_batch_rank=True) # action_space (2 possible actions). action_space = IntBox(2, add_batch_rank=True) flat_float_action_space = FloatBox(shape=(2,), add_batch_rank=True) # Policy with dueling logic. policy = DuelingPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), action_adapter_spec=dict( pre_network_spec=[ dict(type="dense", units=10, activation="lrelu", activation_params=[0.1]) ] ), units_state_value_stream=10, action_space=action_space ) test = ComponentTest( component=policy, input_spaces=dict( nn_input=nn_input_space, actions=action_space, probabilities=flat_float_action_space, parameters=flat_float_action_space, logits=flat_float_action_space ), action_space=action_space ) policy_params = test.read_variable_values(policy.variables) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = relu(np.matmul( nn_input, policy_params["dueling-policy/test-network/hidden-layer/dense/kernel"]), 0.1 ) test.test(("get_nn_output", nn_input), expected_outputs=dict(output=expected_nn_output)) # Raw action layer output. expected_raw_advantages = np.matmul(relu(np.matmul( expected_nn_output, policy_params["dueling-policy/action-adapter-0/action-network/dense-layer/dense/kernel"] ), 0.1), policy_params["dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel"]) test.test(("get_action_layer_output", nn_input), expected_outputs=dict(output=expected_raw_advantages), decimals=5) # Single state values. expected_state_values = np.matmul(relu(np.matmul( expected_nn_output, policy_params["dueling-policy/dense-layer-state-value-stream/dense/kernel"] )), policy_params["dueling-policy/state-value-node/dense/kernel"]) test.test(("get_state_values", nn_input), expected_outputs=dict(state_values=expected_state_values), decimals=5) # State-values: One for each item in the batch. expected_q_values_output = expected_state_values + expected_raw_advantages - \ np.mean(expected_raw_advantages, axis=-1, keepdims=True) test.test(("get_logits_probabilities_log_probs", nn_input, "logits"), expected_outputs=dict( logits=expected_q_values_output ), decimals=5) # Parameter (probabilities). Softmaxed q_values. expected_probabilities_output = softmax(expected_q_values_output, axis=-1) test.test(("get_logits_probabilities_log_probs", nn_input, ["logits", "probabilities"]), expected_outputs=dict( logits=expected_q_values_output, probabilities=expected_probabilities_output ), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = np.argmax(expected_q_values_output, axis=-1) test.test(("get_action", nn_input), expected_outputs=dict(action=expected_actions)) # Action log-probs. expected_action_log_prob_output = np.log(np.array([ expected_probabilities_output[0][expected_actions[0]], expected_probabilities_output[1][expected_actions[1]], expected_probabilities_output[2][expected_actions[2]], ])) test.test(("get_action_log_probs", [nn_input, expected_actions]), expected_outputs=dict(action_log_probs=expected_action_log_prob_output, logits=expected_q_values_output), decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (3,)) # Deterministic sample. out = test.test(("get_deterministic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (3,)) # Distribution's entropy. out = test.test(("get_entropy", nn_input), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (3,))
def test_shared_value_function_policy_for_discrete_container_action_space_with_time_rank_folding( self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(6, ), add_batch_rank=True, add_time_rank=True) # Action_space. action_space = Tuple(IntBox(2), IntBox(3), Dict(a=IntBox(4), ), add_batch_rank=True, add_time_rank=True) #flat_float_action_space = Tuple( # FloatBox(shape=(2,)), # FloatBox(shape=(3,)), # Dict( # a=FloatBox(shape=(4,)), # ), # add_batch_rank=True, # add_time_rank=True #) # Policy with baseline action adapter AND batch-apply over the entire policy (NN + ActionAdapter + distr.). network_spec = config_from_path("configs/test_lrelu_nn.json") network_spec["fold_time_rank"] = True network_spec["unfold_time_rank"] = True shared_value_function_policy = SharedValueFunctionPolicy( network_spec=network_spec, action_adapter_spec=dict(fold_time_rank=True, unfold_time_rank=True), action_space=action_space, value_fold_time_rank=True, value_unfold_time_rank=True) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_inputs=state_space, actions=action_space, ), action_space=action_space, ) policy_params = test.read_variable_values( shared_value_function_policy.variable_registry) base_scope = "shared-value-function-policy/action-adapter-" # Some NN inputs. states = state_space.sample(size=(2, 3)) states_folded = np.reshape(states, newshape=(6, 6)) # Raw NN-output (still folded). expected_nn_output = np.reshape(relu( np.matmul( states_folded, policy_params[ "shared-value-function-policy/test-network/hidden-layer/dense/kernel"] ), 0.1), newshape=(2, 3, 3)) test.test(("get_nn_outputs", states), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value expected_action_layer_output = tuple([ np.matmul( expected_nn_output, policy_params[base_scope + "0/action-network/action-layer/dense/kernel"]), np.matmul( expected_nn_output, policy_params[base_scope + "1/action-network/action-layer/dense/kernel"]), dict(a=np.matmul( expected_nn_output, policy_params[base_scope + "2/action-network/action-layer/dense/kernel"])) ]) expected_action_layer_output_unfolded = tuple([ np.reshape(expected_action_layer_output[0], newshape=(2, 3, 2)), np.reshape(expected_action_layer_output[1], newshape=(2, 3, 3)), dict(a=np.reshape(expected_action_layer_output[2]["a"], newshape=(2, 3, 4))) ]) test.test(("get_adapter_outputs", states), expected_outputs=dict( adapter_outputs=expected_action_layer_output_unfolded, nn_outputs=expected_nn_output), decimals=5) # State-values: One for each item in the batch. expected_state_value_output = np.matmul( expected_nn_output, policy_params[ "shared-value-function-policy/value-function-node/dense-layer/dense/kernel"] ) expected_state_value_output_unfolded = np.reshape( expected_state_value_output, newshape=(2, 3, 1)) test.test(("get_state_values", states, ["state_values"]), expected_outputs=dict( state_values=expected_state_value_output_unfolded), decimals=5) test.test(("get_state_values_adapter_outputs_and_parameters", states, ["state_values", "adapter_outputs"]), expected_outputs=dict( state_values=expected_state_value_output_unfolded, adapter_outputs=expected_action_layer_output_unfolded), decimals=5) # Parameter (probabilities). Softmaxed logits. expected_probs_output = tuple([ softmax(expected_action_layer_output_unfolded[0], axis=-1), softmax(expected_action_layer_output_unfolded[1], axis=-1), dict(a=softmax(expected_action_layer_output_unfolded[2]["a"], axis=-1)) ]) test.test(("get_adapter_outputs_and_parameters", states, ["adapter_outputs", "parameters"]), expected_outputs=dict( adapter_outputs=expected_action_layer_output_unfolded, parameters=expected_action_layer_output_unfolded), decimals=5) print("Probs: {}".format(expected_probs_output)) expected_actions = tuple([ np.argmax(expected_action_layer_output_unfolded[0], axis=-1), np.argmax(expected_action_layer_output_unfolded[1], axis=-1), dict(a=np.argmax(expected_action_layer_output_unfolded[2]["a"], axis=-1), ) ]) test.test(("get_action", states, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", states)) action = out["action"] llh = out["log_likelihood"] # Action log-likelihood. expected_action_llh_output = np.log( np.array([ [ expected_probs_output[0][0][0][action[0][0][0]], expected_probs_output[0][0][1][action[0][0][1]], expected_probs_output[0][0][2][action[0][0][2]], ], [ expected_probs_output[0][1][0][action[0][1][0]], expected_probs_output[0][1][1][action[0][1][1]], expected_probs_output[0][1][2][action[0][1][2]], ] ])) + np.log( np.array([[ expected_probs_output[1][0][0][action[1][0][0]], expected_probs_output[1][0][1][action[1][0][1]], expected_probs_output[1][0][2][action[1][0][2]], ], [ expected_probs_output[1][1][0][action[1][1][0]], expected_probs_output[1][1][1][action[1][1][1]], expected_probs_output[1][1][2][action[1][1][2]], ]]) ) + np.log( np.array([[ expected_probs_output[2]["a"][0][0][action[2]["a"][0][0]], expected_probs_output[2]["a"][0][1][action[2]["a"][0][1]], expected_probs_output[2]["a"][0][2][action[2]["a"][0][2]], ], [ expected_probs_output[2]["a"][1][0][action[2] ["a"][1][0]], expected_probs_output[2]["a"][1][1][action[2] ["a"][1][1]], expected_probs_output[2]["a"][1][2][action[2] ["a"][1][2]], ]])) test.test(("get_log_likelihood", [states, action]), expected_outputs=dict( log_likelihood=expected_action_llh_output, adapter_outputs=expected_action_layer_output_unfolded), decimals=5) recursive_assert_almost_equal(expected_action_llh_output, llh, decimals=5) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"][0].dtype == np.int32) self.assertTrue( out["action"][0].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["action"][1].dtype == np.int32) self.assertTrue( out["action"][1].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["action"][2]["a"].dtype == np.int32) self.assertTrue(out["action"][2]["a"].shape == ( 2, 3)) # Make sure output is unfolded. # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"][0].dtype == np.int32) self.assertTrue( out["action"][0].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["action"][1].dtype == np.int32) self.assertTrue( out["action"][1].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["action"][2]["a"].dtype == np.int32) self.assertTrue(out["action"][2]["a"].shape == ( 2, 3)) # Make sure output is unfolded. # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"][0].dtype == np.float32) self.assertTrue( out["entropy"][0].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["entropy"][1].dtype == np.float32) self.assertTrue( out["entropy"][1].shape == (2, 3)) # Make sure output is unfolded. self.assertTrue(out["entropy"][2]["a"].dtype == np.float32) self.assertTrue(out["entropy"][2]["a"].shape == ( 2, 3)) # Make sure output is unfolded.
def test_environment_stepper_on_deterministic_env_with_returning_action_probs( self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) policy_scope = "environment-stepper/actor-component/policy/" weights_hid = weights[policy_scope + "test-network/hidden-layer/dense/kernel"] biases_hid = weights[policy_scope + "test-network/hidden-layer/dense/bias"] weights_action = weights[ policy_scope + "action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[ policy_scope + "action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. expected = ( # t_ np.array([False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ softmax( dense_layer( dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ])) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( np.array([False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ softmax( dense_layer( dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ])) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_policy_for_discrete_action_space_with_dueling_layer(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). nn_input_space = FloatBox(shape=(5, ), add_batch_rank=True) # Action space. action_space = Dict(dict(a=Tuple(IntBox(2), IntBox(3)), b=Dict(dict(ba=IntBox(4)))), add_batch_rank=True) #flat_float_action_space = Dict(dict( # a=Tuple(FloatBox(shape=(2,)), FloatBox(shape=(3,))), # b=Dict(dict(ba=FloatBox(shape=(4,)))) #), add_batch_rank=True) # Policy with dueling logic. policy = DuelingPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), # Make all sub action adapters the same. action_adapter_spec=dict(pre_network_spec=[ dict(type="dense", units=5, activation="lrelu", activation_params=[0.2]) ]), units_state_value_stream=2, action_space=action_space) test = ComponentTest( component=policy, input_spaces=dict( nn_inputs=nn_input_space, actions=action_space, #logits=flat_float_action_space, #parameters=flat_float_action_space ), action_space=action_space) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs. nn_input = nn_input_space.sample(size=3) # Raw NN-output. expected_nn_output = relu( np.matmul( nn_input, policy_params[ "dueling-policy/test-network/hidden-layer/dense/kernel"]), 0.2) test.test(("get_nn_outputs", nn_input), expected_outputs=expected_nn_output, decimals=5) # Raw action layer output. expected_raw_advantages = dict( a=( np.matmul( relu( np.matmul( expected_nn_output, policy_params[ "dueling-policy/action-adapter-0/action-network/dense-layer/dense/kernel"] ), 0.2), policy_params[ "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel"] ), np.matmul( relu( np.matmul( expected_nn_output, policy_params[ "dueling-policy/action-adapter-1/action-network/dense-layer/dense/kernel"] ), 0.2), policy_params[ "dueling-policy/action-adapter-1/action-network/action-layer/dense/kernel"] ), ), b=dict(ba=np.matmul( relu( np.matmul( expected_nn_output, policy_params[ "dueling-policy/action-adapter-2/action-network/dense-layer/dense/kernel"] ), 0.2), policy_params[ "dueling-policy/action-adapter-2/action-network/action-layer/dense/kernel"] ))) # Single state values. expected_state_values = np.matmul( relu( np.matmul( expected_nn_output, policy_params[ "dueling-policy/dense-layer-state-value-stream/dense/kernel"] )), policy_params["dueling-policy/state-value-node/dense/kernel"]) test.test(("get_state_values", nn_input, ["state_values"]), expected_outputs=dict(state_values=expected_state_values), decimals=5) # State-values: One for each item in the batch. expected_q_values_output = dict( a=( expected_state_values + expected_raw_advantages["a"][0] - np.mean( expected_raw_advantages["a"][0], axis=-1, keepdims=True), expected_state_values + expected_raw_advantages["a"][1] - np.mean( expected_raw_advantages["a"][1], axis=-1, keepdims=True), ), b=dict( ba=expected_state_values + expected_raw_advantages["b"]["ba"] - np.mean( expected_raw_advantages["b"]["ba"], axis=-1, keepdims=True) )) test.test( ("get_adapter_outputs", nn_input), expected_outputs=dict(adapter_outputs=expected_q_values_output, nn_outputs=expected_nn_output, advantages=expected_raw_advantages, q_values=expected_q_values_output), decimals=5) test.test( ("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs"]), expected_outputs=dict(adapter_outputs=expected_q_values_output), decimals=5) # Parameter (probabilities). Softmaxed q_values. expected_probs_output = dict( a=(softmax(expected_q_values_output["a"][0], axis=-1), softmax(expected_q_values_output["a"][1], axis=-1)), b=dict(ba=np.maximum( softmax(expected_q_values_output["b"]["ba"], axis=-1), SMALL_NUMBER))) expected_log_probs_output = dict( a=(np.log(expected_probs_output["a"][0]), np.log(expected_probs_output["a"][1])), b=dict(ba=np.log(expected_probs_output["b"]["ba"]))) test.test( ("get_adapter_outputs_and_parameters", nn_input, ["adapter_outputs", "parameters", "log_probs"]), expected_outputs=dict(adapter_outputs=expected_q_values_output, parameters=expected_q_values_output, log_probs=expected_log_probs_output), decimals=5) print("Probs: {}".format(expected_probs_output)) expected_actions = dict( a=(np.argmax(expected_q_values_output["a"][0], axis=-1), np.argmax(expected_q_values_output["a"][1], axis=-1)), b=dict(ba=np.argmax(expected_q_values_output["b"]["ba"], axis=-1))) test.test(("get_action", nn_input, ["action"]), expected_outputs=dict(action=expected_actions)) out = test.test(("get_action_and_log_likelihood", nn_input)) action = out["action"] llh = out["log_likelihood"] # Action log-likelihood. expected_action_llh_output = np.array([ expected_log_probs_output["a"][0][0][action["a"][0][0]], expected_log_probs_output["a"][0][1][action["a"][0][1]], expected_log_probs_output["a"][0][2][action["a"][0][2]], ]) + np.array([ expected_log_probs_output["a"][1][0][action["a"][1][0]], expected_log_probs_output["a"][1][1][action["a"][1][1]], expected_log_probs_output["a"][1][2][action["a"][1][2]], ]) + np.array([ expected_log_probs_output["b"]["ba"][0][action["b"]["ba"][0]], expected_log_probs_output["b"]["ba"][1][action["b"]["ba"][1]], expected_log_probs_output["b"]["ba"][2][action["b"]["ba"][2]], ]) test.test( ("get_log_likelihood", [nn_input, action]), expected_outputs=dict(log_likelihood=expected_action_llh_output, adapter_outputs=expected_q_values_output), decimals=5) recursive_assert_almost_equal(expected_action_llh_output, llh, decimals=5) # Stochastic sample. out = test.test(("get_stochastic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"]["a"][0].dtype == np.int32) self.assertTrue(out["action"]["a"][0].shape == (3, )) self.assertTrue(out["action"]["a"][1].dtype == np.int32) self.assertTrue(out["action"]["a"][1].shape == (3, )) self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32) self.assertTrue(out["action"]["b"]["ba"].shape == (3, )) # Deterministic sample. out = test.test(("get_deterministic_action", nn_input), expected_outputs=None) self.assertTrue(out["action"]["a"][0].dtype == np.int32) self.assertTrue(out["action"]["a"][0].shape == (3, )) self.assertTrue(out["action"]["a"][1].dtype == np.int32) self.assertTrue(out["action"]["a"][1].shape == (3, )) self.assertTrue(out["action"]["b"]["ba"].dtype == np.int32) self.assertTrue(out["action"]["b"]["ba"].shape == (3, )) # Distribution's entropy. out = test.test(("get_entropy", nn_input), expected_outputs=None) self.assertTrue(out["entropy"]["a"][0].dtype == np.float32) self.assertTrue(out["entropy"]["a"][0].shape == (3, )) self.assertTrue(out["entropy"]["a"][1].dtype == np.float32) self.assertTrue(out["entropy"]["a"][1].shape == (3, )) self.assertTrue(out["entropy"]["b"]["ba"].dtype == np.float32) self.assertTrue(out["entropy"]["b"]["ba"].shape == (3, ))
def test_environment_stepper_on_deterministic_env_with_action_probs_lstm( self): internal_states_space = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, ))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) policy_scope = "environment-stepper/actor-component/policy/" weights_lstm = weights[policy_scope + "test-lstm-network/lstm-layer/lstm-cell/kernel"] biases_lstm = weights[policy_scope + "test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[ policy_scope + "action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[ policy_scope + "action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( np.array([False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ softmax( dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states (np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]])))) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def test_double_dqn_on_2x2_grid_world_with_container_actions(self): """ Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions. """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") dummy_env = GridWorld.from_spec(env_spec) agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json") preprocessing_spec = agent_config.pop("preprocessing_spec") agent = DQNAgent.from_spec(agent_config, double_q=True, dueling_q=False, state_space=FloatBox(shape=(4, )), action_space=dummy_env.action_space, execution_spec=dict(seed=15), store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print("LAST q-table:\n{}".format(agent.last_q_table)) self.assertEqual(results["timesteps_executed"], time_steps) self.assertEqual(results["env_frames"], time_steps) self.assertGreaterEqual(results["mean_episode_reward"], -2.0) self.assertGreaterEqual(results["max_episode_reward"], -1.0) self.assertLessEqual(results["episodes_executed"], time_steps / 3) # Check q-table for correct values. expected_q_values_per_state = { (0., 0., -1., 0.): { "forward": (-5.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 0., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., -1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 1., 0.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., -1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, (0., 1., 0., 1.): { "forward": (0.0, -1.0, -1.0), "jump": (0.0, -1.0) }, } for state, q_values_forward, q_values_jump in zip( agent.last_q_table["states"], agent.last_q_table["q_values"]["forward"], agent.last_q_table["q_values"]["jump"]): state, q_values_forward, q_values_jump = tuple(state), tuple( q_values_forward), tuple(q_values_jump) assert state in expected_q_values_per_state, \ "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state) recursive_assert_almost_equal( q_values_forward, expected_q_values_per_state[state]["forward"], decimals=0) recursive_assert_almost_equal( q_values_jump, expected_q_values_per_state[state]["jump"], decimals=0)
def test_shared_value_function_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4,), add_batch_rank=True) # action_space (3 possible actions). action_space = IntBox(3, add_batch_rank=True) flat_float_action_space = FloatBox(shape=(3,), add_batch_rank=True) # Policy with baseline action adapter. shared_value_function_policy = SharedValueFunctionPolicy( network_spec=config_from_path("configs/test_lrelu_nn.json"), action_space=action_space ) test = ComponentTest( component=shared_value_function_policy, input_spaces=dict( nn_input=state_space, actions=action_space, probabilities=flat_float_action_space, parameters=flat_float_action_space, logits=flat_float_action_space ), action_space=action_space, ) policy_params = test.read_variable_values(shared_value_function_policy.variables) # Some NN inputs (4 input nodes, batch size=3). states = state_space.sample(size=3) # Raw NN-output (3 hidden nodes). All weights=1.5, no biases. expected_nn_output = relu(np.matmul( states, policy_params["shared-value-function-policy/test-network/hidden-layer/dense/kernel"] ), 0.1) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=5) # Raw action layer output; Expected shape=(3,3): 3=batch, 2=action categories + 1 state value expected_action_layer_output = np.matmul( expected_nn_output, policy_params["shared-value-function-policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output), decimals=5) # State-values: One for each item in the batch. expected_state_value_output = np.matmul( expected_nn_output, policy_params["shared-value-function-policy/value-function-node/dense-layer/dense/kernel"] ) test.test(("get_state_values", states), expected_outputs=dict(state_values=expected_state_value_output), decimals=5) # Logits-values. test.test(("get_state_values_logits_probabilities_log_probs", states, ["state_values", "logits"]), expected_outputs=dict(state_values=expected_state_value_output, logits=expected_action_layer_output), decimals=5) # Parameter (probabilities). Softmaxed logits. expected_probabilities_output = softmax(expected_action_layer_output, axis=-1) test.test(("get_logits_probabilities_log_probs", states, ["logits", "probabilities"]), expected_outputs=dict( logits=expected_action_layer_output, probabilities=expected_probabilities_output ), decimals=5) print("Probs: {}".format(expected_probabilities_output)) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states), expected_outputs=dict(action=expected_actions)) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (3,)) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (3,)) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (3,))