def test_actor_component_with_lstm_network(self): # state space and internal state space state_space = FloatBox(shape=(2, ), add_batch_rank=True, add_time_rank=True, time_major=False) internal_states_space = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, )), add_batch_rank=True) time_step_space = IntBox() # action_space. action_space = IntBox(2, add_batch_rank=True, add_time_rank=True) preprocessor = PreprocessorStack.from_spec([ dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10) ]) policy = Policy( network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space) exploration = Exploration(epsilon_spec=dict( decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100))) actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest(component=actor_component, input_spaces=dict(states=state_space, other_nn_inputs=Tuple( internal_states_space, add_batch_rank=True), time_step=time_step_space), action_space=action_space) # Some state inputs (batch size=2, seq-len=1000; batch-major). np.random.seed(10) states = state_space.sample(size=(1000, 2)) initial_internal_states = internal_states_space.zeros( size=2) # only batch time_steps = time_step_space.sample(1000) # Run n times a single time-step to simulate acting and env interaction with an LSTM. preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float) actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int) for i, time_step in enumerate(time_steps): ret = test.test(( "get_preprocessed_state_and_action", # expand time dim at 1st slot as we are time-major == False [ np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_step ])) preprocessed_states[i] = ret[ "preprocessed_state"][:, 0, :] # take out time-rank again () actions[i] = ret["action"] # Check c/h-state shape. self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3)) # batch-size=2, LSTM units=3 self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3)) # Check all preprocessed states (easy: just divided by 10). expected_preprocessed_state = states / 10 recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state) # Check the exploration functionality over the actions. # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small. stddev_actions = actions.std() self.assertGreater(stddev_actions, 0.4) self.assertLess(stddev_actions, 0.6)
def test_simple_actor_component(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(5,), add_batch_rank=True) # action_space. action_space = IntBox(10) preprocessor = PreprocessorStack.from_spec( [dict(type="convert_type", to_dtype="float"), dict(type="multiply", factor=2)] ) policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) exploration = Exploration() # no exploration actor_component = ActorComponent(preprocessor, policy, exploration) test = ComponentTest( component=actor_component, input_spaces=dict(states=state_space), action_space=action_space ) # Get and check some actions. actor_component_params = test.read_variable_values(actor_component.variable_registry) # Some state inputs (5 input nodes, batch size=2). states = state_space.sample(2) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params["actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_and_action", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, nn_outputs=expected_nn_output ), decimals=5) # Get actions and action-probs by calling a different API-method. states = state_space.sample(5) # Get and check some actions. actor_component_params = test.read_variable_values(actor_component.variable_registry) # Expected NN-output. expected_nn_output = np.matmul( states * 2, actor_component_params["actor-component/policy/test-network/hidden-layer/dense/kernel"] ) # Raw action layer output. expected_action_layer_output = np.matmul( expected_nn_output, actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"] ) # No reshape necessary (simple action space), softmax to get probs. expected_action_probs = softmax(expected_action_layer_output) # Final actions (max-likelihood/greedy pick). expected_actions = np.argmax(expected_action_layer_output, axis=-1) expected_preprocessed_state = states * 2 test.test(("get_preprocessed_state_action_and_action_probs", states), expected_outputs=dict( preprocessed_state=expected_preprocessed_state, action=expected_actions, action_probs=expected_action_probs, nn_outputs=expected_nn_output ), decimals=5)