def test_multi_lstm_layer(self): return # TODO: finish this test case # Tests a double MultiLSTMLayer. input_spaces = dict(inputs=FloatBox(shape=(3, ), add_batch_rank=True, add_time_rank=True), initial_c_and_h_states=Tuple( Tuple(FloatBox(shape=(5, )), FloatBox(shape=(5, ))), Tuple(FloatBox(shape=(5, )), FloatBox(shape=(5, ))), add_batch_rank=True)) multi_lstm_layer = MultiLSTMLayer( num_lstms=2, units=5, # Full skip connections (x goes into both layers, out0 goes into layer1). skip_connections=[[True, False], [True, True]]) # Do not seed, we calculate expectations manually. test = ComponentTest(component=multi_lstm_layer, input_spaces=input_spaces) # Batch of size=n, time-steps=m. input_ = input_spaces["inputs"].sample((2, 3)) global_scope = "variational-auto-encoder/" # Calculate output manually. var_dict = test.read_variable_values( multi_lstm_layer.variable_registry) encoder_network_out = dense_layer( input_, var_dict[global_scope + "encoder-network/encoder-layer/dense/kernel"], var_dict[global_scope + "encoder-network/encoder-layer/dense/bias"]) expected_mean = dense_layer( encoder_network_out, var_dict[global_scope + "mean-layer/dense/kernel"], var_dict[global_scope + "mean-layer/dense/bias"]) expected_stddev = dense_layer( encoder_network_out, var_dict[global_scope + "stddev-layer/dense/kernel"], var_dict[global_scope + "stddev-layer/dense/bias"]) out = test.test(("encode", input_), expected_outputs=None) recursive_assert_almost_equal(out["mean"], expected_mean, decimals=5) recursive_assert_almost_equal(out["stddev"], expected_stddev, decimals=5) self.assertTrue(out["z_sample"].shape == (3, 1)) test.terminate()
def test_impala_actor_compilation(self): """ Tests IMPALA agent compilation (actor). """ try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env = DeepmindLabEnv( level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4 ) actor_agent = IMPALAAgent.from_spec( agent_config, type="actor", state_space=env.state_space, action_space=env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True), # Make session-creation hang in docker. execution_spec=dict(disable_monitoring=True) ) # Start Specifiable Server with Env manually. actor_agent.environment_stepper.environment_server.start() print("Compiled IMPALA type=actor agent.") actor_agent.environment_stepper.environment_server.stop()
def test_keras_style_one_container_input_space(self): # Define one container input Space. input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True) # One-hot flatten the int tensor. flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0]) # Run the float tensor through two dense layers. dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1]) dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out) # Concat everything. cat_out = ConcatLayer()(flatten_layer_out, dense_2_out) # Use the `outputs` arg to allow your network to trace back the data flow until the input space. # `inputs` is not needed here as we only have one single input (the Tuple). neural_net = NeuralNetwork(outputs=cat_out) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space)) var_dict = neural_net.variable_registry w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"]) b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"]) w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"]) b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"]) # Batch of size=n. input_ = input_space.sample(4) expected = np.concatenate([ # concat everything one_hot(input_[0]), # int flattening dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value) # float -> 2 x dense ], axis=-1) out = test.test(("call", tuple([input_])), expected_outputs=expected) test.terminate()
def test_calculate_gradients(self): return optimizer = GradientDescentOptimizer(learning_rate=0.01) x = tf.Variable(2, name='x', dtype=tf.float32) log_x = tf.log(x) loss = tf.square(x=log_x) test = ComponentTest(component=optimizer, input_spaces=dict( loss=FloatBox(), variables=Dict({"x": FloatBox()}), loss_per_item=FloatBox(add_batch_rank=True), grads_and_vars=Tuple(Tuple(float, float)))) print( test.test(("calculate_gradients", [dict(x=x), loss]), expected_outputs=None))
def test_environment_stepper_on_deterministic_env_with_action_probs_lstm(self): internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec ) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values(environment_stepper.actor_component.policy.variable_registry) policy_scope = "environment-stepper/actor-component/policy/" weights_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/kernel"] biases_lstm = weights[policy_scope+"test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/kernel"] biases_action = weights[policy_scope+"action-adapter-0/action-network/action-layer/dense/bias"] # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( np.array([False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ softmax(dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax(dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states ( np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze(np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]])) ) ) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate()
def _prepare_loss_function_test(loss_function): test = ComponentTest( component=loss_function, input_spaces=dict( alpha=float, log_probs_next_sampled=FloatBox(shape=(1, ), add_batch_rank=True), q_values_next_sampled=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), q_values=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), log_probs_sampled=FloatBox(shape=(1, ), add_batch_rank=True), q_values_sampled=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), rewards=FloatBox(add_batch_rank=True), terminals=BoolBox(add_batch_rank=True), loss_per_item=FloatBox(add_batch_rank=True)), action_space=IntBox(2, shape=(), add_batch_rank=True)) return test
def test_multi_input_stream_neural_network_with_tuple(self): # Space must contain batch dimension (otherwise, NNLayer will complain). input_space = Tuple( IntBox(3, shape=()), FloatBox(shape=(8,)), IntBox(4, shape=()), add_batch_rank=True ) multi_input_nn = MultiInputStreamNeuralNetwork( input_network_specs=( [{"type": "reshape", "flatten": True, "flatten_categories": True}], # intbox -> flatten [{"type": "dense", "units": 2}], # floatbox -> dense [{"type": "reshape", "flatten": True, "flatten_categories": True}] # inbox -> flatten ), post_network_spec=[{"type": "dense", "units": 3}], ) test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space)) # Batch of size=n. nn_inputs = input_space.sample(3) global_scope_pre = "multi-input-stream-nn/input-stream-nn-" global_scope_post = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/" # Calculate output manually. var_dict = test.read_variable_values() flat_0 = one_hot(nn_inputs[0], depth=3) dense_1 = dense_layer( nn_inputs[1], var_dict[global_scope_pre+"1/dense-layer/dense/kernel"], var_dict[global_scope_pre+"1/dense-layer/dense/bias"] ) flat_2 = one_hot(nn_inputs[2], depth=4) concat_out = np.concatenate((flat_0, dense_1, flat_2), axis=-1) expected = dense_layer(concat_out, var_dict[global_scope_post+"kernel"], var_dict[global_scope_post+"bias"]) test.test(("call", tuple([nn_inputs])), expected_outputs=expected) test.terminate()
def test_impala_actor_compilation(self): """ Tests IMPALA agent compilation (actor). """ return if get_backend() == "pytorch": return try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env_spec = dict(level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = DeepmindLabEnv.from_spec(env_spec) agent = IMPALAAgent.from_spec( agent_config, type="actor", state_space=dummy_env.state_space, action_space=dummy_env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=False), environment_spec=default_dict(dict(type="deepmind-lab"), env_spec), # Make session-creation hang in docker. execution_spec=dict( session_config=dict( type="monitored-training-session", auto_start=False ), disable_monitoring=True ) ) # Start Specifiable Server with Env manually (monitoring is disabled). agent.environment_stepper.environment_server.start_server() print("Compiled {}".format(agent)) agent.environment_stepper.environment_server.stop_server() agent.terminate()
def test_impala_learner_compilation(self): """ Tests IMPALA agent compilation (learner). """ try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("Deepmind Lab not installed: Will skip this test.") return agent_config = config_from_path("configs/impala_agent_for_deepmind_lab_env.json") env = DeepmindLabEnv( level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4 ) learner_agent = IMPALAAgent.from_spec( agent_config, type="learner", state_space=env.state_space, action_space=env.action_space, internal_states_space=Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=True), ) print("Compiled IMPALA type=learner agent.")
def test_lstm_nn_with_custom_apply(self): # Space must contain batch dimension (otherwise, NNlayer will complain). units = 3 batch_size = 2 time_steps = 4 input_nodes = 2 input_space = FloatBox(shape=(input_nodes, ), add_batch_rank=True, add_time_rank=True) internal_states_space = Tuple(FloatBox(shape=(units, )), FloatBox(shape=(units, )), add_batch_rank=True) def custom_apply(self, input_, internal_states=None): d0_out = self.get_sub_component_by_name("d0").apply(input_) lstm_out = self.get_sub_component_by_name("lstm").apply( d0_out, internal_states) d1_out = self.get_sub_component_by_name("d1").apply( lstm_out["output"]) return dict(output=d1_out, last_internal_states=lstm_out["last_internal_states"]) # Create a simple neural net with the above custom API-method. neural_net = NeuralNetwork(DenseLayer(units, scope="d0"), LSTMLayer(units, scope="lstm"), DenseLayer(units, scope="d1"), api_methods={("apply", custom_apply)}) # Do not seed, we calculate expectations manually. test = ComponentTest(component=neural_net, input_spaces=dict( input_=input_space, internal_states=internal_states_space)) # Batch of size=2, time-steps=3. input_ = input_space.sample((batch_size, time_steps)) internal_states = internal_states_space.sample(batch_size) # Calculate output manually. w0_value = test.read_variable_values( neural_net.variable_registry["neural-network/d0/dense/kernel"]) b0_value = test.read_variable_values( neural_net.variable_registry["neural-network/d0/dense/bias"]) w1_value = test.read_variable_values( neural_net.variable_registry["neural-network/d1/dense/kernel"]) b1_value = test.read_variable_values( neural_net.variable_registry["neural-network/d1/dense/bias"]) lstm_w_value = test.read_variable_values( neural_net. variable_registry["neural-network/lstm/lstm-cell/kernel"]) lstm_b_value = test.read_variable_values( neural_net.variable_registry["neural-network/lstm/lstm-cell/bias"]) d0_out = dense_layer(input_, w0_value, b0_value) lstm_out, last_internal_states = lstm_layer( d0_out, lstm_w_value, lstm_b_value, initial_internal_states=internal_states, time_major=False) d1_out = dense_layer(lstm_out, w1_value, b1_value) expected = dict(output=d1_out, last_internal_states=last_internal_states) test.test(("apply", [input_, internal_states]), expected_outputs=expected, decimals=5) test.terminate()
def __init__(self, file_name=None, worker_id=0, base_port=5005, seed=0, docker_training=False, no_graphics=False, timeout_wait=30, train_mode=True, **kwargs): """ Args: file_name (Optional[str]): Name of Unity environment binary. base_port (int): Port number to connect to Unity environment. `worker_id` increments on top of this. worker_id (int): Number to add to `base_port`. Used for asynchronous agent scenarios. docker_training (bool): Informs this class, whether the process is being run within a container. Default: False. no_graphics (bool): Whether to run the Unity simulator in no-graphics mode. Default: False. timeout_wait (int): Time (in seconds) to wait for connection from environment. train_mode (bool): Whether to run in training mode, speeding up the simulation. Default: True. """ # First create the UnityMLAgentsEnvironment to get state and action spaces, then create RLgraph Environment # instance. self.mlagents_env = UnityEnvironment( file_name, worker_id, base_port, seed, docker_training, no_graphics ) all_brain_info = self.mlagents_env.reset() # Get all possible information from AllBrainInfo. # TODO: Which scene do we pick? self.scene_key = next(iter(all_brain_info)) first_brain_info = all_brain_info[self.scene_key] num_environments = len(first_brain_info.agents) state_space = {} if len(first_brain_info.vector_observations[0]) > 0: state_space["vector"] = get_space_from_op(first_brain_info.vector_observations[0]) # TODO: This is a hack. if state_space["vector"].dtype == np.float64: state_space["vector"].dtype = np.float32 if len(first_brain_info.visual_observations) > 0: state_space["visual"] = get_space_from_op(first_brain_info.visual_observations[0]) if first_brain_info.text_observations[0]: state_space["text"] = get_space_from_op(first_brain_info.text_observations[0]) if len(state_space) == 1: self.state_key = next(iter(state_space)) state_space = state_space[self.state_key] else: self.state_key = None state_space = Dict(state_space) brain_params = next(iter(self.mlagents_env.brains.values())) if brain_params.vector_action_space_type == "discrete": highs = brain_params.vector_action_space_size # MultiDiscrete (Tuple(IntBox)). if any(h != highs[0] for h in highs): action_space = Tuple([IntBox(h) for h in highs]) # Normal IntBox: else: action_space = IntBox( low=np.zeros_like(highs, dtype=np.int32), high=np.array(highs, dtype=np.int32), shape=(len(highs),) ) else: action_space = get_space_from_op(first_brain_info.action_masks[0]) if action_space.dtype == np.float64: action_space.dtype = np.float32 super(MLAgentsEnv, self).__init__( num_environments=num_environments, state_space=state_space, action_space=action_space, **kwargs ) # Caches the last observation we made (after stepping or resetting). self.last_state = None
class IMPALAAgent(Agent): """ An Agent implementing the IMPALA algorithm described in [1]. The Agent contains both learner and actor API-methods, which will be put into the graph depending on the type (). [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer, Munos et al. - 2018 (https://arxiv.org/abs/1802.01561) """ default_internal_states_space = Tuple(FloatBox(shape=(256, )), FloatBox(shape=(256, )), add_batch_rank=False) default_environment_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, feed_previous_action_through_nn=True, feed_previous_reward_through_nn=True, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_action". Default: True. feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict. It will be added under the key "previous_reward". Default: True. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ self.worker_sample_size = worker_sample_size # Network-spec by default is a "large architecture" IMPALA network. self.network_spec = kwargs.pop( "network_spec", dict( type= "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork" .format("Large" if architecture == "large" else "Small"))) if isinstance(self.network_spec, dict) and "type" in self.network_spec and \ "IMPALANetwork" in self.network_spec["type"]: self.network_spec = default_dict( self.network_spec, dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. self.exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) self.feed_previous_action_through_nn = feed_previous_action_through_nn self.feed_previous_reward_through_nn = feed_previous_reward_through_nn # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. self.preprocessing_spec = kwargs.pop( "preprocessing_spec", dict( type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get( "action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[dict(type="reshape", new_shape=(1, ))]))) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get( "mode") == "distributed": default_dict( execution_spec["session_config"], dict(type="monitored-training-session", allow_soft_placement=True, device_filters=["/job:learner/task:0"] + ([ "/job:actor/task:{}".format( execution_spec["distributed_spec"]["task_index"]) ] if self.type == "actor" else ["/job:learner/task:0"]))) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec[ "default_device"] = "/job:{}/task:{}/cpu".format( self.type, execution_spec["distributed_spec"]["task_index"]) self.policy_spec = kwargs.pop("policy_spec", dict()) # TODO: Create some auto-setting based on LSTM inside the NN. default_dict( self.policy_spec, dict(type="shared-value-function-policy", deterministic=False, reuse_variable_scope="shared-policy", action_space=kwargs.get("action_space"))) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__(discount=discount, preprocessing_spec=self.preprocessing_spec, network_spec=self.network_spec, policy_spec=self.policy_spec, exploration_spec=self.exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop( "name", "impala-{}-agent".format(self.type)), **kwargs) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec[ "distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.policy.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states"] + \ (["actions"] if not self.feed_previous_action_through_nn else []) + \ (["rewards"] if not self.feed_previous_reward_through_nn else []) + \ ["action_probs"] + \ (["initial_internal_states"] if self.has_rnn else []) # Define FIFO record space. # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain # num-steps items. self.fifo_record_space = Dict( { "terminals": bool, "action_probs": FloatBox(shape=(self.action_space.num_categories, )), }, add_batch_rank=False, add_time_rank=self.worker_sample_size) self.fifo_record_space["states"] = self.state_space.with_time_rank( self.worker_sample_size + 1) # Add action and rewards to state or do they have an extra channel? if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_action"] = \ self.action_space.with_time_rank(self.worker_sample_size + 1) else: self.fifo_record_space[ "actions"] = self.action_space.with_time_rank( self.worker_sample_size) if self.feed_previous_action_through_nn: self.fifo_record_space["states"]["previous_reward"] = FloatBox( add_time_rank=self.worker_sample_size + 1) else: self.fifo_record_space["rewards"] = FloatBox( add_time_rank=self.worker_sample_size) if self.has_rnn: self.fifo_record_space[ "initial_internal_states"] = self.internal_states_space.with_time_rank( False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": pass elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter( tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = None # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) # Dummy Flattener to calculate action-probs space. dummy_flattener = ReShape( flatten=True, flatten_categories=self.action_space.num_categories) self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space= float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space( self.action_space)) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter( *self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter( *list(self.fifo_record_space["states"].keys()), scope="states-dict-splitter") self.internal_states_slicer = None self.transposer = Transpose( scope="transposer", device=dict(ops="/job:learner/task:0/cpu")) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction( discount=self.discount, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, slice_actions=self.feed_previous_action_through_nn, slice_rewards=self.feed_previous_reward_through_nn, device="/job:learner/task:0/gpu") self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu"))) for component in [ self.staging_area, self.preprocessor, self.optimizer ]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu")) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transposer, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] if self.type != "single": # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_graph_api(*sub_components) if self.type != "single" and self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu") self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: step_context.session.run(self.stage_op )) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op def define_graph_api(self, *sub_components): # TODO: Unify agents with/w/o synchronizable policy. # TODO: Unify Agents with/w/o get_action method (w/ env-stepper vs w/o). #global_scope_base = "environment-stepper/actor-component/" if self.type == "actor" else "" #super(IMPALAAgent, self).define_graph_api( # global_scope_base+"policy", # global_scope_base+"dict-preprocessor-stack" #) # Assemble the specific agent. if self.type == "single": pass elif self.type == "actor": self.define_graph_api_actor(*sub_components) else: self.define_graph_api_learner(*sub_components) def define_graph_api_actor(self, env_stepper, env_output_splitter, internal_states_slicer, merger, fifo_queue): """ Defines the API-methods used by an IMPALA actor. Actors only step through an environment (n-steps at a time), collect the results and push them into the FIFO queue. Results include: The actions actually taken, the discounted accumulated returns for each action, the probability of each taken action according to the behavior policy. Args: env_stepper (EnvironmentStepper): The EnvironmentStepper Component to setp through the Env n steps in a single op call. fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step). """ # Perform n-steps in the env and insert the results into our FIFO-queue. @rlgraph_api(component=self.root_component) def perform_n_steps_and_insert_into_fifo(self_): # Take n steps in the environment. step_results = env_stepper.step() split_output = env_output_splitter.split(step_results) # Slice off the initial internal state (so the learner can re-feed-forward from that internal-state). initial_internal_states = internal_states_slicer.slice( split_output[-1], 0) # -1=internal states to_merge = split_output[:-1] + (initial_internal_states, ) record = merger.merge(*to_merge) # Insert results into the FIFOQueue. insert_op = fifo_queue.insert_records(record) return insert_op, split_output[0] # 0=terminals def define_graph_api_learner(self, fifo_output_splitter, fifo_queue, states_dict_splitter, transposer, staging_area, preprocessor, policy, loss_function, optimizer): """ Defines the API-methods used by an IMPALA learner. Its job is basically: Pull a batch from the FIFOQueue, split it up into its components and pass these through the loss function and into the optimizer for a learning update. Args: fifo_output_splitter (ContainerSplitter): The ContainerSplitter Component to split up a batch from the queue along its items. fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step). states_dict_splitter (ContainerSplitter): The ContainerSplitter Component to split the state components into its single parts. transposer (Transpose): A space-agnostic Transpose to flip batch- and time ranks of all state-components. staging_area (StagingArea): A possible GPU stating area component. preprocessor (PreprocessorStack): A preprocessing Component for the states (may be a DictPreprocessorStack as well). policy (Policy): The Policy Component, which to update. loss_function (IMPALALossFunction): The IMPALALossFunction Component. optimizer (Optimizer): The optimizer that we use to calculate an update and apply it. """ @rlgraph_api(component=self.root_component) def get_queue_size(self_): return fifo_queue.get_size() @rlgraph_api(component=self.root_component) def update_from_memory(self_): # Pull n records from the queue. # Note that everything will come out as batch-major and must be transposed before the main-LSTM. # This is done by the network itself for all network inputs: # - preprocessed_s # - preprocessed_last_s_prime # But must still be done for actions, rewards, terminals here in this API-method via separate ReShapers. records = fifo_queue.get_records(self.update_spec["batch_size"]) split_record = fifo_output_splitter.split(records) actions = None rewards = None if self.feed_previous_action_through_nn and self.feed_previous_reward_through_nn: terminals, states, action_probs_mu, initial_internal_states = split_record else: terminals, states, actions, rewards, action_probs_mu, initial_internal_states = split_record # Flip everything to time-major. # TODO: Create components that are less input-space sensitive (those that have no variables should # TODO: be reused for any kind of processing) states = transposer.apply(states) terminals = transposer.apply(terminals) action_probs_mu = transposer.apply(action_probs_mu) if self.feed_previous_action_through_nn is False: actions = transposer.apply(actions) if self.feed_previous_reward_through_nn is False: rewards = transposer.apply(rewards) # If we use a GPU: Put everything on staging area (adds 1 time step policy lag, but makes copying # data into GPU more efficient). if self.has_gpu: stage_op = staging_area.stage(states, terminals, action_probs_mu, initial_internal_states) # Get data from stage again and continue. states, terminals, action_probs_mu, initial_internal_states = staging_area.unstage( ) else: # TODO: No-op component? stage_op = None # Preprocess actions and rewards inside the state (actions: flatten one-hot, rewards: expand). preprocessed_states = preprocessor.preprocess(states) # Only retrieve logits and do faster sparse softmax in loss. out = policy.get_state_values_logits_probabilities_log_probs( preprocessed_states, initial_internal_states) state_values_pi = out["state_values"] logits = out["logits"] #current_internal_states = out["last_internal_states"] # Isolate actions and rewards from states. if self.feed_previous_action_through_nn or self.feed_previous_reward_through_nn: states_split = states_dict_splitter.split(states) actions = states_split[-2] rewards = states_split[-1] # Calculate the loss. loss, loss_per_item = loss_function.loss(logits, action_probs_mu, state_values_pi, actions, rewards, terminals) policy_vars = policy._variables() # Pass vars and loss values into optimizer. step_op, loss, loss_per_item = optimizer.step( policy_vars, loss, loss_per_item) # Return optimizer op and all loss values. # TODO: Make it possible to return None from API-method without messing with the meta-graph. return step_op, (stage_op if stage_op else step_op), loss, loss_per_item def get_action(self, states, internal_states=None, use_exploration=True, extra_returns=None): pass def _observe_graph(self, preprocessed_states, actions, internals, rewards, terminals): self.graph_executor.execute( ("insert_records", [preprocessed_states, actions, rewards, terminals])) def update(self, batch=None): if batch is None: # Include stage_op or not? if self.has_gpu: return self.graph_executor.execute("update_from_memory") else: return self.graph_executor.execute( ("update_from_memory", None, ([0, 2, 3, 4]))) else: raise RLGraphError( "Cannot call update-from-batch on an IMPALA Agent.") def __repr__(self): return "IMPALAAgent(type={})".format(self.type)
class TestEnvironmentStepper(unittest.TestCase): """ Tests for the EnvironmentStepper Component using a simple RandomEnv. """ deterministic_env_state_space = FloatBox(shape=(1, )) deterministic_env_action_space = IntBox(2) deterministic_action_probs_space = FloatBox(shape=(2, ), add_batch_rank=True) internal_states_space = Tuple(FloatBox(shape=(256, )), FloatBox(shape=(256, )), add_batch_rank=True) internal_states_space_test_lstm = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, )), add_batch_rank=True) action_probs_space = FloatBox(shape=(4, ), add_batch_rank=True) time_steps = 500 def test_environment_stepper_on_deterministic_env(self): preprocessor_spec = None network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=5), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( np.array([True, False, False, False]), # t_ np.array([[0.0], [1.0], [2.0], [3.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, True, False]), # t_ np.array([[3.0], [4.0], [0.0], [1.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_deterministic_env_with_returning_action_probs( self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) weights_hid = weights[ "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/kernel"] biases_hid = weights[ "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/bias"] weights_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"] biases_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"] # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( # t_ np.array([True, False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ [0.0, 0.0], # <- init (no input gets sent through NN). softmax( dense_layer( dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ]))) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ [0.0, 0.0], # <- init (no input gets sent through NN). softmax( dense_layer( dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ]))) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_deterministic_env_with_action_probs_lstm( self): internal_states_space = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, ))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) weights_lstm = weights[ "environment-stepper/actor-component/policy/test-lstm-network/" "lstm-layer/lstm-cell/kernel"] biases_lstm = weights[ "environment-stepper/actor-component/policy/test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"] biases_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"] # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( None, ( np.array([True, False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ [0.0, 0.0], softmax( dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states (np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))))) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. # Reset the stepper. test.test("reset") time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[1][0].dtype == np.float32) # preprocessed states #self.assertTrue(out[1][0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[1][0].max() <= 1.0) #self.assertTrue(out[1][1].dtype == np.int32) # actions #self.assertTrue(out[1][2].dtype == np.float32) # rewards #self.assertTrue(out[1][3].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) self.assertTrue(out[1][2].dtype == np.float32) # rewards self.assertTrue(out[1][2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[1][2].max() <= 1.0) # Check whether episode returns match single rewards (including resetting after each terminal signal). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[2][i] # self.assertAlmostEqual(episode_returns, out[1][3][i]) # # Terminal: Reset accumulated episode-return before next step. # if out[1][4][i] is np.bool_(True): # episode_returns = 0.0 # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_compare_with_non_env_stepper(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space.with_batch_rank() action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) test = ComponentTest( component=actor_component, input_spaces=dict(states=state_space), action_space=action_space, ) s = dummy_env.reset() time_start = time.monotonic() for i in range(self.time_steps): out = test.test( ("get_preprocessed_state_and_action", np.array([s]))) #preprocessed_s = out["preprocessed_state"] a = out["action"] # Act in env. s, r, t, _ = dummy_env.step(a[0]) # remove batch if t is True: s = dummy_env.reset() time_end = time.monotonic() print("Done running {} steps in bare-metal env in {}sec.".format( self.time_steps, time_end - time_start)) test.terminate() def test_environment_stepper_on_deepmind_lab(self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only divide and flatten the image). [{ "type": "divide", "divisor": 255 }, { "type": "reshape", "flatten": True }], # Policy spec. dict(network_spec="../configs/test_lstm_nn.json", action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space_test_lstm, num_steps=1000, # Add both prev-action and -reward into the state sent through the network. #add_previous_action_to_state=True, #add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True)) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() steps = 10 out = None for _ in range(steps): out = test.test("step") time_total = time.monotonic() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)" .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[0].dtype == np.float32) #self.assertTrue(out[0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[0].max() <= 1.0) #self.assertTrue(out[1].dtype == np.int32) # actions #self.assertTrue(out[2].dtype == np.float32) # rewards #self.assertTrue(out[0].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) # action probs (test whether sum to one). #self.assertTrue(out[1][6].dtype == np.float32) #self.assertTrue(out[1][6].min() >= 0.0) #self.assertTrue(out[1][6].max() <= 1.0) #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False), # np.ones(shape=(environment_stepper.num_steps,)), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3)) self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3)) # Check whether episode returns match single rewards (including terminal signals). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[0][i] # self.assertAlmostEqual(episode_returns, out[3][i]) # # Terminal: Reset for next step. # if out[4][i] is np.bool_(True): # episode_returns = 0.0 test.terminate()
class TestFIFOQueue(unittest.TestCase): """ Tests sampling and insertion behaviour of the FIFOQueue class. """ record_space = Dict(states=dict(state1=float, state2=float, state3=bool), actions=dict(action1=float, action2=Tuple(float, float)), reward=float, terminals=BoolBox(), add_batch_rank=True) capacity = 10 input_spaces = dict(records=record_space, num_records=int) def test_enqueue_dequeue(self): """ Simply tests insert op without checking internal logic. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) first_record = self.record_space.sample(size=1) test.test(("insert_records", first_record), expected_outputs=None) test.test("get_size", expected_outputs=1) further_records = self.record_space.sample(size=5) test.test(("insert_records", further_records), expected_outputs=None) test.test("get_size", expected_outputs=6) expected = dict() for (k1, v1), (k2, v2) in zip( flatten_op(first_record).items(), flatten_op(further_records).items()): expected[k1] = np.concatenate((v1, v2[:4])) expected = unflatten_op(expected) test.test(("get_records", 5), expected_outputs=expected) test.test("get_size", expected_outputs=1) def test_capacity(self): """ Tests if insert correctly blocks when capacity is reached. """ fifo_queue = FIFOQueue(capacity=self.capacity, record_space=self.record_space) test = ComponentTest(component=fifo_queue, input_spaces=self.input_spaces) def run(expected_): # Wait n seconds. time.sleep(2) # Pull something out of the queue again to continue. test.test(("get_records", 2), expected_outputs=expected_) # Insert one more element than capacity records = self.record_space.sample(size=self.capacity + 1) expected = dict() for key, value in flatten_op(records).items(): expected[key] = value[:2] expected = unflatten_op(expected) # Start thread to save this one from getting stuck due to capacity overflow. thread = threading.Thread(target=run, args=(expected, )) thread.start() print("Going over capacity: blocking ...") test.test(("insert_records", records), expected_outputs=None) print("Dequeued some items in another thread. Unblocked.") thread.join() def test_fifo_queue_with_distributed_tf(self): """ Tests if FIFO is correctly shared between two processes running in distributed tf. """ cluster_spec = dict(source=["localhost:22222"], target=["localhost:22223"]) def run1(): fifo_queue_1 = FIFOQueue(capacity=self.capacity, device="/job:source/task:0/cpu", record_space=self.record_space) test_1 = ComponentTest(component=fifo_queue_1, input_spaces=self.input_spaces, execution_spec=dict( mode="distributed", distributed_spec=dict( job="source", task_index=0, cluster_spec=cluster_spec))) # Insert elements from source. records = self.record_space.sample(size=self.capacity) print("inserting into source-side queue ...") test_1.test(("insert_records", records), expected_outputs=None) print("size of source-side queue:") print(test_1.test("get_size", expected_outputs=None)) # Pull one sample out. print("pulling from source-side queue:") print(test_1.test(("get_records", 2), expected_outputs=None)) test_1.terminate() def run2(): fifo_queue_2 = FIFOQueue(capacity=self.capacity, device="/job:source/task:0/cpu", record_space=self.record_space) test_2 = ComponentTest(component=fifo_queue_2, input_spaces=self.input_spaces, execution_spec=dict( mode="distributed", distributed_spec=dict( job="target", task_index=0, cluster_spec=cluster_spec))) # Dequeue elements in target. print("size of target-side queue:") print(test_2.test("get_size", expected_outputs=None)) print("pulling from target-side queue:") print(test_2.test(("get_records", 5), expected_outputs=None)) test_2.terminate() # Start thread to save this one from getting stuck due to capacity overflow. thread_1 = threading.Thread(target=run1) thread_2 = threading.Thread(target=run2) thread_1.start() thread_2.start() thread_1.join() thread_2.join()
def test_keras_style_complex_multi_stream_nn(self): # 3 inputs. input_spaces = [ Dict({ "img": FloatBox(shape=(6, 6, 3)), "int": IntBox(3) }, add_batch_rank=True, add_time_rank=True), FloatBox(shape=(2,), add_batch_rank=True), Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True) ] # Same NN as in test above, only using some of the sub-Spaces from the input spaces. # Tests whether this NN can add automatically the correct splitters. folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1]) # String layer will create batched AND time-ranked (individual words) hash outputs (int64). string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text) # Batched and time-ranked embedding output (floats) with embed dim=n. embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out) # Pass embeddings through a text LSTM and use last output (reduce time-rank). string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")( embedding_out, sequence_length=lengths ) # Unfold to get original time-rank back. string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1]) # Parallel image stream via 1 CNN layer plus dense. folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"]) cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img) unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"]) unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out) dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened) # Concat everything. concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out) # LSTM output has batch+time. main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out) dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out) dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out) dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out) # A NN with 3 outputs. neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states]) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces)) # Batch of size=n. sample_shape = (4, 2) input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]), input_spaces[2].sample(sample_shape)] out = test.test(("call", tuple(input_)), expected_outputs=None) # Main output (Dense out after LSTM). self.assertTrue(out[0].shape == sample_shape + (1,)) # 1=1 unit in dense layer self.assertTrue(out[0].dtype == np.float32) # main-LSTM out. self.assertTrue(out[1].shape == sample_shape + (2,)) # 2=2 LSTM units self.assertTrue(out[1].dtype == np.float32) # main-LSTM internal-states. self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][0].dtype == np.float32) self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][1].dtype == np.float32) test.terminate()