def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) # evaluation specific variables self.observation_seen = 0 self.episode_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback(self._observe_expired_incomplete_experience) self.experimental_reward = config.get('experimental_reward', False) agent_config = config['agent_config'] self.converter = CachingStrategyRLConverter() # action space: should cache: true or false # state space: [capacity (1), query key(1), query result set(num_indexes)] fields_in_state = len(CachingAgentSystemState.__slots__) self.agent = Agent.from_spec(agent_config, state_space=FloatBox(shape=(fields_in_state,)), action_space=IntBox(2)) self.logger = logging.getLogger(__name__) name = 'rl_caching_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger(name=f'{name}_observation_logger', result_dir=self.result_dir) self.entry_hits_logger = create_file_logger(name=f'{name}_entry_hits_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary()
def test_embedding_lookup_layer(self): # Input space for lookup indices (double indices for picking 2 rows per batch item). input_space = IntBox(shape=(2,), add_batch_rank=True) embedding = EmbeddingLookup(embed_dim=5, vocab_size=4, initializer_spec=np.array([ [1.0, 2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0, 10.0], [11.0, 12.0, 13.0, 14.0, 15.0], [16.0, 17.0, 18.0, 19.0, 20.0] ])) test = ComponentTest(component=embedding, input_spaces=dict(ids=input_space)) # Pull a batch of 3 (2 vocabs each) from the embedding matrix. inputs = np.array( [[0, 1], [3, 2], [2, 1]] ) expected = np.array([ [ [1.0, 2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0, 10.0] ], [ [16.0, 17.0, 18.0, 19.0, 20.0], [11.0, 12.0, 13.0, 14.0, 15.0] ], [ [11.0, 12.0, 13.0, 14.0, 15.0], [6.0, 7.0, 8.0, 9.0, 10.0], ] ]) test.test(("apply", inputs), expected_outputs=expected, decimals=5)
def test_keras_style_one_container_input_space(self): # Define one container input Space. input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True) # One-hot flatten the int tensor. flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0]) # Run the float tensor through two dense layers. dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1]) dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out) # Concat everything. cat_out = ConcatLayer()(flatten_layer_out, dense_2_out) # Use the `outputs` arg to allow your network to trace back the data flow until the input space. # `inputs` is not needed here as we only have one single input (the Tuple). neural_net = NeuralNetwork(outputs=cat_out) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space)) var_dict = neural_net.variable_registry w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"]) b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"]) w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"]) b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"]) # Batch of size=n. input_ = input_space.sample(4) expected = np.concatenate([ # concat everything one_hot(input_[0]), # int flattening dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value) # float -> 2 x dense ], axis=-1) out = test.test(("call", tuple([input_])), expected_outputs=expected) test.terminate()
def test_double_dqn_on_2x2_grid_world_single_action_to_container(self): """ Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container actions). """ # ftj = forward + turn + jump env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation") agent_config = config_from_path( "configs/dqn_agent_for_2x2_gridworld_single_to_container.json") preprocessing_spec = agent_config.pop("preprocessing_spec") action_space = IntBox(0, 18) agent = DQNAgent.from_spec(agent_config, huber_loss=True, double_q=True, dueling_q=True, state_space=FloatBox(shape=(4, )), action_space=action_space, store_last_q_table=True) time_steps = 10000 worker = SingleThreadedWorker( env_spec=lambda: GridWorld.from_spec(env_spec), agent=agent, preprocessing_spec=preprocessing_spec, worker_executes_preprocessing=True, render=False) results = worker.execute_timesteps(time_steps, use_exploration=True) print(results)
def test_multi_input_stream_neural_network_with_dict(self): # Space must contain batch dimension (otherwise, NNlayer will complain). input_space = Dict( a=FloatBox(shape=(3,)), b=IntBox(4, shape=()), add_batch_rank=True ) multi_input_nn = MultiInputStreamNeuralNetwork( input_network_specs=dict( a=[], b=[{"type": "reshape", "flatten": True, "flatten_categories": True}] ), post_network_spec=[{"type": "dense", "units": 2}], ) test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space)) # Batch of size=n. nn_inputs = input_space.sample(5) global_scope = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/" # Calculate output manually. var_dict = test.read_variable_values() b_flat = one_hot(nn_inputs["b"], depth=4) concat_out = np.concatenate((nn_inputs["a"], b_flat), axis=-1) expected = dense_layer(concat_out, var_dict[global_scope+"kernel"], var_dict[global_scope+"bias"]) test.test(("call", nn_inputs), expected_outputs=expected) test.terminate()
def test_memory_compilation(self): # Builds a memory and returns build stats. env = OpenAIGymEnv("Pong-v0", frameskip=4, max_num_noops=30, episodic_life=True) record_space = Dict(states=env.state_space, actions=env.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=True) input_spaces = dict( # insert: records records=record_space, # get_records: num_records num_records=int, # update_records: indices, update indices=IntBox(add_batch_rank=True), update=FloatBox(add_batch_rank=True)) input_spaces.pop("num_records") memory = MemPrioritizedReplay(capacity=20000, ) test = ComponentTest(component=memory, input_spaces=input_spaces, auto_build=False) return test.build()
def build_input_tokens(self): """ Tokenizes vocabulary used for state representations for input to Q-network by assigning integers to vocab words (query operators, query operands i.e. attributes represented in workload) Exposed through self.system_spec and self.states_spec """ self.system_spec["state_dim"] = self.input_sequence_size vocab = {} vocab_size = 0 # # tokenize # # special tokens pad_token = 'pad' vocab[pad_token] = vocab_size vocab_size += 1 ## state = ... ## ... query # operands for col in self.cols: vocab[col] = vocab_size vocab_size += 1 # operators if self.include_default_operators: for op in self.query_ops: vocab[op] = vocab_size vocab_size += 1 for op in self.query_selection_ops: vocab[op] = vocab_size vocab_size += 1 ## ... + context TODO for col in self.cols: vocab[col + '_idx'] = vocab_size vocab_size += 1 # delimits / demarcates compound indices idx_token = 'idx' vocab[idx_token] = vocab_size # # specific input schema, i.e. a vector of vocabulary tokens in Z^n, n=specified input size, to be embedded in embedding layer # self.states_spec = IntBox(low=0, high=vocab_size, shape=(self.input_sequence_size, )) self.system_spec['vocab'] = vocab self.system_spec['vocab_size'] = len(vocab) self.system_spec['index_token'] = idx_token self.system_spec['pad_token'] = pad_token
def test_specifiable_server(self): action_space = IntBox(2) state_space = FloatBox() env_spec = dict(type="random_env", state_space=state_space, action_space=action_space, deterministic=True) # Create the server, but don't start it yet. This will be done fully automatically by the tf-Session. specifiable_server = SpecifiableServer( Environment, env_spec, dict(step_flow=[state_space, float, bool]), "terminate") # ret are ops now in the graph. ret1 = specifiable_server.step_flow(action_space.sample()) ret2 = specifiable_server.step_flow(action_space.sample()) # Check all 3 outputs of the Env step (next state, reward, terminal). self.assertEqual(ret1[0].shape, ()) self.assertEqual(ret1[0].dtype, convert_dtype("float32")) self.assertEqual(ret1[1].shape, ()) self.assertEqual(ret1[1].dtype, convert_dtype("float32")) self.assertEqual(ret1[2].shape, ()) self.assertEqual(ret1[2].dtype, convert_dtype("bool")) self.assertEqual(ret2[0].shape, ()) self.assertEqual(ret2[0].dtype, convert_dtype("float32")) self.assertEqual(ret2[1].shape, ()) self.assertEqual(ret2[1].dtype, convert_dtype("float32")) self.assertEqual(ret2[2].shape, ()) self.assertEqual(ret2[2].dtype, convert_dtype("bool")) # Start the session and run the op, then check its actual values. with tf.train.SingularMonitoredSession( hooks=[SpecifiableServerHook()]) as sess: out1 = sess.run(ret1) out2 = sess.run(ret2) # next state self.assertAlmostEqual(out1[0], 0.7713, places=4) self.assertAlmostEqual(out2[0], 0.7488, places=4) # reward self.assertAlmostEqual(out1[1], 0.0208, places=4) self.assertAlmostEqual(out2[1], 0.4985, places=4) # terminal self.assertTrue(out1[2] is np.bool_(False)) self.assertTrue(out2[2] is np.bool_(False))
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4,), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest( component=policy, input_spaces=dict(nn_input=state_space), action_space=action_space ) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul(states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, policy_params["policy/action-adapter/action-layer/dense/kernel"] ) expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 5)) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output), decimals=5) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states), expected_outputs=dict(action=expected_actions, last_internal_states=None)) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = softmax(expected_action_layer_output, axis=-1) test.test(("get_logits_parameters_log_probs", states, [0, 1, 2]), expected_outputs=dict( logits=expected_action_layer_output, parameters=expected_probabilities_output, log_probs=np.log(expected_probabilities_output) ), decimals=5) print("Probs: {}".format(expected_probabilities_output)) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2,)) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2,)) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2,))
def test_slice_without_squeeze(self): slicer = Slice(squeeze=False) input_space = FloatBox(shape=(1, 4, 5), add_batch_rank=True) test = ComponentTest(component=slicer, input_spaces=dict(inputs=input_space, start_index=IntBox(), end_index=IntBox())) # Time-steps=3, Batch=5 inputs = input_space.sample(size=4) expected = np.asarray( [inputs[1]]) # Add the not-squeezed rank back to expected. test.test(("slice", [inputs, 1, 2]), expected_outputs=expected) expected = inputs[0:2] test.test(("slice", [inputs, 0, 2]), expected_outputs=expected) expected = np.asarray([inputs[0]]) test.test(("slice", [inputs, 0, 1]), expected_outputs=expected)
def test_slice_with_squeeze(self): slicer = Slice(squeeze=True) input_space = FloatBox(shape=(2, 2, 3), add_batch_rank=True, add_time_rank=True, time_major=True) test = ComponentTest(component=slicer, input_spaces=dict( preprocessing_inputs=input_space, start_index=IntBox(), end_index=IntBox() )) # Time-steps=3, Batch=5 inputs = input_space.sample(size=(3, 5)) expected = inputs[1] test.test(("slice", [inputs, 1, 2]), expected_outputs=expected) expected = inputs[0:2] test.test(("slice", [inputs, 0, 2]), expected_outputs=expected) expected = inputs[0] test.test(("slice", [inputs, 0, 1]), expected_outputs=expected)
def test_multi_input_stream_neural_network_with_tuple(self): # Space must contain batch dimension (otherwise, NNLayer will complain). input_space = Tuple( IntBox(3, shape=()), FloatBox(shape=(8,)), IntBox(4, shape=()), add_batch_rank=True ) multi_input_nn = MultiInputStreamNeuralNetwork( input_network_specs=( [{"type": "reshape", "flatten": True, "flatten_categories": True}], # intbox -> flatten [{"type": "dense", "units": 2}], # floatbox -> dense [{"type": "reshape", "flatten": True, "flatten_categories": True}] # inbox -> flatten ), post_network_spec=[{"type": "dense", "units": 3}], ) test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space)) # Batch of size=n. nn_inputs = input_space.sample(3) global_scope_pre = "multi-input-stream-nn/input-stream-nn-" global_scope_post = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/" # Calculate output manually. var_dict = test.read_variable_values() flat_0 = one_hot(nn_inputs[0], depth=3) dense_1 = dense_layer( nn_inputs[1], var_dict[global_scope_pre+"1/dense-layer/dense/kernel"], var_dict[global_scope_pre+"1/dense-layer/dense/bias"] ) flat_2 = one_hot(nn_inputs[2], depth=4) concat_out = np.concatenate((flat_0, dense_1, flat_2), axis=-1) expected = dense_layer(concat_out, var_dict[global_scope_post+"kernel"], var_dict[global_scope_post+"bias"]) test.test(("call", tuple([nn_inputs])), expected_outputs=expected) test.terminate()
def test_container_actions(self): # Test container actions with embedding. vocab_size = 100 embed_dim = 128 # ID/state space. state_space = IntBox(vocab_size, shape=(10, )) # Container action space. actions_space = {} num_outputs = 3 for i in range(3): actions_space['action_{}'.format(i)] = IntBox(low=0, high=num_outputs) actions_space = Dict(actions_space) agent_config = config_from_path("configs/dqfd_container.json") agent_config["network_spec"] = [ dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size), dict(type="reshape", flatten=True), dict(type="dense", units=embed_dim, activation="relu", scope="dense_1") ] agent = DQFDAgent.from_spec(agent_config, state_space=state_space, action_space=actions_space) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) agent.observe_demos( preprocessed_states=agent.preprocessed_state_space.with_batch_rank( ).sample(1), actions=actions_space.with_batch_rank().sample(1), rewards=rewards.sample(1), next_states=agent.preprocessed_state_space.with_batch_rank(). sample(1), terminals=terminals.sample(1), )
def get_preprocessed_space(self, space): # TODO map of allowed conversions in utils? if isinstance(space, IntBox): if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float"\ or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32": return FloatBox(shape=space.shape, low=space.low, high=space.high, add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank) elif self.to_dtype == "bool": if space.low == 0 and space.high == 1: return BoolBox(shape=space.shape, add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank) else: raise RLGraphError("ERROR: Conversion from IntBox to BoolBox not allowed if low is not 0 and " "high is not 1.") elif isinstance(space, BoolBox): if self.to_dtype == "float" or self.to_dtype == "float32" or self.to_dtype == "np.float" \ or self.to_dtype == "tf.float32" or self.to_dtype == "torch.float32": return FloatBox(shape=space.shape, low=0.0, high=1.0, add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank) elif self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype == "np.int32" or \ self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32": return IntBox(shape=space.shape, low=0, high=1, add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank) elif isinstance(space, FloatBox): if self.to_dtype == "int" or self.to_dtype == "int32" or self.to_dtype == "np.int32" or \ self.to_dtype == "tf.int32" or self.to_dtype == "torch.int32": return IntBox(shape=space.shape, low=space.low, high=space.high, add_batch_rank=space.has_batch_rank, add_time_rank=space.has_time_rank) # Wrong conversion. else: raise RLGraphError("ERROR: Space conversion from: {} to type {} not supported".format( space, self.to_dtype )) # No conversion. return space
def test_update_throughput(self): env = Environment.from_spec(self.env_spec) # TODO comment in for multi gpu # config_from_path("configs/multi_gpu_ray_apex_for_pong.json"), config = config_from_path("configs/ray_apex_for_pong.json") # Adjust to usable GPUs for test system. num_gpus = [1] for gpu_count in num_gpus: config["execution_spec"]["gpu_spec"]["num_gpus"] = gpu_count config["execution_spec"]["gpu_spec"]["per_process_gpu_memory_fraction"] = 1.0 / gpu_count agent = Agent.from_spec( # TODO replace with config from above config_from_path("configs/ray_apex_for_pong.json"), state_space=env.state_space, # Try with "reduced" action space (actually only 3 actions, up, down, no-op) action_space=env.action_space ) batch_space = Dict( states=agent.preprocessed_state_space, actions=env.action_space, rewards=FloatBox(), next_states=agent.preprocessed_state_space, terminals=IntBox(low=0, high=1), importance_weights=FloatBox(), add_batch_rank=True ) batch_size = 512 * gpu_count num_samples = 50 samples = [batch_space.sample(batch_size) for _ in range(num_samples)] times = [] throughputs = [] for sample in samples: start = time.perf_counter() agent.update(sample) runtime = time.perf_counter() - start times.append(runtime) throughputs.append(batch_size / runtime) print("Throughput: {} samples / s ({}) for {} GPUs".format(np.mean(throughputs), np.std(throughputs), gpu_count))
def test_random_env(self): """ Tests deterministic functionality of RandomEnv. """ env = RandomEnv(state_space=FloatBox(shape=(2, 2)), action_space=IntBox(2), deterministic=True) # Simple test runs with fixed actions. s = env.reset() recursive_assert_almost_equal(s, np.array([[0.77132064, 0.02075195], [0.63364823, 0.74880388]])) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(s, np.array([[0.1980629, 0.7605307], [0.1691108, 0.0883398]])) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(r, np.array(0.7217553)) s, r, t, _ = env.step(env.action_space.sample()) self.assertEqual(t, False) s, r, t, _ = env.step(env.action_space.sample()) recursive_assert_almost_equal(s, np.array([[0.4418332, 0.434014], [0.617767 , 0.5131382]])) s, r, t, _ = env.step(env.action_space.sample())
def build_output_tokens(self): """ Tokenizes vocabulary used for action representations for output of Q-network Exposed through self.system_spec and self.actions_spec Recall action representation maps index field (a candidate index field) to a decision e.g. suppose allow indices on up to 3 cols, allow indices to be ASC or DESC then the action is specified in [0,6], where 0 corresponds to noop, {1,2} correspond to an ASC or DESC index on 1st query attribute, {3,4} correspond to an ASC or DESC index on 2nd query attribute, etc. {0:1, 1:0:, 2:0} is an action specifying an index (ascending index) on 1st query attributes, and noops for the 2 remaining allowed columns for the compound index n.b. actions_spec comes from action branching architectures https://arxiv.org/abs/1711.08946 TODO dig deeper into that """ noop_idx = 0 idxs = [] self.actions_spec = {} # not sure whether ASC / DESC can be specified # see LIFT paper for this representation in particular n_outputs = 1 + self.max_fields_per_index # 1 + 2 * self.max_fields_per_index for i in range(self.max_fields_per_index): idxs.append('index_column{}'.format(i)) self.actions_spec['index_column{}'.format(i)] = IntBox( low=0, high=n_outputs) # ? self.actions_spec = Dict(self.actions_spec, add_batch_rank=True) self.system_spec['idxs'] = idxs self.system_spec['n_outputs'] = n_outputs self.system_spec['noop_idx'] = noop_idx self.system_spec['max_fields_per_index'] = self.max_fields_per_index
def _prepare_loss_function_test(loss_function): test = ComponentTest( component=loss_function, input_spaces=dict( alpha=float, log_probs_next_sampled=FloatBox(shape=(1, ), add_batch_rank=True), q_values_next_sampled=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), q_values=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), log_probs_sampled=FloatBox(shape=(1, ), add_batch_rank=True), q_values_sampled=Tuple(FloatBox(shape=(1, )), FloatBox(shape=(1, )), add_batch_rank=True), rewards=FloatBox(add_batch_rank=True), terminals=BoolBox(add_batch_rank=True), loss_per_item=FloatBox(add_batch_rank=True)), action_space=IntBox(2, shape=(), add_batch_rank=True)) return test
def __init__(self, config: Dict[str, any], result_dir: str, cache_stats: CacheInformation): super().__init__(config, result_dir, cache_stats) # evaluation specific variables self.observation_seen = 0 self.episode_reward = 0 self.checkpoint_steps = config['checkpoint_steps'] self._incomplete_experiences = TTLCache(InMemoryStorage()) self._incomplete_experiences.expired_entry_callback( self._observe_expired_incomplete_experience) self.view_of_the_cache = {} # type: Dict[str, Dict[str, any]] self._end_episode_observation = { ObservationType.Invalidate, ObservationType.Miss, ObservationType.Expiration } # TODO refactor into common RL interface for all strategies # Agent configuration (can be shared with others) agent_config = config['agent_config'] fields_in_state = len(EvictionAgentSystemState.__slots__) self.converter = EvictionStrategyRLConverter(self.result_dir) # State: fields to observe in question # Action: to evict or not that key self.agent = Agent.from_spec( agent_config, state_space=FloatBox(shape=(fields_in_state, )), action_space=IntBox(low=0, high=2)) self.logger = logging.getLogger(__name__) name = 'rl_eviction_strategy' self.reward_logger = create_file_logger(name=f'{name}_reward_logger', result_dir=self.result_dir) self.loss_logger = create_file_logger(name=f'{name}_loss_logger', result_dir=self.result_dir) self.observation_logger = create_file_logger( name=f'{name}_observation_logger', result_dir=self.result_dir) self.key_vocab = Vocabulary()
def test_prioritized_replay(self): """ Tests individual and chunked insert and sampling performance of prioritized replay memory. """ record_space = Dict(states=self.env.state_space, actions=self.env.action_space, reward=float, terminals=BoolBox(), add_batch_rank=True) input_spaces = dict(insert_records=record_space, get_records=int, update_records=[ IntBox(shape=(), add_batch_rank=True), FloatBox(shape=(), add_batch_rank=True) ]) memory = PrioritizedReplay(capacity=self.capacity, next_states=True, alpha=self.alpha, beta=self.beta) test = ComponentTest(component=memory, input_spaces=input_spaces, enable_profiler=self.enable_profiler) records = [record_space.sample(size=1) for _ in range(self.inserts)] start = time.monotonic() for record in records: test.test(("insert_records", record), expected_outputs=None) end = time.monotonic() - start tp = len(records) / end print('#### Testing Prioritized Replay memory ####') print('Testing insert performance:') print( 'Inserted {} separate records, throughput: {} records/s, total time: {} s' .format(len(records), tp, end)) record_chunks = [ record_space.sample(size=self.chunk_size) for _ in range(self.inserts) ] start = time.monotonic() for chunk in record_chunks: test.test(("insert_records", chunk), expected_outputs=None) end = time.monotonic() - start tp = len(record_chunks) * self.chunk_size / end print( 'Inserted {} record chunks of size {}, throughput: {} records/s, total time: {} s' .format(len(record_chunks), self.chunk_size, tp, end)) print('Testing sample performance:') start = time.monotonic() for _ in range(self.samples): test.test(("get_records", self.sample_batch_size), expected_outputs=None) end = time.monotonic() - start tp = self.samples / end print( 'Sampled {} batches of size {}, throughput: {} sample-ops/s, total time: {} s' .format(self.samples, self.sample_batch_size, tp, end))
def test_custom_margin_demos_with_container_actions(self): # Tests if using different margins per sample works. # Same state, but different vocab_size = 100 embed_dim = 8 # ID/state space. state_space = IntBox(vocab_size, shape=(10,)) # Container action space. actions_space = {} num_outputs = 3 for i in range(3): actions_space['action_{}'.format(i)] = IntBox( low=0, high=num_outputs ) actions_space = Dict(actions_space) agent_config = config_from_path("configs/dqfd_container.json") agent_config["network_spec"] = [ dict(type="embedding", embed_dim=embed_dim, vocab_size=vocab_size), dict(type="reshape", flatten=True), dict(type="dense", units=embed_dim, activation="relu", scope="dense_1") ] agent = DQFDAgent.from_spec( agent_config, state_space=state_space, action_space=actions_space ) terminals = BoolBox(add_batch_rank=True) rewards = FloatBox(add_batch_rank=True) # Create a set of demos. demo_states = agent.preprocessed_state_space.with_batch_rank().sample(2) # Same state. demo_states[1] = demo_states[0] demo_actions = actions_space.with_batch_rank().sample(2) for name, action in actions_space.items(): demo_actions[name][0] = 0 demo_actions[name][1] = 1 demo_rewards = rewards.sample(2, fill_value=.0) # One action has positive reward, one negative demo_rewards[0] = 0 demo_rewards[1] = 0 # One action is encouraged, one is discouraged. margins = np.asarray([0.5, -0.5]) demo_next_states = agent.preprocessed_state_space.with_batch_rank().sample(2) demo_terminals = terminals.sample(2, fill_value=False) # When using margins, need to use external batch. batch = dict( states=demo_states, actions=demo_actions, rewards=demo_rewards, next_states=demo_next_states, importance_weights=np.ones_like(demo_rewards), terminals=demo_terminals, ) # Fit demos with custom margins. for _ in range(10000): agent.update(batch=batch, update_from_demos=False, apply_demo_loss_to_batch=True, expert_margins=margins) # Evaluate demos for the state -> should have action with positive reward. agent_actions = agent.get_action(np.array([demo_states[0]]), apply_preprocessing=False, use_exploration=False) print("learned action = ", agent_actions)
def __init__(self, file_name=None, worker_id=0, base_port=5005, seed=0, docker_training=False, no_graphics=False, timeout_wait=30, train_mode=True, **kwargs): """ Args: file_name (Optional[str]): Name of Unity environment binary. base_port (int): Port number to connect to Unity environment. `worker_id` increments on top of this. worker_id (int): Number to add to `base_port`. Used for asynchronous agent scenarios. docker_training (bool): Informs this class, whether the process is being run within a container. Default: False. no_graphics (bool): Whether to run the Unity simulator in no-graphics mode. Default: False. timeout_wait (int): Time (in seconds) to wait for connection from environment. train_mode (bool): Whether to run in training mode, speeding up the simulation. Default: True. """ # First create the UnityMLAgentsEnvironment to get state and action spaces, then create RLgraph Environment # instance. self.mlagents_env = UnityEnvironment( file_name, worker_id, base_port, seed, docker_training, no_graphics ) all_brain_info = self.mlagents_env.reset() # Get all possible information from AllBrainInfo. # TODO: Which scene do we pick? self.scene_key = next(iter(all_brain_info)) first_brain_info = all_brain_info[self.scene_key] num_environments = len(first_brain_info.agents) state_space = {} if len(first_brain_info.vector_observations[0]) > 0: state_space["vector"] = get_space_from_op(first_brain_info.vector_observations[0]) # TODO: This is a hack. if state_space["vector"].dtype == np.float64: state_space["vector"].dtype = np.float32 if len(first_brain_info.visual_observations) > 0: state_space["visual"] = get_space_from_op(first_brain_info.visual_observations[0]) if first_brain_info.text_observations[0]: state_space["text"] = get_space_from_op(first_brain_info.text_observations[0]) if len(state_space) == 1: self.state_key = next(iter(state_space)) state_space = state_space[self.state_key] else: self.state_key = None state_space = Dict(state_space) brain_params = next(iter(self.mlagents_env.brains.values())) if brain_params.vector_action_space_type == "discrete": highs = brain_params.vector_action_space_size # MultiDiscrete (Tuple(IntBox)). if any(h != highs[0] for h in highs): action_space = Tuple([IntBox(h) for h in highs]) # Normal IntBox: else: action_space = IntBox( low=np.zeros_like(highs, dtype=np.int32), high=np.array(highs, dtype=np.int32), shape=(len(highs),) ) else: action_space = get_space_from_op(first_brain_info.action_masks[0]) if action_space.dtype == np.float64: action_space.dtype = np.float32 super(MLAgentsEnv, self).__init__( num_environments=num_environments, state_space=state_space, action_space=action_space, **kwargs ) # Caches the last observation we made (after stepping or resetting). self.last_state = None
def test_keras_style_complex_multi_stream_nn(self): # 3 inputs. input_spaces = [ Dict({ "img": FloatBox(shape=(6, 6, 3)), "int": IntBox(3) }, add_batch_rank=True, add_time_rank=True), FloatBox(shape=(2,), add_batch_rank=True), Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True) ] # Same NN as in test above, only using some of the sub-Spaces from the input spaces. # Tests whether this NN can add automatically the correct splitters. folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1]) # String layer will create batched AND time-ranked (individual words) hash outputs (int64). string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text) # Batched and time-ranked embedding output (floats) with embed dim=n. embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out) # Pass embeddings through a text LSTM and use last output (reduce time-rank). string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")( embedding_out, sequence_length=lengths ) # Unfold to get original time-rank back. string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1]) # Parallel image stream via 1 CNN layer plus dense. folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"]) cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img) unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"]) unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out) dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened) # Concat everything. concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out) # LSTM output has batch+time. main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out) dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out) dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out) dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out) # A NN with 3 outputs. neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states]) test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces)) # Batch of size=n. sample_shape = (4, 2) input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]), input_spaces[2].sample(sample_shape)] out = test.test(("call", tuple(input_)), expected_outputs=None) # Main output (Dense out after LSTM). self.assertTrue(out[0].shape == sample_shape + (1,)) # 1=1 unit in dense layer self.assertTrue(out[0].dtype == np.float32) # main-LSTM out. self.assertTrue(out[1].shape == sample_shape + (2,)) # 2=2 LSTM units self.assertTrue(out[1].dtype == np.float32) # main-LSTM internal-states. self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][0].dtype == np.float32) self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,)) # 2=2 LSTM units self.assertTrue(out[2][1].dtype == np.float32) test.terminate()
def run_dqn(exp, steps=25000, combinatorial=False): # # can't account for all configurations, but be sure agent is of a reasonably small size # vocab_size = 6 state_size = 6 # # queries, rewards for actions per query # dqn_queries, _, actions = data(exp) repr_builder = RepresentationBuilder() get_query, get_reward = repr_builder.build_dqn(dqn_queries, actions, K=state_size, prob=0.67) # # agent # import json # config is a bit big to copy with open('/Users/jeremywelborn/rlautoindex/conf/dqn.json', 'r') as f: config = json.load(f) agent_config = config['agent'] # any further adjustments? agent_config['memory_spec']['type']='replay' agent_config['exploration_spec']['epsilon_spec']['decay_spec']['num_timesteps'] = int(steps * .75) agent_config['network_spec'][0]['embed_dim'] = 64 # reduce capacity agent_config['network_spec'][2]['units'] = 64 agent_config['network_spec'][0]['vocab_size'] = vocab_size # replicate representations defined in Schema state_spec = IntBox(low=0, high=vocab_size, shape=(state_size,)) if not combinatorial: n_outputs = 1+3 action_spec = {} for i in range(3): action_spec['candidate_index_column{}'.format(i)] = IntBox(low=0, high=n_outputs) action_spec = Dict(action_spec, add_batch_rank=True) else: perm_idx_2_perm = [] for r in range(3+1): perm_idx_2_perm.extend(itertools.permutations(range(3),r=r)) perm_idx_2_perm = list(map(list, perm_idx_2_perm)) # [[], [1], [2], [3], [1, 2], [1, 3], [2, 1], [2, 3], [3, 1], [3, 2], [1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]] # action is a scalar corresponding to a particular permutation of query attributes action_spec = IntBox(low=0, high=len(perm_idx_2_perm)) task_graph = TaskGraph() task = Task(agent_config, state_space=state_spec, action_space=action_spec) task_graph.add_task(task) task_graph.get_task("").unwrap().timesteps = 0 controller = SystemController(None, None) # have to have for updates... controller.task_graph = task_graph controller.set_update_schedule(agent_config["update_spec"]) print("params: {}".format(task.agent.graph_builder.num_trainable_parameters)) # TODO yikes # # train agent # step = 0; steps = steps record = [] running_avg_reward = deque(maxlen=1000) start = time.time() while step < steps: step += 1 if step != 0 and step % 1000 == 0: print('running avg reward after {}/{} steps is {}'.format(step, steps, np.mean(running_avg_reward))) record.append((step, np.mean(running_avg_reward), time.time() - start)) query_idx, query = get_query() agent_action = task_graph.act_task("", query, apply_preprocessing=True) # replicate representation conversions defined in Converter # hack - same as how query_cols are stored with query in actual training loop attr_tokens = [foo_token, bar_token, baz_token] n_attrs = len([attr_token for attr_token in query[:3] if attr_token in attr_tokens]) # count tokens that are column tokens if not combinatorial: action = [] for key in ['candidate_index_column{}'.format(i) for i in range(3)]: action_val = agent_action[key][0] if action_val != 0: # if is not noop if n_attrs > action_val - 1: # if is a valid action col = query[:n_attrs][action_val - 1] if col not in action: action.append(col) else: action = [] perm_idx = agent_action perm = perm_idx_2_perm[perm_idx] if len(perm) == n_attrs: # ignore case like query==[foo], permutation of query==[1,2] for query_attr_idx in perm: if n_attrs > query_attr_idx: # ignore case like query==[foo], permutation of query==[1] b/c there is only 0th attribute, not 1st attribute col = query[:n_attrs][query_attr_idx] # if col not in action: # no repeats in this representation action.append(col) reward = get_reward(query_idx, action) running_avg_reward.append(reward) # TODO what to do with s_t+1??? task_graph.observe_task("", query, agent_action, [], reward, query, False) controller.update_if_necessary() return record
class TestPythonPrioritizedReplay(unittest.TestCase): """ Tests sampling and insertion behaviour of the mem_prioritized_replay module. """ record_space = Dict(states=dict(state1=float, state2=float), actions=dict(action1=float), reward=float, terminals=BoolBox(), add_batch_rank=True) apex_space = Dict(states=FloatBox(shape=(4, )), actions=FloatBox(shape=(2, )), reward=float, terminals=BoolBox(), weights=FloatBox(), add_batch_rank=True) memory_variables = ["size", "index", "max-priority"] capacity = 10 alpha = 1.0 beta = 1.0 max_priority = 1.0 input_spaces = dict( # insert: records records=record_space, # get_records: num_records num_records=int, # update_records: indices, update indices=IntBox(add_batch_rank=True), update=FloatBox(add_batch_rank=True)) # TODO These methods are all graph fns now -> unify backend tests. def test_insert(self): """ Simply tests insert op without checking internal logic. """ memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True, alpha=self.alpha, beta=self.beta) memory.create_variables(self.input_spaces) observation = memory.record_space_flat.sample(size=1) memory.insert_records(observation) # Test chunked insert observation = memory.record_space_flat.sample(size=5) memory.insert_records(observation) # Also test Apex version memory = ApexMemory(capacity=self.capacity, alpha=self.alpha, beta=self.beta) observation = self.apex_space.sample(size=5) for i in range_(5): memory.insert_records( (observation['states'][i], observation['actions'][i], observation['reward'][i], observation['terminals'][i], observation['states'][i], observation["weights"][i])) def test_update_records(self): """ Tests update records logic. """ memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True) memory.create_variables(self.input_spaces) # Insert a few Elements. observation = memory.record_space_flat.sample(size=2) memory.insert_records(observation) # Fetch elements and their indices. num_records = 2 batch = memory.get_records(num_records) indices = batch[1] self.assertEqual(num_records, len(indices)) # Does not return anything. memory.update_records(indices, np.asarray([0.1, 0.2])) # Test apex memory. memory = ApexMemory(capacity=self.capacity, alpha=self.alpha, beta=self.beta) observation = self.apex_space.sample(size=5) for i in range_(5): memory.insert_records( (ray_compress(observation["states"][i]), observation["actions"][i], observation["reward"][i], observation["terminals"][i], observation["weights"][i])) # Fetch elements and their indices. num_records = 5 batch = memory.get_records(num_records) indices = batch[1] self.assertEqual(num_records, len(indices)) # Does not return anything memory.update_records(indices, np.random.uniform(size=10)) def test_segment_tree_insert_values(self): """ Tests if segment tree inserts into correct positions. """ memory = MemPrioritizedReplay(capacity=self.capacity, next_states=True, alpha=self.alpha, beta=self.beta) memory.create_variables(self.input_spaces) priority_capacity = 1 while priority_capacity < self.capacity: priority_capacity *= 2 sum_segment_values = memory.merged_segment_tree.sum_segment_tree.values min_segment_values = memory.merged_segment_tree.min_segment_tree.values self.assertEqual(sum(sum_segment_values), 0) self.assertEqual(sum(min_segment_values), float('inf')) self.assertEqual(len(sum_segment_values), 2 * priority_capacity) self.assertEqual(len(min_segment_values), 2 * priority_capacity) # Insert 1 Element. observation = memory.record_space_flat.sample(size=1) memory.insert_records(observation) # Check insert positions # Initial insert is at priority capacity print(sum_segment_values) print(min_segment_values) start = priority_capacity while start >= 1: self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) # Insert another Element. observation = memory.record_space_flat.sample(size=1) memory.insert_records(observation) # Index shifted 1 start = priority_capacity + 1 self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) while start >= 1: # 1 + 1 is 2 on the segment. self.assertEqual(sum_segment_values[start], 2.0) # min is still 1. self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) def test_tree_insert(self): """ Tests inserting into the segment tree and querying segments. """ memory = ApexMemory(capacity=4) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(2, 1.0) tree.insert(3, 3.0) assert np.isclose(tree.get_sum(), 4.0) assert np.isclose(tree.get_sum(0, 2), 0.0) assert np.isclose(tree.get_sum(0, 3), 1.0) assert np.isclose(tree.get_sum(2, 3), 1.0) assert np.isclose(tree.get_sum(2, -1), 1.0) assert np.isclose(tree.get_sum(2, 4), 4.0) def test_prefixsum_idx(self): """ Tests fetching the index corresponding to a prefix sum. """ memory = ApexMemory(capacity=4) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(2, 1.0) tree.insert(3, 3.0) self.assertEqual(tree.index_of_prefixsum(0.0), 2) self.assertEqual(tree.index_of_prefixsum(0.5), 2) self.assertEqual(tree.index_of_prefixsum(0.99), 2) self.assertEqual(tree.index_of_prefixsum(1.01), 3) self.assertEqual(tree.index_of_prefixsum(3.0), 3) self.assertEqual(tree.index_of_prefixsum(4.0), 3) memory = ApexMemory(capacity=4) tree = memory.merged_segment_tree.sum_segment_tree tree.insert(0, 0.5) tree.insert(1, 1.0) tree.insert(2, 1.0) tree.insert(3, 3.0) self.assertEqual(tree.index_of_prefixsum(0.0), 0) self.assertEqual(tree.index_of_prefixsum(0.55), 1) self.assertEqual(tree.index_of_prefixsum(0.99), 1) self.assertEqual(tree.index_of_prefixsum(1.51), 2) self.assertEqual(tree.index_of_prefixsum(3.0), 3) self.assertEqual(tree.index_of_prefixsum(5.50), 3)
class TestEnvironmentStepper(unittest.TestCase): """ Tests for the EnvironmentStepper Component using a simple RandomEnv. """ deterministic_env_state_space = FloatBox(shape=(1, )) deterministic_env_action_space = IntBox(2) deterministic_action_probs_space = FloatBox(shape=(2, ), add_batch_rank=True) internal_states_space = Tuple(FloatBox(shape=(256, )), FloatBox(shape=(256, )), add_batch_rank=True) internal_states_space_test_lstm = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, )), add_batch_rank=True) action_probs_space = FloatBox(shape=(4, ), add_batch_rank=True) time_steps = 500 def test_environment_stepper_on_deterministic_env(self): preprocessor_spec = None network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=5), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( np.array([True, False, False, False]), # t_ np.array([[0.0], [1.0], [2.0], [3.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, True, False]), # t_ np.array([[3.0], [4.0], [0.0], [1.0]]), # s' (raw) )) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_deterministic_env_with_returning_action_probs( self): preprocessor_spec = [dict(type="divide", divisor=2)] network_spec = config_from_path("configs/test_simple_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=6), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=3) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) weights_hid = weights[ "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/kernel"] biases_hid = weights[ "environment-stepper/actor-component/policy/test-network/hidden-layer/dense/bias"] weights_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"] biases_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"] # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. expected = ( None, ( # t_ np.array([True, False, False, False]), # s' (raw) np.array([[0.0], [1.0], [2.0], [3.0]]), # action probs np.array([ [0.0, 0.0], # <- init (no input gets sent through NN). softmax( dense_layer( dense_layer(np.array([0.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([0.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([1.0]), weights_hid, biases_hid), weights_action, biases_action)) ]))) test.test("step", expected_outputs=expected, decimals=3) # Step again, check whether stitching of states/etc.. works. expected = ( None, ( np.array([False, False, False, True]), np.array([[3.0], [4.0], [5.0], [0.0]]), # s' (raw) np.array([ [0.0, 0.0], # <- init (no input gets sent through NN). softmax( dense_layer( dense_layer(np.array([1.5]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.0]), weights_hid, biases_hid), weights_action, biases_action)), softmax( dense_layer( dense_layer(np.array([2.5]), weights_hid, biases_hid), weights_action, biases_action)) ]))) test.test("step", expected_outputs=expected, decimals=3) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_deterministic_env_with_action_probs_lstm( self): internal_states_space = Tuple(FloatBox(shape=(3, )), FloatBox(shape=(3, ))) preprocessor_spec = [dict(type="multiply", factor=0.1)] network_spec = config_from_path("configs/test_lstm_nn.json") exploration_spec = None actor_component = ActorComponent( preprocessor_spec, dict(network_spec=network_spec, action_space=self.deterministic_env_action_space), exploration_spec) environment_stepper = EnvironmentStepper( environment_spec=dict(type="deterministic_env", steps_to_terminal=3), actor_component_spec=actor_component, state_space=self.deterministic_env_state_space, reward_space="float32", internal_states_space=internal_states_space, add_action_probs=True, action_probs_space=self.deterministic_action_probs_space, num_steps=4, ) test = ComponentTest( component=environment_stepper, action_space=self.deterministic_env_action_space, ) weights = test.read_variable_values( environment_stepper.actor_component.policy.variables) weights_lstm = weights[ "environment-stepper/actor-component/policy/test-lstm-network/" "lstm-layer/lstm-cell/kernel"] biases_lstm = weights[ "environment-stepper/actor-component/policy/test-lstm-network/lstm-layer/lstm-cell/bias"] weights_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/kernel"] biases_action = weights[ "environment-stepper/actor-component/policy/action-adapter/action-layer/dense/bias"] # Reset the stepper. test.test("reset") # Step 3 times through the Env and collect results. lstm_1 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm) lstm_2 = lstm_layer(np.array([[[0.1]]]), weights_lstm, biases_lstm, lstm_1[1]) lstm_3 = lstm_layer(np.array([[[0.2]]]), weights_lstm, biases_lstm, lstm_2[1]) lstm_4 = lstm_layer(np.array([[[0.0]]]), weights_lstm, biases_lstm, lstm_3[1]) expected = ( None, ( np.array([True, False, False, True, False]), np.array([[0.0], [1.0], [2.0], [0.0], [1.0]]), # s' (raw) np.array([ [0.0, 0.0], softmax( dense_layer(np.squeeze(lstm_1[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_2[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_3[0]), weights_action, biases_action)), softmax( dense_layer(np.squeeze(lstm_4[0]), weights_action, biases_action)), ]), # action probs # internal states (np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][0], lstm_2[1][0], lstm_3[1][0], lstm_4[1][0]])), np.squeeze( np.array([[[0.0, 0.0, 0.0]], lstm_1[1][1], lstm_2[1][1], lstm_3[1][1], lstm_4[1][1]]))))) test.test("step", expected_outputs=expected) # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_environment_stepper_on_pong(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float", add_reward=True, num_steps=self.time_steps) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Step 30 times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. # Reset the stepper. test.test("reset") time_start = time.monotonic() out = test.test("step") time_end = time.monotonic() print("Done running {} steps in env-stepper env in {}sec.".format( environment_stepper.num_steps, time_end - time_start)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[1][0].dtype == np.float32) # preprocessed states #self.assertTrue(out[1][0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[1][0].max() <= 1.0) #self.assertTrue(out[1][1].dtype == np.int32) # actions #self.assertTrue(out[1][2].dtype == np.float32) # rewards #self.assertTrue(out[1][3].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) self.assertTrue(out[1][2].dtype == np.float32) # rewards self.assertTrue(out[1][2].min() >= -1.0) # -1.0 to 1.0 self.assertTrue(out[1][2].max() <= 1.0) # Check whether episode returns match single rewards (including resetting after each terminal signal). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[2][i] # self.assertAlmostEqual(episode_returns, out[1][3][i]) # # Terminal: Reset accumulated episode-return before next step. # if out[1][4][i] is np.bool_(True): # episode_returns = 0.0 # Make sure we close the session (to shut down the Env on the server). test.terminate() def test_compare_with_non_env_stepper(self): environment_spec = dict(type="openai_gym", gym_env="Pong-v0", frameskip=4, seed=10) dummy_env = Environment.from_spec(environment_spec) state_space = dummy_env.state_space.with_batch_rank() action_space = dummy_env.action_space agent_config = config_from_path("configs/dqn_agent_for_pong.json") actor_component = ActorComponent( agent_config["preprocessing_spec"], dict(network_spec=agent_config["network_spec"], action_adapter_spec=agent_config["action_adapter_spec"], action_space=action_space), agent_config["exploration_spec"]) test = ComponentTest( component=actor_component, input_spaces=dict(states=state_space), action_space=action_space, ) s = dummy_env.reset() time_start = time.monotonic() for i in range(self.time_steps): out = test.test( ("get_preprocessed_state_and_action", np.array([s]))) #preprocessed_s = out["preprocessed_state"] a = out["action"] # Act in env. s, r, t, _ = dummy_env.step(a[0]) # remove batch if t is True: s = dummy_env.reset() time_end = time.monotonic() print("Done running {} steps in bare-metal env in {}sec.".format( self.time_steps, time_end - time_start)) test.terminate() def test_environment_stepper_on_deepmind_lab(self): try: from rlgraph.environments.deepmind_lab import DeepmindLabEnv except ImportError: print("DeepmindLab not installed: Skipping this test case.") return env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only divide and flatten the image). [{ "type": "divide", "divisor": 255 }, { "type": "reshape", "flatten": True }], # Policy spec. dict(network_spec="../configs/test_lstm_nn.json", action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space_test_lstm, num_steps=1000, # Add both prev-action and -reward into the state sent through the network. #add_previous_action_to_state=True, #add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=FloatBox(shape=(9, ), add_batch_rank=True)) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.monotonic() steps = 10 out = None for _ in range(steps): out = test.test("step") time_total = time.monotonic() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec. ({} actions/sec)" .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(out[0] is None) self.assertTrue(isinstance( out[1], DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. #self.assertTrue(out[0].dtype == np.float32) #self.assertTrue(out[0].min() >= 0.0) # make sure we have pixels / 255 #self.assertTrue(out[0].max() <= 1.0) #self.assertTrue(out[1].dtype == np.int32) # actions #self.assertTrue(out[2].dtype == np.float32) # rewards #self.assertTrue(out[0].dtype == np.float32) # episode return self.assertTrue(out[1][0].dtype == np.bool_) # next-state is terminal? self.assertTrue( out[1][1].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue(out[1][1].min() >= 0) # make sure we have pixels self.assertTrue(out[1][1].max() <= 255) # action probs (test whether sum to one). #self.assertTrue(out[1][6].dtype == np.float32) #self.assertTrue(out[1][6].min() >= 0.0) #self.assertTrue(out[1][6].max() <= 1.0) #recursive_assert_almost_equal(out[1][6].sum(axis=-1, keepdims=False), # np.ones(shape=(environment_stepper.num_steps,)), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][0].shape == (environment_stepper.num_steps, 3)) self.assertTrue(out[3][1].shape == (environment_stepper.num_steps, 3)) # Check whether episode returns match single rewards (including terminal signals). #episode_returns = 0.0 #for i in range(environment_stepper.num_steps): # episode_returns += out[0][i] # self.assertAlmostEqual(episode_returns, out[3][i]) # # Terminal: Reset for next step. # if out[4][i] is np.bool_(True): # episode_returns = 0.0 test.terminate()
class TestPrioritizedReplay(unittest.TestCase): """ Tests sampling and insertion behaviour of the prioritized_replay module. """ record_space = Dict(states=dict(state1=float, state2=float), actions=dict(action1=float), reward=float, terminals=BoolBox(), add_batch_rank=True) memory_variables = ["size", "index", "max-priority"] capacity = 10 alpha = 1.0 beta = 1.0 max_priority = 1.0 input_spaces = dict( # insert: records records=record_space, # get_records: num_records num_records=int, # update_records: indices, update indices=IntBox(add_batch_rank=True), update=FloatBox(add_batch_rank=True)) def test_insert(self): """ Simply tests insert op without checking internal logic. """ memory = PrioritizedReplay(capacity=self.capacity, alpha=self.alpha, beta=self.beta) test = ComponentTest(component=memory, input_spaces=self.input_spaces) observation = self.record_space.sample(size=1) test.test(("insert_records", observation), expected_outputs=None) def test_capacity(self): """ Tests if insert correctly manages capacity. """ memory = PrioritizedReplay(capacity=self.capacity, alpha=self.alpha, beta=self.beta) test = ComponentTest(component=memory, input_spaces=self.input_spaces) # Internal state variables. memory_variables = memory.get_variables(self.memory_variables, global_scope=False) buffer_size = memory_variables['size'] buffer_index = memory_variables['index'] max_priority = memory_variables['max-priority'] size_value, index_value, max_priority_value = test.read_variable_values( buffer_size, buffer_index, max_priority) # Assert indices 0 before insert. self.assertEqual(size_value, 0) self.assertEqual(index_value, 0) self.assertEqual(max_priority_value, 1.0) # Insert one more element than capacity observation = self.record_space.sample(size=self.capacity + 1) test.test(("insert_records", observation), expected_outputs=None) size_value, index_value = test.read_variable_values( buffer_size, buffer_index) # Size should be equivalent to capacity when full. self.assertEqual(size_value, self.capacity) # Index should be one over capacity due to modulo. self.assertEqual(index_value, 1) def test_batch_retrieve(self): """ Tests if retrieval correctly manages capacity. """ memory = PrioritizedReplay(capacity=self.capacity, alpha=self.alpha, beta=self.beta) test = ComponentTest(component=memory, input_spaces=self.input_spaces) # Insert 2 Elements. observation = non_terminal_records(self.record_space, 2) test.test(("insert_records", observation), expected_outputs=None) # Assert we can now fetch 2 elements. num_records = 2 batch = test.test(("get_records", num_records), expected_outputs=None) records = batch[0] print('Result batch = {}'.format(records)) self.assertEqual(2, len(records['terminals'])) # We allow repeat indices in sampling. num_records = 5 batch = test.test(("get_records", num_records), expected_outputs=None) records = batch[0] self.assertEqual(5, len(records['terminals'])) # Now insert over capacity, note all elements here are non-terminal. observation = non_terminal_records(self.record_space, self.capacity) test.test(("insert_records", observation), expected_outputs=None) # Assert we can fetch exactly capacity elements. num_records = self.capacity batch = test.test(("get_records", num_records), expected_outputs=None) records = batch[0] self.assertEqual(self.capacity, len(records['terminals'])) def test_update_records(self): """ Tests update records logic. """ memory = PrioritizedReplay(capacity=self.capacity) test = ComponentTest(component=memory, input_spaces=self.input_spaces) # Insert a few Elements. observation = non_terminal_records(self.record_space, 2) test.test(("insert_records", observation), expected_outputs=None) # Fetch elements and their indices. num_records = 2 batch = test.test(("get_records", num_records), expected_outputs=None) indices = batch[1] self.assertEqual(num_records, len(indices)) # 0.3, 0.5, 1.0]) input_params = [indices, np.asarray([0.1, 0.2])] # Does not return anything test.test(("update_records", input_params), expected_outputs=None) def test_segment_tree_insert_values(self): """ Tests if segment tree inserts into correct positions. """ memory = PrioritizedReplay(capacity=self.capacity, alpha=self.alpha, beta=self.beta) test = ComponentTest(component=memory, input_spaces=self.input_spaces) priority_capacity = 1 while priority_capacity < self.capacity: priority_capacity *= 2 memory_variables = memory.get_variables( ["sum-segment-tree", "min-segment-tree"], global_scope=False) sum_segment_tree = memory_variables['sum-segment-tree'] min_segment_tree = memory_variables['min-segment-tree'] sum_segment_values, min_segment_values = test.read_variable_values( sum_segment_tree, min_segment_tree) self.assertEqual(sum(sum_segment_values), 0) self.assertEqual(sum(min_segment_values), float('inf')) self.assertEqual(len(sum_segment_values), 2 * priority_capacity) self.assertEqual(len(min_segment_values), 2 * priority_capacity) # Insert 1 Element. observation = non_terminal_records(self.record_space, 1) test.test(("insert_records", observation), expected_outputs=None) # Fetch segment tree. sum_segment_values, min_segment_values = test.read_variable_values( sum_segment_tree, min_segment_tree) # Check insert positions # Initial insert is at priority capacity print(sum_segment_values) print(min_segment_values) start = priority_capacity while start >= 1: self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) # Insert another Element. observation = non_terminal_records(self.record_space, 1) test.test(("insert_records", observation), expected_outputs=None) # Fetch segment tree. sum_segment_values, min_segment_values = test.read_variable_values( sum_segment_tree, min_segment_tree) print(sum_segment_values) print(min_segment_values) # Index shifted 1 start = priority_capacity + 1 self.assertEqual(sum_segment_values[start], 1.0) self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2) while start >= 1: # 1 + 1 is 2 on the segment. self.assertEqual(sum_segment_values[start], 2.0) # min is still 1. self.assertEqual(min_segment_values[start], 1.0) start = int(start / 2)
def __init__(self, world="4x4", save_mode=False, action_type="udlr", reward_function="sparse", state_representation="discrete"): """ Args: world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state). save_mode (bool): Whether to replace holes (H) with walls (W). Default: False. action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete action space. "ftjb" is the same as "ftj", except that sub-action "jump" is a boolean. reward_function (str): One of sparse: hole=-5, fire=-3, goal=1, all other steps=-0.1 rich: hole=-100, fire=-10, goal=50, all other steps=-0.1 state_representation (str): - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one below, etc.. - "xy": The x and y grid position tuple. - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values of the actor. - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are used to indicate different items in the scene (walls, holes, the actor, etc..). """ # Build our map. if isinstance(world, str): self.description = world world = self.MAPS[world] else: self.description = "custom-map" world = np.array(list(map(list, world))) # Apply safety switch. world[world == 'H'] = ("H" if not save_mode else "F") # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column). self.world = world self.n_row, self.n_col = self.world.shape (start_y, ), (start_x, ) = np.nonzero(self.world == "S") # Init pygame (if installed) for visualizations. if pygame is not None: self.pygame_field_size = 30 pygame.init() self.pygame_agent = pygame.image.load( os.path.join(os.path.dirname(os.path.abspath(__file__)), "images/agent.png")) # Create basic grid Surface for reusage. self.pygame_basic_surface = self.grid_to_surface() self.pygame_display_set = False # Figure out our state space. assert state_representation in [ "discrete", "xy", "xy+orientation", "camera" ] self.state_representation = state_representation # Discrete states (single int from 0 to n). if self.state_representation == "discrete": state_space = IntBox(self.n_row * self.n_col) # x/y position (2 ints). elif self.state_representation == "xy": state_space = IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2, )) # x/y position + orientation (3 ints). elif self.state_representation == "xy+orientation": state_space = IntBox(low=(0, 0, 0, 0), high=(self.n_col, self.n_row, 1, 1)) # Camera outputting a 2D color image of the world. else: state_space = IntBox(0, 255, shape=(self.n_row, self.n_col, 3)) self.default_start_pos = self.get_discrete_pos(start_x, start_y) self.discrete_pos = self.default_start_pos assert reward_function in ["sparse", "rich"] # TODO: "potential"-based reward self.reward_function = reward_function # Store the goal position for proximity calculations (for "potential" reward function). (self.goal_y, ), (self.goal_x, ) = np.nonzero(self.world == "G") # Specify the actual action spaces. self.action_type = action_type action_space = IntBox(4) if self.action_type == "udlr" else Dict( dict(forward=IntBox(3), turn=IntBox(3), jump=(IntBox(2) if self.action_type == "ftj" else BoolBox()))) # Call the super's constructor. super(GridWorld, self).__init__(state_space=state_space, action_space=action_space) # Reset ourselves. self.state = None self.orientation = None # int: 0, 90, 180, 270 self.camera_pixels = None # only used, if state_representation=='cam' self.reward = None self.is_terminal = None self.reset(randomize=False)