def test_dddqn_n_step_memory_insertion_n_step_samples_only(self): """ Tests the n-step post-processing and memory-insertions of DDDQN (with the n_step_only option set to True). """ # Create an Env object. env = GridWorld("2x2", actors=1) # Create a very standard DDQN. dqn_config = DDDQNConfig.make( "{}/../configs/dddqn_grid_world_2x2_learning.json".format( os.path.dirname(__file__)), n_step=2, # fix n-step to 2, just in case. gamma=0.5, # fix gamma for unique-memory-checks purposes epsilon=[1.0, 0.5], # fix epsilon to get lots of random actions. preprocessor=Preprocessor(lambda inputs_: tf.one_hot( inputs_, depth=env.actors[0].state_space.num_categories)), state_space=env.actors[0].state_space, action_space=env.actors[0].action_space) algo = DDDQN(config=dqn_config, name="my-dddqn") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run for n ticks, then check memory contents for correct n-step tuples. for _ in range(5): env.run(ticks=100, sync=True, render=False) self._check_2x2_grid_world_mem(algo.memory, n_step_only=True) env.terminate()
def test_dads_compilation(self): """ Tests the c'tor of SAC. """ env = GridWorld("4-room", actors=2) # Create a Config (for any Atari game). config = DADSConfig.make( "{}/../configs/dads_grid_world_4room_learning.json".format( os.path.dirname(__file__)), state_space=env.actors[0].state_space, action_space=env.actors[0].action_space) dads = DADS(config, name="my-dads") print("DADS built ({}).".format(dads)) env.terminate()
def test_dddqn_learning_on_grid_world_2x2(self): # Create an Env object. env = GridWorld("2x2", actors=1) # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories) ) # Create a Config. dqn_config = DDDQNConfig.make( "{}/../configs/dddqn_grid_world_2x2_learning.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space ) # Create an Algo object. algo = DDDQN(config=dqn_config, name="my-dddqn") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests) # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.6) # Check learnt Q-function (using our dueling layer). a_and_v = algo.Q(one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4)) q = dueling(a_and_v, np.array([0, 1, 2, 3, 0, 1, 2, 3])) print(q) self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0]) # q(s=0,a=right) is the worst check(q[5], 1.0, atol=0.4) # Q(1,->) is close to 1.0. #self.assertTrue(q[5] > max(q[:4]) and q[5] > max(q[6:])) # q(s=1,a=right) is the best #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1) # a=up,down,left,right env.terminate()
def test_dqn2015_learning_on_4x4_grid_world_with_n_actors(self): # Create an Env object. env = GridWorld("4x4", actors=8) # Add the preprocessor. preprocessor = Preprocessor(lambda inputs_: tf.one_hot( inputs_, depth=env.actors[0].state_space.num_categories)) # Create a Config. config = DQN2015Config.make( # type: DQN2015Config "{}/../configs/dqn2015_grid_world_4x4_learning_n_actors.json". format(os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space) # Create an Algo object. algo = DQN2015(config=config, name="my-dqn") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=4000, sync=True, render=debug.RenderEnvInLearningTests) # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= -0.4) # Check learnt Q-function for states 0 and 1, action=down (should be larger 0.0, ideally 0.5). action_values = algo.Q(preprocessor(np.array([0, 1]))) self.assertTrue(action_values[0][2] >= 0.0) self.assertTrue(action_values[1][2] >= 0.0) env.terminate()
def test_dads_learning_on_grid_world_4room(self): # Create an Env object. env = GridWorld("4-room") # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories) ) # Create a Config. config = DADSConfig.make( "{}/../configs/dads_grid_world_4room_learning.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space ) # Create an Algo object. algo = DADS(config=config, name="my-dads") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests) # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.3) # Check learnt Q-function. check(algo.q( np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]) ), [[0.8, -5.0, 0.9, 0.8], [0.8, 1.0, 0.9, 0.9]], decimals=1) # a=up,down,left,right env.terminate()
def test_saving_then_loading_to_get_exact_same_algo(self): env = GridWorld("2x2", actors=1) state_space = env.actors[0].state_space.with_batch() action_space = env.actors[0].action_space.with_batch() # Create a very simple DQN2015. dqn = DQN2015(config=DQN2015Config.make( "{}/../configs/dqn2015_grid_world_2x2_learning.json".format( os.path.dirname(__file__)), preprocessor=lambda inputs_: tf.one_hot( inputs_, depth=state_space.num_categories), state_space=state_space, action_space=action_space), name="my-dqn") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) dqn.save("test.json") env.terminate()
def test_sac_learning_on_grid_world_2x2(self): # Create an Env object. env = GridWorld("2x2", actors=1) # Add the preprocessor (not really necessary, as NN will automatically one-hot, but faster as states # are then stored in memory already preprocessed and won't have to be preprocessed again for batch-updates). preprocessor = Preprocessor(lambda inputs_: tf.one_hot( inputs_, depth=env.actors[0].state_space.num_categories)) # Create a Config. config = SACConfig.make( "{}/../configs/sac_grid_world_2x2_learning.json".format( os.path.dirname(__file__)), preprocessor=preprocessor, state_space=env.actors[0].state_space, action_space=env.actors[0].action_space, summaries=[ "Ls_critic[0]", "L_actor", "L_alpha", "alpha", ("Q(0,^)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([0])})" ), ("Q(0,->)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([1])})" ), ("Q(0,v)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([2])})" ), ("Q(0,<-)", "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([3])})" ), ("Q(1,->)", "Q[0]({'s': np.array([[0., 1., 0., 0.]]), 'a': np.array([1])})" ) ]) # Create an Algo object. algo = SAC(config=config, name="my-sac") # Point actor(s) to the algo. env.point_all_actors_to_algo(algo) # Run and wait for env to complete. env.run(ticks=700, sync=True, render=debug.RenderEnvInLearningTests) # Check learnt Q-function. q = algo.Q[0](dict(s=one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4), a=np.array([0, 1, 2, 3, 0, 1, 2, 3]))) print(q) self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0]) # q(s=0,a=right) is the worst check(q[5], 1.0, decimals=1) # Q(1,->) is close to 1.0. #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1) # a=up,down,left,right # Check last n episode returns. n = 10 mean_last_n = np.mean(env.historic_episodes_returns[-n:]) print("Avg return over last {} episodes: {}".format(n, mean_last_n)) self.assertTrue(mean_last_n >= 0.7) env.terminate()
def test_dqn2015_functionality(self): # Fake q-net/qt-net used for this test. def q(s, a): return np.sum(dense(dense(s, weights_q[0], weights_q[1]), weights_q[2], weights_q[3]) * one_hot(a, depth=4), axis=-1) def qt(s): return dense(dense(s, weights_qt[0], weights_qt[1]), weights_qt[2], weights_qt[3]) env = GridWorld("2x2", actors=1) state_space = env.actors[0].state_space.with_batch() action_space = env.actors[0].action_space.with_batch() # Add the preprocessor. preprocessor = Preprocessor( lambda inputs_: tf.one_hot(inputs_, depth=state_space.num_categories) ) preprocessed_space = preprocessor(state_space) # Add the Q-network. i = K.layers.Input(shape=preprocessed_space.shape, dtype=preprocessed_space.dtype) o = K.layers.Dense(2, activation="linear")(i) # keep it very simple # o = K.layers.Dense(256)(o) q_network = K.Model(inputs=i, outputs=o) # Create a very simple DQN2015. dqn = DQN2015(config=DQN2015Config.make( "{}/../configs/dqn2015_grid_world_2x2_functionality.json".format(os.path.dirname(__file__)), preprocessor=preprocessor, q_network=q_network, state_space=state_space, action_space=action_space ), name="my-dqn") # Check slot of "x" in flattened mem. check(dqn.memory.next_record_setup["x"][1], [3]) self.assertTrue(dqn.memory.batch_size is None) check(dqn.Q.get_weights(), dqn.Qt.get_weights()) # Point actor(s) to the algo. env.point_all_actors_to_algo(dqn) # Set our weights fixed. weights = [ np.array([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4]]), # hidden layer kernel np.array([0.0, 0.0]), # hidden layer bias np.array([[-0.4, -0.3, -0.2, -0.1], [0.4, 0.3, 0.2, 0.1]]), # output layer kernel np.array([0.1, 0.1, 1.0, 0.0]) # output layer bias ] dqn.Q.set_weights(weights) # Perform one step in the env. expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1) check(expected_action, 2) # expect to go down env.run(ticks=1) # ts=0 -> do nothing # Check action taken. check(dqn.a.value, expected_action) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn (after one time step, should still be empty). check(dqn.memory.size, 0) self.assertTrue(dqn.memory.batch_size is None) # Perform one step in the env. expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1) check(expected_action, 2) # expect to go down env.run(ticks=1) # ts=1 -> no sync, no update # Check action taken. check(dqn.a.value, expected_action) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn. check(dqn.memory.size, 1) self.assertTrue(dqn.memory.batch_size == 1) # batch_size is now established. check(dqn.memory.memory, [ np.array([2, 0, 0, 0]), np.array([-0.1, 0., 0., 0.]), np.array([False, False, False, False]), np.array([[1., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]]) ]) # Check next states. check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]]) # Perform one step in the env. # What are the weights after the update? weights_q_before_update = dqn.Q.get_weights() weights_q = copy.deepcopy(weights_q_before_update) weights_qt = dqn.Qt.get_weights() # Check action taken (action is picked before! update). expected_action = np.argmax(dqn.Q(dqn.Phi(np.array([1]))), axis=-1) env.run(ticks=1) # ts=2 -> no sync, do update weights_q_after_update = dqn.Q.get_weights() check(dqn.a.value, expected_action) # Check new weight values after the update. loss = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config) for i, matrix in enumerate(weights_q_before_update): for idx in np.ndindex(matrix.shape): weights_q = copy.deepcopy(weights_q_before_update) weights_q[i][idx] += 0.0001 lossd = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config) dL_over_dw = (lossd - loss) / 0.0001 check(weights_q_after_update[i][idx], weights_q_before_update[i][idx] - dL_over_dw * dqn.optimizer.learning_rate(0.0), decimals=3) # Check state of the env after action taken. check(env.state[0], 1) check(env.reward[0], -0.1) check(env.terminal[0], False) # Check memory of dqn. check(dqn.memory.size, 2) check(dqn.memory.memory, [ np.array([2, 2, 0, 0]), np.array([-0.1, -0.1, 0., 0.]), np.array([False, False, False, False]), np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]]) ]) # Check next states. check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]]) env.terminate()