def test_dddqn_n_step_memory_insertion_n_step_samples_only(self):
        """
        Tests the n-step post-processing and memory-insertions of DDDQN (with the n_step_only option set to True).
        """
        # Create an Env object.
        env = GridWorld("2x2", actors=1)
        # Create a very standard DDQN.
        dqn_config = DDDQNConfig.make(
            "{}/../configs/dddqn_grid_world_2x2_learning.json".format(
                os.path.dirname(__file__)),
            n_step=2,  # fix n-step to 2, just in case.
            gamma=0.5,  # fix gamma for unique-memory-checks purposes
            epsilon=[1.0, 0.5],  # fix epsilon to get lots of random actions.
            preprocessor=Preprocessor(lambda inputs_: tf.one_hot(
                inputs_, depth=env.actors[0].state_space.num_categories)),
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space)
        algo = DDDQN(config=dqn_config, name="my-dddqn")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run for n ticks, then check memory contents for correct n-step tuples.
        for _ in range(5):
            env.run(ticks=100, sync=True, render=False)
            self._check_2x2_grid_world_mem(algo.memory, n_step_only=True)

        env.terminate()
    def test_dads_compilation(self):
        """
        Tests the c'tor of SAC.
        """
        env = GridWorld("4-room", actors=2)
        # Create a Config (for any Atari game).
        config = DADSConfig.make(
            "{}/../configs/dads_grid_world_4room_learning.json".format(
                os.path.dirname(__file__)),
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space)
        dads = DADS(config, name="my-dads")
        print("DADS built ({}).".format(dads))

        env.terminate()
Esempio n. 3
0
    def test_dddqn_learning_on_grid_world_2x2(self):
        # Create an Env object.
        env = GridWorld("2x2", actors=1)

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories)
        )
        # Create a Config.
        dqn_config = DDDQNConfig.make(
            "{}/../configs/dddqn_grid_world_2x2_learning.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space
        )

        # Create an Algo object.
        algo = DDDQN(config=dqn_config, name="my-dddqn")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests)

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.6)

        # Check learnt Q-function (using our dueling layer).
        a_and_v = algo.Q(one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]), depth=4))
        q = dueling(a_and_v, np.array([0, 1, 2, 3, 0, 1, 2, 3]))
        print(q)
        self.assertTrue(q[1] < min(q[2:]) and q[1] < q[0])  # q(s=0,a=right) is the worst
        check(q[5], 1.0, atol=0.4)  # Q(1,->) is close to 1.0.
        #self.assertTrue(q[5] > max(q[:4]) and q[5] > max(q[6:]))  # q(s=1,a=right) is the best
        #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1)  # a=up,down,left,right

        env.terminate()
    def test_dqn2015_learning_on_4x4_grid_world_with_n_actors(self):
        # Create an Env object.
        env = GridWorld("4x4", actors=8)

        # Add the preprocessor.
        preprocessor = Preprocessor(lambda inputs_: tf.one_hot(
            inputs_, depth=env.actors[0].state_space.num_categories))

        # Create a Config.
        config = DQN2015Config.make(  # type: DQN2015Config
            "{}/../configs/dqn2015_grid_world_4x4_learning_n_actors.json".
            format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space)

        # Create an Algo object.
        algo = DQN2015(config=config, name="my-dqn")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=4000, sync=True, render=debug.RenderEnvInLearningTests)

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= -0.4)

        # Check learnt Q-function for states 0 and 1, action=down (should be larger 0.0, ideally 0.5).
        action_values = algo.Q(preprocessor(np.array([0, 1])))
        self.assertTrue(action_values[0][2] >= 0.0)
        self.assertTrue(action_values[1][2] >= 0.0)

        env.terminate()
Esempio n. 5
0
    def test_dads_learning_on_grid_world_4room(self):
        # Create an Env object.
        env = GridWorld("4-room")

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=env.actors[0].state_space.num_categories)
        )
        # Create a Config.
        config = DADSConfig.make(
            "{}/../configs/dads_grid_world_4room_learning.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space
        )

        # Create an Algo object.
        algo = DADS(config=config, name="my-dads")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=3000, sync=True, render=debug.RenderEnvInLearningTests)

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.3)

        # Check learnt Q-function.
        check(algo.q(
            np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]])
        ), [[0.8, -5.0, 0.9, 0.8], [0.8, 1.0, 0.9, 0.9]], decimals=1)  # a=up,down,left,right

        env.terminate()
Esempio n. 6
0
    def test_saving_then_loading_to_get_exact_same_algo(self):
        env = GridWorld("2x2", actors=1)
        state_space = env.actors[0].state_space.with_batch()
        action_space = env.actors[0].action_space.with_batch()

        # Create a very simple DQN2015.
        dqn = DQN2015(config=DQN2015Config.make(
            "{}/../configs/dqn2015_grid_world_2x2_learning.json".format(
                os.path.dirname(__file__)),
            preprocessor=lambda inputs_: tf.one_hot(
                inputs_, depth=state_space.num_categories),
            state_space=state_space,
            action_space=action_space),
                      name="my-dqn")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        dqn.save("test.json")

        env.terminate()
Esempio n. 7
0
    def test_sac_learning_on_grid_world_2x2(self):
        # Create an Env object.
        env = GridWorld("2x2", actors=1)

        # Add the preprocessor (not really necessary, as NN will automatically one-hot, but faster as states
        # are then stored in memory already preprocessed and won't have to be preprocessed again for batch-updates).
        preprocessor = Preprocessor(lambda inputs_: tf.one_hot(
            inputs_, depth=env.actors[0].state_space.num_categories))

        # Create a Config.
        config = SACConfig.make(
            "{}/../configs/sac_grid_world_2x2_learning.json".format(
                os.path.dirname(__file__)),
            preprocessor=preprocessor,
            state_space=env.actors[0].state_space,
            action_space=env.actors[0].action_space,
            summaries=[
                "Ls_critic[0]", "L_actor", "L_alpha", "alpha",
                ("Q(0,^)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([0])})"
                 ),
                ("Q(0,->)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([1])})"
                 ),
                ("Q(0,v)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([2])})"
                 ),
                ("Q(0,<-)",
                 "Q[0]({'s': np.array([[1., 0., 0., 0.]]), 'a': np.array([3])})"
                 ),
                ("Q(1,->)",
                 "Q[0]({'s': np.array([[0., 1., 0., 0.]]), 'a': np.array([1])})"
                 )
            ])

        # Create an Algo object.
        algo = SAC(config=config, name="my-sac")

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(algo)

        # Run and wait for env to complete.
        env.run(ticks=700, sync=True, render=debug.RenderEnvInLearningTests)

        # Check learnt Q-function.
        q = algo.Q[0](dict(s=one_hot(np.array([0, 0, 0, 0, 1, 1, 1, 1]),
                                     depth=4),
                           a=np.array([0, 1, 2, 3, 0, 1, 2, 3])))
        print(q)
        self.assertTrue(q[1] < min(q[2:])
                        and q[1] < q[0])  # q(s=0,a=right) is the worst
        check(q[5], 1.0, decimals=1)  # Q(1,->) is close to 1.0.
        #check(q, [0.8, -5.0, 0.9, 0.8, 0.8, 1.0, 0.9, 0.9], decimals=1)  # a=up,down,left,right

        # Check last n episode returns.
        n = 10
        mean_last_n = np.mean(env.historic_episodes_returns[-n:])
        print("Avg return over last {} episodes: {}".format(n, mean_last_n))
        self.assertTrue(mean_last_n >= 0.7)

        env.terminate()
    def test_dqn2015_functionality(self):
        # Fake q-net/qt-net used for this test.
        def q(s, a):
            return np.sum(dense(dense(s, weights_q[0], weights_q[1]), weights_q[2], weights_q[3]) * one_hot(a, depth=4), axis=-1)

        def qt(s):
            return dense(dense(s, weights_qt[0], weights_qt[1]), weights_qt[2], weights_qt[3])

        env = GridWorld("2x2", actors=1)
        state_space = env.actors[0].state_space.with_batch()
        action_space = env.actors[0].action_space.with_batch()

        # Add the preprocessor.
        preprocessor = Preprocessor(
            lambda inputs_: tf.one_hot(inputs_, depth=state_space.num_categories)
        )
        preprocessed_space = preprocessor(state_space)

        # Add the Q-network.
        i = K.layers.Input(shape=preprocessed_space.shape, dtype=preprocessed_space.dtype)
        o = K.layers.Dense(2, activation="linear")(i)  # keep it very simple
        # o = K.layers.Dense(256)(o)
        q_network = K.Model(inputs=i, outputs=o)

        # Create a very simple DQN2015.
        dqn = DQN2015(config=DQN2015Config.make(
            "{}/../configs/dqn2015_grid_world_2x2_functionality.json".format(os.path.dirname(__file__)),
            preprocessor=preprocessor,
            q_network=q_network,
            state_space=state_space,
            action_space=action_space
        ), name="my-dqn")

        # Check slot of "x" in flattened mem.
        check(dqn.memory.next_record_setup["x"][1], [3])
        self.assertTrue(dqn.memory.batch_size is None)

        check(dqn.Q.get_weights(), dqn.Qt.get_weights())

        # Point actor(s) to the algo.
        env.point_all_actors_to_algo(dqn)

        # Set our weights fixed.
        weights = [
            np.array([[0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4]]),  # hidden layer kernel
            np.array([0.0, 0.0]),  # hidden layer bias
            np.array([[-0.4, -0.3, -0.2, -0.1], [0.4, 0.3, 0.2, 0.1]]),  # output layer kernel
            np.array([0.1, 0.1, 1.0, 0.0])  # output layer bias
        ]
        dqn.Q.set_weights(weights)

        # Perform one step in the env.
        expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1)
        check(expected_action, 2)  # expect to go down
        env.run(ticks=1)  # ts=0 -> do nothing
        # Check action taken.
        check(dqn.a.value, expected_action)
        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn (after one time step, should still be empty).
        check(dqn.memory.size, 0)
        self.assertTrue(dqn.memory.batch_size is None)

        # Perform one step in the env.
        expected_action = np.argmax(dqn.Q(dqn.Phi(env.state)), axis=-1)
        check(expected_action, 2)  # expect to go down
        env.run(ticks=1)  # ts=1 -> no sync, no update
        # Check action taken.
        check(dqn.a.value, expected_action)
        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn.
        check(dqn.memory.size, 1)
        self.assertTrue(dqn.memory.batch_size == 1)  # batch_size is now established.
        check(dqn.memory.memory, [
            np.array([2, 0, 0, 0]),
            np.array([-0.1, 0., 0., 0.]),
            np.array([False, False, False, False]),
            np.array([[1., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]])
        ])
        # Check next states.
        check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]])

        # Perform one step in the env.
        # What are the weights after the update?
        weights_q_before_update = dqn.Q.get_weights()
        weights_q = copy.deepcopy(weights_q_before_update)
        weights_qt = dqn.Qt.get_weights()

        # Check action taken (action is picked before! update).
        expected_action = np.argmax(dqn.Q(dqn.Phi(np.array([1]))), axis=-1)

        env.run(ticks=1)  # ts=2 -> no sync, do update
        weights_q_after_update = dqn.Q.get_weights()
        check(dqn.a.value, expected_action)

        # Check new weight values after the update.
        loss = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config)
        for i, matrix in enumerate(weights_q_before_update):
            for idx in np.ndindex(matrix.shape):
                weights_q = copy.deepcopy(weights_q_before_update)
                weights_q[i][idx] += 0.0001
                lossd = DQN2015Loss()(dqn.memory.last_records_pulled, q, qt, dqn.config)
                dL_over_dw = (lossd - loss) / 0.0001
                check(weights_q_after_update[i][idx], weights_q_before_update[i][idx] - dL_over_dw * dqn.optimizer.learning_rate(0.0), decimals=3)

        # Check state of the env after action taken.
        check(env.state[0], 1)
        check(env.reward[0], -0.1)
        check(env.terminal[0], False)
        # Check memory of dqn.
        check(dqn.memory.size, 2)
        check(dqn.memory.memory, [
            np.array([2, 2, 0, 0]),
            np.array([-0.1, -0.1, 0., 0.]),
            np.array([False, False, False, False]),
            np.array([[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 0.], [0., 0., 0., 0.]])
        ])
        # Check next states.
        check(dqn.memory.next_records, [[np.array([[0., 1., 0., 0.]])]])

        env.terminate()