Beispiel #1
0
    def test_sac_2x2_grid_world_with_container_actions(self):
        """
        Creates a SAC agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2", action_type="ftj", state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path("configs/sac_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = SACAgent.from_spec(
            agent_config,
            state_space=FloatBox(shape=(4,)),
            action_space=dummy_env.action_space,
        )

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=False,
            render=False
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)
    def test_double_dqn_on_2x2_grid_world(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld.
        """
        env_spec = dict(world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            dueling_q=False,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
            execution_spec=dict(seed=10),
            update_spec=dict(update_interval=4,
                             batch_size=24,
                             sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
            store_last_q_table=True)

        time_steps = 1000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
    def test_2x2_grid_world_using_flow_methods(self):
        """
        Tests a minimalistic 2x2 GridWorld.
        """
        env = GridWorld(world="2x2")

        # Simple test runs with fixed actions.
        # X=player's position
        s, r, t = env.step_flow(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(1)  # right: [" H", " X"]
        self.assertTrue(s == 0)
        self.assertTrue(r == 1.0)
        self.assertTrue(t)

        s, r, t = env.step_flow(1)  # right: [" X", " G"] -> in the hole
        self.assertTrue(s == 0)
        self.assertTrue(r == -5.0)
        self.assertTrue(t)

        # Run against a wall.
        s, r, t = env.step_flow(3)  # left: ["XH", " G"]
        self.assertTrue(s == 0)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t = env.step_flow(0)  # up: ["XH", " G"]
        self.assertTrue(s == 0)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
 def test_ppo_agent_faulty_op_visualization(self):
     """
     Creates a PPOAgent with a badly connected network and visualizes the root component.
     """
     agent_config = config_from_path(
         "configs/ppo_agent_for_2x2_gridworld.json")
     # Sabotage the NN.
     agent_config["network_spec"] = [{
         "type": "dense",
         "units": 10
     }, {
         "type": "embedding",
         "embed_dim": 3,
         "vocab_size": 4
     }]
     env = GridWorld(world="2x2")
     # Build Agent and hence trigger the Space error.
     try:
         ppo_agent = PPOAgent.from_spec(
             agent_config,
             state_space=GridWorld.grid_world_2x2_flattened_state_space,
             action_space=env.action_space)
     except RLGraphSpaceError as e:
         print("Seeing expected RLGraphSpaceError ({}). Test ok.".format(e))
     else:
         raise RLGraphError(
             "Not seeing expected RLGraphSpaceError with faulty input Space to embed layer of PPO!"
         )
    def test_double_dqn_on_2x2_grid_world_single_action_to_container(self):
        """
        Tests how dqn solves a mapping of a single integer to multiple actions (as opposed to using container
        actions).
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_single_to_container.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        action_space = IntBox(0, 18)
        agent = DQNAgent.from_spec(agent_config,
                                   huber_loss=True,
                                   double_q=True,
                                   dueling_q=True,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=action_space,
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)
        print(results)
Beispiel #6
0
    def test_ppo_on_2x2_grid_world(self):
        """
        Creates a PPO Agent and runs it via a Runner on the 2x2 Grid World Env.
        """
        env = GridWorld(world="2x2")
        agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=15),
        )

        time_steps = 3000
        worker = SingleThreadedWorker(
            env_spec=lambda: env,
            agent=agent,
            worker_executes_preprocessing=True,
            preprocessing_spec=GridWorld.grid_world_2x2_preprocessing_spec
        )
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        # Assume we have learned something.
        self.assertGreater(results["mean_episode_reward"], -0.2)
Beispiel #7
0
    def test_multi_gpu_ppo_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_ppo_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = PPOAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 10000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        # Assume we have learned something.
        # TODO: This test needs more tuning. -1.0 is not great for the 2x2 grid world.
        self.assertGreater(results["mean_episode_reward"], -1.0)
    def test_ppo_agent_visualization(self):
        """
        Creates a PPOAgent and visualizes meta-graph (no APIs) and the NN-component.
        """
        env = GridWorld(world="2x2")
        env.render()
        ppo_agent = PPOAgent.from_spec(
            config_from_path("configs/ppo_agent_for_2x2_gridworld.json"),
            state_space=GridWorld.grid_world_2x2_flattened_state_space,
            action_space=env.action_space)

        # Test graphviz component-graph drawing.
        draw_meta_graph(ppo_agent.root_component,
                        output=rlgraph_dir + "/ppo.gv",
                        apis=False,
                        graph_fns=False)
        self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo.gv"))
        # Test graphviz component-graph w/ API drawing (only the Policy component).
        draw_meta_graph(ppo_agent.policy.neural_network,
                        output=rlgraph_dir + "/ppo_nn.gv",
                        apis=True)
        self.assertTrue(os.path.isfile(rlgraph_dir + "/ppo_nn.gv"))
Beispiel #9
0
 def test_impala_single_agent_compilation(self):
     """
     Tests IMPALA agent compilation (single-node mode).
     """
     env = GridWorld("2x2")
     agent = IMPALAAgent.from_spec(
         config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
         state_space=env.state_space,
         action_space=env.action_space,
         update_spec=dict(batch_size=16),
         optimizer_spec=dict(type="adam", learning_rate=0.05))
     agent.terminate()
     print("Compiled IMPALA type=single agent.")
    def test_ppo_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a PPO agent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # -----
        # |^|H|
        # -----
        # | |G|  ^=start, looking up
        # -----

        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/ppo_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = PPOAgent.from_spec(agent_config,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space)

        time_steps = 5000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print(results)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertLessEqual(results["episodes_executed"], time_steps)
        # Assume we have learned something.
        self.assertGreaterEqual(results["mean_episode_reward"], -2.0)
Beispiel #11
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 1000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        # Marge q-tables of all four GPUs:
        agent.last_q_table["q_values"] = agent.last_q_table[
            "q_values"].reshape((48, 4))

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(
            np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Beispiel #12
0
 def test_impala_single_agent_compilation(self):
     """
     Tests IMPALA agent compilation (single-node mode).
     """
     return
     if get_backend() == "pytorch":
         return
     env = GridWorld("2x2")
     agent = IMPALAAgent.from_spec(
         config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
         state_space=env.state_space,
         action_space=env.action_space,
         update_spec=dict(batch_size=16),
         optimizer_spec=dict(type="adam", learning_rate=0.05),
         # Make session-creation hang in docker.
         execution_spec=dict(disable_monitoring=True))
     agent.terminate()
     print("Compiled {}".format(agent))
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via a simple loop on a 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05))

        learn_updates = 50
        for i in range(learn_updates):
            ret = agent.update()
            mean_return = self._calc_mean_return(ret)
            print("i={} Loss={:.4} Avg-reward={:.2}".format(
                i, float(ret[1]), mean_return))

        # Assume we have learned something.
        self.assertGreater(mean_return, -0.1)

        # Check the last action probs for the 2 valid next_states (start (after a reset) and one below start).
        action_probs = ret[3]["action_probs"].reshape((80, 4))
        next_states = ret[3]["states"][:, 1:].reshape((80, ))
        for s_, probs in zip(next_states, action_probs):
            # Start state:
            # - Assume we picked "right" in state=1 (in order to step into goal state).
            # - OR we picked "up" or "left" in state=0 (unlikely, but possible).
            if s_ == 0:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)
            # One below start:
            # - Assume we picked "down" in start state with very large probability.
            # - OR we picked "left" or "down" in state=1 (unlikely, but possible).
            elif s_ == 1:
                recursive_assert_almost_equal(probs[0], 0.0, decimals=2)
                self.assertTrue(probs[1] > 0.99 or probs[2] > 0.99)
                recursive_assert_almost_equal(probs[3], 0.0, decimals=2)

        agent.terminate()
Beispiel #14
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system.

        THIS TEST REQUIRES A MULTI GPU SYSTEM.
        """
        #root_logger.setLevel(DEBUG)  # test
        env = GridWorld("2x2")
        agent = DQNAgent.from_spec(
            config_from_path("configs/multi_gpu_dqn_for_2x2_gridworld.json"),
            dueling_q=False,
            state_space=env.state_space,
            action_space=env.action_space,
            observe_spec=dict(buffer_size=100),
            # Rule of thumb for multi-GPU (with n GPUs): n-fold batch-size and learning rate w/ respect to 1 GPU.
            update_spec=dict(update_interval=4, batch_size=48, sync_interval=32),
            optimizer_spec=dict(type="adam", learning_rate=0.15),
            store_last_q_table=True
        )

        time_steps = 400
        worker = SingleThreadedWorker(env_spec=lambda: env, agent=agent, worker_executes_preprocessing=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("STATES:\n{}".format(agent.last_q_table["states"]))
        print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], 250)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"], agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values, expected_q_values_per_state[state], decimals=0)
Beispiel #15
0
    def test_impala_on_2x2_grid_world(self):
        """
        Creates a single IMPALAAgent and runs it via the IMPALAWorker on a simple 2x2 GridWorld.
        """
        env = GridWorld("2x2")
        agent = IMPALAAgent.from_spec(
            config_from_path("configs/impala_agent_for_2x2_gridworld.json"),
            state_space=env.state_space,
            action_space=env.action_space,
            execution_spec=dict(seed=12),
            update_spec=dict(update_interval=4, batch_size=16),
            optimizer_spec=dict(type="adam", learning_rate=0.05),
        )

        learn_updates = 1000
        # Setup the queue runner.
        agent.call_api_method("setup_queue_runner")
        for _ in range(learn_updates):
            agent.update()

        #print("STATES:\n{}".format(agent.last_q_table["states"]))
        #print("\n\nQ(s,a)-VALUES:\n{}".format(np.round_(agent.last_q_table["q_values"], decimals=2)))

        #self.assertEqual(results["timesteps_executed"], time_steps)
        #self.assertEqual(results["env_frames"], time_steps)
        #self.assertGreaterEqual(results["mean_episode_reward"], -3.5)
        #self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        #self.assertLessEqual(results["episodes_executed"], 350)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (1.0, 0, 0, 0): (-1, -5, 0, -1),
            (0, 1.0, 0, 0): (-1, 1, 0, 0)
        }
        for state, q_values in zip(agent.last_q_table["states"],
                                   agent.last_q_table["q_values"]):
            state, q_values = tuple(state), tuple(q_values)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(q_values,
                                          expected_q_values_per_state[state],
                                          decimals=0)
Beispiel #16
0
    def test_weights_getting_setting(self):
        """
        Tests getting and setting of the Agent's weights.
        """
        env = GridWorld(world="2x2")
        agent = Agent.from_spec(
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            state_space=env.state_space,
            action_space=env.action_space)

        weights = agent.get_weights()
        new_weights = {}
        for key, weight in weights["policy_weights"].items():
            new_weights[key] = weight + 0.01

        agent.set_weights(new_weights)
        new_actual_weights = agent.get_weights()

        recursive_assert_almost_equal(new_actual_weights["policy_weights"],
                                      new_weights)
Beispiel #17
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 2000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check all learnt Q-values.
        q_values = agent.graph_executor.execute(
            ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:]
        recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8),
                                      decimals=1)
        recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9),
                                      decimals=1)
    def test_double_dqn_on_2x2_grid_world_with_container_actions(self):
        """
        Creates a double DQNAgent and runs it via a Runner on a simple 2x2 GridWorld using container actions.
        """
        # ftj = forward + turn + jump
        env_spec = dict(world="2x2",
                        action_type="ftj",
                        state_representation="xy+orientation")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/dqn_agent_for_2x2_gridworld_with_container_actions.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")

        agent = DQNAgent.from_spec(agent_config,
                                   double_q=True,
                                   dueling_q=False,
                                   state_space=FloatBox(shape=(4, )),
                                   action_space=dummy_env.action_space,
                                   execution_spec=dict(seed=15),
                                   store_last_q_table=True)

        time_steps = 10000
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld.from_spec(env_spec),
            agent=agent,
            preprocessing_spec=preprocessing_spec,
            worker_executes_preprocessing=True,
            render=False)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        print("LAST q-table:\n{}".format(agent.last_q_table))

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -7)
        self.assertGreaterEqual(results["max_episode_reward"], -1.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 3)

        # Check q-table for correct values.
        expected_q_values_per_state = {
            (0., 0., -1., 0.): {
                "forward": (-5.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 0., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., -1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 1., 0.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., -1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
            (0., 1., 0., 1.): {
                "forward": (0.0, -1.0, -1.0),
                "jump": (0.0, -1.0)
            },
        }
        for state, q_values_forward, q_values_jump in zip(
                agent.last_q_table["states"],
                agent.last_q_table["q_values"]["forward"],
                agent.last_q_table["q_values"]["jump"]):
            state, q_values_forward, q_values_jump = tuple(state), tuple(
                q_values_forward), tuple(q_values_jump)
            assert state in expected_q_values_per_state, \
                "ERROR: state '{}' not expected in q-table as it's a terminal state!".format(state)
            recursive_assert_almost_equal(
                q_values_forward,
                expected_q_values_per_state[state]["forward"],
                decimals=0)
            recursive_assert_almost_equal(
                q_values_jump,
                expected_q_values_per_state[state]["jump"],
                decimals=0)
    def test_long_chain_grid_world(self):
        """
        Tests a minimalistic long-chain GridWorld.
        """
        env = GridWorld(world="long-chain")

        # Simple test runs with fixed actions.
        # X=player's position
        s = env.reset()  # ["X                                              G"]
        self.assertTrue(s == 33)
        s, r, t, _ = env.step(2)  # down: ["X                                              G"]
        self.assertTrue(s == 33)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(1)  # right: ["SX                                             G"]
        self.assertTrue(s == 34)
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)

        env.reset()  # ["X                                              G"]
        # Right, left, down, up, right -> Move one right each iteration.
        for x in range(20):
            s, r, t, _ = env.step(1)
            self.assertTrue(s == x + 33 + 1)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(3)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(2)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(0)
            self.assertTrue(s == x + 33)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
            s, r, t, _ = env.step(1)
            self.assertTrue(s == x + 33 + 1)
            recursive_assert_almost_equal(r, -0.1)
            self.assertTrue(not t)
    def test_4x4_grid_world_with_container_actions(self):
        """
        Tests a 4x4 GridWorld using forward+turn+jump container actions.
        """
        env = GridWorld(world="4x4", action_type="ftj", state_representation="xy+orientation")

        # Simple test runs with fixed actions.

        # Fall into hole.
        s = env.reset()  # [0, 0, 0] (x, y, orientation)
        recursive_assert_almost_equal(s, [0, 0, 0, 1])
        s, r, t, _ = env.step(dict(turn=2, forward=2))  # turn=2 (right), move=2 (forward), jump=0
        recursive_assert_almost_equal(s, [1, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=1))  # turn=2 (right), move=1 (stay), jump=0
        recursive_assert_almost_equal(s, [1, 0, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=2))  # turn=1 (no turn), move=2 (forward), jump=0
        recursive_assert_almost_equal(s, [1, 1, 0, -1])
        self.assertTrue(r == -5.0)
        self.assertTrue(t)

        # Jump quite a lot and reach goal.
        env.reset()  # [0, 0, 0] (x, y, orientation)
        s, r, t, _ = env.step(dict(turn=2, forward=1))
        recursive_assert_almost_equal(s, [0, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1))
        recursive_assert_almost_equal(s, [2, 0, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=2))
        recursive_assert_almost_equal(s, [2, 1, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=2, jump=1))
        recursive_assert_almost_equal(s, [2, 3, 0, -1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=2, forward=0))
        recursive_assert_almost_equal(s, [3, 3, -1, 0])
        self.assertTrue(r == 1.0)
        self.assertTrue(t)

        # Run against a wall.
        env.reset()  # [0, 0, 0] (x, y, orientation)
        s, r, t, _ = env.step(dict(turn=1, forward=0))
        recursive_assert_almost_equal(s, [0, 1, 0, 1])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=0, forward=2))
        recursive_assert_almost_equal(s, [0, 1, -1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)

        # Jump over a hole (no reset).
        s, r, t, _ = env.step(dict(turn=2, forward=1))  # turn around
        s, r, t, _ = env.step(dict(turn=2, forward=1))
        recursive_assert_almost_equal(s, [0, 1, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
        s, r, t, _ = env.step(dict(turn=1, forward=1, jump=1))
        recursive_assert_almost_equal(s, [2, 1, 1, 0])
        recursive_assert_almost_equal(r, -0.1)
        self.assertTrue(not t)
Beispiel #21
0
    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)
Beispiel #22
0
    def test_2x2_grid_world(self):
        """
        Tests a minimalistic 2x2 GridWorld.
        """
        env = GridWorld(world="2x2")

        # Make everything deterministic.
        env.seed(55)

        # Simple test runs with fixed actions.
        # X=player's position
        s = env.reset()  # ["XH", " G"]  X=player's position
        self.assertTrue(s == 0)
        s, r, t, _ = env.step(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        self.assertTrue(r == -1.0)
        self.assertTrue(t is False)
        s, r, t, _ = env.step(1)  # right: [" H", " X"]
        self.assertTrue(s == 3)
        self.assertTrue(r == 1.0)
        self.assertTrue(t is True)

        env.reset()  # ["XH", " G"]  X=player's position
        s, r, t, _ = env.step(1)  # right: [" X", " G"] -> in the hole
        self.assertTrue(s == 2)
        self.assertTrue(r == -5.0)
        self.assertTrue(t is True)

        # Run against a wall.
        env.reset()  # ["XH", " G"]  X=player's position
        s, r, t, _ = env.step(3)  # left: ["XH", " G"]
        self.assertTrue(s == 0)
        self.assertTrue(r == -1.0)
        self.assertTrue(t is False)
        s, r, t, _ = env.step(2)  # down: [" H", "XG"]
        self.assertTrue(s == 1)
        self.assertTrue(r == -1.0)
        self.assertTrue(t is False)
        s, r, t, _ = env.step(0)  # up: ["XH", " G"]
        self.assertTrue(s == 0)
        self.assertTrue(r == -1.0)
        self.assertTrue(t is False)