Ejemplo n.º 1
0
    def test_1_container_1_float_only_flatten(self):
        """
        Adds a single component with 2-to-3 graph_fn to the core and passes one container and one float through it
        with only the flatten option enabled.
        """
        input1_space = spaces.Dict(a=float, b=float, c=spaces.Tuple(float))
        input2_space = spaces.FloatBox(shape=(1, ))

        component = OnlyFlattenDummy(constant_value=5.0)
        test = ComponentTest(component=component,
                             input_spaces=dict(input1=input1_space,
                                               input2=input2_space))

        # Options: only flatten_ops=True.
        in1 = dict(a=5.4, b=3.4, c=tuple([3.2]))
        in2 = np.array([1.2])
        # out1: dict(in1_f key: in1_f value + in2_f[""])
        # out2: in2_f
        # out3: self.constant_value
        out1 = dict(a=in1["a"] + in2,
                    b=in1["b"] + in2,
                    c=tuple([in1["c"][0] + in2]))
        out2 = dict(a=in1["a"] - in2,
                    b=in1["b"] - in2,
                    c=tuple([in1["c"][0] - in2]))
        out3 = in2
        test.test(("run", [in1, in2]),
                  expected_outputs=[out1, out2, out3],
                  decimals=5)
Ejemplo n.º 2
0
    def test_faulty_op_catching(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes two containers through it
        with flatten/split options enabled.
        """
        # Construct some easy component containing a sub-component.
        dense_layer = DenseLayer(units=2, scope="dense-layer")
        string_layer = EmbeddingLookup(embed_dim=3,
                                       vocab_size=4,
                                       scope="embed-layer")
        container_component = Component(dense_layer, string_layer)

        # Add the component's API method.
        @rlgraph_api(component=container_component)
        def test_api(self, a):
            dense_result = self.get_sub_component_by_name("dense-layer").call(
                a)
            # First call dense to get a vector output, then call embedding, which is expecting an int input.
            # This should fail EmbeddingLookup's input space checking (only during the build phase).
            return self.get_sub_component_by_name("embed-layer").call(
                dense_result)

        # Test graphviz component graph drawing.
        draw_meta_graph(container_component, apis=True)

        test = ComponentTest(
            component=container_component,
            input_spaces=dict(
                a=spaces.FloatBox(shape=(4, ), add_batch_rank=True)))
Ejemplo n.º 3
0
    def test_1_containers_1_float_flattening_splitting(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes one container and one float through it
        with flatten/split options all disabled.
        """
        input1_space = spaces.Dict(a=float, b=spaces.FloatBox(shape=(1, 2)))
        input2_space = spaces.FloatBox(shape=(1,1))

        component = FlattenSplitDummy()
        test = ComponentTest(component=component, input_spaces=dict(input1=input1_space, input2=input2_space))

        # Options: fsu=flat/split/un-flat.
        in1_fsu = dict(a=np.array(0.234), b=np.array([[0.0, 3.0]]))
        in2_fsu = np.array([[2.0]])
        # Result of sending 'a' keys through graph_fn: (in1[a]+1.0=1.234, in1[a]+in2=2.234)
        # Result of sending 'b' keys through graph_fn: (in1[b]+1.0=[[1, 4]], in1[b]+in2=[[2.0, 5.0]])
        out1_fsu = dict(a=1.234, b=np.array([[1.0, 4.0]]))
        out2_fsu = dict(a=np.array([[2.234]], dtype=np.float32), b=np.array([[2.0, 5.0]]))
        test.test(("run", [in1_fsu, in2_fsu]), expected_outputs=[out1_fsu, out2_fsu])
Ejemplo n.º 4
0
    def test_calling_graph_fn_from_inside_another_graph_fn(self):
        """
        One graph_fn gets called from within another. Must return actual ops from inner one so that the outer one
        can handle it.
        """
        input_space = spaces.FloatBox(shape=(2, ))
        component = Dummy2NestedGraphFnCalls()
        test = ComponentTest(component=component,
                             input_spaces=dict(input_=input_space))

        input_ = input_space.sample()
        expected = input_ - 1.0
        test.test(("run", input_), expected_outputs=expected, decimals=5)
Ejemplo n.º 5
0
    def __init__(self,
                 state_start=0.0,
                 reward_start=-100.0,
                 steps_to_terminal=10):
        """
        Args:
            state_start (float): State to start with after reset.
            reward_start (float): Reward to start with (after first action) after a reset.
            steps_to_terminal (int): Number of steps after which a terminal signal is raised.
        """
        super(DeterministicEnv, self).__init__(state_space=spaces.FloatBox(),
                                               action_space=spaces.IntBox(2))

        self.state_start = state_start
        self.reward_start = reward_start
        self.steps_to_terminal = steps_to_terminal

        self.state = state_start
        self.reward = reward_start
        self.steps_into_episode = 0
Ejemplo n.º 6
0
    def test_dqn_functionality(self):
        """
        Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test
        all steps of the learning process.
        """
        env = GridWorld(world="2x2", save_mode=True)  # no holes, just fire
        agent = Agent.from_spec(  # type: DQNAgent
            config_from_path("configs/dqn_agent_for_functionality_test.json"),
            double_q=True,
            dueling_q=True,
            state_space=env.state_space,
            action_space=env.action_space,
            discount=0.95)
        worker = SingleThreadedWorker(
            env_spec=lambda: GridWorld(world="2x2", save_mode=True),
            agent=agent)
        test = AgentTest(worker=worker)

        # Helper python DQNLossFunc object.
        loss_func = DQNLossFunction(backend="python",
                                    double_q=True,
                                    discount=agent.discount)
        loss_func.when_input_complete(input_spaces=dict(loss_per_item=[
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.IntBox(4, add_batch_rank=True),
            spaces.FloatBox(add_batch_rank=True),
            spaces.BoolBox(add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True),
            spaces.FloatBox(shape=(4, ), add_batch_rank=True)
        ]),
                                      action_space=env.action_space)

        matrix1_qnet = np.array([[0.9] * 2] * 4)
        matrix2_qnet = np.array([[0.8] * 5] * 2)
        matrix1_target_net = np.array([[0.9] * 2] * 4)
        matrix2_target_net = np.array([[0.8] * 5] * 2)

        a = self._calculate_action(0, matrix1_qnet, matrix2_qnet)

        # 1st step -> Expect insert into python-buffer.
        # action: up (0)
        test.step(1, reset=True)
        # Environment's new state.
        test.check_env("state", 0)
        # Agent's buffer.
        test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]],
                         key_or_index="env_0")  # <- prev state (preprocessed)
        test.check_agent("actions_buffer", [a], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        # Memory contents.
        test.check_var("replay-memory/index", 0)
        test.check_var("replay-memory/size", 0)
        test.check_var("replay-memory/memory/states",
                       np.array([[0] * 4] * agent.memory.capacity))
        test.check_var("replay-memory/memory/actions",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/rewards",
                       np.array([0] * agent.memory.capacity))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False] * agent.memory.capacity))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 2nd step -> expect insert into memory (and python buffer should be empty again).
        # action: up (0)
        # Also check the policy and target policy values (Should be equal at this point).
        test.step(1)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 2)
        test.check_var("replay-memory/size", 2)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0] + [0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] + [False] * (agent.memory.capacity - 2)))
        # Check policy and target-policy weights (should be the same).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_qnet)

        # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again).
        # actions: down (2), up (0)  <- exploring is True = more random actions
        # Expect an update to the policy variables (leave target as is (no sync yet)).
        test.step(2, use_exploration=True)
        test.check_env("state", 0)
        test.check_agent("states_buffer", [], key_or_index="env_0")
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_var("replay-memory/index", 4)
        test.check_var("replay-memory/size", 4)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/actions",
            np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/rewards",
            np.array([-1.0] * 4 +  # + [-3.0] +
                     [0.0] * (agent.memory.capacity - 4)))
        test.check_var(
            "replay-memory/memory/terminals",
            np.array([False, True] * 2 + [False] *
                     (agent.memory.capacity - 4)))
        # Get the latest memory batch.
        expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0],
                                               [1.0, 0.0, 0.0, 0.0]]),
                              actions=np.array([0, 1]),
                              rewards=np.array([-1.0, -3.0]),
                              terminals=np.array([False, True]),
                              next_states=np.array([[1.0, 0.0, 0.0, 0.0],
                                                    [0.0, 0.0, 0.0, 0.0]]))
        test.check_agent("last_memory_batch", expected_batch)

        # Calculate the weight updates and check against actually update weights by the AgentDQN.
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet,
                                                 matrix1_target_net,
                                                 matrix2_target_net, agent,
                                                 loss_func)
        # Check policy and target-policy weights (policy should be updated now).
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        matrix1_qnet = mat_updated[0]
        matrix2_qnet = mat_updated[1]

        # 5th step -> Another buffer update check.
        # action: down (2) (weights have been updated -> different actions)
        test.step(1)
        test.check_env("state", 3)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty b/c we reached end of episode (buffer gets force-flushed)
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 5)
        test.check_var("replay-memory/size", 5)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5)))
        test.check_var("replay-memory/memory/actions",
                       np.array([0, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, 0.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([False, True] * 2 + [True, False]))
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            mat_updated[1],
            decimals=4)
        test.check_var(
            "target-policy/action-adapter-0/action-network/action-layer/dense/kernel",
            matrix2_target_net)

        # 6th/7th step (with exploration enabled) -> Another buffer update check.
        # action: up, down (0, 2)
        test.step(2, use_exploration=True)
        test.check_env("state", 1)
        test.check_agent(
            "states_buffer", [], key_or_index="env_0"
        )  # <- all empty again; flushed after 6th step (when buffer was full).
        test.check_agent("actions_buffer", [], key_or_index="env_0")
        test.check_agent("rewards_buffer", [], key_or_index="env_0")
        test.check_agent("terminals_buffer", [], key_or_index="env_0")
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index",
                       1)  # index has been rolled over (memory capacity is 6)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0] * 3 + [-3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=4)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_target_net)
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=4)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_target_net)

        # 8th step -> Another buffer update check and weights update and sync.
        # action: down (2)
        test.step(1)
        test.check_env("state", 1)
        test.check_agent("states_buffer", [1], key_or_index="env_0")
        test.check_agent("actions_buffer", [2], key_or_index="env_0")
        test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0")
        test.check_agent("terminals_buffer", [False], key_or_index="env_0")
        expected_batch = dict(
            states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]),
            actions=np.array([0, 1]),
            rewards=np.array([-1.0, -3.0]),
            terminals=np.array([True, True]),
            next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]])
            # TODO: <- This is wrong and must be fixed
            # (next-state of first item is from a previous insert and unrelated to first item)
        )
        test.check_agent("last_memory_batch", expected_batch)
        test.check_var("replay-memory/index", 1)
        test.check_var("replay-memory/size", 6)
        test.check_var(
            "replay-memory/memory/states",
            np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] +
                     [[1.0, 0.0, 0.0, 0.0]]))
        test.check_var("replay-memory/memory/actions",
                       np.array([2, 0, 0, 1, 2, 0]))
        test.check_var("replay-memory/memory/rewards",
                       np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0]))
        test.check_var("replay-memory/memory/terminals",
                       np.array([True, True, False, True, True, False]))

        # Assume that the sync happens first (matrices are already the same when updating).
        mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet,
                                                 matrix2_qnet, matrix1_qnet,
                                                 matrix2_qnet, agent,
                                                 loss_func)

        # Now target-net should be again 1 step behind policy-net.
        test.check_var("dueling-policy/neural-network/hidden/dense/kernel",
                       mat_updated[0],
                       decimals=2)
        test.check_var("target-policy/neural-network/hidden/dense/kernel",
                       matrix1_qnet,
                       decimals=2)  # again: old matrix
        test.check_var(
            "dueling-policy/dueling-action-adapter/action-layer/dense/kernel",
            mat_updated[1],
            decimals=2)
        test.check_var(
            "target-policy/dueling-action-adapter/action-layer/dense/kernel",
            matrix2_qnet,
            decimals=2)
Ejemplo n.º 7
0
class GridWorld(Environment):
    """
    A classic grid world.

    Possible action spaces are:
    - up, down, left, right
    - forward/halt/backward + turn left/right/no-turn + jump (or not)

    The state space is discrete.

    Field types are:
    'S' : starting point
    ' ' : free space
    'W' : wall (blocks, but can be jumped)
    'H' : hole (terminates episode) (to be replaced by W in save-mode)
    'F' : fire (usually causing negative reward, but can be jumped)
    'G' : goal state (terminates episode)

    TODO: Create an option to introduce a continuous action space.
    """
    # Some built-in maps.
    MAPS = {
        "chain": [
            "G    S  F G"
        ],
        "2x2": [
            "SH",
            " G"
        ],
        "4x4": [
            "S   ",
            " H H",
            "   H",
            "H  G"
        ],
        "8x8": [
            "S       ",
            "        ",
            "   H    ",
            "     H  ",
            "   H    ",
            " HH   H ",
            " H  H H ",
            "   H   G"
        ],
        "8x16": [
            "S      H        ",
            "   H       HH   ",
            "    FF   WWWWWWW",
            "  H      W      ",
            "    FF   W  H   ",
            "         W      ",
            "    FF   W      ",
            "  H          H G"
        ],
        "16x16": [
            "S      H        ",
            "           HH   ",
            "    FF   W     W",
            "         W      ",
            "WWW FF      H   ",
            "         W      ",
            " FFFF    W      ",
            "  H          H  ",
            "       H        ",
            "   H       HH   ",
            "WWWW     WWWWWWW",
            "  H      W    W ",
            "    FF   W  H W ",
            "WWWW    WW    W ",
            "    FF   W      ",
            "  H          H G"
        ]
    }

    # Some useful class vars.
    grid_world_2x2_preprocessing_spec = [dict(type="reshape", flatten=True, flatten_categories=4)]
    grid_world_4x4_preprocessing_spec = [dict(type="reshape", flatten=True, flatten_categories=16)]
    # Preprocessed state spaces.
    grid_world_2x2_flattened_state_space = spaces.FloatBox(shape=(4,), add_batch_rank=True)
    grid_world_4x4_flattened_state_space = spaces.FloatBox(shape=(16,), add_batch_rank=True)

    def __init__(self, world="4x4", save_mode=False, action_type="udlr",
                 reward_function="sparse", state_representation="discrete"):
        """
        Args:
            world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows
                of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state).

            save_mode (bool): Whether to replace holes (H) with walls (W). Default: False.

            action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a
                discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete
                action space.

            reward_function (str): One of
                sparse: hole=-1, fire=-1, goal=50, all other steps=-1
                rich: hole=-100, fire=-10, goal=50

            state_representation (str):
                - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one
                    below, etc..
                - "xy": The x and y grid position tuple.
                - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values
                    of the actor.
                - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are
                    used to indicate different items in the scene (walls, holes, the actor, etc..).
        """
        # Build our map.
        if isinstance(world, str):
            self.description = world
            world = self.MAPS[world]
        else:
            self.description = "custom-map"

        world = np.array(list(map(list, world)))
        # Apply safety switch.
        world[world == 'H'] = ("H" if not save_mode else "F")

        # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column).
        self.world = world
        self.n_row, self.n_col = self.world.shape
        (start_x,), (start_y,) = np.nonzero(self.world == "S")

        # Figure out our state space.
        assert state_representation in ["discrete", "xy", "xy+orientation", "camera"]
        self.state_representation = state_representation
        # Discrete states (single int from 0 to n).
        if self.state_representation == "discrete":
            state_space = spaces.IntBox(self.n_row * self.n_col)
        # x/y position (2 ints).
        elif self.state_representation == "xy":
            state_space = spaces.IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2,))
        # x/y position + orientation (3 ints).
        elif self.state_representation == "xy+orientation":
            state_space = spaces.IntBox(low=(0, 0, 0, 0), high=(self.n_col, self.n_row, 1, 1))
        # Camera outputting a 2D color image of the world.
        else:
            state_space = spaces.IntBox(0, 255, shape=(self.n_row, self.n_col, 3))

        self.default_start_pos = self.get_discrete_pos(start_x, start_y)
        self.discrete_pos = self.default_start_pos

        assert reward_function in ["sparse", "rich"]  # TODO: "potential"-based reward
        self.reward_function = reward_function

        # Store the goal position for proximity calculations (for "potential" reward function).
        (self.goal_x,), (self.goal_y,) = np.nonzero(self.world == "G")

        # Specify the actual action spaces.
        self.action_type = action_type
        action_space = spaces.IntBox(4) if self.action_type == "udlr" else spaces.Dict(dict(
            forward=spaces.IntBox(3), turn=spaces.IntBox(3), jump=spaces.IntBox(2)
        ))

        # Call the super's constructor.
        super(GridWorld, self).__init__(state_space=state_space, action_space=action_space)

        # Reset ourselves.
        self.state = None
        self.orientation = None  # int: 0, 90, 180, 270
        self.camera_pixels = None  # only used, if state_representation=='cam'
        self.reward = None
        self.is_terminal = None
        self.reset(randomize=False)

    def seed(self, seed=None):
        if seed is None:
            seed = time.time()
        np.random.seed(seed)
        return seed

    def reset(self, randomize=False):
        """
        Args:
            randomize (bool): Whether to start the new episode in a random position (instead of "S").
                This could be an empty space (" "), the default start ("S") or a fire field ("F").
        """
        if randomize is False:
            self.discrete_pos = self.default_start_pos
        else:
            # Move to a random first position (" ", "S", or "F" (ouch!) are all ok to start in).
            while True:
                self.discrete_pos = random.choice(range(self.n_row * self.n_col))
                if self.world[self.y, self.x] in [" ", "S", "F"]:
                    break

        self.reward = 0.0
        self.is_terminal = False
        self.orientation = 0
        self.refresh_state()
        return self.state

    def reset_flow(self, randomize=False):
        return self.reset(randomize=randomize)

    def step(self, actions, set_discrete_pos=None):
        """
        Action map:
        0: up
        1: right
        2: down
        3: left

        Args:
            actions (Optional[int,Dict[str,int]]):
                For "udlr": An integer 0-3 that describes the next action.
                For "ftj": A dict with keys: "turn" (0 (turn left), 1 (no turn), 2 (turn right)), "forward"
                    (0 (backward), 1(stay), 2 (forward)) and "jump" (0 (no jump) and 1 (jump)).

            set_discrete_pos (Optional[int]): An integer to set the current discrete position to before acting.

        Returns:
            tuple: State Space (Space), reward (float), is_terminal (bool), info (usually None).
        """
        # Process possible manual setter instruction.
        if set_discrete_pos is not None:
            assert isinstance(set_discrete_pos, int) and 0 <= set_discrete_pos < self.state_space.flat_dim
            self.discrete_pos = set_discrete_pos

        # Forward, turn, jump container action.
        move = None
        if self.action_type == "ftj":
            actions = self._translate_action(actions)
            # Turn around (0 (left turn), 1 (no turn), 2 (right turn)).
            if "turn" in actions:
                self.orientation += (actions["turn"] - 1) * 90
                self.orientation %= 360  # re-normalize orientation

            # Forward (0=move back, 1=don't move, 2=move forward).
            if "forward" in actions:
                forward = actions["forward"]
                # Translate into classic grid world action (0=up, 1=right, 2=down, 3=left).
                # We are actually moving in some direction.
                if actions["forward"] != 1:
                    if self.orientation == 0 and forward == 2 or self.orientation == 180 and forward == 0:
                        move = 0  # up
                    elif self.orientation == 90 and forward == 2 or self.orientation == 270 and forward == 0:
                        move = 1  # right
                    elif self.orientation == 180 and forward == 2 or self.orientation == 0 and forward == 0:
                        move = 2  # down
                    else:
                        move = 3  # left
            # Up, down, left, right actions.
        else:
            move = actions

        if move is not None:
            # determine the next state based on the transition function
            next_positions = self.get_possible_next_positions(self.discrete_pos, move)
            next_state_idx = np.random.choice(len(next_positions), p=[x[1] for x in next_positions])
            # Update our pos.
            self.discrete_pos = next_positions[next_state_idx][0]

        # Jump? -> Move two fields forward (over walls/fires/holes w/o any damage).
        if self.action_type == "ftj" and "jump" in actions:
            assert actions["jump"] == 0 or actions["jump"] == 1
            if actions["jump"] == 1:
                # Translate into "classic" grid world action (0=up, ..., 3=left) and execute that action twice.
                action = int(self.orientation / 90)
                for i in range(2):
                    # determine the next state based on the transition function
                    next_positions = self.get_possible_next_positions(self.discrete_pos, action, in_air=(i==1))
                    next_state_idx = np.random.choice(len(next_positions), p=[x[1] for x in next_positions])
                    # Update our pos.
                    self.discrete_pos = next_positions[next_state_idx][0]

        next_x = self.discrete_pos // self.n_col
        next_y = self.discrete_pos % self.n_col

        # determine reward and done flag
        next_state_type = self.world[next_y, next_x]
        if next_state_type == "H":
            self.is_terminal = True
            self.reward = -5 if self.reward_function == "sparse" else -10
        elif next_state_type == "F":
            self.is_terminal = False
            self.reward = -3 if self.reward_function == "sparse" else -10
        elif next_state_type in [" ", "S"]:
            self.is_terminal = False
            self.reward = -1
        elif next_state_type == "G":
            self.is_terminal = True
            self.reward = 1 if self.reward_function == "sparse" else 50
        else:
            raise NotImplementedError

        self.refresh_state()

        return self.state, np.array(self.reward, dtype=np.float32), np.array(self.is_terminal), None

    def step_flow(self, actions):
        state, reward, terminal, _ = self.step(actions)
        # Flow Env logic.
        if terminal:
            state = self.reset()

        return state, reward, terminal

    def render(self):
        actor = "X"
        if self.action_type == "ftj":
            actor = "^" if self.orientation == 0 else ">" if self.orientation == 90 else "v" if \
                self.orientation == 180 else "<"

        # paints itself
        for row in range_(len(self.world)):
            for col, val in enumerate(self.world[row]):
                if self.x == col and self.y == row:
                    print(actor, end="")
                else:
                    print(val, end="")
            print()
        print()

    def __str__(self):
        return "GridWorld({})".format(self.description)

    def refresh_state(self):
        # Discrete state.
        if self.state_representation == "discrete":
            # TODO: If ftj-actions, maybe multiply discrete states with orientation (will lead to x4 state space size).
            self.state = np.array(self.discrete_pos, dtype=np.int32)
        # xy position.
        elif self.state_representation == "xy":
            self.state = np.array([self.x, self.y], dtype=np.int32)
        # xy + orientation (only if `self.action_type` supports turns).
        elif self.state_representation == "xy+orientation":
            orient = [0, 1] if self.orientation == 0 else [1, 0] if self.orientation == 90 else [0, -1] \
                if self.orientation == 180 else [-1, 0]
            self.state = np.array([self.x, self.y] + orient, dtype=np.int32)
        # Camera.
        else:
            self.update_cam_pixels()
            self.state = self.camera_pixels

    def get_possible_next_positions(self, discrete_pos, action, in_air=False):
        """
        Given a discrete position value and an action, returns a list of possible next states and
        their probabilities. Only next states with non-zero probabilities will be returned.
        For now: Implemented as a deterministic MDP.

        Args:
            discrete_pos (int): The discrete position to return possible next states for.
            action (int): The action choice.
            in_air (bool): Whether we are actually in the air right now (ignore if we come from "H" or "W").

        Returns:
            List[Tuple[int,float]]: A list of tuples (s', p(s'\|s,a)). Where s' is the next discrete position and
                p(s'\|s,a) is the probability of ending up in that position when in state s and taking action a.
        """
        x = discrete_pos // self.n_col
        y = discrete_pos % self.n_col
        coords = np.array([x, y])

        increments = np.array([[0, -1], [1, 0], [0, 1], [-1, 0]])
        next_coords = np.clip(
            coords + increments[action],
            [0, 0],
            [self.n_row - 1, self.n_col - 1]
        )
        next_pos = self.get_discrete_pos(next_coords[0], next_coords[1])
        pos_type = self.world[y, x]
        next_pos_type = self.world[next_coords[1], next_coords[0]]
        # TODO: Allow stochasticity in this env. Right now, all probs are 1.0.
        # Next field is a wall or we are already terminal. Stay where we are.
        if next_pos_type == "W" or (in_air is False and pos_type in ["H", "G"]):
            return [(discrete_pos, 1.)]
        # Move to next field.
        else:
            return [(next_pos, 1.)]

    def update_cam_pixels(self):
        # Init camera?
        if self.camera_pixels is None:
            self.camera_pixels = np.zeros(shape=(self.n_row, self.n_col, 3), dtype=np.int32)
        self.camera_pixels[:, :, :] = 0  # reset everything

        # 1st channel -> Walls (127) and goal (255).
        # 2nd channel -> Dangers (fire=127, holes=255)
        # 3rd channel -> Actor position (255).
        for row in range_(self.n_row):
            for col in range_(self.n_col):
                field = self.world[row, col]
                if field == "F":
                    self.camera_pixels[row, col, 0] = 127
                elif field == "H":
                    self.camera_pixels[row, col, 0] = 255
                elif field == "W":
                    self.camera_pixels[row, col, 1] = 127
                elif field == "G":
                    self.camera_pixels[row, col, 1] = 255  # will this work (goal==2x wall)?
        # Overwrite player's position.
        self.camera_pixels[self.y, self.x, 2] = 255

    def get_dist_to_goal(self):
        return math.sqrt((self.x - self.goal_x) ** 2 + (self.y - self.goal_y) ** 2)

    def get_discrete_pos(self, x, y):
        """
        Returns a single, discrete int-value.
        Calculated by walking down the rows of the grid first (starting in upper left corner),
        then along the col-axis.

        Args:
            x (int): The x-coordinate.
            y (int): The y-coordinate.

        Returns:
            int: The discrete pos value corresponding to the given x and y.
        """
        return x * self.n_col + y

    @property
    def x(self):
        return self.discrete_pos // self.n_col

    @property
    def y(self):
        return self.discrete_pos % self.n_col

    def _translate_action(self, actions):
        """
        Maps a single integer action to dict actions. This allows us to compare how
        container actions perform when instead using a large range on a single discrete action by enumerating
        all combinations.

        Args:
            actions Union(int, dict): Maps single integer to different actions.

        Returns:
            dict: Actions dict.
        """
        # If already dict, do nothing.
        if isinstance(actions, dict):
            return actions
        else:
            # Unpack
            if isinstance(actions, (np.ndarray, list)):
                actions = actions[0]
            # 3 x 3 x 2 = 18 actions
            assert 18 > actions >= 0
            # For "ftj": A dict with keys: "turn" (0 (turn left), 1 (no turn), 2 (turn right)), "forward"
            # (0 (backward), 1(stay), 2 (forward)) and "jump" (0 (no jump) and 1 (jump)).
            converted_actions = {}

            # Mapping:
            # 0 = 0 0 0
            # 1 = 0 0 1
            # 2 = 0 1 0
            # 3 = 0 1 1
            # 4 = 0 2 0
            # 5 = 0 2 1
            # 6 = 1 0 0
            # 7 = 1 0 1
            # 8 = 1 1 0
            # 9 = 1 1 1
            # 10 = 1 2 0
            # 11 = 1 2 1
            # 12 = 2 0 0
            # 13 = 2 0 1
            # 14 = 2 1 0
            # 15 = 2 1 1
            # 16 = 2 2 0
            # 17 = 2 2 1

            # Set turn via range:
            if 6 > actions >= 0:
                converted_actions["turn"] = 0
            elif 12 > actions >= 6:
                converted_actions["turn"] = 1
            elif 18 > actions >= 12:
                converted_actions["turn"] = 2

            if actions in [0, 1, 6, 7, 12, 13]:
                converted_actions["forward"] = 0
            elif actions in [2, 3, 8, 9, 14, 15]:
                converted_actions["forward"] = 1
            elif actions in [4, 5, 10, 11, 16, 17]:
                converted_actions["forward"] = 2

            if actions % 2 == 0:
                converted_actions["jump"] = 0
            else:
                converted_actions["jump"] = 1
            return converted_actions