def test_1_container_1_float_only_flatten(self): """ Adds a single component with 2-to-3 graph_fn to the core and passes one container and one float through it with only the flatten option enabled. """ input1_space = spaces.Dict(a=float, b=float, c=spaces.Tuple(float)) input2_space = spaces.FloatBox(shape=(1, )) component = OnlyFlattenDummy(constant_value=5.0) test = ComponentTest(component=component, input_spaces=dict(input1=input1_space, input2=input2_space)) # Options: only flatten_ops=True. in1 = dict(a=5.4, b=3.4, c=tuple([3.2])) in2 = np.array([1.2]) # out1: dict(in1_f key: in1_f value + in2_f[""]) # out2: in2_f # out3: self.constant_value out1 = dict(a=in1["a"] + in2, b=in1["b"] + in2, c=tuple([in1["c"][0] + in2])) out2 = dict(a=in1["a"] - in2, b=in1["b"] - in2, c=tuple([in1["c"][0] - in2])) out3 = in2 test.test(("run", [in1, in2]), expected_outputs=[out1, out2, out3], decimals=5)
def test_faulty_op_catching(self): """ Adds a single component with 2-to-2 graph_fn to the core and passes two containers through it with flatten/split options enabled. """ # Construct some easy component containing a sub-component. dense_layer = DenseLayer(units=2, scope="dense-layer") string_layer = EmbeddingLookup(embed_dim=3, vocab_size=4, scope="embed-layer") container_component = Component(dense_layer, string_layer) # Add the component's API method. @rlgraph_api(component=container_component) def test_api(self, a): dense_result = self.get_sub_component_by_name("dense-layer").call( a) # First call dense to get a vector output, then call embedding, which is expecting an int input. # This should fail EmbeddingLookup's input space checking (only during the build phase). return self.get_sub_component_by_name("embed-layer").call( dense_result) # Test graphviz component graph drawing. draw_meta_graph(container_component, apis=True) test = ComponentTest( component=container_component, input_spaces=dict( a=spaces.FloatBox(shape=(4, ), add_batch_rank=True)))
def test_1_containers_1_float_flattening_splitting(self): """ Adds a single component with 2-to-2 graph_fn to the core and passes one container and one float through it with flatten/split options all disabled. """ input1_space = spaces.Dict(a=float, b=spaces.FloatBox(shape=(1, 2))) input2_space = spaces.FloatBox(shape=(1,1)) component = FlattenSplitDummy() test = ComponentTest(component=component, input_spaces=dict(input1=input1_space, input2=input2_space)) # Options: fsu=flat/split/un-flat. in1_fsu = dict(a=np.array(0.234), b=np.array([[0.0, 3.0]])) in2_fsu = np.array([[2.0]]) # Result of sending 'a' keys through graph_fn: (in1[a]+1.0=1.234, in1[a]+in2=2.234) # Result of sending 'b' keys through graph_fn: (in1[b]+1.0=[[1, 4]], in1[b]+in2=[[2.0, 5.0]]) out1_fsu = dict(a=1.234, b=np.array([[1.0, 4.0]])) out2_fsu = dict(a=np.array([[2.234]], dtype=np.float32), b=np.array([[2.0, 5.0]])) test.test(("run", [in1_fsu, in2_fsu]), expected_outputs=[out1_fsu, out2_fsu])
def test_calling_graph_fn_from_inside_another_graph_fn(self): """ One graph_fn gets called from within another. Must return actual ops from inner one so that the outer one can handle it. """ input_space = spaces.FloatBox(shape=(2, )) component = Dummy2NestedGraphFnCalls() test = ComponentTest(component=component, input_spaces=dict(input_=input_space)) input_ = input_space.sample() expected = input_ - 1.0 test.test(("run", input_), expected_outputs=expected, decimals=5)
def __init__(self, state_start=0.0, reward_start=-100.0, steps_to_terminal=10): """ Args: state_start (float): State to start with after reset. reward_start (float): Reward to start with (after first action) after a reset. steps_to_terminal (int): Number of steps after which a terminal signal is raised. """ super(DeterministicEnv, self).__init__(state_space=spaces.FloatBox(), action_space=spaces.IntBox(2)) self.state_start = state_start self.reward_start = reward_start self.steps_to_terminal = steps_to_terminal self.state = state_start self.reward = reward_start self.steps_into_episode = 0
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, discount=0.95) worker = SingleThreadedWorker( env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict(loss_per_item=[ spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True) ]), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed # (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)
class GridWorld(Environment): """ A classic grid world. Possible action spaces are: - up, down, left, right - forward/halt/backward + turn left/right/no-turn + jump (or not) The state space is discrete. Field types are: 'S' : starting point ' ' : free space 'W' : wall (blocks, but can be jumped) 'H' : hole (terminates episode) (to be replaced by W in save-mode) 'F' : fire (usually causing negative reward, but can be jumped) 'G' : goal state (terminates episode) TODO: Create an option to introduce a continuous action space. """ # Some built-in maps. MAPS = { "chain": [ "G S F G" ], "2x2": [ "SH", " G" ], "4x4": [ "S ", " H H", " H", "H G" ], "8x8": [ "S ", " ", " H ", " H ", " H ", " HH H ", " H H H ", " H G" ], "8x16": [ "S H ", " H HH ", " FF WWWWWWW", " H W ", " FF W H ", " W ", " FF W ", " H H G" ], "16x16": [ "S H ", " HH ", " FF W W", " W ", "WWW FF H ", " W ", " FFFF W ", " H H ", " H ", " H HH ", "WWWW WWWWWWW", " H W W ", " FF W H W ", "WWWW WW W ", " FF W ", " H H G" ] } # Some useful class vars. grid_world_2x2_preprocessing_spec = [dict(type="reshape", flatten=True, flatten_categories=4)] grid_world_4x4_preprocessing_spec = [dict(type="reshape", flatten=True, flatten_categories=16)] # Preprocessed state spaces. grid_world_2x2_flattened_state_space = spaces.FloatBox(shape=(4,), add_batch_rank=True) grid_world_4x4_flattened_state_space = spaces.FloatBox(shape=(16,), add_batch_rank=True) def __init__(self, world="4x4", save_mode=False, action_type="udlr", reward_function="sparse", state_representation="discrete"): """ Args: world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state). save_mode (bool): Whether to replace holes (H) with walls (W). Default: False. action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete action space. reward_function (str): One of sparse: hole=-1, fire=-1, goal=50, all other steps=-1 rich: hole=-100, fire=-10, goal=50 state_representation (str): - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one below, etc.. - "xy": The x and y grid position tuple. - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values of the actor. - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are used to indicate different items in the scene (walls, holes, the actor, etc..). """ # Build our map. if isinstance(world, str): self.description = world world = self.MAPS[world] else: self.description = "custom-map" world = np.array(list(map(list, world))) # Apply safety switch. world[world == 'H'] = ("H" if not save_mode else "F") # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column). self.world = world self.n_row, self.n_col = self.world.shape (start_x,), (start_y,) = np.nonzero(self.world == "S") # Figure out our state space. assert state_representation in ["discrete", "xy", "xy+orientation", "camera"] self.state_representation = state_representation # Discrete states (single int from 0 to n). if self.state_representation == "discrete": state_space = spaces.IntBox(self.n_row * self.n_col) # x/y position (2 ints). elif self.state_representation == "xy": state_space = spaces.IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2,)) # x/y position + orientation (3 ints). elif self.state_representation == "xy+orientation": state_space = spaces.IntBox(low=(0, 0, 0, 0), high=(self.n_col, self.n_row, 1, 1)) # Camera outputting a 2D color image of the world. else: state_space = spaces.IntBox(0, 255, shape=(self.n_row, self.n_col, 3)) self.default_start_pos = self.get_discrete_pos(start_x, start_y) self.discrete_pos = self.default_start_pos assert reward_function in ["sparse", "rich"] # TODO: "potential"-based reward self.reward_function = reward_function # Store the goal position for proximity calculations (for "potential" reward function). (self.goal_x,), (self.goal_y,) = np.nonzero(self.world == "G") # Specify the actual action spaces. self.action_type = action_type action_space = spaces.IntBox(4) if self.action_type == "udlr" else spaces.Dict(dict( forward=spaces.IntBox(3), turn=spaces.IntBox(3), jump=spaces.IntBox(2) )) # Call the super's constructor. super(GridWorld, self).__init__(state_space=state_space, action_space=action_space) # Reset ourselves. self.state = None self.orientation = None # int: 0, 90, 180, 270 self.camera_pixels = None # only used, if state_representation=='cam' self.reward = None self.is_terminal = None self.reset(randomize=False) def seed(self, seed=None): if seed is None: seed = time.time() np.random.seed(seed) return seed def reset(self, randomize=False): """ Args: randomize (bool): Whether to start the new episode in a random position (instead of "S"). This could be an empty space (" "), the default start ("S") or a fire field ("F"). """ if randomize is False: self.discrete_pos = self.default_start_pos else: # Move to a random first position (" ", "S", or "F" (ouch!) are all ok to start in). while True: self.discrete_pos = random.choice(range(self.n_row * self.n_col)) if self.world[self.y, self.x] in [" ", "S", "F"]: break self.reward = 0.0 self.is_terminal = False self.orientation = 0 self.refresh_state() return self.state def reset_flow(self, randomize=False): return self.reset(randomize=randomize) def step(self, actions, set_discrete_pos=None): """ Action map: 0: up 1: right 2: down 3: left Args: actions (Optional[int,Dict[str,int]]): For "udlr": An integer 0-3 that describes the next action. For "ftj": A dict with keys: "turn" (0 (turn left), 1 (no turn), 2 (turn right)), "forward" (0 (backward), 1(stay), 2 (forward)) and "jump" (0 (no jump) and 1 (jump)). set_discrete_pos (Optional[int]): An integer to set the current discrete position to before acting. Returns: tuple: State Space (Space), reward (float), is_terminal (bool), info (usually None). """ # Process possible manual setter instruction. if set_discrete_pos is not None: assert isinstance(set_discrete_pos, int) and 0 <= set_discrete_pos < self.state_space.flat_dim self.discrete_pos = set_discrete_pos # Forward, turn, jump container action. move = None if self.action_type == "ftj": actions = self._translate_action(actions) # Turn around (0 (left turn), 1 (no turn), 2 (right turn)). if "turn" in actions: self.orientation += (actions["turn"] - 1) * 90 self.orientation %= 360 # re-normalize orientation # Forward (0=move back, 1=don't move, 2=move forward). if "forward" in actions: forward = actions["forward"] # Translate into classic grid world action (0=up, 1=right, 2=down, 3=left). # We are actually moving in some direction. if actions["forward"] != 1: if self.orientation == 0 and forward == 2 or self.orientation == 180 and forward == 0: move = 0 # up elif self.orientation == 90 and forward == 2 or self.orientation == 270 and forward == 0: move = 1 # right elif self.orientation == 180 and forward == 2 or self.orientation == 0 and forward == 0: move = 2 # down else: move = 3 # left # Up, down, left, right actions. else: move = actions if move is not None: # determine the next state based on the transition function next_positions = self.get_possible_next_positions(self.discrete_pos, move) next_state_idx = np.random.choice(len(next_positions), p=[x[1] for x in next_positions]) # Update our pos. self.discrete_pos = next_positions[next_state_idx][0] # Jump? -> Move two fields forward (over walls/fires/holes w/o any damage). if self.action_type == "ftj" and "jump" in actions: assert actions["jump"] == 0 or actions["jump"] == 1 if actions["jump"] == 1: # Translate into "classic" grid world action (0=up, ..., 3=left) and execute that action twice. action = int(self.orientation / 90) for i in range(2): # determine the next state based on the transition function next_positions = self.get_possible_next_positions(self.discrete_pos, action, in_air=(i==1)) next_state_idx = np.random.choice(len(next_positions), p=[x[1] for x in next_positions]) # Update our pos. self.discrete_pos = next_positions[next_state_idx][0] next_x = self.discrete_pos // self.n_col next_y = self.discrete_pos % self.n_col # determine reward and done flag next_state_type = self.world[next_y, next_x] if next_state_type == "H": self.is_terminal = True self.reward = -5 if self.reward_function == "sparse" else -10 elif next_state_type == "F": self.is_terminal = False self.reward = -3 if self.reward_function == "sparse" else -10 elif next_state_type in [" ", "S"]: self.is_terminal = False self.reward = -1 elif next_state_type == "G": self.is_terminal = True self.reward = 1 if self.reward_function == "sparse" else 50 else: raise NotImplementedError self.refresh_state() return self.state, np.array(self.reward, dtype=np.float32), np.array(self.is_terminal), None def step_flow(self, actions): state, reward, terminal, _ = self.step(actions) # Flow Env logic. if terminal: state = self.reset() return state, reward, terminal def render(self): actor = "X" if self.action_type == "ftj": actor = "^" if self.orientation == 0 else ">" if self.orientation == 90 else "v" if \ self.orientation == 180 else "<" # paints itself for row in range_(len(self.world)): for col, val in enumerate(self.world[row]): if self.x == col and self.y == row: print(actor, end="") else: print(val, end="") print() print() def __str__(self): return "GridWorld({})".format(self.description) def refresh_state(self): # Discrete state. if self.state_representation == "discrete": # TODO: If ftj-actions, maybe multiply discrete states with orientation (will lead to x4 state space size). self.state = np.array(self.discrete_pos, dtype=np.int32) # xy position. elif self.state_representation == "xy": self.state = np.array([self.x, self.y], dtype=np.int32) # xy + orientation (only if `self.action_type` supports turns). elif self.state_representation == "xy+orientation": orient = [0, 1] if self.orientation == 0 else [1, 0] if self.orientation == 90 else [0, -1] \ if self.orientation == 180 else [-1, 0] self.state = np.array([self.x, self.y] + orient, dtype=np.int32) # Camera. else: self.update_cam_pixels() self.state = self.camera_pixels def get_possible_next_positions(self, discrete_pos, action, in_air=False): """ Given a discrete position value and an action, returns a list of possible next states and their probabilities. Only next states with non-zero probabilities will be returned. For now: Implemented as a deterministic MDP. Args: discrete_pos (int): The discrete position to return possible next states for. action (int): The action choice. in_air (bool): Whether we are actually in the air right now (ignore if we come from "H" or "W"). Returns: List[Tuple[int,float]]: A list of tuples (s', p(s'\|s,a)). Where s' is the next discrete position and p(s'\|s,a) is the probability of ending up in that position when in state s and taking action a. """ x = discrete_pos // self.n_col y = discrete_pos % self.n_col coords = np.array([x, y]) increments = np.array([[0, -1], [1, 0], [0, 1], [-1, 0]]) next_coords = np.clip( coords + increments[action], [0, 0], [self.n_row - 1, self.n_col - 1] ) next_pos = self.get_discrete_pos(next_coords[0], next_coords[1]) pos_type = self.world[y, x] next_pos_type = self.world[next_coords[1], next_coords[0]] # TODO: Allow stochasticity in this env. Right now, all probs are 1.0. # Next field is a wall or we are already terminal. Stay where we are. if next_pos_type == "W" or (in_air is False and pos_type in ["H", "G"]): return [(discrete_pos, 1.)] # Move to next field. else: return [(next_pos, 1.)] def update_cam_pixels(self): # Init camera? if self.camera_pixels is None: self.camera_pixels = np.zeros(shape=(self.n_row, self.n_col, 3), dtype=np.int32) self.camera_pixels[:, :, :] = 0 # reset everything # 1st channel -> Walls (127) and goal (255). # 2nd channel -> Dangers (fire=127, holes=255) # 3rd channel -> Actor position (255). for row in range_(self.n_row): for col in range_(self.n_col): field = self.world[row, col] if field == "F": self.camera_pixels[row, col, 0] = 127 elif field == "H": self.camera_pixels[row, col, 0] = 255 elif field == "W": self.camera_pixels[row, col, 1] = 127 elif field == "G": self.camera_pixels[row, col, 1] = 255 # will this work (goal==2x wall)? # Overwrite player's position. self.camera_pixels[self.y, self.x, 2] = 255 def get_dist_to_goal(self): return math.sqrt((self.x - self.goal_x) ** 2 + (self.y - self.goal_y) ** 2) def get_discrete_pos(self, x, y): """ Returns a single, discrete int-value. Calculated by walking down the rows of the grid first (starting in upper left corner), then along the col-axis. Args: x (int): The x-coordinate. y (int): The y-coordinate. Returns: int: The discrete pos value corresponding to the given x and y. """ return x * self.n_col + y @property def x(self): return self.discrete_pos // self.n_col @property def y(self): return self.discrete_pos % self.n_col def _translate_action(self, actions): """ Maps a single integer action to dict actions. This allows us to compare how container actions perform when instead using a large range on a single discrete action by enumerating all combinations. Args: actions Union(int, dict): Maps single integer to different actions. Returns: dict: Actions dict. """ # If already dict, do nothing. if isinstance(actions, dict): return actions else: # Unpack if isinstance(actions, (np.ndarray, list)): actions = actions[0] # 3 x 3 x 2 = 18 actions assert 18 > actions >= 0 # For "ftj": A dict with keys: "turn" (0 (turn left), 1 (no turn), 2 (turn right)), "forward" # (0 (backward), 1(stay), 2 (forward)) and "jump" (0 (no jump) and 1 (jump)). converted_actions = {} # Mapping: # 0 = 0 0 0 # 1 = 0 0 1 # 2 = 0 1 0 # 3 = 0 1 1 # 4 = 0 2 0 # 5 = 0 2 1 # 6 = 1 0 0 # 7 = 1 0 1 # 8 = 1 1 0 # 9 = 1 1 1 # 10 = 1 2 0 # 11 = 1 2 1 # 12 = 2 0 0 # 13 = 2 0 1 # 14 = 2 1 0 # 15 = 2 1 1 # 16 = 2 2 0 # 17 = 2 2 1 # Set turn via range: if 6 > actions >= 0: converted_actions["turn"] = 0 elif 12 > actions >= 6: converted_actions["turn"] = 1 elif 18 > actions >= 12: converted_actions["turn"] = 2 if actions in [0, 1, 6, 7, 12, 13]: converted_actions["forward"] = 0 elif actions in [2, 3, 8, 9, 14, 15]: converted_actions["forward"] = 1 elif actions in [4, 5, 10, 11, 16, 17]: converted_actions["forward"] = 2 if actions % 2 == 0: converted_actions["jump"] = 0 else: converted_actions["jump"] = 1 return converted_actions