def test_apex_weight_syncing(self): env = RandomEnv(state_space=spaces.IntBox(2), action_space=spaces.IntBox(2), deterministic=True) agent = Agent.from_spec( config_from_path("configs/apex_agent_for_random_env.json"), state_space=env.state_space, action_space=env.action_space) policy_weights = agent.get_policy_weights() print('policy weights: {}'.format(policy_weights)) for variable, weights in policy_weights.items(): weights += 0.01 agent.set_policy_weights(policy_weights) new_weights = agent.get_policy_weights() recursive_assert_almost_equal(policy_weights, new_weights)
def __init__(self, state_start=0.0, reward_start=-100.0, steps_to_terminal=10): """ Args: state_start (float): State to start with after reset. reward_start (float): Reward to start with (after first action) after a reset. steps_to_terminal (int): Number of steps after which a terminal signal is raised. """ super(DeterministicEnv, self).__init__(state_space=spaces.FloatBox(), action_space=spaces.IntBox(2)) self.state_start = state_start self.reward_start = reward_start self.steps_to_terminal = steps_to_terminal self.state = state_start self.reward = reward_start self.steps_into_episode = 0
def __init__(self, world="4x4", save_mode=False, reward_function="sparse", state_representation="discr"): """ Args: world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state). save_mode (bool): Whether to replace holes (H) with walls (W). Default: False. reward_function (str): One of sparse: hole=-1, fire=-1, goal=50, all other steps=-1 rich: hole=-100, fire=-10, goal=50 state_representation (str): One of "discr_pos", "xy_pos", "cam" """ # Build our map. if isinstance(world, str): self.description = world world = self.MAPS[world] else: self.description = "custom-map" world = np.array(list(map(list, world))) # Apply safety switch. world[world == 'H'] = ("H" if not save_mode else "F") # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column). self.world = world self.n_row, self.n_col = self.world.shape (start_x, ), (start_y, ) = np.nonzero(self.world == "S") # Figure out our state space. assert state_representation in ["discr", "xy", "cam"] self.state_representation = state_representation # Discrete states (single int from 0 to n). if self.state_representation == "discr": state_space = spaces.IntBox(self.n_row * self.n_col) # x/y position (2 ints). elif self.state_representation == "xy_pos": state_space = spaces.IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2, )) # Camera outputting a 2D color image of the world. else: state_space = spaces.IntBox(0, 255, shape=(self.n_row, self.n_col, 3)) self.default_start_pos = self.get_discrete_pos(start_x, start_y) self.discrete_pos = self.default_start_pos assert reward_function in ["sparse", "rich"] # TODO: "potential"-based reward self.reward_function = reward_function # Store the goal position for proximity calculations (for "potential" reward function). (self.goal_x, ), (self.goal_y, ) = np.nonzero(self.world == "G") # Call the super's constructor. super(GridWorld, self).__init__(state_space=state_space, action_space=spaces.IntBox(4)) # Reset ourselves. self.state = None self.camera_pixels = None # only used, if state_representation=='cam' self.reward = None self.is_terminal = None self.reset(randomize=False)
def test_dqn_functionality(self): """ Creates a DQNAgent and runs it for a few steps in a GridWorld to vigorously test all steps of the learning process. """ env = GridWorld(world="2x2", save_mode=True) # no holes, just fire agent = Agent.from_spec( # type: DQNAgent config_from_path("configs/dqn_agent_for_functionality_test.json"), double_q=True, dueling_q=True, state_space=env.state_space, action_space=env.action_space, discount=0.95) worker = SingleThreadedWorker( env_spec=lambda: GridWorld(world="2x2", save_mode=True), agent=agent) test = AgentTest(worker=worker) # Helper python DQNLossFunc object. loss_func = DQNLossFunction(backend="python", double_q=True, discount=agent.discount) loss_func.when_input_complete(input_spaces=dict(loss_per_item=[ spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.IntBox(4, add_batch_rank=True), spaces.FloatBox(add_batch_rank=True), spaces.BoolBox(add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True), spaces.FloatBox(shape=(4, ), add_batch_rank=True) ]), action_space=env.action_space) matrix1_qnet = np.array([[0.9] * 2] * 4) matrix2_qnet = np.array([[0.8] * 5] * 2) matrix1_target_net = np.array([[0.9] * 2] * 4) matrix2_target_net = np.array([[0.8] * 5] * 2) a = self._calculate_action(0, matrix1_qnet, matrix2_qnet) # 1st step -> Expect insert into python-buffer. # action: up (0) test.step(1, reset=True) # Environment's new state. test.check_env("state", 0) # Agent's buffer. test.check_agent("states_buffer", [[1.0, 0.0, 0.0, 0.0]], key_or_index="env_0") # <- prev state (preprocessed) test.check_agent("actions_buffer", [a], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") # Memory contents. test.check_var("replay-memory/index", 0) test.check_var("replay-memory/size", 0) test.check_var("replay-memory/memory/states", np.array([[0] * 4] * agent.memory.capacity)) test.check_var("replay-memory/memory/actions", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/rewards", np.array([0] * agent.memory.capacity)) test.check_var("replay-memory/memory/terminals", np.array([False] * agent.memory.capacity)) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 2nd step -> expect insert into memory (and python buffer should be empty again). # action: up (0) # Also check the policy and target policy values (Should be equal at this point). test.step(1) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 2) test.check_var("replay-memory/size", 2) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 2))) test.check_var("replay-memory/memory/actions", np.array([0, 0] + [0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0, -1.0] + [0.0] * (agent.memory.capacity - 2))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] + [False] * (agent.memory.capacity - 2))) # Check policy and target-policy weights (should be the same). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_qnet) # 3rd and 4th step -> expect another insert into memory (and python buffer should be empty again). # actions: down (2), up (0) <- exploring is True = more random actions # Expect an update to the policy variables (leave target as is (no sync yet)). test.step(2, use_exploration=True) test.check_env("state", 0) test.check_agent("states_buffer", [], key_or_index="env_0") test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_var("replay-memory/index", 4) test.check_var("replay-memory/size", 4) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 3 + [[0.0, 1.0, 0.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/actions", np.array([0, 0, 2, 0] + [0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/rewards", np.array([-1.0] * 4 + # + [-3.0] + [0.0] * (agent.memory.capacity - 4))) test.check_var( "replay-memory/memory/terminals", np.array([False, True] * 2 + [False] * (agent.memory.capacity - 4))) # Get the latest memory batch. expected_batch = dict(states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([False, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])) test.check_agent("last_memory_batch", expected_batch) # Calculate the weight updates and check against actually update weights by the AgentDQN. mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_target_net, matrix2_target_net, agent, loss_func) # Check policy and target-policy weights (policy should be updated now). test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) matrix1_qnet = mat_updated[0] matrix2_qnet = mat_updated[1] # 5th step -> Another buffer update check. # action: down (2) (weights have been updated -> different actions) test.step(1) test.check_env("state", 3) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty b/c we reached end of episode (buffer gets force-flushed) test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 5) test.check_var("replay-memory/size", 5) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[0.0, 0.0, 0.0, 0.0]] * (agent.memory.capacity - 5))) test.check_var("replay-memory/memory/actions", np.array([0, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, 0.0])) test.check_var("replay-memory/memory/terminals", np.array([False, True] * 2 + [True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/action-adapter-0/action-network/action-layer/dense/kernel", mat_updated[1], decimals=4) test.check_var( "target-policy/action-adapter-0/action-network/action-layer/dense/kernel", matrix2_target_net) # 6th/7th step (with exploration enabled) -> Another buffer update check. # action: up, down (0, 2) test.step(2, use_exploration=True) test.check_env("state", 1) test.check_agent( "states_buffer", [], key_or_index="env_0" ) # <- all empty again; flushed after 6th step (when buffer was full). test.check_agent("actions_buffer", [], key_or_index="env_0") test.check_agent("rewards_buffer", [], key_or_index="env_0") test.check_agent("terminals_buffer", [], key_or_index="env_0") test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) # index has been rolled over (memory capacity is 6) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0] * 3 + [-3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) test.check_var("dueling-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=4) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_target_net) test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=4) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_target_net) # 8th step -> Another buffer update check and weights update and sync. # action: down (2) test.step(1) test.check_env("state", 1) test.check_agent("states_buffer", [1], key_or_index="env_0") test.check_agent("actions_buffer", [2], key_or_index="env_0") test.check_agent("rewards_buffer", [-1.0], key_or_index="env_0") test.check_agent("terminals_buffer", [False], key_or_index="env_0") expected_batch = dict( states=np.array([[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]]), actions=np.array([0, 1]), rewards=np.array([-1.0, -3.0]), terminals=np.array([True, True]), next_states=np.array([[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]) # TODO: <- This is wrong and must be fixed # (next-state of first item is from a previous insert and unrelated to first item) ) test.check_agent("last_memory_batch", expected_batch) test.check_var("replay-memory/index", 1) test.check_var("replay-memory/size", 6) test.check_var( "replay-memory/memory/states", np.array([[1.0, 0.0, 0.0, 0.0]] * 4 + [[0.0, 0.0, 1.0, 0.0]] + [[1.0, 0.0, 0.0, 0.0]])) test.check_var("replay-memory/memory/actions", np.array([2, 0, 0, 1, 2, 0])) test.check_var("replay-memory/memory/rewards", np.array([-1.0, -1.0, -1.0, -3.0, 1.0, -1.0])) test.check_var("replay-memory/memory/terminals", np.array([True, True, False, True, True, False])) # Assume that the sync happens first (matrices are already the same when updating). mat_updated = self._helper_update_matrix(expected_batch, matrix1_qnet, matrix2_qnet, matrix1_qnet, matrix2_qnet, agent, loss_func) # Now target-net should be again 1 step behind policy-net. test.check_var("dueling-policy/neural-network/hidden/dense/kernel", mat_updated[0], decimals=2) test.check_var("target-policy/neural-network/hidden/dense/kernel", matrix1_qnet, decimals=2) # again: old matrix test.check_var( "dueling-policy/dueling-action-adapter/action-layer/dense/kernel", mat_updated[1], decimals=2) test.check_var( "target-policy/dueling-action-adapter/action-layer/dense/kernel", matrix2_qnet, decimals=2)
def __init__(self, world="4x4", save_mode=False, action_type="udlr", reward_function="sparse", state_representation="discrete"): """ Args: world (Union[str,List[str]]): Either a string to map into `MAPS` or a list of strings describing the rows of the world (e.g. ["S ", " G"] for a two-row/two-column world with start and goal state). save_mode (bool): Whether to replace holes (H) with walls (W). Default: False. action_type (str): Which action space to use. Chose between "udlr" (up, down, left, right), which is a discrete action space and "ftj" (forward + turn + jump), which is a container multi-discrete action space. reward_function (str): One of sparse: hole=-1, fire=-1, goal=50, all other steps=-1 rich: hole=-100, fire=-10, goal=50 state_representation (str): - "discrete": An int representing the field on the grid, 0 meaning the upper left field, 1 the one below, etc.. - "xy": The x and y grid position tuple. - "xy+orientation": The x and y grid position tuple plus the orientation (if any) as tuple of 2 values of the actor. - "camera": A 3-channel image where each field in the grid-world is one pixel and the 3 channels are used to indicate different items in the scene (walls, holes, the actor, etc..). """ # Build our map. if isinstance(world, str): self.description = world world = self.MAPS[world] else: self.description = "custom-map" world = np.array(list(map(list, world))) # Apply safety switch. world[world == 'H'] = ("H" if not save_mode else "F") # `world` is a list of lists that needs to be indexed using y/x pairs (first row, then column). self.world = world self.n_row, self.n_col = self.world.shape (start_x,), (start_y,) = np.nonzero(self.world == "S") # Figure out our state space. assert state_representation in ["discrete", "xy", "xy+orientation", "camera"] self.state_representation = state_representation # Discrete states (single int from 0 to n). if self.state_representation == "discrete": state_space = spaces.IntBox(self.n_row * self.n_col) # x/y position (2 ints). elif self.state_representation == "xy": state_space = spaces.IntBox(low=(0, 0), high=(self.n_col, self.n_row), shape=(2,)) # x/y position + orientation (3 ints). elif self.state_representation == "xy+orientation": state_space = spaces.IntBox(low=(0, 0, 0, 0), high=(self.n_col, self.n_row, 1, 1)) # Camera outputting a 2D color image of the world. else: state_space = spaces.IntBox(0, 255, shape=(self.n_row, self.n_col, 3)) self.default_start_pos = self.get_discrete_pos(start_x, start_y) self.discrete_pos = self.default_start_pos assert reward_function in ["sparse", "rich"] # TODO: "potential"-based reward self.reward_function = reward_function # Store the goal position for proximity calculations (for "potential" reward function). (self.goal_x,), (self.goal_y,) = np.nonzero(self.world == "G") # Specify the actual action spaces. self.action_type = action_type action_space = spaces.IntBox(4) if self.action_type == "udlr" else spaces.Dict(dict( forward=spaces.IntBox(3), turn=spaces.IntBox(3), jump=spaces.IntBox(2) )) # Call the super's constructor. super(GridWorld, self).__init__(state_space=state_space, action_space=action_space) # Reset ourselves. self.state = None self.orientation = None # int: 0, 90, 180, 270 self.camera_pixels = None # only used, if state_representation=='cam' self.reward = None self.is_terminal = None self.reset(randomize=False)