def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99): """ :type n_parallel_games: int n_actions: int """ # Parameters for training self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.gamma = gamma self.loss = None # image observation self.observation_layer = InputLayer(observation_shape) self.n_actions = n_actions self.resolver, self.agent = self.build_model() weights = lasagne.layers.get_all_params(self.resolver, trainable=True) self.applier_fun = self.agent.get_react_function() # Prepare replay pool env = SessionPoolEnvironment(observations=self.observation_layer, actions=self.resolver, agent_memories=self.agent.state_variables) preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) if pool_size is None: batch_env = env else: batch_env = env.sample_session_batch(self.replay_batch_size) self.loss = self.build_loss(batch_env) self.eval_fun = self.build_eval_fun(batch_env) updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) train_fun = theano.function([], [self.loss], updates=updates) super(BasicRLAgent, self).__init__(env, pool, train_fun, pool_size, replay_seq_len)
class HierarchicalAgent(MdpAgent): def __init__( self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1, ): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller) #, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment( observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment( observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch( self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch( self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum( axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len) # raise NotImplementedError def reload_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] # print observation_tensor.shape meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate( [np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), # goal_log.reshape(ctrl_shape)][0].shape self.controller_env.load_sessions( [ observation_tensor.reshape( ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape) ], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), # controller_preceding_states ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate([ np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0] ], axis=1)) self.metacontroller_env.load_sessions( meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states) def update_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate( [np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) self.controller_env.append_sessions( [ observation_tensor.reshape( ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape) ], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), controller_preceding_states, max_pool_size=self.pool_size, ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate([ np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0] ], axis=1)) self.metacontroller_env.append_sessions( meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states, max_pool_size=self.pool_size) def step(self, env_observation, prev_memories='zeros'): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" batch_size = self.n_parallel_games if prev_memories == 'zeros': controller_mem = metacontroller_mem = 'zeros' meta_inp = np.zeros( (batch_size, ) + tuple(self.metacontroller.observation_shape[1:]), dtype='float32') itr = -1 # goal will be defined by "if itr ==0" clause else: pivot = len(self.controller.agent.state_variables) controller_mem, metacontroller_mem = prev_memories[: pivot], prev_memories[ pivot:-4] meta_inp, goal, meta_V, itrs = prev_memories[-4:] itr = itrs[0] itr = (itr + 1) % self.metacontroller.period if itr == 0: goal, metacontroller_mem, meta_V = self.metacontroller.step( meta_inp, metacontroller_mem, batch_size) #print env_observation.shape action, controller_mem, meta_inp = self.controller.step( env_observation, goal, controller_mem, batch_size) new_memories = controller_mem + metacontroller_mem + [ meta_inp, goal, meta_V, [itr] * self.n_parallel_games ] return action, new_memories def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective( qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10**-5 return mse_loss + reg_l2 def build_eval_fun(self, env): mean_session_reward = env.rewards.sum( axis=1).mean() / self.replay_seq_len eval_fun = theano.function([], [mean_session_reward]) return eval_fun
class HierarchicalAgent(MdpAgent): def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1,): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller)#, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment(observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment(observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch(self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch(self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum(axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len) # raise NotImplementedError def reload_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] # print observation_tensor.shape meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), # goal_log.reshape(ctrl_shape)][0].shape self.controller_env.load_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape)], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), # controller_preceding_states ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate( [np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]], axis=1 ) ) self.metacontroller_env.load_sessions(meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states) def update_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) self.controller_env.append_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape)], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), controller_preceding_states, max_pool_size=self.pool_size, ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate( [np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]], axis=1 ) ) self.metacontroller_env.append_sessions(meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states, max_pool_size=self.pool_size) def step(self, env_observation, prev_memories='zeros'): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" batch_size = self.n_parallel_games if prev_memories == 'zeros': controller_mem = metacontroller_mem = 'zeros' meta_inp = np.zeros((batch_size,) + tuple(self.metacontroller.observation_shape[1:]), dtype='float32') itr = -1 # goal will be defined by "if itr ==0" clause else: pivot = len(self.controller.agent.state_variables) controller_mem, metacontroller_mem = prev_memories[:pivot], prev_memories[pivot:-4] meta_inp, goal, meta_V, itrs = prev_memories[-4:] itr = itrs[0] itr = (itr + 1) % self.metacontroller.period if itr == 0: goal, metacontroller_mem, meta_V = self.metacontroller.step(meta_inp, metacontroller_mem, batch_size) #print env_observation.shape action, controller_mem, meta_inp = self.controller.step(env_observation, goal, controller_mem, batch_size) new_memories = controller_mem + metacontroller_mem + [meta_inp, goal, meta_V, [itr] * self.n_parallel_games] return action, new_memories def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective(qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10 ** -5 return mse_loss + reg_l2 def build_eval_fun(self, env): mean_session_reward = env.rewards.sum(axis=1).mean() / self.replay_seq_len eval_fun = theano.function([], [mean_session_reward]) return eval_fun
class AtariGamePool(object): def __init__(self, agent, game_title, n_games, max_size=None, **kwargs): """ A pool that stores several - game states (gym environment) - prev_observations - last agent observations - prev memory states - last agent hidden states :param game_title: name of the game. See here http://yavar.naddaf.name/ale/list_of_current_games.html :param n_games: number of parallel games :param kwargs: options passed to Atari when creating a game. See Atari __init__ """ #create atari games self.game_kwargs = kwargs self.game_title = game_title self.games = [ Atari(self.game_title, **self.game_kwargs) for _ in range(n_games) ] #initial observations self.prev_observations = [atari.reset() for atari in self.games] #agent memory variables (if you use recurrent networks self.prev_memory_states = [ np.zeros((n_games, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in agent.agent_states ] #save agent self.agent = agent self.agent_step = agent.get_react_function() # Create experience replay environment self.experience_replay = SessionPoolEnvironment( observations=agent.observation_layers, actions=agent.action_layers, agent_memories=agent.agent_states) self.max_size = max_size def interact(self, n_steps=100, verbose=False): """generate interaction sessions with ataries (openAI gym atari environments) Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset params: agent_step: a function(observations,memory_states) -> actions,new memory states for agent update n_steps: length of an interaction verbose: if True, prints small debug message whenever a game gets reloaded after end. returns: observation_log,action_log,reward_log,[memory_logs],is_alive_log,info_log a bunch of tensors [batch, tick, size...] the only exception is info_log, which is a list of infos for [time][batch] """ history_log = [] for i in range(n_steps): res = self.agent_step(self.prev_observations, *self.prev_memory_states) actions, new_memory_states = res[0], res[1:] new_observations, cur_rewards, is_done, infos = \ zip(*map( lambda atari, action: atari.step(action), self.games, actions) ) new_observations = np.array(new_observations) for i in range(len(self.games)): if is_done[i]: new_observations[i] = self.games[i].reset() for m_i in range(len(new_memory_states)): new_memory_states[m_i][i] = 0 if verbose: print("atari %i reloaded" % i) # append observation -> action -> reward tuple history_log.append((self.prev_observations, actions, cur_rewards, new_memory_states, is_done, infos)) self.prev_observations = new_observations self.prev_memory_states = new_memory_states # cast to numpy arrays observation_log, action_log, reward_log, memories_log, is_done_log, info_log = zip( *history_log) # tensor dimensions # [batch_i, time_i, observation_size...] observation_log = np.array(observation_log).swapaxes(0, 1) # [batch, time, units] for each memory tensor memories_log = map(lambda mem: np.array(mem).swapaxes(0, 1), zip(*memories_log)) # [batch_i,time_i] action_log = np.array(action_log).swapaxes(0, 1) # [batch_i, time_i] reward_log = np.array(reward_log).swapaxes(0, 1) # [batch_i, time_i] is_alive_log = 1 - np.array(is_done_log, dtype='int8').swapaxes(0, 1) return observation_log, action_log, reward_log, memories_log, is_alive_log, info_log def update(self, n_steps=100, append=False, max_size=None): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(self.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = self.interact( n_steps=n_steps) # load them into experience replay environment if not append: self.experience_replay.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) else: self.experience_replay.append_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states, max_pool_size=max_size or self.max_size) def evaluate(self, n_games=1, save_path="./records", record_video=True, verbose=True, t_max=10000): """ Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = Atari(self.game_title, **self.game_kwargs) if record_video: env.monitor.start(save_path, force=True) else: env.monitor.start(save_path, lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [ np.zeros((1, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states ] t = 0 total_reward = 0 while True: res = self.agent_step(observation[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print( "Episode finished after {} timesteps with reward={}" .format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.monitor.close() del env return np.mean(game_rewards)