def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99): """ :type n_parallel_games: int n_actions: int """ # Parameters for training self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.gamma = gamma self.loss = None # image observation self.observation_layer = InputLayer(observation_shape) self.n_actions = n_actions self.resolver, self.agent = self.build_model() weights = lasagne.layers.get_all_params(self.resolver, trainable=True) self.applier_fun = self.agent.get_react_function() # Prepare replay pool env = SessionPoolEnvironment(observations=self.observation_layer, actions=self.resolver, agent_memories=self.agent.state_variables) preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) if pool_size is None: batch_env = env else: batch_env = env.sample_session_batch(self.replay_batch_size) self.loss = self.build_loss(batch_env) self.eval_fun = self.build_eval_fun(batch_env) updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) train_fun = theano.function([], [self.loss], updates=updates) super(BasicRLAgent, self).__init__(env, pool, train_fun, pool_size, replay_seq_len)
def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1,): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller)#, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment(observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment(observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch(self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch(self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum(axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len)
def __init__(self, agent, game_title, n_games, max_size=None, **kwargs): """ A pool that stores several - game states (gym environment) - prev_observations - last agent observations - prev memory states - last agent hidden states :param game_title: name of the game. See here http://yavar.naddaf.name/ale/list_of_current_games.html :param n_games: number of parallel games :param kwargs: options passed to Atari when creating a game. See Atari __init__ """ #create atari games self.game_kwargs = kwargs self.game_title = game_title self.games = [ Atari(self.game_title, **self.game_kwargs) for _ in range(n_games) ] #initial observations self.prev_observations = [atari.reset() for atari in self.games] #agent memory variables (if you use recurrent networks self.prev_memory_states = [ np.zeros((n_games, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in agent.agent_states ] #save agent self.agent = agent self.agent_step = agent.get_react_function() # Create experience replay environment self.experience_replay = SessionPoolEnvironment( observations=agent.observation_layers, actions=agent.action_layers, agent_memories=agent.agent_states) self.max_size = max_size
def test_space_invaders( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {window: prev_window} # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(window_max, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #fakes for a2c policy_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.softmax, name="a2c action probas") state_value_eval = DenseLayer(nn, num_units=1, nonlinearity=None, name="a2c state values") # resolver resolver = ProbabilisticResolver(policy_eval, name="resolver") # agent agent = Agent(observation_layer, memory_dict, (q_eval, policy_eval, state_value_eval), resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states ] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values, policy, etc obtained via experience replay _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) (q_values_sequence, policy_sequence, value_sequence) = estimators # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = 0. #1-step algos for algo in qlearning, sarsa: elwise_mse_loss += algo.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) #qlearning_n_step for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1, None): elwise_mse_loss += qlearning_n_step.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=n) #a2c n_step elwise_mse_loss += a2c_n_step.get_elementwise_objective( policy_sequence, value_sequence[:, :, 0], env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=3) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def __init__( self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1, ): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller) #, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment( observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment( observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch( self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch( self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum( axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len)
class HierarchicalAgent(MdpAgent): def __init__( self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1, ): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller) #, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment( observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment( observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch( self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch( self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum( axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len) # raise NotImplementedError def reload_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] # print observation_tensor.shape meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate( [np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), # goal_log.reshape(ctrl_shape)][0].shape self.controller_env.load_sessions( [ observation_tensor.reshape( ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape) ], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), # controller_preceding_states ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate([ np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0] ], axis=1)) self.metacontroller_env.load_sessions( meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states) def update_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate( [np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) self.controller_env.append_sessions( [ observation_tensor.reshape( ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape) ], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), controller_preceding_states, max_pool_size=self.pool_size, ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate([ np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0] ], axis=1)) self.metacontroller_env.append_sessions( meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states, max_pool_size=self.pool_size) def step(self, env_observation, prev_memories='zeros'): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" batch_size = self.n_parallel_games if prev_memories == 'zeros': controller_mem = metacontroller_mem = 'zeros' meta_inp = np.zeros( (batch_size, ) + tuple(self.metacontroller.observation_shape[1:]), dtype='float32') itr = -1 # goal will be defined by "if itr ==0" clause else: pivot = len(self.controller.agent.state_variables) controller_mem, metacontroller_mem = prev_memories[: pivot], prev_memories[ pivot:-4] meta_inp, goal, meta_V, itrs = prev_memories[-4:] itr = itrs[0] itr = (itr + 1) % self.metacontroller.period if itr == 0: goal, metacontroller_mem, meta_V = self.metacontroller.step( meta_inp, metacontroller_mem, batch_size) #print env_observation.shape action, controller_mem, meta_inp = self.controller.step( env_observation, goal, controller_mem, batch_size) new_memories = controller_mem + metacontroller_mem + [ meta_inp, goal, meta_V, [itr] * self.n_parallel_games ] return action, new_memories def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective( qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10**-5 return mse_loss + reg_l2 def build_eval_fun(self, env): mean_session_reward = env.rewards.sum( axis=1).mean() / self.replay_seq_len eval_fun = theano.function([], [mean_session_reward]) return eval_fun
def test_memory( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states memory_dict = OrderedDict([]) ###Window window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) memory_dict[window] = prev_window ###Stack #prev stack stack_w, stack_h = 4, 5 stack_inputs = DenseLayer(observation_reshape, stack_w, name="prev_stack") stack_controls = DenseLayer(observation_reshape, 3, nonlinearity=lasagne.nonlinearities.softmax, name="prev_stack") prev_stack = InputLayer((None, stack_h, stack_w), name="previous stack state") stack = StackAugmentation(stack_inputs, prev_stack, stack_controls) memory_dict[stack] = prev_stack stack_top = lasagne.layers.SliceLayer(stack, 0, 1) ###RNN preset prev_rnn = InputLayer((None, 16), name="previous RNN state") new_rnn = RNNCell(prev_rnn, observation_reshape) memory_dict[new_rnn] = prev_rnn ###GRU preset prev_gru = InputLayer((None, 16), name="previous GRUcell state") new_gru = GRUCell(prev_gru, observation_reshape) memory_dict[new_gru] = prev_gru ###GRUmemorylayer prev_gru1 = InputLayer((None, 15), name="previous GRUcell state") new_gru1 = GRUMemoryLayer(15, observation_reshape, prev_gru1) memory_dict[new_gru1] = prev_gru1 #LSTM with peepholes prev_lstm0_cell = InputLayer( (None, 13), name="previous LSTMCell hidden state [with peepholes]") prev_lstm0_out = InputLayer( (None, 13), name="previous LSTMCell output state [with peepholes]") new_lstm0_cell, new_lstm0_out = LSTMCell( prev_lstm0_cell, prev_lstm0_out, input_or_inputs=observation_reshape, peepholes=True, name="newLSTM1 [with peepholes]") memory_dict[new_lstm0_cell] = prev_lstm0_cell memory_dict[new_lstm0_out] = prev_lstm0_out #LSTM without peepholes prev_lstm1_cell = InputLayer( (None, 14), name="previous LSTMCell hidden state [no peepholes]") prev_lstm1_out = InputLayer( (None, 14), name="previous LSTMCell output state [no peepholes]") new_lstm1_cell, new_lstm1_out = LSTMCell( prev_lstm1_cell, prev_lstm1_out, input_or_inputs=observation_reshape, peepholes=False, name="newLSTM1 [no peepholes]") memory_dict[new_lstm1_cell] = prev_lstm1_cell memory_dict[new_lstm1_out] = prev_lstm1_out ##concat everything for i in [flatten(window_max), stack_top, new_rnn, new_gru, new_gru1]: print(i.output_shape) all_memory = concat([ flatten(window_max), stack_top, new_rnn, new_gru, new_gru1, new_lstm0_out, new_lstm1_out, ]) # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(all_memory, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") # resolver resolver = EpsilonGreedyResolver(q_eval, epsilon=0.1, name="resolver") # agent agent = Agent(observation_layer, memory_dict, q_eval, resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function print('compiling react') applier_fun = agent.get_react_function() # a nice pythonic interface def step(observation, prev_memories='zeros', batch_size=n_parallel_games): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" # default to zeros if prev_memories == 'zeros': prev_memories = [ np.zeros((batch_size, ) + tuple(mem.output_shape[1:]), dtype='float32') for mem in agent.agent_states ] res = applier_fun(np.array(observation), *prev_memories) action = res[0] memories = res[1:] return action, memories # # Create and manage a pool of atari sessions to play with pool = GamePool(game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(step, 50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( step, n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values obtained via experience replay _env_states, _observations, _memories, _imagined_actions, q_values_sequence = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = qlearning.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
class HierarchicalAgent(MdpAgent): def __init__(self, pool, observation_shape, n_actions, n_parallel_games=1, replay_seq_len=20, replay_batch_size=20, pool_size=None, n_steps=3, gamma=0.99, split_into=1,): #gru0_size=128): self.n_parallel_games = n_parallel_games self.replay_seq_len = replay_seq_len self.replay_batch_size = replay_batch_size self.pool_size = pool_size self.n_steps = n_steps self.n_actions = n_actions self.gamma = gamma self.split_into = split_into self.controller = Controller(observation_shape, n_actions) self.metacontroller = MetaController(self.controller)#, gru0_size) # Prepare replay pool self.controller_env = SessionPoolEnvironment(observations=self.controller.agent.observation_layers, actions=self.controller.resolver, agent_memories=self.controller.agent.agent_states) self.metacontroller_env = SessionPoolEnvironment(observations=self.metacontroller.agent.observation_layers, actions=self.metacontroller.resolver, agent_memories=self.metacontroller.agent.agent_states) # get interaction sessions observation_log, action_tensor, extrinsic_reward_log, memory_log, is_alive_tensor, _ = \ pool.interact(self.step, n_steps=self.replay_seq_len) preceding_memory_states = list(pool.prev_memory_states) self.reload_pool(observation_log, action_tensor, extrinsic_reward_log, is_alive_tensor, memory_log, preceding_memory_states) if pool_size is None: controller_batch_env = self.controller_env metacontroller_batch_env = self.metacontroller_env else: controller_batch_env = self.controller_env.sample_session_batch(self.replay_batch_size) metacontroller_batch_env = self.metacontroller_env.sample_session_batch(self.replay_batch_size) self.loss = self.build_loss(controller_batch_env, self.controller.agent, 50) + \ self.build_loss(metacontroller_batch_env, self.metacontroller.agent, 10) self.eval_fun = self.build_eval_fun(metacontroller_batch_env) weights = self.controller.weights + self.metacontroller.weights updates = lasagne.updates.adadelta(self.loss, weights, learning_rate=0.01) mean_session_reward = metacontroller_batch_env.rewards.sum(axis=1).mean() train_fun = theano.function([], [self.loss, mean_session_reward], updates=updates) super(HierarchicalAgent, self).__init__([self.controller_env, self.metacontroller_env], pool, train_fun, pool_size, replay_seq_len) # raise NotImplementedError def reload_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] # print observation_tensor.shape meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) # print [observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), # goal_log.reshape(ctrl_shape)][0].shape self.controller_env.load_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape)], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), # controller_preceding_states ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate( [np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]], axis=1 ) ) self.metacontroller_env.load_sessions(meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states) def update_pool(self, observation_tensor, action_tensor, extrinsic_reward_tensor, is_alive_tensor, memory_tensor, preceding_memory_states): batch_size = observation_tensor.shape[0] meta_obs_log, goal_log, meta_V, itrs = memory_tensor[-4:] itr = itrs[0] pivot = len(self.controller.agent.state_variables) controller_preceding_states = preceding_memory_states[:pivot] metacontroller_preceding_states = preceding_memory_states[pivot:-4] ###CONTROLLER### # load them into experience replay environment for controller # controller_preceding_states =!!!!!!!!!!!!!!!!!!!!!!!!!!! ctrl_shape = (batch_size * self.split_into, self.replay_seq_len / self.split_into) intrinsic_rewards = np.concatenate([np.zeros([meta_V.shape[0], 1]), np.diff(meta_V, axis=1)], axis=1) self.controller_env.append_sessions([observation_tensor.reshape(ctrl_shape + self.controller.observation_shape[1:]), goal_log.reshape(ctrl_shape)], action_tensor.reshape(ctrl_shape), intrinsic_rewards.reshape(ctrl_shape), is_alive_tensor.reshape(ctrl_shape), controller_preceding_states, max_pool_size=self.pool_size, ) ###METACONTROLLER### # separate case for metacontroller extrinsic_reward_sums = np.diff( np.concatenate( [np.zeros_like(extrinsic_reward_tensor[:, 0, None]), extrinsic_reward_tensor.cumsum(axis=-1)[:, itr == 0]], axis=1 ) ) self.metacontroller_env.append_sessions(meta_obs_log[:, itr == 0][:, :10], goal_log[:, itr == 0][:, :10], extrinsic_reward_sums[:, :10], is_alive_tensor[:, itr == 0][:, :10], metacontroller_preceding_states, max_pool_size=self.pool_size) def step(self, env_observation, prev_memories='zeros'): """ returns actions and new states given observation and prev state Prev state in default setup should be [prev window,]""" batch_size = self.n_parallel_games if prev_memories == 'zeros': controller_mem = metacontroller_mem = 'zeros' meta_inp = np.zeros((batch_size,) + tuple(self.metacontroller.observation_shape[1:]), dtype='float32') itr = -1 # goal will be defined by "if itr ==0" clause else: pivot = len(self.controller.agent.state_variables) controller_mem, metacontroller_mem = prev_memories[:pivot], prev_memories[pivot:-4] meta_inp, goal, meta_V, itrs = prev_memories[-4:] itr = itrs[0] itr = (itr + 1) % self.metacontroller.period if itr == 0: goal, metacontroller_mem, meta_V = self.metacontroller.step(meta_inp, metacontroller_mem, batch_size) #print env_observation.shape action, controller_mem, meta_inp = self.controller.step(env_observation, goal, controller_mem, batch_size) new_memories = controller_mem + metacontroller_mem + [meta_inp, goal, meta_V, [itr] * self.n_parallel_games] return action, new_memories def build_loss(self, env, agent, replay_seq_len): # get agent's Qvalues obtained via experience replay _, _, _, _, qvalues_seq = agent.get_sessions( env, # initial_hidden = env.preceding_agent_memories, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) scaled_reward_seq = env.rewards elwise_mse_loss = qlearning_n_step.get_elementwise_objective(qvalues_seq, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=self.gamma, n_steps=self.n_steps) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(agent.state_variables.keys(), l2) * 10 ** -5 return mse_loss + reg_l2 def build_eval_fun(self, env): mean_session_reward = env.rewards.sum(axis=1).mean() / self.replay_seq_len eval_fun = theano.function([], [mean_session_reward]) return eval_fun
class AtariGamePool(object): def __init__(self, agent, game_title, n_games, max_size=None, **kwargs): """ A pool that stores several - game states (gym environment) - prev_observations - last agent observations - prev memory states - last agent hidden states :param game_title: name of the game. See here http://yavar.naddaf.name/ale/list_of_current_games.html :param n_games: number of parallel games :param kwargs: options passed to Atari when creating a game. See Atari __init__ """ #create atari games self.game_kwargs = kwargs self.game_title = game_title self.games = [ Atari(self.game_title, **self.game_kwargs) for _ in range(n_games) ] #initial observations self.prev_observations = [atari.reset() for atari in self.games] #agent memory variables (if you use recurrent networks self.prev_memory_states = [ np.zeros((n_games, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in agent.agent_states ] #save agent self.agent = agent self.agent_step = agent.get_react_function() # Create experience replay environment self.experience_replay = SessionPoolEnvironment( observations=agent.observation_layers, actions=agent.action_layers, agent_memories=agent.agent_states) self.max_size = max_size def interact(self, n_steps=100, verbose=False): """generate interaction sessions with ataries (openAI gym atari environments) Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset params: agent_step: a function(observations,memory_states) -> actions,new memory states for agent update n_steps: length of an interaction verbose: if True, prints small debug message whenever a game gets reloaded after end. returns: observation_log,action_log,reward_log,[memory_logs],is_alive_log,info_log a bunch of tensors [batch, tick, size...] the only exception is info_log, which is a list of infos for [time][batch] """ history_log = [] for i in range(n_steps): res = self.agent_step(self.prev_observations, *self.prev_memory_states) actions, new_memory_states = res[0], res[1:] new_observations, cur_rewards, is_done, infos = \ zip(*map( lambda atari, action: atari.step(action), self.games, actions) ) new_observations = np.array(new_observations) for i in range(len(self.games)): if is_done[i]: new_observations[i] = self.games[i].reset() for m_i in range(len(new_memory_states)): new_memory_states[m_i][i] = 0 if verbose: print("atari %i reloaded" % i) # append observation -> action -> reward tuple history_log.append((self.prev_observations, actions, cur_rewards, new_memory_states, is_done, infos)) self.prev_observations = new_observations self.prev_memory_states = new_memory_states # cast to numpy arrays observation_log, action_log, reward_log, memories_log, is_done_log, info_log = zip( *history_log) # tensor dimensions # [batch_i, time_i, observation_size...] observation_log = np.array(observation_log).swapaxes(0, 1) # [batch, time, units] for each memory tensor memories_log = map(lambda mem: np.array(mem).swapaxes(0, 1), zip(*memories_log)) # [batch_i,time_i] action_log = np.array(action_log).swapaxes(0, 1) # [batch_i, time_i] reward_log = np.array(reward_log).swapaxes(0, 1) # [batch_i, time_i] is_alive_log = 1 - np.array(is_done_log, dtype='int8').swapaxes(0, 1) return observation_log, action_log, reward_log, memories_log, is_alive_log, info_log def update(self, n_steps=100, append=False, max_size=None): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(self.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = self.interact( n_steps=n_steps) # load them into experience replay environment if not append: self.experience_replay.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) else: self.experience_replay.append_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states, max_pool_size=max_size or self.max_size) def evaluate(self, n_games=1, save_path="./records", record_video=True, verbose=True, t_max=10000): """ Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = Atari(self.game_title, **self.game_kwargs) if record_video: env.monitor.start(save_path, force=True) else: env.monitor.start(save_path, lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [ np.zeros((1, ) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states ] t = 0 total_reward = 0 while True: res = self.agent_step(observation[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print( "Episode finished after {} timesteps with reward={}" .format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.monitor.close() del env return np.mean(game_rewards)