class NAF(object): def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, learning_rate=1e-3, tau=0.001, hidden_layers=[200, 200]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) self.network = {} self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \ self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False) self.target = {} self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \ self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True) #define optimization operations self.network_optimization = {} with tf.name_scope('optimization'): self.network_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.network_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.network['Q'], self.network_optimization['y']), name='loss') self.network_optimization['optimize'] = tf.train.AdamOptimizer( learning_rate).minimize(self.network_optimization['loss']) #define the operations for compute y value self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * V self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['y'] = tf.to_float(self.y_compute['t']) self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0) self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['v'], self.discount), self.y_compute['y'])) # define the soft update operation between the normal networks and target networks self.soft_update_list = [] with tf.name_scope('soft_update'): for source, dest in zip(self.network['variables'], self.target['variables']): self.soft_update_list.append( dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('naf.graph', graph_def=self.session.graph) def create_networks(self, is_target): scope = 'tar_naf' if is_target else 'naf' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') u = tf.placeholder(tf.float32, shape=(None, self.action_dim), name='actions') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None # define operations for the value function with tf.variable_scope('V'): V = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: V = fully_connected(inputs=V, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place V = batch_norm(inputs=V, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: V = fully_connected(inputs=V, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) # add in the last layer V = fully_connected(inputs=V, activation_fn=None, num_outputs=1) # define operations for compute covariance matrix with tf.variable_scope('L'): L = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: L = fully_connected(inputs=L, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place L = batch_norm(inputs=L, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: L = fully_connected(inputs=L, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) L = fully_connected(inputs=L, activation_fn=None, num_outputs=(self.action_dim * (self.action_dim + 1) / 2)) #construct upper triangular matrix U pivot = 0 rows = [] for index in xrange(self.action_dim): count = self.action_dim - index # slice one element at point pivot from the second dimension and apply exp to it # NOTE, first dimension indicate the batch, -1 means all element in this dimension are in slice diag_elem = tf.exp(tf.slice(L, (0, pivot), (-1, 1))) # slice the next count - 1 element from the second dimension # count is the number of non-zero element in each row # NOTE: index getting bigger, so count get smaller non_diag_elems = tf.slice(L, (0, pivot + 1), (-1, count - 1)) # concate the tensor to form one row of the matrix non_zero_elements = tf.concat(1, (diag_elem, non_diag_elems)) # ((0, 0), (index, 0)) is the paddings # since we have two-d matrix, so the tuple has two elements # for the first (0,0), specify the first dimension # the first 0 means padding nothing, the second 0 means padding before the elements (-1 means after) # (index, 0) specify the padding for second dimension, which is what we want # (index, 0) mean padding index number before the elements row = tf.pad(non_zero_elements, ((0, 0), (index, 0))) rows.append(row) # take off the elements we already used pivot += count # Packs a list of rank-R tensors into one rank-(R+1) tensor. # axis = 1 mean the second dimensions # NOTE : this will get upper triangular matrix U not L L = tf.pack(rows, axis=1) # convariance matrix P = L*L^{T} = U^{T}*U P = tf.batch_matmul(tf.transpose(L, perm=[0, 2, 1]), L) # define operations for compute Mu with tf.variable_scope('M'): M = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: M = fully_connected(inputs=M, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place # see https://github.com/tensorflow/tensorflow/issues/1122 M = batch_norm(inputs=M, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: M = fully_connected(inputs=M, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) # add in the last layer M = fully_connected(inputs=M, activation_fn=tf.tanh, num_outputs=self.action_dim) #define operations for compute Advantage function with tf.name_scope('A'): # first expand the u-M to a 2-d tensor for multiplication # NOTE: it's actually a 3-d tensor, but we ignore the first dim which is the batch # u is two-d matrix, first dimension is the batch # so u is actually a row vector after expand_dim Aprime = tf.expand_dims(u - M, -1) # fix the dimension for batch, transpose each instance A = tf.transpose(Aprime, perm=[0, 2, 1]) # A = -1/2 * (u-M)^{T} * P * (u-M) A = -tf.batch_matmul(tf.batch_matmul(Aprime, P), A) / 2 # make sure the shape is batch_size * 1 for A, -1 mean that dim is automatically computed # after last step, each A is now a 1*1 matrix, we reshape it to get scalar A = tf.reshape(A, [-1, 1]) with tf.name_scope('Q'): Q = A + V # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) #return x, u, is_train, V, P, M, Q, variables return x, u, is_train, V, P, M, Q, variables def predict_target_v(self, x): return self.session.run(self.target['V'], feed_dict={self.target['x']: x}) def get_y(self, v, r, t): return self.session.run(self.y_compute['y'], feed_dict={ self.y_compute['r']: r, self.y_compute['v']: v, self.y_compute['t']: t }) def optimize_network(self, x, u, is_train, y): if self.batch_norm: feed_dict = { self.network['x']: x, self.network['u']: u, self.network['is_train']: is_train, self.network_optimization['y']: y } else: feed_dict = { self.network['x']: x, self.network['u']: u, self.network_optimization['y']: y } return self.session.run(self.network_optimization['optimize'], feed_dict=feed_dict) def predict_action(self, x, is_train): if self.batch_norm: feed_dict = { self.network['x']: x, self.network['is_train']: is_train } else: feed_dict = {self.network['x']: x} return self.session.run([self.network['M'], self.network['P']], feed_dict=feed_dict) def get_action(self, s): s = np.reshape(s, (1, self.state_dim)) a, covariance = self.predict_action(s, False) return self.exploration.add_noise(a[0], self.action_lb, self.action_ub) def soft_update(self): self.session.run(self.soft_update_list) def learn(self, s, a, sprime, r, terminal): # first add the sample to the replay buffer self.replay_buffer.add(s, a, sprime, r, terminal) # we start learning if we have enough sample for one minibatch if self.replay_buffer.get_size() > self.mini_batch_size: # we do the update with several batch in each turn for i in xrange(self.update_per_iteration): state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch( self.mini_batch_size) # compute V' v = self.predict_target_v(sprime_set) # compute y = r + gamma * V' y = self.get_y(v, reward_set, terminal_set) # optimize critic using y, and batch normalization self.optimize_network(state_set, action_set, True, y) # using soft update to update target networks self.soft_update() def reset_exploration(self): self.exploration.reset()
class DDPG(object): def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, actor_learning_rate=0.0001, critic_learning_rate=0.001, tau=0.001, hidden_layers=[400, 300]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) # we define the operations that is used in this algorithms self.critic = {} self.critic['x'], self.critic['u'], self.critic[ 'is_train'], self.critic['q'], self.critic[ 'variables'] = self.create_critic_network(is_target=False) self.target_critic = {} self.target_critic['x'], self.target_critic[ 'u'], _, self.target_critic['q'], self.target_critic[ 'variables'] = self.create_critic_network(is_target=True) self.actor = {} self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[ 'variables'] = self.create_actor_network(is_target=False) self.target_actor = {} self.target_actor['x'], _, self.target_actor['a'], self.target_actor[ 'variables'] = self.create_actor_network(is_target=True) self.critic_optimization = {} with tf.name_scope('critic_optimization'): self.critic_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.critic_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.critic['q'], self.critic_optimization['y']), name='loss') self.critic_optimization['optimize'] = tf.train.AdamOptimizer( critic_learning_rate).minimize( self.critic_optimization['loss']) # define operation to get y self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * target_q self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1)) temp = tf.to_float(self.y_compute['t']) temp = tf.mul(temp, -1.0) temp = tf.add(temp, 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['q'], self.discount), temp)) # define the operation to get the gradient of Q with respect to action self.action_gradients = {} with tf.name_scope('action_grads'): self.action_gradients["action_grads"] = tf.gradients( self.critic['q'], self.critic['u']) self.actor_optimization = {} with tf.name_scope('actor_optimization'): # first define the placeholder for the gradient of Q with respect to action self.actor_optimization['action_grads'] = tf.placeholder( tf.float32, shape=(None, self.action_dim)) # since actor are using gradient ascent, we add the minus sign self.actor_optimization['actor_variable_grads'] = tf.gradients( self.actor['a'], self.actor['variables'], -self.actor_optimization['action_grads']) self.actor_optimization['optimize'] = tf.train.AdamOptimizer( actor_learning_rate).apply_gradients( zip(self.actor_optimization['actor_variable_grads'], self.actor['variables'])) self.soft_update_list = [] with tf.name_scope("soft_update"): for source, dest in zip(self.critic['variables'], self.target_critic['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) for source, dest in zip(self.actor['variables'], self.target_actor['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('critic.graph', graph_def=self.session.graph) def create_actor_network(self, is_target): scope = 'tar_actor' if is_target else 'actor' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None net = x for hidden_unit_num in self.hidden_layers: if self.batch_norm: net = fully_connected(inputs=net, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place net = batch_norm(inputs=net, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: net = fully_connected(inputs=net, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) net = fully_connected( inputs=net, activation_fn=tf.tanh, num_outputs=self.action_dim, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) return x, is_train, net, variables def create_critic_network(self, is_target): scope = 'tar_critic' if is_target else 'critic' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') u = tf.placeholder(tf.float32, shape=(None, self.action_dim), name='actions') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None # first concatenate the input # NOTE : this is different architecture from the original paper, we include the action from the first layer with tf.name_scope('merge'): net = tf.concat(1, [x, u]) for hidden_unit_num in self.hidden_layers: if self.batch_norm: net = fully_connected(inputs=net, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place net = batch_norm(inputs=net, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: net = fully_connected(inputs=net, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) net = fully_connected( inputs=net, activation_fn=None, num_outputs=1, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) return x, u, is_train, net, variables # define the functions for executing operations def predict_target_q(self, x, u): return self.session.run(self.target_critic['q'], feed_dict={ self.target_critic['x']: x, self.target_critic['u']: u }) def predict_target_action(self, x): return self.session.run(self.target_actor['a'], feed_dict={self.target_actor['x']: x}) def get_y(self, q, r, t): return self.session.run(self.y_compute['y'], feed_dict={ self.y_compute['r']: r, self.y_compute['q']: q, self.y_compute['t']: t }) def optimize_critic(self, x, u, is_train, y): if self.batch_norm: return self.session.run(self.critic_optimization['optimize'], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic['is_train']: is_train, self.critic_optimization['y']: y }) else: return self.session.run(self.critic_optimization['optimize'], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic_optimization['y']: y }) def predict_action(self, x, is_train): if self.batch_norm: return self.session.run(self.actor['a'], feed_dict={ self.actor['x']: x, self.actor['is_train']: is_train }) else: return self.session.run(self.actor['a'], feed_dict={self.actor['x']: x}) def action_grads(self, x, u, is_train): if self.batch_norm: return self.session.run(self.action_gradients["action_grads"], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic['is_train']: is_train }) else: return self.session.run(self.action_gradients["action_grads"], feed_dict={ self.critic['x']: x, self.critic['u']: u }) def optimize_actor(self, x, a_grads, is_train): if self.batch_norm: return self.session.run( self.actor_optimization['optimize'], feed_dict={ self.actor['x']: x, self.actor['is_train']: is_train, self.actor_optimization['action_grads']: a_grads }) else: return self.session.run( self.actor_optimization['optimize'], feed_dict={ self.actor['x']: x, self.actor_optimization['action_grads']: a_grads }) def soft_update(self): self.session.run(self.soft_update_list) def get_action(self, s): # first make sure the s have the valid form s = np.reshape(s, (1, self.state_dim)) a = self.predict_action(s, False) # a is a list with mini_batch size of 1, so we need the first element of is_train return self.exploration.add_noise(a[0], self.action_lb, self.action_ub) def learn(self, s, a, sprime, r, t): # first add the sample to the replay buffer self.replay_buffer.add(s, a, sprime, r, t) # we start learning if we have enough sample for one minibatch if self.replay_buffer.get_size() > self.mini_batch_size: # we do the update with several batch in each turn for i in xrange(self.update_per_iteration): state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch( self.mini_batch_size) # first optimize the critic # compute Q' q = self.predict_target_q( sprime_set, self.predict_target_action(sprime_set)) # compute y = r + gamma * Q' y = self.get_y(q, reward_set, terminal_set) # optimize critic using y, and batch normalization self.optimize_critic(state_set, action_set, True, y) # then optimize the actor actions = self.predict_action(state_set, True) a_grads = self.action_grads(state_set, actions, False) # NOTE: the tf.gradient return a list of len(actions), so we need to take the first element from it self.optimize_actor(state_set, a_grads[0], True) # using soft update to update target networks self.soft_update() def reset_exploration(self): self.exploration.reset()
def train(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Obtain environment parameters. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space env.close() # Build networks. main_network = QValueNetwork(obs_space, action_space, name="main_network") target_network = QValueNetwork(obs_space, action_space, name="target_network", auxiliary_network=main_network) variables_initializer = tf.global_variables_initializer() # Create parallel environments. par_env = ParallelEnvironment( [make_atari(ENV_NAME) for _ in range(NUM_ENV)]) replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE) start_time = time.time() list_episodic_reward = [] episodic_reward = np.zeros(NUM_ENV) obs = par_env.reset() with tf.Session() as sess: # Initialize all variables. sess.run(variables_initializer) # Only save the main network. saver = tf.train.Saver(var_list=main_network.variables) # Initialize buffers. while replay_buffer.get_size() < INITIAL_BUFFER_SIZE: # Sample random action. action = np.random.randint(action_space.n, size=NUM_ENV) # Interact with the environment. obs_next, reward, done, _ = par_env.step(action) episodic_reward += reward for i in range(NUM_ENV): if done[i]: episodic_reward[i] = 0 # Store data. for i in range(NUM_ENV): data = [obs[i], action[i], reward[i], done[i], obs_next[i]] replay_buffer.append(data) # Update observation. obs = obs_next step = 0 next_target_network_update_step = 0 next_autosave_step = 0 while step < TOTAL_STEP: # Synchronize the target network periodically (target network <- main network). if step >= next_target_network_update_step: sess.run(target_network.sync_op) next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP # Sample action with epsilon-greedy policy. epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum( step / EPSILON_DECAY_STEP, 1) random_uniform = np.random.uniform(size=NUM_ENV) action = np.zeros(NUM_ENV, dtype=np.int32) random_action_index = np.argwhere(random_uniform < epsilon) if np.shape(random_action_index)[0] > 0: action[tuple( np.transpose(random_action_index))] = np.random.randint( action_space.n, size=np.shape(random_action_index)[0]) greedy_action_index = np.argwhere(random_uniform >= epsilon) if np.shape(greedy_action_index)[0] > 0: q = sess.run(target_network.q, feed_dict={ target_network.Obs: np.array(obs)[tuple( np.transpose(greedy_action_index))] / 255.0 }) action[tuple(np.transpose(greedy_action_index))] = np.argmax( q, axis=1) # Interact with the environment. obs_next, reward, done, _ = par_env.step(action) episodic_reward += reward for i in range(NUM_ENV): if done[i]: list_episodic_reward.append((step, episodic_reward[i])) delta_time = int(time.time() - start_time) print("Step ", step, "/", TOTAL_STEP, ": Time spent = ", delta_time, " s , Episodic reward = ", episodic_reward[i], sep="") episodic_reward[i] = 0 # Store data. for i in range(NUM_ENV): data = [obs[i], action[i], reward[i], done[i], obs_next[i]] replay_buffer.append(data) # Update observation. obs = obs_next # Learning rate. lr = LEARNING_RATE[-1] for i in range(len(LR_ANNEAL_STEP)): if step < LR_ANNEAL_STEP[i]: lr = LEARNING_RATE[i] break for _ in range(NUM_ENV): # Sample training data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \ [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))] # Compute the target Q value: # target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)] q_next = sess.run( target_network.q, feed_dict={target_network.Obs: batch_obs_next / 255.0}) max_qnext = np.amax(q_next, axis=1) target_q = batch_reward + ( 1 - batch_done) * REWARD_DISCOUNT * max_qnext # Update the main network. sess.run(main_network.train_op, feed_dict={ main_network.Obs: batch_obs / 255.0, main_network.Action: batch_action, main_network.TargetQ: target_q, main_network.LR: lr }) # Save the main network periodically. if step >= next_autosave_step: saver.save(sess, SAVE_DIR + file_name) next_autosave_step += AUTOSAVE_STEP # Update step. step += NUM_ENV # Save the main network. saver = tf.train.Saver(var_list=main_network.variables) saver.save(sess, SAVE_DIR + file_name) total_time = int(time.time() - start_time) print("Training finished in ", total_time, " s.", sep="") # Close the environment. par_env.close() # Plot the episodic reward against training step curve. plot_episodic_reward(list_episodic_reward, file_name)