def __init__(self, n_states, n_actions, opt, ouprocess=True, mean_var_path=None, supervised=False): """ DDPG Algorithms Args: n_states: int, dimension of states n_actions: int, dimension of actions opt: dict, params supervised, bool, pre-train the actor with supervised learning """ self.n_states = n_states self.n_actions = n_actions # Params self.alr = opt['alr'] self.clr = opt['clr'] self.model_name = opt['model'] self.batch_size = opt['batch_size'] self.gamma = opt['gamma'] self.tau = opt['tau'] self.ouprocess = ouprocess if mean_var_path is None: mean = np.zeros(n_states) var = np.zeros(n_states) elif not os.path.exists(mean_var_path): mean = np.zeros(n_states) var = np.zeros(n_states) else: with open(mean_var_path, 'rb') as f: mean, var = pickle.load(f) self.normalizer = Normalizer(mean, var) if supervised: self._build_actor() logger.info("Supervised Learning Initialized") else: # Build Network self._build_network() logger.info('Finish Initializing Networks') self.replay_memory = PrioritizedReplayMemory( capacity=opt['memory_size']) # self.replay_memory = ReplayMemory(capacity=opt['memory_size']) self.noise = OUProcess(n_actions) logger.info('DDPG Initialzed!')
def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, actor_learning_rate=0.0001, critic_learning_rate=0.001, tau=0.001, hidden_layers=[400, 300]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) # we define the operations that is used in this algorithms self.critic = {} self.critic['x'], self.critic['u'], self.critic[ 'is_train'], self.critic['q'], self.critic[ 'variables'] = self.create_critic_network(is_target=False) self.target_critic = {} self.target_critic['x'], self.target_critic[ 'u'], _, self.target_critic['q'], self.target_critic[ 'variables'] = self.create_critic_network(is_target=True) self.actor = {} self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[ 'variables'] = self.create_actor_network(is_target=False) self.target_actor = {} self.target_actor['x'], _, self.target_actor['a'], self.target_actor[ 'variables'] = self.create_actor_network(is_target=True) self.critic_optimization = {} with tf.name_scope('critic_optimization'): self.critic_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.critic_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.critic['q'], self.critic_optimization['y']), name='loss') self.critic_optimization['optimize'] = tf.train.AdamOptimizer( critic_learning_rate).minimize( self.critic_optimization['loss']) # define operation to get y self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * target_q self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1)) temp = tf.to_float(self.y_compute['t']) temp = tf.mul(temp, -1.0) temp = tf.add(temp, 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['q'], self.discount), temp)) # define the operation to get the gradient of Q with respect to action self.action_gradients = {} with tf.name_scope('action_grads'): self.action_gradients["action_grads"] = tf.gradients( self.critic['q'], self.critic['u']) self.actor_optimization = {} with tf.name_scope('actor_optimization'): # first define the placeholder for the gradient of Q with respect to action self.actor_optimization['action_grads'] = tf.placeholder( tf.float32, shape=(None, self.action_dim)) # since actor are using gradient ascent, we add the minus sign self.actor_optimization['actor_variable_grads'] = tf.gradients( self.actor['a'], self.actor['variables'], -self.actor_optimization['action_grads']) self.actor_optimization['optimize'] = tf.train.AdamOptimizer( actor_learning_rate).apply_gradients( zip(self.actor_optimization['actor_variable_grads'], self.actor['variables'])) self.soft_update_list = [] with tf.name_scope("soft_update"): for source, dest in zip(self.critic['variables'], self.target_critic['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) for source, dest in zip(self.actor['variables'], self.target_actor['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('critic.graph', graph_def=self.session.graph)
class DDPG(object): def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, actor_learning_rate=0.0001, critic_learning_rate=0.001, tau=0.001, hidden_layers=[400, 300]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) # we define the operations that is used in this algorithms self.critic = {} self.critic['x'], self.critic['u'], self.critic[ 'is_train'], self.critic['q'], self.critic[ 'variables'] = self.create_critic_network(is_target=False) self.target_critic = {} self.target_critic['x'], self.target_critic[ 'u'], _, self.target_critic['q'], self.target_critic[ 'variables'] = self.create_critic_network(is_target=True) self.actor = {} self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[ 'variables'] = self.create_actor_network(is_target=False) self.target_actor = {} self.target_actor['x'], _, self.target_actor['a'], self.target_actor[ 'variables'] = self.create_actor_network(is_target=True) self.critic_optimization = {} with tf.name_scope('critic_optimization'): self.critic_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.critic_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.critic['q'], self.critic_optimization['y']), name='loss') self.critic_optimization['optimize'] = tf.train.AdamOptimizer( critic_learning_rate).minimize( self.critic_optimization['loss']) # define operation to get y self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * target_q self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1)) temp = tf.to_float(self.y_compute['t']) temp = tf.mul(temp, -1.0) temp = tf.add(temp, 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['q'], self.discount), temp)) # define the operation to get the gradient of Q with respect to action self.action_gradients = {} with tf.name_scope('action_grads'): self.action_gradients["action_grads"] = tf.gradients( self.critic['q'], self.critic['u']) self.actor_optimization = {} with tf.name_scope('actor_optimization'): # first define the placeholder for the gradient of Q with respect to action self.actor_optimization['action_grads'] = tf.placeholder( tf.float32, shape=(None, self.action_dim)) # since actor are using gradient ascent, we add the minus sign self.actor_optimization['actor_variable_grads'] = tf.gradients( self.actor['a'], self.actor['variables'], -self.actor_optimization['action_grads']) self.actor_optimization['optimize'] = tf.train.AdamOptimizer( actor_learning_rate).apply_gradients( zip(self.actor_optimization['actor_variable_grads'], self.actor['variables'])) self.soft_update_list = [] with tf.name_scope("soft_update"): for source, dest in zip(self.critic['variables'], self.target_critic['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) for source, dest in zip(self.actor['variables'], self.target_actor['variables']): if 'BatchNorm' not in source.name: self.soft_update_list.append( dest.assign( tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('critic.graph', graph_def=self.session.graph) def create_actor_network(self, is_target): scope = 'tar_actor' if is_target else 'actor' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None net = x for hidden_unit_num in self.hidden_layers: if self.batch_norm: net = fully_connected(inputs=net, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place net = batch_norm(inputs=net, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: net = fully_connected(inputs=net, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) net = fully_connected( inputs=net, activation_fn=tf.tanh, num_outputs=self.action_dim, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) return x, is_train, net, variables def create_critic_network(self, is_target): scope = 'tar_critic' if is_target else 'critic' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') u = tf.placeholder(tf.float32, shape=(None, self.action_dim), name='actions') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None # first concatenate the input # NOTE : this is different architecture from the original paper, we include the action from the first layer with tf.name_scope('merge'): net = tf.concat(1, [x, u]) for hidden_unit_num in self.hidden_layers: if self.batch_norm: net = fully_connected(inputs=net, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place net = batch_norm(inputs=net, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: net = fully_connected(inputs=net, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) net = fully_connected( inputs=net, activation_fn=None, num_outputs=1, weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3), biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) return x, u, is_train, net, variables # define the functions for executing operations def predict_target_q(self, x, u): return self.session.run(self.target_critic['q'], feed_dict={ self.target_critic['x']: x, self.target_critic['u']: u }) def predict_target_action(self, x): return self.session.run(self.target_actor['a'], feed_dict={self.target_actor['x']: x}) def get_y(self, q, r, t): return self.session.run(self.y_compute['y'], feed_dict={ self.y_compute['r']: r, self.y_compute['q']: q, self.y_compute['t']: t }) def optimize_critic(self, x, u, is_train, y): if self.batch_norm: return self.session.run(self.critic_optimization['optimize'], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic['is_train']: is_train, self.critic_optimization['y']: y }) else: return self.session.run(self.critic_optimization['optimize'], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic_optimization['y']: y }) def predict_action(self, x, is_train): if self.batch_norm: return self.session.run(self.actor['a'], feed_dict={ self.actor['x']: x, self.actor['is_train']: is_train }) else: return self.session.run(self.actor['a'], feed_dict={self.actor['x']: x}) def action_grads(self, x, u, is_train): if self.batch_norm: return self.session.run(self.action_gradients["action_grads"], feed_dict={ self.critic['x']: x, self.critic['u']: u, self.critic['is_train']: is_train }) else: return self.session.run(self.action_gradients["action_grads"], feed_dict={ self.critic['x']: x, self.critic['u']: u }) def optimize_actor(self, x, a_grads, is_train): if self.batch_norm: return self.session.run( self.actor_optimization['optimize'], feed_dict={ self.actor['x']: x, self.actor['is_train']: is_train, self.actor_optimization['action_grads']: a_grads }) else: return self.session.run( self.actor_optimization['optimize'], feed_dict={ self.actor['x']: x, self.actor_optimization['action_grads']: a_grads }) def soft_update(self): self.session.run(self.soft_update_list) def get_action(self, s): # first make sure the s have the valid form s = np.reshape(s, (1, self.state_dim)) a = self.predict_action(s, False) # a is a list with mini_batch size of 1, so we need the first element of is_train return self.exploration.add_noise(a[0], self.action_lb, self.action_ub) def learn(self, s, a, sprime, r, t): # first add the sample to the replay buffer self.replay_buffer.add(s, a, sprime, r, t) # we start learning if we have enough sample for one minibatch if self.replay_buffer.get_size() > self.mini_batch_size: # we do the update with several batch in each turn for i in xrange(self.update_per_iteration): state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch( self.mini_batch_size) # first optimize the critic # compute Q' q = self.predict_target_q( sprime_set, self.predict_target_action(sprime_set)) # compute y = r + gamma * Q' y = self.get_y(q, reward_set, terminal_set) # optimize critic using y, and batch normalization self.optimize_critic(state_set, action_set, True, y) # then optimize the actor actions = self.predict_action(state_set, True) a_grads = self.action_grads(state_set, actions, False) # NOTE: the tf.gradient return a list of len(actions), so we need to take the first element from it self.optimize_actor(state_set, a_grads[0], True) # using soft update to update target networks self.soft_update() def reset_exploration(self): self.exploration.reset()
def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, learning_rate=1e-3, tau=0.001, hidden_layers=[200, 200]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) self.network = {} self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \ self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False) self.target = {} self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \ self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True) #define optimization operations self.network_optimization = {} with tf.name_scope('optimization'): self.network_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.network_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.network['Q'], self.network_optimization['y']), name='loss') self.network_optimization['optimize'] = tf.train.AdamOptimizer( learning_rate).minimize(self.network_optimization['loss']) #define the operations for compute y value self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * V self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['y'] = tf.to_float(self.y_compute['t']) self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0) self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['v'], self.discount), self.y_compute['y'])) # define the soft update operation between the normal networks and target networks self.soft_update_list = [] with tf.name_scope('soft_update'): for source, dest in zip(self.network['variables'], self.target['variables']): self.soft_update_list.append( dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('naf.graph', graph_def=self.session.graph)
class NAF(object): def __init__(self, sess, env, state_dim, action_dim, max_buffer_size=100000, update_per_iteration=5, mini_batch_size=64, discount=0.99, batch_norm=True, learning_rate=1e-3, tau=0.001, hidden_layers=[200, 200]): self.session = sess self.env = env self.state_dim = state_dim self.action_dim = action_dim self.action_lb = self.env.action_space.low self.action_ub = self.env.action_space.high self.discount = discount self.batch_norm = batch_norm self.mini_batch_size = mini_batch_size self.update_per_iteration = update_per_iteration self.hidden_layers = hidden_layers self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim, action_dim) self.exploration = OUProcess(self.action_dim) self.network = {} self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \ self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False) self.target = {} self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \ self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True) #define optimization operations self.network_optimization = {} with tf.name_scope('optimization'): self.network_optimization['y'] = tf.placeholder(tf.float32, shape=(None, 1), name='y') self.network_optimization['loss'] = tf.reduce_mean( tf.squared_difference(self.network['Q'], self.network_optimization['y']), name='loss') self.network_optimization['optimize'] = tf.train.AdamOptimizer( learning_rate).minimize(self.network_optimization['loss']) #define the operations for compute y value self.y_compute = {} with tf.name_scope('y'): # y = reward + (1-terminal) * gamma * V self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1)) self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1)) self.y_compute['y'] = tf.to_float(self.y_compute['t']) self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0) self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0) self.y_compute['y'] = tf.add( self.y_compute['r'], tf.mul(tf.mul(self.y_compute['v'], self.discount), self.y_compute['y'])) # define the soft update operation between the normal networks and target networks self.soft_update_list = [] with tf.name_scope('soft_update'): for source, dest in zip(self.network['variables'], self.target['variables']): self.soft_update_list.append( dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau))) # after define the computation, we initialize all the varialbes self.session.run(tf.initialize_all_variables()) summary_writer = tf.train.SummaryWriter('naf.graph', graph_def=self.session.graph) def create_networks(self, is_target): scope = 'tar_naf' if is_target else 'naf' with tf.variable_scope(scope): x = tf.placeholder(tf.float32, shape=(None, self.state_dim), name='observation') u = tf.placeholder(tf.float32, shape=(None, self.action_dim), name='actions') # this is used for determine which mode, training or evalutation, for batch normalization if self.batch_norm: # for target network, is alway evaluation mode is_train = False if is_target else tf.placeholder( tf.bool, name='is_train') else: is_train = None # define operations for the value function with tf.variable_scope('V'): V = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: V = fully_connected(inputs=V, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place V = batch_norm(inputs=V, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: V = fully_connected(inputs=V, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) # add in the last layer V = fully_connected(inputs=V, activation_fn=None, num_outputs=1) # define operations for compute covariance matrix with tf.variable_scope('L'): L = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: L = fully_connected(inputs=L, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place L = batch_norm(inputs=L, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: L = fully_connected(inputs=L, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) L = fully_connected(inputs=L, activation_fn=None, num_outputs=(self.action_dim * (self.action_dim + 1) / 2)) #construct upper triangular matrix U pivot = 0 rows = [] for index in xrange(self.action_dim): count = self.action_dim - index # slice one element at point pivot from the second dimension and apply exp to it # NOTE, first dimension indicate the batch, -1 means all element in this dimension are in slice diag_elem = tf.exp(tf.slice(L, (0, pivot), (-1, 1))) # slice the next count - 1 element from the second dimension # count is the number of non-zero element in each row # NOTE: index getting bigger, so count get smaller non_diag_elems = tf.slice(L, (0, pivot + 1), (-1, count - 1)) # concate the tensor to form one row of the matrix non_zero_elements = tf.concat(1, (diag_elem, non_diag_elems)) # ((0, 0), (index, 0)) is the paddings # since we have two-d matrix, so the tuple has two elements # for the first (0,0), specify the first dimension # the first 0 means padding nothing, the second 0 means padding before the elements (-1 means after) # (index, 0) specify the padding for second dimension, which is what we want # (index, 0) mean padding index number before the elements row = tf.pad(non_zero_elements, ((0, 0), (index, 0))) rows.append(row) # take off the elements we already used pivot += count # Packs a list of rank-R tensors into one rank-(R+1) tensor. # axis = 1 mean the second dimensions # NOTE : this will get upper triangular matrix U not L L = tf.pack(rows, axis=1) # convariance matrix P = L*L^{T} = U^{T}*U P = tf.batch_matmul(tf.transpose(L, perm=[0, 2, 1]), L) # define operations for compute Mu with tf.variable_scope('M'): M = x # add in the hidden layers for hidden_unit_num in self.hidden_layers: if self.batch_norm: M = fully_connected(inputs=M, activation_fn=None, num_outputs=hidden_unit_num) # NOTE : we set the updates_collections to None to force the updates of mean and variance in place # see https://github.com/tensorflow/tensorflow/issues/1122 M = batch_norm(inputs=M, center=True, scale=True, activation_fn=tf.nn.relu, is_training=is_train, updates_collections=None) else: M = fully_connected(inputs=M, activation_fn=tf.nn.relu, num_outputs=hidden_unit_num) # add in the last layer M = fully_connected(inputs=M, activation_fn=tf.tanh, num_outputs=self.action_dim) #define operations for compute Advantage function with tf.name_scope('A'): # first expand the u-M to a 2-d tensor for multiplication # NOTE: it's actually a 3-d tensor, but we ignore the first dim which is the batch # u is two-d matrix, first dimension is the batch # so u is actually a row vector after expand_dim Aprime = tf.expand_dims(u - M, -1) # fix the dimension for batch, transpose each instance A = tf.transpose(Aprime, perm=[0, 2, 1]) # A = -1/2 * (u-M)^{T} * P * (u-M) A = -tf.batch_matmul(tf.batch_matmul(Aprime, P), A) / 2 # make sure the shape is batch_size * 1 for A, -1 mean that dim is automatically computed # after last step, each A is now a 1*1 matrix, we reshape it to get scalar A = tf.reshape(A, [-1, 1]) with tf.name_scope('Q'): Q = A + V # get all the trainable variable from this scope variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) #return x, u, is_train, V, P, M, Q, variables return x, u, is_train, V, P, M, Q, variables def predict_target_v(self, x): return self.session.run(self.target['V'], feed_dict={self.target['x']: x}) def get_y(self, v, r, t): return self.session.run(self.y_compute['y'], feed_dict={ self.y_compute['r']: r, self.y_compute['v']: v, self.y_compute['t']: t }) def optimize_network(self, x, u, is_train, y): if self.batch_norm: feed_dict = { self.network['x']: x, self.network['u']: u, self.network['is_train']: is_train, self.network_optimization['y']: y } else: feed_dict = { self.network['x']: x, self.network['u']: u, self.network_optimization['y']: y } return self.session.run(self.network_optimization['optimize'], feed_dict=feed_dict) def predict_action(self, x, is_train): if self.batch_norm: feed_dict = { self.network['x']: x, self.network['is_train']: is_train } else: feed_dict = {self.network['x']: x} return self.session.run([self.network['M'], self.network['P']], feed_dict=feed_dict) def get_action(self, s): s = np.reshape(s, (1, self.state_dim)) a, covariance = self.predict_action(s, False) return self.exploration.add_noise(a[0], self.action_lb, self.action_ub) def soft_update(self): self.session.run(self.soft_update_list) def learn(self, s, a, sprime, r, terminal): # first add the sample to the replay buffer self.replay_buffer.add(s, a, sprime, r, terminal) # we start learning if we have enough sample for one minibatch if self.replay_buffer.get_size() > self.mini_batch_size: # we do the update with several batch in each turn for i in xrange(self.update_per_iteration): state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch( self.mini_batch_size) # compute V' v = self.predict_target_v(sprime_set) # compute y = r + gamma * V' y = self.get_y(v, reward_set, terminal_set) # optimize critic using y, and batch normalization self.optimize_network(state_set, action_set, True, y) # using soft update to update target networks self.soft_update() def reset_exploration(self): self.exploration.reset()
class DDPG(object): def __init__(self, n_states, n_actions, opt, ouprocess=True, mean_var_path=None, supervised=False): """ DDPG Algorithms Args: n_states: int, dimension of states n_actions: int, dimension of actions opt: dict, params supervised, bool, pre-train the actor with supervised learning """ self.n_states = n_states self.n_actions = n_actions # Params self.alr = opt['alr'] self.clr = opt['clr'] self.model_name = opt['model'] self.batch_size = opt['batch_size'] self.gamma = opt['gamma'] self.tau = opt['tau'] self.ouprocess = ouprocess if mean_var_path is None: mean = np.zeros(n_states) var = np.zeros(n_states) elif not os.path.exists(mean_var_path): mean = np.zeros(n_states) var = np.zeros(n_states) else: with open(mean_var_path, 'rb') as f: mean, var = pickle.load(f) self.normalizer = Normalizer(mean, var) if supervised: self._build_actor() logger.info("Supervised Learning Initialized") else: # Build Network self._build_network() logger.info('Finish Initializing Networks') self.replay_memory = PrioritizedReplayMemory( capacity=opt['memory_size']) # self.replay_memory = ReplayMemory(capacity=opt['memory_size']) self.noise = OUProcess(n_actions) logger.info('DDPG Initialzed!') @staticmethod def totensor(x): return Variable(torch.FloatTensor(x)) def _build_actor(self): if self.ouprocess: noisy = False else: noisy = True self.actor = Actor(self.n_states, self.n_actions, noisy=noisy) self.actor_criterion = nn.MSELoss() self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters()) def _build_network(self): if self.ouprocess: noisy = False else: noisy = True self.actor = Actor(self.n_states, self.n_actions, noisy=noisy) self.target_actor = Actor(self.n_states, self.n_actions) self.critic = Critic(self.n_states, self.n_actions) self.target_critic = Critic(self.n_states, self.n_actions) # if model params are provided, load them if len(self.model_name): self.load_model(model_name=self.model_name) logger.info("Loading model from file: {}".format(self.model_name)) # Copy actor's parameters self._update_target(self.target_actor, self.actor, tau=1.0) # Copy critic's parameters self._update_target(self.target_critic, self.critic, tau=1.0) self.loss_criterion = nn.MSELoss() self.actor_optimizer = optimizer.Adam(lr=self.alr, params=self.actor.parameters(), weight_decay=1e-5) self.critic_optimizer = optimizer.Adam(lr=self.clr, params=self.critic.parameters(), weight_decay=1e-5) @staticmethod def _update_target(target, source, tau): for (target_param, param) in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1 - tau) + param.data * tau) def reset(self, sigma): self.noise.reset(sigma) def _sample_batch(self): batch, idx = self.replay_memory.sample(self.batch_size) # batch = self.replay_memory.sample(self.batch_size) states = map(lambda x: x[0].tolist(), batch) next_states = map(lambda x: x[3].tolist(), batch) actions = map(lambda x: x[1].tolist(), batch) rewards = map(lambda x: x[2], batch) terminates = map(lambda x: x[4], batch) return idx, states, next_states, actions, rewards, terminates def add_sample(self, state, action, reward, next_state, terminate): self.critic.eval() self.actor.eval() self.target_critic.eval() self.target_actor.eval() batch_state = self.normalizer([state.tolist()]) batch_next_state = self.normalizer([next_state.tolist()]) current_value = self.critic(batch_state, self.totensor([action.tolist()])) target_action = self.target_actor(batch_next_state) target_value = self.totensor([reward]) \ + self.totensor([0 if x else 1 for x in [terminate]]) \ * self.target_critic(batch_next_state, target_action) * self.gamma error = float(torch.abs(current_value - target_value).data.numpy()[0]) self.target_actor.train() self.actor.train() self.critic.train() self.target_critic.train() self.replay_memory.add(error, (state, action, reward, next_state, terminate)) def update(self): """ Update the Actor and Critic with a batch data """ idxs, states, next_states, actions, rewards, terminates = self._sample_batch( ) batch_states = self.normalizer(states) # totensor(states) batch_next_states = self.normalizer( next_states) # Variable(torch.FloatTensor(next_states)) batch_actions = self.totensor(actions) batch_rewards = self.totensor(rewards) mask = [0 if x else 1 for x in terminates] mask = self.totensor(mask) target_next_actions = self.target_actor(batch_next_states).detach() target_next_value = self.target_critic( batch_next_states, target_next_actions).detach().squeeze(1) current_value = self.critic(batch_states, batch_actions) next_value = batch_rewards + mask * target_next_value * self.gamma # Update Critic # update prioritized memory error = torch.abs(current_value - next_value).data.numpy() for i in range(self.batch_size): idx = idxs[i] self.replay_memory.update(idx, error[i][0]) loss = self.loss_criterion(current_value, next_value) self.critic_optimizer.zero_grad() loss.backward() self.critic_optimizer.step() # Update Actor self.critic.eval() policy_loss = -self.critic(batch_states, self.actor(batch_states)) policy_loss = policy_loss.mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic.train() self._update_target(self.target_critic, self.critic, tau=self.tau) self._update_target(self.target_actor, self.actor, tau=self.tau) return loss.data[0], policy_loss.data[0] def choose_action(self, x): """ Select Action according to the current state Args: x: np.array, current state """ self.actor.eval() act = self.actor(self.normalizer([x.tolist()])).squeeze(0) self.actor.train() action = act.data.numpy() if self.ouprocess: action += self.noise.noise() return action.clip(0, 1) def sample_noise(self): self.actor.sample_noise() def load_model(self, model_name): """ Load Torch Model from files Args: model_name: str, model path """ self.actor.load_state_dict( torch.load('{}_actor.pth'.format(model_name))) self.critic.load_state_dict( torch.load('{}_critic.pth'.format(model_name))) def save_model(self, model_dir, title): """ Save Torch Model from files Args: model_dir: str, model dir title: str, model name """ torch.save(self.actor.state_dict(), '{}/{}_actor.pth'.format(model_dir, title)) torch.save(self.critic.state_dict(), '{}/{}_critic.pth'.format(model_dir, title)) def save_actor(self, path): """ save actor network Args: path, str, path to save """ torch.save(self.actor.state_dict(), path) def load_actor(self, path): """ load actor network Args: path, str, path to load """ self.actor.load_state_dict(torch.load(path)) def train_actor(self, batch_data, is_train=True): """ Train the actor separately with data Args: batch_data: tuple, (states, actions) is_train: bool Return: _loss: float, training loss """ states, action = batch_data if is_train: self.actor.train() pred = self.actor(self.normalizer(states)) action = self.totensor(action) _loss = self.actor_criterion(pred, action) self.actor_optimizer.zero_grad() _loss.backward() self.actor_optimizer.step() else: self.actor.eval() pred = self.actor(self.normalizer(states)) action = self.totensor(action) _loss = self.actor_criterion(pred, action) return _loss.data[0]