def __init__(self, name, sess, state_space, action_space, lr=1e-4, gamma=0.99, tau=0.98, memory_size=10**6, batch_size=64): super().__init__(name, state_space, action_space) self.sess = sess self.gamma = gamma self.lr = lr self.tau = tau self.memory_size = memory_size self.batch_size = batch_size self.act_dim = flatten(action_space) self.state_ph = tf.placeholder(tf.float32, (None,) + state_space, name='state-ph') self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), name='act-ph') with tf.variable_scope('eval'): self.eval_scope = tf.get_variable_scope().name self.eval_net = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) with tf.variable_scope('target'): self.target_scope = tf.get_variable_scope().name self.target_net = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) with tf.name_scope('update'): eval_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.eval_scope) target_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_scope) self.async_op = [tf.assign(t_var, e_var) for e_var, t_var in zip(eval_vars, target_vars)] self.soft_async_op = [tf.assign(t_var, self.tau * e_var + (1. - self.tau) * t_var) for e_var, t_var in zip(eval_vars, target_vars)] with tf.name_scope('optimization'): self.loss = None self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) self.replay_buffer = Buffer(self.memory_size)
def __init__(self, name, sess, state_space, action_space, len_episode, actor_lr=1e-4, critic_lr=1e-3, gamma=0.96, epsilon=0.1, update_steps=5): super().__init__(name, state_space, action_space) self.sess = sess self.len_episode = len_episode self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.epsilon = epsilon self.update_steps = update_steps self.act_dim = flatten(action_space) self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph') self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), name='action-ph') self.adv_ph = tf.placeholder(tf.float32, (None,), name='advantage-ph') self.cum_r_ph = tf.placeholder(tf.float32, (None,), name='cum-r-ph') with tf.variable_scope('new_policy'): self.new_a_scope = tf.get_variable_scope().name self.new_logits = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) with tf.variable_scope('old_policy'): self.old_a_scope = tf.get_variable_scope().name self.old_logits = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) with tf.variable_scope('critic'): self.value = self._construct(input_ph=self.state_ph, out_dim=1) with tf.name_scope('optimization'): ratio = self._policy(self.new_logits) / self._policy(self.old_logits) self.a_loss = (tf.stop_gradient(self.value) - tf.reshape(self.cum_r_ph, (-1, 1))) self.c_loss = tf.reduce_mean(tf.squared_difference(self.value, self.cum_r_ph)) self.a_train_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss) self.c_train_op = tf.train.AdamOptimizer(self.critic_lr).minimize(self.c_loss) with tf.name_scope('update'): new_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.new_a_scope) old_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.old_a_scope) self.async = [tf.assign(t_var, e_var) for e_var, t_var in zip(new_vars, old_vars)] self.episode = Episode(len_episode)
def __init__(self, name, sess, state_space, action_space, len_episode, gamma=0.99, actor_lr=1e-4, critic_lr=1e-3): super().__init__(name, state_space, action_space) self.sess = sess self.len_episode = len_episode self.gamma = gamma self.actor_lr = actor_lr self.critic_lr = critic_lr self.act_dim = flatten(self.act_space) self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph') self.act_ph = tf.placeholder(tf.int32, (None,), name='act-ph') self.cum_r_ph = tf.placeholder(tf.float32, (None,), name='cum_r-ph') with tf.variable_scope("emb"): emb_layer = self._emb(self.state_ph) with tf.variable_scope("policy"): self.actor_scope = tf.get_variable_scope().name self.policy_logits = self._construct(emb=emb_layer, out_dim=self.act_dim) self.policy = tf.nn.softmax(self.policy_logits, axis=1) with tf.variable_scope("value"): self.critic_scope = tf.get_variable_scope().name self.value = self._construct(emb=emb_layer, out_dim=1) with tf.name_scope('optimization'): act_one_hot = tf.one_hot(self.act_ph, self.act_dim) log_policy = tf.reduce_sum(tf.log(self.policy) * act_one_hot, axis=1) self.a_loss = -tf.reduce_sum(log_policy * (tf.stop_gradient(self.value) - self.cum_r)) self.c_loss = tf.reduce_mean(tf.square(self.value - self.cum_r)) self.a_train_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.a_loss) # stop update for observation embedding c_optimizer = tf.train.AdamOptimizer(self.critic_lr) c_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.critic_scope) grad_vars = optimizer.compute_gradients(self.c_loss, c_vars) self.c_train_op = c_optimizer.apply_gradients(grad_vars) self.episode = Episode(self.len_episode)
def __init__(self, name, sess, state_space, act_space, lr=1e-3, gamma=0.99, use_double=True, use_dueling=True, tau=0.01, batch_size=64, policy_type='e_greedy', memory_size=10**6): super(DQN, self).__init__(name, state_space, act_space) self.sess = sess self.lr = lr self.gamma = gamma self.use_double = use_double self.use_dueling = use_dueling self.tau = tau self.act_dim = flatten(act_space) self.batch_size = batch_size self.policy_type = policy_type self.replay_buffer = Buffer(memory_size) self.state_ph = tf.placeholder(tf.float32, (None,) + self.state_space, name='state-ph') self.target_q_ph = tf.placeholder(tf.float32, (None,), name='target-q-ph') self.act_ph = tf.placeholder(tf.int32, (None,), name='act-ph') with tf.variable_scope('eval-net'): self.eval_scope = tf.get_variable_scope().name self.eval_q_tf = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) one_hot = tf.one_hot(self.act_ph, self.act_dim) self.selected_q_tf = tf.reduce_sum(self.eval_q_tf * one_hot, axis=1, name='selected-q-tf') with tf.variable_scope('target-net'): self.target_scope = tf.get_variable_scope().name self.target_q_tf = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) with tf.name_scope('update'): e_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.eval_scope) t_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.target_scope) self.sync = [tf.assign(t_var, e_var) for t_var, e_var in zip(e_vars, t_vars)] self.soft_sync = [tf.assign(t_var, self.tau * e_var + (1. - self.tau) * t_var) for t_var, e_var in zip(e_vars, t_vars)] with tf.name_scope('optimization'): self.loss = 0.5 * tf.reduce_mean(tf.square(self.selected_q_tf - self.target_q_ph)) self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) with tf.name_scope('cal_target'): if self.use_double: act = tf.argmax(self.eval_q_tf, axis=1) one_hot = tf.one_hot(act, self.act_dim) self.reduced_target_tf = tf.reduce_sum(self.target_q_tf * one_hot, axis=1) else: self.reduced_target_tf = tf.reduce_max(self.target_q_tf, axis=1) with tf.name_scope('policy'): self.exploration_ph = tf.placeholder(tf.float32, None, name='exploration-ph') if self.policy_type == 'e_greedy': self.policy = tf.random.uniform((1,), minval=0., maxval=1.) < self.exploration_ph elif self.policy_type == 'boltzman': self.policy = tf.nn.softmax(self.eval_q_tf / self.exploration_ph) else: raise NotImplementedError
def __init__(self, name, sess, state_space, action_space, gamma=0.98, actor_lr=1e-4, critic_lr=1e-3, memory_size=10**3, batch_size=64, tau=0.01, grad_norm=5.0): super(DDPG, self).__init__(name, state_space, action_space) self.sess = sess self.gamma = gamma self.a_lr = actor_lr self.c_lr = critic_lr self.tau = tau self.batch_size = batch_size self.act_dim = flatten(action_space) self.replay_buffer = Buffer(memory_size) self.state_ph = tf.placeholder(tf.float32, (None, ) + state_space, name='state-ph') self.next_state_ph = tf.placeholder(tf.float32, (None, ) + state_space, name='next_state-ph') self.reward_ph = tf.placeholder(tf.float32, (None, ), name='reward-ph') self.done_ph = tf.placeholder(tf.float32, (None, ), name='done-ph') with tf.variable_scope('policy'): p_scope = tf.get_variable_scope().name self.logits = self._construct(input_ph=self.state_ph, out_dim=self.act_dim) self.policy = tf.nn.softmax(self.logits) with tf.variable_scope('target_policy'): t_p_scope = tf.get_variable_scope().name self.target_logits = self._construct(input_ph=self.next_state_ph, out_dim=self.act_dim) self.target_policy = tf.nn.softmax(self.target_logits) with tf.variable_scope('value'): q_scope = tf.get_variable_scope().name self.q = self._construct(input_ph=tf.concat( [self.state_ph, self.policy], axis=1), out_dim=1) with tf.variable_scope('target_value'): t_q_scope = tf.get_variable_scope().name self.target_q = self._construct(input_ph=tf.concat( [self.next_state_ph, self.target_policy], axis=1), out_dim=1) self.next_q = gamma * (1. - self.done_ph) * tf.reshape( self.target_q, (-1, )) + self.reward_ph with tf.name_scope('update'): e_p_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=p_scope) t_p_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=t_p_scope) e_q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope) t_q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=t_q_scope) self.sync = [ tf.assign(t_var, e_var) for e_var, t_var in zip(e_p_vars, t_p_vars) ] + [ tf.assign(t_var, e_var) for e_var, t_var in zip(e_q_vars, t_q_vars) ] self.soft_sync = [ tf.assign(t_var, tau * e_var + (1. - tau) * t_var) for e_var, t_var in zip(e_p_vars, t_p_vars) ] + [ tf.assign(t_var, tau * e_var + (1. - tau) * t_var) for e_var, t_var in zip(e_q_vars, t_q_vars) ] with tf.name_scope('optimization'): policy_loss = -tf.reduce_mean(self.q) value_loss = tf.reduce_mean( tf.square( tf.stop_gradient(self.next_q) - tf.reshape(self.q, (-1, )))) optimizer = tf.train.AdamOptimizer(self.a_lr) grad_vars = optimizer.compute_gradients( policy_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=p_scope)) grad_vars = [(tf.clip_by_value(grad, -1., 1.), _var) for grad, _var in grad_vars] self.p_train = optimizer.apply_gradients(grad_vars) optimizer = tf.train.AdamOptimizer(self.c_lr) grad_vars = optimizer.compute_gradients( value_loss, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope)) self.c_train = optimizer.apply_gradients(grad_vars) self.a_loss = policy_loss self.c_loss = value_loss