def __init__(self): self.sess = tf.Session() self.memory = replay_buffer(max_length=1e5) self.tau = 0.995 self.gamma = 0.99 self.state_size = 33 self.output_size = 4 self.action_limit = 1.0 self.hidden = [400, 300] self.batch_size = 100 self.pi_lr = 1e-4 self.q_lr = 1e-4 self.noise = OU_noise(self.output_size, 1) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.mlp_actor_critic( self.x_ph, self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\ self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.target = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.q_pi_targ) self.pi_loss = -tf.reduce_mean(self.q_pi) self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5 self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.v_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize( self.pi_loss, var_list=cr.get_vars('main/pi')) self.v_train = self.v_optimizer.minimize( self.v_loss, var_list=cr.get_vars('main/q')) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
class SAC: def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.num_worker = 20 self.noise = OU_noise(self.output_size, self.num_worker) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \ cr.sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) with tf.variable_scope('target'): _, _, _, _, _, _, _, self.v_targ = \ cr.sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) self.q_backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.v_targ) self.v_backup = tf.stop_gradient(self.min_q_pi - self.alpha * self.logp_pi) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi) self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2) self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2) self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1, self.q2, self.v, self.logp_pi, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) def update(self): data = self.memory.get_sample(sample_size=self.batch_size) feed_dict = { self.x_ph: data['state'], self.x2_ph: data['next_state'], self.a_ph: data['action'], self.r_ph: data['reward'], self.d_ph: data['done'] } self.sess.run(self.step_ops, feed_dict=feed_dict) def get_action(self, state, deterministic=False): act_op = self.mu if deterministic else self.pi return self.sess.run(act_op, feed_dict={self.x_ph: [state]})[0] def test(self): env = gym.make('Pendulum-v0') while True: state = env.reset() done = False while not done: env.render() action = self.get_action(state, 0) state, _, done, _ = env.step(action) def run(self): from mlagents.envs import UnityEnvironment writer = SummaryWriter('runs/sac') num_worker = self.num_worker state_size = self.state_size output_size = self.output_size ep = 0 train_size = 5 env = UnityEnvironment(file_name='env/training', worker_id=1) default_brain = env.brain_names[0] brain = env.brains[default_brain] initial_observation = env.reset() step = 0 start_steps = 100000 states = np.zeros([num_worker, state_size]) for i in range(start_steps): actions = np.clip(np.random.randn(num_worker, output_size), -self.action_limit, self.action_limit) actions += self.noise.sample() env_info = env.step(actions)[default_brain] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for s, ns, r, d, a in zip(states, next_states, rewards, dones, actions): self.memory.append(s, ns, r, d, a) states = next_states if dones[0]: self.noise.reset() if i % train_size == 0: if len(self.memory.memory) > self.batch_size: self.update() print('data storing :', float(i / start_steps)) while True: ep += 1 states = np.zeros([num_worker, state_size]) terminal = False score = 0 while not terminal: step += 1 ''' if step > start_steps: actions = [self.get_action(s) for s in states] action_random = 'False' else: actions = np.clip(np.random.randn(num_worker, output_size), -self.action_limit, self.action_limit) action_random = 'True' ''' actions = [self.get_action(s) for s in states] action_random = 'False' env_info = env.step(actions)[default_brain] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done terminal = dones[0] for s, ns, r, d, a in zip(states, next_states, rewards, dones, actions): self.memory.append(s, ns, r, d, a) score += sum(rewards) states = next_states if len(self.memory.memory) > self.batch_size: if step % train_size == 0: self.update() if ep < 1000: print('step : ', step, '| start steps : ', start_steps, '| episode :', ep, '| score : ', score, '| memory size', len(self.memory.memory), '| action random : ', action_random) writer.add_scalar('data/reward', score, ep) writer.add_scalar('data/memory_size', len(self.memory.memory), ep)
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise_clip = 0.1 self.alpha = 1e-5 self.num_worker = 20 self.noise = OU_noise(self.output_size, self.num_worker) self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \ cr.sac_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) with tf.variable_scope('target'): _, _, _, _, _, _, _, self.v_targ = \ cr.sac_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit ) self.pi_params = cr.get_vars('main/pi') self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v') self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi) self.q_backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.v_targ) self.v_backup = tf.stop_gradient(self.min_q_pi - self.alpha * self.logp_pi) self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi) self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2) self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2) self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2) self.value_loss = self.q1_loss + self.q2_loss + self.v_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) with tf.control_dependencies([self.train_pi_op]): self.train_value_op = self.value_optimizer.minimize( self.value_loss, var_list=self.value_params) with tf.control_dependencies([self.train_value_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1, self.q2, self.v, self.logp_pi, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
class TD3: def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise = OU_noise(self.output_size, 1) self.noise_clip = 0.1 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target', reuse=True): self.eps = tf.random_normal(tf.shape(self.pi_targ), stddev=self.target_noise) self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip, self.noise_clip) self.a_prev = self.pi_targ + self.epsilon self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit, self.action_limit) _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a2, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.q_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.v_train = self.pi_optimizer.minimize(self.v_loss, var_list=self.q_params) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) def update(self): data = self.memory.get_sample(sample_size=self.batch_size) feed_dict = { self.x_ph: data['state'], self.x2_ph: data['next_state'], self.a_ph: data['action'], self.r_ph: data['reward'], self.d_ph: data['done'] } q_loss, _ = self.sess.run([self.v_loss, self.v_train], feed_dict=feed_dict) pi_loss, _, _ = self.sess.run( [self.pi_loss, self.pi_train, self.target_update], feed_dict=feed_dict) return q_loss, pi_loss def get_action(self, state, epsilon): a = self.sess.run(self.pi, feed_dict={self.x_ph: [state]}) a += epsilon * self.noise.sample() return np.clip(a, -self.action_limit, self.action_limit)[0] def test(self): env = gym.make('Pendulum-v0') while True: state = env.reset() done = False while not done: env.render() action = self.get_action(state, 0) next_state, _, done, _ = env.step(action) state = next_state def run(self): from mlagents.envs import UnityEnvironment writer = SummaryWriter('runs/td3') num_worker = 20 state_size = 33 output_size = 4 epsilon = 1.0 ep = 0 train_size = 5 env = UnityEnvironment(file_name='env/training', worker_id=0) default_brain = env.brain_names[0] brain = env.brains[default_brain] initial_observation = env.reset() step = 0 score = 0 while True: ep += 1 env_info = env.reset() states = np.zeros([num_worker, state_size]) terminal = False self.noise.reset() if epsilon > 0.001: epsilon = -ep * 0.005 + 1.0 while not terminal: step += 1 actions = [self.get_action(s, epsilon) for s in states] env_info = env.step(actions)[default_brain] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done terminal = dones[0] for s, ns, r, d, a in zip(states, next_states, rewards, dones, actions): self.memory.append(s, ns, r, d, a) score += sum(rewards) states = next_states if step % train_size == 0: self.update() if ep < 1000: print('episode :', ep, '| score : ', score, '| epsilon :', epsilon) writer.add_scalar('data/reward', score, ep) writer.add_scalar('data/epsilon', epsilon, ep) writer.add_scalar('data/memory_size', len(self.memory.memory), ep) score = 0
def __init__(self): self.sess = tf.Session() self.state_size = 33 self.output_size = 4 self.tau = 0.995 self.gamma = 0.99 self.hidden = [400, 300] self.batch_size = 64 self.pi_lr = 1e-3 self.q_lr = 1e-3 self.action_limit = 1.0 self.memory = replay_buffer(1e5) self.target_noise = 0.2 self.noise = OU_noise(self.output_size, 1) self.noise_clip = 0.1 self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size, self.state_size, None, None) with tf.variable_scope('main'): self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target'): self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) with tf.variable_scope('target', reuse=True): self.eps = tf.random_normal(tf.shape(self.pi_targ), stddev=self.target_noise) self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip, self.noise_clip) self.a_prev = self.pi_targ + self.epsilon self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit, self.action_limit) _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic( x=self.x2_ph, a=self.a2, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ) self.backup = tf.stop_gradient(self.r_ph + self.gamma * (1 - self.d_ph) * self.min_q_targ) self.pi_loss = -tf.reduce_mean(self.q1_pi) self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2) self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2) self.v_loss = self.q1_loss + self.q2_loss self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) self.q_optimizer = tf.train.AdamOptimizer(self.q_lr) self.pi_train = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params) self.v_train = self.pi_optimizer.minimize(self.v_loss, var_list=self.q_params) self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 32 self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.noise = OU_noise(self.output_size, self.worker_size) self.kappa = 1.0 self.risk_factor = 0 self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size,self.state_size, None, None) self.risk_factor_ph = tf.placeholder(tf.float32) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.dqpg_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) with tf.variable_scope('target'): _, _, self.q_pi_targ = cr.dqpg_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\ + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.q_pi_targ) self.quantile_weight = 1.0 - self.risk_factor_ph*\ (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0) self.pi_loss = -tf.reduce_mean( tf.reduce_mean(self.q_pi * self.quantile_weight)) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) tau = tf.reshape( tf.range(0.5 / self.support_size, 1, 1 / self.support_size), [1, self.support_size]) tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size]) theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) bellman_errors = logit_valid_tile - theta_loss_tile Loss = (tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) * Huber_loss) / self.kappa self.v_loss = tf.reduce_mean( tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1)) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init)
class DQPG: def __init__(self): self.sess = tf.Session() self.state_size = env_set['state'] self.output_size = env_set['action'] self.worker_size = env_set['worker'] self.support_size = 32 self.tau = 0.995 self.gamma = env_set['gamma'] self.hidden = env_set['hidden'] self.batch_size = 64 self.pi_lr = env_set['pi_lr'] self.q_lr = env_set['q_lr'] self.action_limit = 1.0 self.memory = replay_buffer(env_set['mem_size']) self.noise = OU_noise(self.output_size, self.worker_size) self.kappa = 1.0 self.risk_factor = 0 self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \ cr.placeholders(self.state_size, self.output_size,self.state_size, None, None) self.risk_factor_ph = tf.placeholder(tf.float32) with tf.variable_scope('main'): self.pi, self.q, self.q_pi = cr.dqpg_mlp_actor_critic( x=self.x_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) with tf.variable_scope('target'): _, _, self.q_pi_targ = cr.dqpg_mlp_actor_critic( x=self.x2_ph, a=self.a_ph, hidden=self.hidden, activation=tf.nn.relu, output_activation=tf.tanh, output_size=self.output_size, action_limit=self.action_limit, support_size=self.support_size) self.pi_params = cr.get_vars('main/pi') self.q_params = cr.get_vars('main/q') self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\ + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.q_pi_targ) self.quantile_weight = 1.0 - self.risk_factor_ph*\ (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0) self.pi_loss = -tf.reduce_mean( tf.reduce_mean(self.q_pi * self.quantile_weight)) logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1), [1, self.support_size, 1]) tau = tf.reshape( tf.range(0.5 / self.support_size, 1, 1 / self.support_size), [1, self.support_size]) tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size]) theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2), [1, 1, self.support_size]) Huber_loss = tf.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE, delta=self.kappa) bellman_errors = logit_valid_tile - theta_loss_tile Loss = (tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) * Huber_loss) / self.kappa self.v_loss = tf.reduce_mean( tf.reduce_mean(tf.reduce_sum(Loss, axis=1), axis=1)) self.value_optimizer = tf.train.AdamOptimizer(self.q_lr) self.train_value_op = self.value_optimizer.minimize( self.v_loss, var_list=self.q_params) self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr) with tf.control_dependencies([self.train_value_op]): self.train_pi_op = self.pi_optimizer.minimize( self.pi_loss, var_list=self.pi_params) with tf.control_dependencies([self.train_pi_op]): self.target_update = tf.group([ tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target')) ]) self.step_ops = [ self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op, self.target_update ] self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip( cr.get_vars('main'), cr.get_vars('target')) ]) self.sess.run(tf.global_variables_initializer()) self.sess.run(self.target_init) def update(self): data = self.memory.get_sample(sample_size=self.batch_size) feed_dict = { self.x_ph: data['state'], self.x2_ph: data['next_state'], self.a_ph: data['action'], self.r_ph: data['reward'], self.d_ph: data['done'], self.risk_factor_ph: self.risk_factor } pi_loss, q_loss, _, _, _ = self.sess.run(self.step_ops, feed_dict=feed_dict) return q_loss, pi_loss def get_action(self, state, epsilon): a = self.sess.run(self.pi, feed_dict={self.x_ph: state}) a += epsilon * self.noise.sample() return np.clip(a, -self.action_limit, self.action_limit) def run_gym(self): import mujoco_py import gym writer = SummaryWriter('runs/dqpg_' + env_set['env_name'] + '_' + "{:1.1f}".format(self.risk_factor)) epsilon = 0.1 ep = 0 env = gym.make(env_set['env_name']) step = 0 end_ep = env_set['ep_len'] train_start_ep = 5 vs = [] ps = [] scores = deque(maxlen=10) score = 0 state = env.reset() while True: ep += 1 if epsilon > 0.05: epsilon = -ep * 2.0 / end_ep + 1.0 for i in range(1000): if ep > end_ep: env.render() step += 1 action = self.get_action([state], epsilon) next_state, reward, done, info = env.step(action) self.memory.append(state, next_state, reward, done, action[0]) score += reward state = next_state if ep > train_start_ep: v, p = self.update() vs.append(v) ps.append(p) if done: scores.append(score) score = 0 state = env.reset() if ep < end_ep: print('episode :', ep, '| score : ', "{0:.2f}".format(np.mean(scores)), '| epsilon :', "{0:.2f}".format(epsilon), " | v :", "{0:.2f}".format(np.mean(vs)), " | p :", "{0:.2f}".format(np.mean(ps))) writer.add_scalar('data/reward', np.mean(scores), ep) writer.add_scalar('data/epsilon', epsilon, ep) writer.add_scalar('data/memory_size', len(self.memory.memory), ep) writer.add_scalar('loss/value', np.mean(vs), ep) writer.add_scalar('loss/policy', np.mean(ps), ep) vs.clear() ps.clear() def run_unity(self): from mlagents.envs import UnityEnvironment writer = SummaryWriter('runs/dqpg_' + env_set['env_name'] + '_' + "{:1.1f}".format(self.risk_factor)) epsilon = 1.0 ep = 0 env = UnityEnvironment(file_name='env/' + env_set['env_name'], worker_id=0) default_brain = env.brain_names[0] env_info = env.reset()[default_brain] step = 0 scores = np.zeros([self.worker_size]) score = deque(maxlen=10) end_ep = env_set['ep_len'] train_start_ep = 5 vs = [] ps = [] while True: ep += 1 if ep == end_ep + 1: env_info = env.reset(train_mode=False)[default_brain] states = env_info.vector_observations if epsilon > 0.05: epsilon = -ep * 2.0 / end_ep + 1.0 for i in range(1000): step += 1 actions = self.get_action(states, epsilon) env_info = env.step(actions)[default_brain] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for s, ns, r, d, a in zip(states, next_states, rewards, dones, actions): self.memory.append(s, ns, r, d, a) scores += rewards states = next_states for idx, d in enumerate(dones): if d: score.append(scores[idx]) scores[idx] = 0 if ep > train_start_ep: v, p = self.update() vs.append(v) ps.append(p) if ep < end_ep: print('episode :', ep, '| score : ', "{0:.2f}".format(np.mean(score)), '| epsilon :', "{0:.2f}".format(epsilon), " | v :", "{0:.2f}".format(np.mean(vs)), " | p :", "{0:.2f}".format(np.mean(ps))) writer.add_scalar('data/reward', np.mean(score), ep) writer.add_scalar('data/epsilon', epsilon, ep) writer.add_scalar('data/memory_size', len(self.memory.memory), ep) writer.add_scalar('loss/value', np.mean(vs), ep) writer.add_scalar('loss/policy', np.mean(ps), ep) vs.clear() ps.clear()