def __init__(self, observation_space,action_space): obs_dim = observation_space.shape act_dim = action_space.shape # Share information about action space with policy architecture ac_kwargs = dict() ac_kwargs['action_space'] = action_space #ac_kwargs['output_activation'] = tf.tanh # Inputs to computation graph self.x_ph, self.a_ph = core.placeholders_from_spaces(observation_space, action_space) self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph self.pi, self.logp, self.logp_pi, self.v = core.mlp_actor_critic(self.x_ph, self.a_ph, output_activation=tf.tanh,**ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) self.all_phs = [self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph] # Every step, get: action, value, and logprob self.get_action_ops = [self.pi, self.v, self.logp_pi] # Experience buffer steps_per_epoch = 1000 self.local_steps_per_epoch = steps_per_epoch gamma = 0.99 lam = 0.97 self.buf = PPOBuffer(obs_dim, act_dim, self.local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) print var_counts # PPO objectives clip_ratio = 0.2 ratio = tf.exp(self.logp - self.logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(self.adv_ph > 0, (1 + clip_ratio) * self.adv_ph, (1 - clip_ratio) * self.adv_ph) self.pi_loss = -tf.reduce_mean(tf.minimum(ratio * self.adv_ph, min_adv)) self.v_loss = tf.reduce_mean((self.ret_ph - self.v) ** 2) # Info (useful to watch during learning) self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp) # a sample estimate for KL-divergence, easy to compute self.approx_ent = tf.reduce_mean(-self.logp) # a sample estimate for entropy, also easy to compute self.clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32)) pi_lr = 3e-4 vf_lr = 1e-3 pi_optimizer = tf.train.AdadeltaOptimizer(learning_rate=pi_lr) vf_optimizer = tf.train.AdadeltaOptimizer(learning_rate=vf_lr) self.train_pi = pi_optimizer.minimize(self.pi_loss) self.train_v = vf_optimizer.minimize(self.v_loss) self.train_pi_iters = 80 self.train_v_iters = 80 self.target_kl = 0.01 self.sess = tf.Session() self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, ac_kwargs, gamma=0.99, alpha=0.2, lr=1e-3, polyak=0.995): self.gamma = gamma self.alpha = alpha self.lr = lr self.polyak = polyak self.ac = core.MLPActorCritic(observation_space, action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.ac.to(device) self.ac_targ.to(device) for p in self.ac_targ.parameters(): p.requires_grad = False self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) self.dynam = dynam.MLPModel(observation_space.shape[0], action_space.shape[0]) self.dynam.to(device) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) print('\nInitial parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.m_optimizer = Adam(self.dynam.parameters(), lr=self.lr * 1.0)
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim, z_type, act_noise, test_without_state, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, z_dim, obs_dim, None, None) actor_critic = core.get_iac_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n' % var_counts) # Bellman backup for Q and V function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) min_q_pi = tf.minimum(q1_pi, q2_pi) v_backup = tf.stop_gradient(min_q_pi) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2) q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2) v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2) value_loss = q1_loss + q2_loss + v_loss # Separate train ops for pi, q policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_policy_op = policy_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) if ac_kwargs["pi_separate"]: train_policy_emb_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/emb')) train_policy_d_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/d')) train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q') + get_vars('main/v')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def sample_z(size): if z_type == "uniform": return np.random.random_sample(size=size) elif z_type == "gaussian": return np.random.normal(size=size) else: raise Exception("z_type error") def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={ x_ph: o.reshape(1, -1), z_ph: sample_z((1, z_dim)) })[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) if test_without_state: _, real_a = get_action(np.zeros(o.shape), 0) else: _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } feed_dict[z_ph] = sample_z((batch_size, z_dim)) # Policy Learning update for key in feed_dict: feed_dict[key] = np.repeat(feed_dict[key], q_pi_sample_size, axis=0) feed_dict[z_ph] = sample_z( (batch_size * q_pi_sample_size, z_dim)) if ac_kwargs["pi_separate"]: if len(rewards) % 2 == 0: outs = sess.run([pi_loss, train_policy_emb_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_d_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_op], feed_dict) logger.store(LossPi=outs[0]) # Q-learning update outs = sess.run([q1_loss, v_loss, q1, v, train_value_op], feed_dict) logger.store(LossQ=outs[0], LossV=outs[1], ValueQ=outs[2], ValueV=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True)[0] test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('ValueQ', average_only=True) logger.log_tabular('ValueV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep sess.run(target_update, feed_dict) logger.save_state( { "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def ddpg(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, test=False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is #irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) saver = tf.train.Saver() save_path = './saved_model/' + env_name + '/test' def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def save(saver, sess): if not os.path.exists('./saved_model/' + env_name): os.mkdir('./saved_model/' + env_name) ckpt_path = saver.save(sess, save_path) #print('Save ckpt file: {}'.format(ckpt_path)) def load(saver, sess): if os.path.exists('./saved_model/' + env_name): saver.restore(sess, save_path) print('Load model complete.') else: print('There is no saved model.') if test is False: start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): #logger.save_state({'env': env}, None) save(saver, sess) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() #save(saver, sess) else: load(saver, sess) test_logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 num_episodes = 100 render = True max_ep_len = 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o, 0) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): test_logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 test_logger.log_tabular('EpRet', with_min_and_max=True) test_logger.log_tabular('EpLen', average_only=True) test_logger.dump_tabular()
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t==local_steps_per_epoch-1 if terminal or epoch_ended: if epoch_ended and not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, batch_size=250000, n=100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape sequence_length = n * max_ep_len trials = batch_size // sequence_length # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) x_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='x_ph') t_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='t_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='a_ph') r_ph = tf.placeholder(dtype=tf.float32, shape=(None, sequence_length), name='r_ph') # input_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, n, None), name='rew_ph') adv_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='logp_old_ph') # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, t_ph, a_ph, r_ph, sequence_length, env.action_space.n, env.observation_space.shape[0]) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, t_ph, a_ph, r_ph, adv_ph, ret_ph, logp_old_ph] # for ph in all_phs: # print(ph.shape) # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer buf = PPOBuffer(obs_dim, act_dim, batch_size, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving model_inputs = {'x': x_ph, 't': t_ph, 'a': a_ph, 'r': r_ph} model_outputs = {'pi': pi} logger.setup_tf_saver(sess, inputs=model_inputs, outputs=model_outputs) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} # inputs[a_ph] = np.tril(np.transpose(np.repeat(inputs[a_ph], n).reshape(trials, n, n), [0, 2, 1])) # inputs[rew_ph] = np.tril(np.transpose(np.repeat(inputs[rew_ph], n).reshape(trials, n, n), [0, 2, 1])) # print(inputs[x_ph]) # print(inputs[t_ph]) # print(inputs[a_ph]) # print(inputs[r_ph]) inputs[x_ph] = inputs[x_ph].reshape(trials, sequence_length) inputs[t_ph] = inputs[t_ph].reshape(trials, sequence_length) inputs[a_ph] = inputs[a_ph].reshape(trials, sequence_length) inputs[r_ph] = inputs[r_ph].reshape(trials, sequence_length) # print('x:', inputs[x_ph]) # print('t:', inputs[t_ph]) # print('a:', inputs[a_ph]) # print('r:', inputs[r_ph]) # print('ret:', inputs[ret_ph]) # print('adv:', inputs[adv_ph]) # print('logp_old:', inputs[logp_old_ph]) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # kl = mpi_avg(kl) # if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.'%i) # break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() save_itr = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trail in range(trials): print('trial:', trail) # last_a = np.zeros(n).reshape(1, n) # last_r = np.zeros(n).reshape(1, n) o_deque = deque(sequence_length * [0], sequence_length) t_deque = deque(sequence_length * [0], sequence_length) last_a = deque(sequence_length * [0], sequence_length) last_r = deque(sequence_length * [0], sequence_length) means = env.sample_tasks(1)[0] # print('task means:', means) action_dict = defaultdict(int) total_reward = 0 env.reset_task(means) o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 for episode in range(sequence_length): # print('episode:', episode) # print('o:', o_deque) # print('d:', t_deque) # print('a:', last_a) # print('r:', last_r) a, v_t, logp_t = sess.run( get_action_ops, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) # print("a shape:", a.shape) # print("v_t shape:", v_t.shape) # print("logp_t shape:", logp_t.shape) # choosen_a = a[episode, 0] # choosen_v_t = v_t[0, episode] # choosen_logp_t = logp_t[episode] # print('a:', a) choosen_a = a[-1] choosen_v_t = v_t[-1] choosen_logp_t = logp_t[-1] action_dict[choosen_a] += 1 o, r, d, _ = env.step(choosen_a) ep_ret += r ep_len += 1 t = ep_len == max_ep_len total_reward += r o_deque.append(o) t_deque.append(int(d)) last_a.append(choosen_a) last_r.append(r) # save and log buf.store(o, int(t), choosen_a, r, choosen_v_t, choosen_logp_t) logger.store(VVals=v_t) terminal = d or t if terminal or (episode == sequence_length - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = r else: last_val = sess.run( v, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) last_val = last_val[-1] buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_deque[-1] = 0 t_deque[-1] = 0 last_a[-1] = 0 last_r[-1] = 0 print(action_dict) print('average reward:', total_reward / sequence_length) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, save_itr) save_itr += 1 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * batch_size) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env, hidden_sizes, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # logger logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # random seeds seed += 1000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # 环境 obs_dim = env.observation_space.shape act_dim = env.action_space.shape # 创建模型 ac = core.MLPActorCritic(env.observation_space, env.action_space, hidden_sizes) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer. 如果有多个线程,每个线程的经验池长度为 local_steps_per_epoch local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, size=local_steps_per_epoch, gamma=gamma, lam=lam) # optimizer pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr) # setup model saving # logger.setup_pytorch_for_mpi() # interaction start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor( o, dtype=torch.float32)) # (act_dim,), (), () next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save buf.store(o, a, r, v, logp) logger.store(VVals=v) # update obs o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: # timeout=True, terminal=True, epoch_ended=True/False if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # 重新初始化 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger) # # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
# Instantiate Actor Critic Neural Net and Target Network net = core.MLPActorCritic(env.observation_space, env.action_space) targ_net = deepcopy(net) # Freeze target network for p in targ_net.parameters(): p.requires_grad = False # Experience / Memory Buffer replay_buffer = core.ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=int(1e6)) # Count number of parameters in network param_counts = tuple(core.count_vars(module) for module in [net.pi, net.q]) logger.log('\nNumber of Parameters: \t pi: %d, \t q: %d\n' % param_counts) # Set up optimization functions for policy and q-function pi_optimizer = Adam(net.pi.parameters(), lr=args.pi_lr) q_optimizer = Adam(net.q.parameters(), lr=args.q_lr) ######################################################################################################################## """ Currently need functions here as they rely on cmd line args from run file """ def update(data, net=net, targ_net=targ_net, gamma=args.gamma, polyak=args.polyak, q_optimizer=q_optimizer, pi_optimizer=pi_optimizer): """
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000, update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150, logger_kwargs=dict(), save_freq=1): logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) torch.set_num_threads(torch.get_num_threads()) actor_critic = core.MLPActorCritic ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) gamma = args.gamma seed = args.seed epochs = args.epochs logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime())) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task test_env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) test_env.set_task(tasks[0]) # Set task obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4) q_optimizer = Adam(q_params, lr=3e-4) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, logger_tensor, t): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) logger_tensor.log_value(t, loss_q.item(), "loss q") # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) logger_tensor.log_value(t, loss_pi.item(), "loss pi") # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) logger_tensor.log_value(t, ep_ret, "test ep reward") logger_tensor.log_value(t, ep_len, "test ep length") # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger_tensor.log_value(t, ep_ret, "reward") logging.info("> total_steps={} | reward={}".format(t, ep_ret)) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, logger_tensor = logger_tensor, t = t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger_tensor.log_value(t, epoch, "epoch") logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch) ac.save(args.save_model_dir, args.model_name)
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: # Test the performance of the deterministic version of the agent. test_agent()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, n_episodes=10000, replay_size=int(1e6), gamma=0.99, show_steps=50, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=200, logger_kwargs=dict(), save_freq=1): tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is # irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=5): for j in range(n): o, r, d, ep_ret, ep_len, ep_cost = test_env.reset( ), 0, False, 0, 0, 0 while not (d or (ep_len == 5 * max_ep_len)): # Take deterministic actions at test time (noise_scale=0) test_env.render() a = get_action(o, 0) o, r, d, _, c = test_env.step(a + 0.5 * np.random.rand(), 1) ep_ret += (r - c) ep_len += 1 ep_cost += c test_env.close() print( "\n avg reward {} and episode length {} over {} trials, cost/step {}" .format(ep_ret / n, ep_len / n, n, ep_cost / ep_len)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for t in range(start_steps): a = env.action_space.sample() o2, r, d, _, c = env.step(a, 1) r -= c replay_buffer.store(o, a, r, o2, d) o = o2 if d: o = env.reset() fails = 0 # Main loop: collect experience in env and update/log each epoch for t in itertools.count(): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ a = get_action(o, act_noise) # Step the env o2, r, d, _, c = env.step(a, 1) r -= c ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if t == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 print("\rSteps {:3}, fails {}".format(t, fails), end="") if t % max_ep_len == 0: """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(max_ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) if d: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 fails += 1 # End of epoch wrap-up if t > 0 and t % (show_steps * max_ep_len) == 0: # Test the performance of the deterministic version of the agent. test_agent()
def rbiflow(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=256, start_steps=1000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, eps=0.2, n_explore=32, device='cuda', n_samples=100, cmin=0.25, cmax=1.75, greed=0.01, rand=0.01): """ Rerouted Behavior Improvement (rbiflow) """ device = torch.device(device) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) def max_reroute(o): b, _ = o.shape o = repeat_and_reshape(o, n_samples) with torch.no_grad(): ai, _ = ac.pi(o) q1 = ac.q1(o, ai) q2 = ac.q2(o, ai) qi = torch.min(q1, q2).unsqueeze(-1) qi = qi.view(n_samples, b, 1) ai = ai.view(n_samples, b, act_dim) rank = torch.argsort(torch.argsort(qi, dim=0, descending=True), dim=0, descending=False) w = cmin * torch.ones_like(ai) m = int((1 - cmin) * n_samples / (cmax - cmin)) w += (cmax - cmin) * (rank < m).float() w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float() w -= greed w += greed * n_samples * (rank == 0).float() w = w * (1 - rand) + rand w = w / w.sum(dim=0, keepdim=True) prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0)) a = torch.gather(ai.permute(1, 2, 0), 2, prob.sample().unsqueeze(2)).squeeze(2) return a, (ai, w.mean(-1)) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] _, (ai, w) = max_reroute(o) pi, logp_pi = ac.pi(o) log_ai = ac.pi.log_prob(ai) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - (log_ai * w).sum(dim=0)).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): o = torch.as_tensor(o, dtype=torch.float32, device=device) if deterministic: a = ac.act(o, deterministic) else: o = o.unsqueeze(0) a, _ = max_reroute(o) a = a.flatten().cpu().numpy() return a def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in tqdm(range(total_steps)): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg( env_fn, actor_critic, ac_kwargs=dict(), # ac_kwargs 存储了网络结构的参数 seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, lam=0.97, # gamma, lambda 的设置 pi_lr=3e-4, vf_lr=1e-3, # 学习率的设置 train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) # num_procs 是CPU个数 buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print( sess.run([ logp, logp_old_ph, tf.reduce_mean(approx_kl), tf.reduce_mean(logp - logp_old_ph) ], feed_dict=inputs)) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) if epoch == epochs - 1: env.render() # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Target value network with tf.variable_scope('target'): self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main']) print('\nNumber of parameters: total: %d\n' % var_counts) a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim) q_value = tf.reduce_sum(self.q * a_one_hot, axis=1) # DDQN online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim) q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1) # DQN # q_target = tf.reduce_max(self.q_next, axis=1) # Bellman backup for Q functions, using Clipped Double-Q targets q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target) # q losses q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') train_value_op = value_optimizer.minimize(q_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step self.step_ops = [q_loss, self.q, train_value_op, target_update] # Initializing targets to match main variables self.target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session( config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( q_loss, self.sess)
def sac(env_name='Ant-v2', actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ tf.set_random_seed(seed) np.random.seed(seed) env = gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi) # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) value_loss = q1_loss + q2_loss + v_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs tf.summary.FileWriter('./log/', graph=tf.get_default_graph()) replay_buffer = ReplayBuffer(obs_dim=env.observation_space.shape[0], act_dim=env.action_space.shape[0], size=replay_size) episode = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o)[0] else: a = np.clip(env.action_space.sample(), -1, 1) # Step the env env.render() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ episode += 1 for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } outs = sess.run(step_ops, feed_dict) print("episode %d, reward %d" % (episode, ep_ret)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 sess.close()
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_critic = core.get_ddpg_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs actions = [] epoch_actions = [] rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 epoch_actions.append(pi_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) actions.append(np.mean(epoch_actions)) epoch_actions = [] rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True) test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('QVals', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep util.plot_actions(test_actions, act_high, logger.output_dir + '/actions%s.png' % epoch) logger.save_state( { "actions": actions, "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def __init__(self, env, test_env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, entropy_tuning: bool = False, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.01, max_ep_len=1000, device='cpu', num_test_episodes=1, save_freq=2, log_mode: List[str] = ["stdout"], log_key: str = "timestep", save_model: str = "checkpoints", checkpoint_path: str = None, log_interval: int = 10, load_model=False, dir_prefix: str = None): torch.manual_seed(seed) np.random.seed(seed) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.seed = seed self.env = env self.test_env = test_env self.obs_dim = env.observation_space.shape self.act_dim = env.action_space.shape[0] self.act_limit = env.action_space.high[0] self.replay_size = replay_size self.batch_size = batch_size #self.noise_scale = act_noise self.load_model = load_model self.log_key = log_key #self.logdir = logdir self.save_model = save_model self.checkpoint_path = checkpoint_path #self.log_interval = log_interval #self.logger = Logger(logdir=logdir, formats=[*log_mode]) #self.pi_lr = pi_lr #self.q_lr = q_lr self.lr = lr self.ac_kwargs = ac_kwargs self.steps_per_epoch = steps_per_epoch self.epochs = epochs self.max_ep_len = max_ep_len self.gamma = gamma self.polyak = polyak self.alpha = alpha self.entropy_tuning = entropy_tuning self.start_steps = start_steps self.update_after = update_after self.update_every = update_every self.save_freq = save_freq self.action_time_step = 0 #no. of updates self.current_timestep = 0 self.current_epoch = 0 self.dir_prefix = dir_prefix # Store the weights and scores in a new directory self.directory = "logs/sac_single_Agent_{}{}/".format( self.dir_prefix, time.strftime("%Y%m%d-%H%M%S")) # appends the timedate os.makedirs(self.directory, exist_ok=True) self.model_dir = os.path.join(self.directory, 'model_param/') os.makedirs(self.model_dir) # Tensorboard writer object self.writer = SummaryWriter(log_dir=self.directory + 'tensorboard/') print("Logging to {}\n".format(self.directory + 'tensorboard/')) #self.test_env = env self.num_test_episodes = num_test_episodes # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs).to(self.device) self.ac_targ = deepcopy(self.ac).to(self.device) #actually no need of saving the policy parameters as target above, since we do not need any target Actor in SAC. if self.load_model: if os.path.exists(self.checkpoint_path): self.ac.load_state_dict( torch.load(os.path.abspath(self.checkpoint_path))) self.ac_targ = deepcopy(self.ac).to(self.device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr) self.pi_scheduler = StepLR(self.pi_optimizer, step_size=1, gamma=0.96) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.q_scheduler = StepLR(self.q_optimizer, step_size=1, gamma=0.96) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=self.replay_size) # from https://github.com/SforAiDl/genrl/blob/master/genrl/deep/agents/sac/sac.py if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = Adam([self.log_alpha], lr=self.lr) #else: # self.alpha=self.alpha # no need of action scales setting # action_limit is directly obtained within the MLPActorCritic class # action_bias is not need as for the city learn environment, actions are bounded with -1/3 to +1/3 # and the bias sums to 0 # Assign device if "cuda" in device and torch.cuda.is_available(): self.device = torch.device(device) else: self.device = torch.device("cpu") # Assign seed if seed is not None: set_seeds(seed, self.env) #initialize logs self.empty_logs() # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) print(var_counts) self.logs["var_counts"] = var_counts print( colorize( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts, 'green', bold=True)) self.writer.add_scalar('Number of parameters/pi', var_counts[0]) self.writer.add_scalar('Number of parameters/q1', var_counts[1]) self.writer.add_scalar('Number of parameters/q2', var_counts[2])
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256, trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph\ raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph') rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40]) max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph') seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,)) # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out # when computing loss seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len)) # rescaled_image_ph This is a ph because we want to be able to pass in value to this node manually rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph') a_ph = core.placeholders_from_spaces( env.action_space)[0] conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5], stride=2) image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5], stride=2)) rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph') # Main outputs from computation graph action_encoder_matrix = np.load(r'encoder.npy') pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic( image_out, a_ph, rew_ph, rnn_state_ph, gru_units, max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, rnn_state, logits] # Experience buffer buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) # Need to mask out the padded zeros when computing loss sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len) # Convert bool tensor to int tensor with 1 and 0 sequence_mask = tf.where(sequence_mask, np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)), np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len))) # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape # it back pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask))) pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio))) aaa = (ret_ph - v)**2 v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask))) ccc = tf.reshape(v_loss_vec, tf.shape(v)) v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v))) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v}) def update(): print(f'Start updating at {datetime.now()}') inputs = {k:v for k,v in zip(all_phs, buf.get())} inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32) inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len) inputs[seq_len_ph] = buf.seq_len_buf pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) buf.reset() # Training print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}') for i in range(train_pi_iters): _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs) print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}') logger.store(StopIter=i) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) print(f'Updating finished at {datetime.now()}') start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 def recenter_rgb(image, min=0.0, max=255.0): ''' :param image: :param min: :param max: :return: an image with rgb value re-centered to [-1, 1] ''' mid = (min + max) / 2.0 return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image) o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): # TODO: tweek settings to match the paper # TODO: find a way to generate mazes last_a = np.array(0) last_r = np.array(r) last_rnn_state = np.zeros((1, gru_units), np.float32) step_counter = 0 for episode in range(episodes_per_trial): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) action_dict = defaultdict(int) # dirty hard coding to make it print in order action_dict[0] = 0 action_dict[1] = 0 action_dict[2] = 0 for step in range(max_ep_len): a, v_t, logp_t, rnn_state_t, logits_t = sess.run( get_action_ops, feed_dict={ rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, # v_rnn_state_ph: last_v_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) action_dict[a[0]] += 1 # save and log buf.store(o_rescaled, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) step_counter += 1 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) ep_ret += r ep_len += 1 last_a = a[0] last_r = np.array(r) last_rnn_state = rnn_state_t terminal = d or (ep_len == max_ep_len) if terminal or (step==n-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}') break print(action_dict) if step_counter < episodes_per_trial * max_ep_len: buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter) buf.seq_len_buf[trial] = step_counter # pad zeros to sequence buffer after each trial # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def td3(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn, env_fn obs_dim = 0 for key in list(env.observation_spec().keys()): obs_dim += env.observation_spec()[key].shape[0] act_dim = env.action_spec().shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_spec().maximum[0] # Create actor-critic module and target networks ac = actor_critic(obs_dim, act_dim, act_limit, **ac_kwargs) ac.to(device) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing TD3 Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): pi_targ = ac_targ.pi(o2) # Target policy smoothing epsilon = torch.randn_like(pi_targ) * target_noise epsilon = torch.clamp(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = torch.clamp(a2, -act_limit, act_limit) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging loss_info = dict(Q1Vals=q1.cpu().detach().numpy(), Q2Vals=q2.cpu().detach().numpy()) return loss_q, loss_info # Set up function for computing TD3 pi loss def compute_loss_pi(data): o = data['obs'] q1_pi = ac.q1(o, ac.pi(o)) return -q1_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(q_params, lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, timer): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.cpu().item(), **loss_info) # Possibly update pi and target networks if timer % policy_delay == 0: # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.cpu().item()) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, device=device, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def get_obs(env, a=None): if a is None: time_step = env.reset() else: time_step = env.step(a) y = np.concatenate([ time_step.observation[key] for key in list(time_step.observation.keys()) ]) return y def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = get_obs(test_env), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, d = get_obs(test_env, get_action(o, 0)), time_step.last() r = 0 if time_step.reward is None else time_step.reward ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = get_obs(env), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in trange(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: action_spec = env.action_spec() a = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) # Step the env time_step = env.step(a) o2, d = get_obs(env, a), time_step.last() r = 0 if time_step.reward is None else time_step.reward ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = get_obs(env), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, timer=j) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def maxsqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(5e5), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=200, start_steps=1000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, prob_pi_a_ph, d_ph = core.placeholders_from_space( obs_space, act_space, obs_space, None, None, None) ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.4 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, q1, q2, v1_x2, v2_x2, pi_log, pi_log_x2 = actor_critic( x_ph, x2_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): mu_, pi_, q1_, q2_, v1_x2_, v2_x2_, pi_log_, pi_log_x2_ = actor_critic( x_ph, x2_ph, a_ph, alpha, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### logp_pi_ = tf.reduce_sum(tf.exp(pi_log_) * pi_log_, axis=1) if isinstance(alpha, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: # # scheme 111111 # min_q_pi = tf.minimum(v1_x2_, v2_x2_) # # min_q_pi = tf.minimum(q1_pi_, q2_pi_) # # min_q_pi = tf.minimum(q1_mu_, q2_mu_) # v_backup = min_q_pi - alpha * pi_log_x2 # v_backup = tf.reduce_sum(tf.exp(pi_log_x2)*v_backup, axis=1) # scheme 222222 min_q_pi = tf.minimum(tf.reduce_sum(tf.exp(pi_log_x2) * v1_x2_, axis=1), tf.reduce_sum(tf.exp(pi_log_x2) * v2_x2_, axis=1)) v_backup = min_q_pi - alpha * tf.reduce_sum(tf.exp(pi_log_x2) * pi_log_x2, axis=1) # # scheme 333333 # min_q_pi = tf.minimum(v1_x2_, v2_x2_) # v_backup = min_q_pi - alpha * pi_log_x2 # v_backup = tf.reduce_max(v_backup, axis=1) v_backup = tf.stop_gradient(v_backup) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses a_one_hot = tf.one_hot(a_ph[..., 0], depth=act_dim) prob_pi_a_cur = tf.reduce_sum(tf.exp(pi_log) * a_one_hot, axis=1) pi_ratio = tf.stop_gradient( tf.clip_by_value(prob_pi_a_cur / prob_pi_a_ph, 0.2, 1.2)) # 0.2, 1.2 # pi_ratio = 1.0 q1_loss = 0.5 * tf.reduce_mean(pi_ratio * (q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean(pi_ratio * (q_backup - q2)**2) value_loss = q1_loss + q2_loss # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha), train_value_op, target_update ] else: step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, alpha, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def get_pi_log(o): return sess.run(pi_log, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] # def get_logp_pi(o): # return sess.run(logp_pi, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def test_agent(n=20): # n: number of tests global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret = 0.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) # o2, r, _, d = env.step(a) ##################### # d = d['ale.lives'] < 5 ##################### ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # # scheme 1 # # logp_pi # logp_pi2 = get_logp_pi(o2) # r_pi = r + gamma * (1 - d) * (- alpha * logp_pi2) # # Store experience to replay buffer # replay_buffer.store(o, a, r_pi, o2, d) # scheme 2 prob_pi_a = np.exp(get_pi_log(o))[a] replay_buffer.store(o, a, prob_pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # make sure: max_ep_len < steps_per_epoch ep_index += 1 print('episode: {}, ep_len: {}, reward: {}'.format( ep_index, ep_len, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(int(ep_len)): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], prob_pi_a_ph: batch['prob_pi_a'], d_ph: batch['done'], } # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], Q1Vals=outs[2], Q2Vals=outs[3], LogPi=outs[4], Alpha=outs[5]) #if d: logger.store(EpRet=ep_ret, EpLen=ep_len) # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(10) # if logger.get_stats('TestEpRet')[0] >= 190: # print('Recalculating TestEpRet...') # test_agent(100) # test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def dqn(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.n print(obs_dim, act_dim) # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o) q2 = ac.q2(o) q1_a = q1.gather(1, a.long()).squeeze(-1) q2_a = q2.gather(1, a.long()).squeeze(-1) # Bellman backup for Q functions with torch.no_grad(): # Target Q-values q1_targ = torch.max(ac_targ.q1(o2), dim=1)[0] q2_targ = torch.max(ac_targ.q2(o2), dim=1)[0] q_targ = torch.min(q1_targ, q2_targ) backup = r + gamma * (1 - d) * q_targ # MSE loss against Bellman backup loss_q1 = ((q1_a - backup) ** 2).mean() loss_q2 = ((q2_a - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up optimizers for policy and q-function q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps or np.random.random() > 0.05: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def LBPO(env_fn, env_name='', actor_critic=core.MLPActorCriticCost, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, jf_lr=1e-3, penalty_init=1., penalty_lr=5e-2, cost_lim=25, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, target_l2=0.012, logger_kwargs=dict(), save_freq=10, beta=0.01, beta_thres=0.05): """ Lyapunov Barrier Policy Optimization Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. env_name : Name of the environment actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. cost_lim (float): Cumulative constraint threshold that we want the agent to respect. target_l2 (float): Hard constraint on KL or a trust region constraint. beta(float): Barrier parameter to control the amount of risk aversion. beta(thres): Barrier parameter for gradient clipping. Set to 0.05 """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape if 'Grid' in env_name: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) else: ac = torch.load('safe_initial_policies/' + env_name + '.pt') # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.Qv1]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up penalty params soft_penalty = Variable(torch.exp(torch.Tensor([penalty_init])) - 1, requires_grad=True) penalty_optimizer = torch.optim.Adam([soft_penalty], lr=penalty_lr) print("Beta: {} Beta threshold: {}".format(beta, beta_thres)) constraint_violations = [0] constraint_violations_count = [0] def safe_transform(data, baseline_pi, pi, epsilon, proj_max_dist): # Do a line search max_steps = 10 obs = data['obs'] for step in range(max_steps): ls_alpha = 0.5**step for param1, param2, target_param in zip( ac.pi.parameters(), ac.baseline_pi.parameters(), ac.pi_mix.parameters()): target_param.data.copy_((ls_alpha) * param1.data + (1 - ls_alpha) * param2.data) mix_act = ac.act_pi(ac.pi_mix, obs).detach() epsilon_observed = ac.Qj1(torch.cat( (obs, mix_act), dim=1)) - ac.Qj1( torch.cat((obs, ac.baseline_pi(obs)), dim=1)) if epsilon_observed.mean() <= epsilon or step == max_steps - 1: for param, target_param in zip(ac.pi_mix.parameters(), ac.pi.parameters()): target_param.data.copy_(param.data) break return ls_alpha def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10): x = torch.zeros(b.size()) r = b.clone() p = b.clone() rdotr = torch.dot(r, r) for i in range(nsteps): _Avp = Avp(p) alpha = rdotr / torch.dot(p, _Avp) x += alpha * p r -= alpha * _Avp new_rdotr = torch.dot(r, r) betta = new_rdotr / rdotr p = r + betta * p rdotr = new_rdotr if rdotr < residual_tol: break return x def linesearch(model, f, x, fullstep, expected_improve_rate, max_backtracks=10, accept_ratio=.1): fval = f().data for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): xnew = x + stepfrac * fullstep set_flat_params_to(model, xnew) newfval = f().data actual_improve = fval - newfval expected_improve = expected_improve_rate * stepfrac ratio = actual_improve / expected_improve if ratio.item() > accept_ratio and actual_improve.item() > 0: return True, xnew return False, x def trust_region_step(model, get_loss, get_kl, max_kl, damping): loss = get_loss() grads = torch.autograd.grad(loss, model.parameters()) loss_grad = torch.cat([grad.view(-1) for grad in grads]).data def Fvp(v): kl = get_kl() kl = kl.mean() grads = torch.autograd.grad(kl, model.parameters(), create_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = (flat_grad_kl * Variable(v)).sum() grads = torch.autograd.grad(kl_v, model.parameters()) flat_grad_grad_kl = torch.cat( [grad.contiguous().view(-1) for grad in grads]).data return flat_grad_grad_kl + v * damping stepdir = conjugate_gradients(Fvp, -loss_grad, 10) shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True) lm = torch.sqrt(shs / max_kl) fullstep = stepdir / lm[0] neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True) print(("lagrange multiplier:", lm[0], "grad_norm:", loss_grad.norm())) prev_params = get_flat_params_from(model) success, new_params = linesearch(model, get_loss, prev_params, fullstep, neggdotstepdir / lm[0]) set_flat_params_to(model, new_params) return loss # Set up function for computing PPO policy loss def compute_loss_pi(data, epoch_no=1): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] def get_kl(old_mean=None, new_mean=None): if old_mean is None: mean1 = ac.pi(obs) else: mean1 = old_mean log_std1, std1 = -2.99, 0.05 if new_mean is None: mean0 = torch.autograd.Variable(mean1.data) else: mean0 = new_mean log_std0 = -2.99 std0 = 0.05 kl = log_std1 - log_std0 + (std0**2 + (mean0 - mean1).pow(2)) / ( 2.0 * std1**2) - 0.5 return kl.sum(1, keepdim=True) def get_loss_pi(): if ac.epsilon < 0: loss_pi = (ac.Qj1(torch.cat((obs, ac.pi(obs)), dim=1))).mean() else: # Surrogate objective that matches the gradient of the barrier at \pi=\pi_B if (beta / ac.epsilon) - beta_thres > 0: loss_pi = - (ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean() + \ (beta/ac.epsilon)*ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1)).mean() else: loss_pi = -(ac.Qv1(torch.cat( (obs, ac.pi(obs)), dim=1))).mean() return loss_pi old_mean = ac.pi(obs).detach().data loss_pi = trust_region_step(ac.pi, get_loss_pi, get_kl, target_l2, 0.1) if ac.epsilon >= 0: alpha_mix = safe_transform( data, ac.baseline_pi, ac.pi, ac.epsilon, np.sqrt( np.max((target_l2 + 0.5) * (2.0 * 0.05**2) - 0.05**2, 0))) logger.store(AlphaMix=alpha_mix) if (beta / ac.epsilon) - beta_thres > 0: logger.store(CostGradWeight=(beta / ac.epsilon)) else: logger.store(CostGradWeight=0) else: logger.store(AlphaMix=-1) logger.store(CostGradWeight=-1) # Useful extra info approx_l2 = torch.sqrt(torch.mean( (ac.pi(obs) - data['old_act'])**2)).item() approx_kl = get_kl(old_mean=old_mean, new_mean=ac.pi(obs).detach()).mean().item() ent = 0 clipped = [0] clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, l2=approx_l2, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, act, ret = data['obs'], data['act'], data['ret'] return ((ac.Qv1(torch.cat((obs, act), dim=1)) - ret)**2).mean(), ((ac.Qv2(torch.cat( (obs, act), dim=1)) - ret)**2).mean() # Set up function for computing value loss def compute_loss_j(data): obs, act, cost_ret = data['obs'], data['act'], data['cost_ret'] return ((ac.Qj1(torch.cat((obs, act), dim=1)) - cost_ret)**2).mean(), ((ac.Qj2(torch.cat( (obs, act), dim=1)) - cost_ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) pi_bc_optimizer = Adam(ac.pi.parameters(), lr=0.001) vf1_optimizer = Adam(ac.Qv1.parameters(), lr=vf_lr) vf2_optimizer = Adam(ac.Qv2.parameters(), lr=vf_lr) jf1_optimizer = Adam(ac.Qj1.parameters(), lr=jf_lr) jf2_optimizer = Adam(ac.Qj2.parameters(), lr=jf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(epoch_no, constraint_violations, constraint_violations_count): # global soft_penalty, penalty_optimizer data = buf.get() # Update the penalty curr_cost = logger.get_stats('EpCostRet')[0] if curr_cost - cost_lim > 0: logger.log('Warning! Safety constraint is already violated.', 'red') ac.epsilon = (1 - gamma) * (cost_lim - curr_cost) if epoch_no == 0 or ac.epsilon >= 0: ac.baseline_pi = copy.deepcopy(ac.pi) ac.baseline_Qj = copy.deepcopy(ac.Qj1) pi_l_old, v_l_old, j_l_old = 0, 0, 0 pi_info_old = dict(kl=0, l2=0, ent=0, cf=0) if epoch_no == 0: for i in range(train_v_iters): vf1_optimizer.zero_grad() vf2_optimizer.zero_grad() loss_v1, loss_v2 = compute_loss_v(data) loss_v1.backward() loss_v2.backward() mpi_avg_grads(ac.Qv1) # average grads across MPI processes mpi_avg_grads(ac.Qv2) vf1_optimizer.step() vf2_optimizer.step() jf1_optimizer.zero_grad() jf2_optimizer.zero_grad() loss_j1, loss_j2 = compute_loss_j(data) loss_j1.backward() loss_j2.backward() mpi_avg_grads(ac.Qj1) # average grads across MPI processes mpi_avg_grads(ac.Qj2) jf1_optimizer.step() jf2_optimizer.step() # Trust region update for policy loss_pi, pi_info = compute_loss_pi(data, epoch_no=epoch_no) logger.store(StopIter=0) # Value and Cost Value function learning for i in range(train_v_iters): vf1_optimizer.zero_grad() vf2_optimizer.zero_grad() loss_v1, loss_v2 = compute_loss_v(data) loss_v1.backward() loss_v2.backward() mpi_avg_grads(ac.Qv1) # average grads across MPI processes mpi_avg_grads(ac.Qv2) vf1_optimizer.step() vf2_optimizer.step() jf1_optimizer.zero_grad() jf2_optimizer.zero_grad() loss_j1, loss_j2 = compute_loss_j(data) loss_j1.backward() loss_j2.backward() mpi_avg_grads(ac.Qj1) # average grads across MPI processes mpi_avg_grads(ac.Qj2) jf1_optimizer.step() jf2_optimizer.step() # Log changes from update kl, l2, ent, cf = pi_info['kl'], pi_info['l2'], pi_info_old[ 'ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, LossJ=j_l_old, KL=kl, L2=l2, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v1.item() - v_l_old), DeltaLossJ=(loss_j1.item() - j_l_old), Penalty=torch.nn.functional.softplus(soft_penalty)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_cost_ret, ep_len = env.reset(), 0, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, j, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) noise = 0.05 * np.random.randn(*a.shape) # fixed noise a = a + noise next_o, r, d, info = env.step(a) ep_ret += r ep_cost_ret += info.get('cost', 0) ep_len += 1 # save and log buf.store(o, a, r, info.get('cost', 0), v, j, logp, a) logger.store(VVals=v, JVals=j) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, j, _ = ac.step( torch.as_tensor(o, dtype=torch.float32)) else: v, j = 0, 0 buf.finish_path(v, j) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpCostRet=ep_cost_ret, EpLen=ep_len) o, ep_ret, ep_cost_ret, ep_len = env.reset(), 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update(epoch, constraint_violations, constraint_violations_count) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpCostRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('JVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossJ', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('DeltaLossJ', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Epsilon', ac.epsilon) logger.log_tabular('CostGradWeight', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Penalty', average_only=True) logger.log_tabular('AlphaMix', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def main(): actor_critic = core.MLPActorCritic hidden_size = 64 activation = torch.nn.Tanh seed = 5 steps_per_epoch = 2048 epochs = 1000 gamma = 0.99 lam = 0.97 beta = 3.0 pi_lr = 3e-4 vf_lr = 1e-3 train_pi_iters = 80 train_vf_iters = 80 max_ep_len = 1000 target_kl = 0.01 save_freq = 10 obs_norm = True view_curve = False # make an environment # env = gym.make('CartPole-v0') # env = gym.make('CartPole-v1') # env = gym.make('MountainCar-v0') # env = gym.make('LunarLander-v2') env = gym.make('BipedalWalker-v3') print(f"reward_threshold: {env.spec.reward_threshold}") obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed env.seed(seed) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, (hidden_size, hidden_size), activation) # Set up optimizers for policy and value function pi_optimizer = AdamW(ac.pi.parameters(), lr=pi_lr, eps=1e-6) vf_optimizer = AdamW(ac.v.parameters(), lr=vf_lr, eps=1e-6) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 ep_num = 0 ep_ret_buf, eval_ret_buf = [], [] loss_buf = {'pi': [], 'vf': []} obs_normalizer = RunningMeanStd(shape=env.observation_space.shape) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): env.render() if obs_norm: obs_normalizer.update(np.array([o])) o_norm = np.clip( (o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var), -10, 10) a, v, logp = ac.step( torch.as_tensor(o_norm, dtype=torch.float32)) else: a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log if obs_norm: buf.store(o_norm, a, r, v, logp) else: buf.store(o, a, r, v, logp) # Update obs o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if timeout or epoch_ended: if obs_norm: obs_normalizer.update(np.array([o])) o_norm = np.clip((o - obs_normalizer.mean) / np.sqrt(obs_normalizer.var), -10, 10) _, v, _ = ac.step( torch.as_tensor(o_norm, dtype=torch.float32)) else: _, v, _ = ac.step( torch.as_tensor(o, dtype=torch.float32)) else: if obs_norm: obs_normalizer.update(np.array([o])) v = 0 buf.finish_path(v) if terminal: ep_ret_buf.append(ep_ret) eval_ret_buf.append(np.mean(ep_ret_buf[-100:])) ep_num += 1 if view_curve: plot(ep_ret_buf, eval_ret_buf, loss_buf) else: print(f'Episode: {ep_num:3} Reward: {ep_ret:3}') if eval_ret_buf[-1] >= env.spec.reward_threshold: print(f"\n{env.spec.id} is sloved! {ep_num} Episode") return o, ep_ret, ep_len = env.reset(), 0, 0 # Perform PPO update! update(buf, train_pi_iters, train_vf_iters, beta, target_kl, ac, pi_optimizer, vf_optimizer, loss_buf)
def asac_v2(actor_critic=core.mlp_actor_critic, seed=0, ac_kwargs=dict(), steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=0.001, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() tf.set_random_seed(seed) np.random.seed(seed) env = baxter() obs_dim = env.obs_dim act_dim = env.act_dim # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = 0.1 # Share information about action space with policy architecture # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in [ 'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main' ]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R dQ = Q_backup * (R - Q) pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars( 'main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss, v_targ ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch obs1_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) obs2_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32) act_epi = np.zeros([2 * max_ep_len, act_dim], dtype=np.float32) rew_epi = np.zeros([2 * max_ep_len], dtype=np.float32) done_epi = np.zeros([2 * max_ep_len], dtype=np.float32) ptr_epi = 0 alpha_update = False epi_num = 0 for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o["feature"]) else: a = 0.1 - np.random.sample(act_dim) * 0.2 # Step the env o2, r = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o["feature"], a, r, o2["feature"], d) obs1_epi[ptr_epi] = o["feature"] obs2_epi[ptr_epi] = o2["feature"] act_epi[ptr_epi] = a rew_epi[ptr_epi] = r done_epi[ptr_epi] = d ptr_epi += 1 # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): epi_num += 1 print("epi : {}, alpha : {}, return : {}".format( epi_num, alpha_t, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ """ rew_epi[ptr_epi] = sess.run(R, feed_dict={x_ph: [o]})[0] rets_epi = scipy.signal.lfilter([1], [1, float(-gamma)], rew_epi[::-1], axis=0)[::-1] rets_epi = rets_epi[:-1] """ """ v_epi = sess.run(R, feed_dict={x_ph: obs_epi}) q_epi, adv_epi = sess.run([Q, adv], feed_dict={x_ph: obs_epi[:-1], a_ph: act_epi}) rets_epi = rew_epi + gamma*v_epi[1:] if t > start_steps: alpha.update_alpha(adv_epi, np.mean(rets_epi*(v_epi[:-1]-q_epi)) > 0) alpha_t = alpha() print("{} {}".format(np.mean(rets_epi*(v_epi[:-1]-q_epi)), alpha_t)) """ if ptr_epi >= max_ep_len: feed_dict = { x_ph: obs1_epi[:ptr_epi], x2_ph: obs2_epi[:ptr_epi], a_ph: act_epi[:ptr_epi], r_ph: rew_epi[:ptr_epi], d_ph: done_epi[:ptr_epi] } adv_epi, Q_epi, R_epi = sess.run([adv, Q, R], feed_dict) R_next_epi = sess.run(R, feed_dict={x_ph: obs2_epi[:ptr_epi]}) dQ_epi = (rew_epi[:ptr_epi] + gamma * (1 - done_epi[:ptr_epi]) * R_next_epi) * (R_epi - Q_epi) """ ret_epi = np.zeros([ptr_epi], dtype=np.float32) for i in np.arange(ptr_epi)[::-1]: if i == ptr_epi - 1: R_next_epi = sess.run(R, feed_dict={x_ph: [obs2_epi[i]]})[0] ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*R_next_epi else: ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*ret_epi[i+1] dQ_epi = ret_epi * (R_epi - Q_epi) """ if t > start_steps: alpha.update_alpha(adv_epi, np.mean(dQ_epi) > 0) alpha_t = alpha() print("{} {}".format(np.mean(dQ_epi), alpha_t)) obs1_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) obs2_epi = np.zeros([max_ep_len * 2, obs_dim], dtype=np.float32) act_epi = np.zeros([max_ep_len * 2, act_dim], dtype=np.float32) rew_epi = np.zeros([max_ep_len * 2], dtype=np.float32) done_epi = np.zeros([max_ep_len * 2], dtype=np.float32) ptr_epi = 0 """ batch = replay_buffer.sample_batch(1000) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t} dQ_epi = sess.run(dQ, feed_dict) """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=None, target_kl=0.01, logger_kwargs=dict(), save_freq=10, TensorBoard=True, save_nn=True, save_every=1000, load_latest=False, load_custom=False, LoadPath=None, RTA_type=None): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. TensorBoard (bool): True plots to TensorBoard, False does not save_nn (bool): True saves neural network data, False does not save_every (int): How often to save neural network load_latest (bool): Load last saved neural network data before training load_custom (bool): Load custom neural network data file before training LoadPath (str): Path for custom neural network data file RTA_type (str): RTA framework, either 'CBF', 'SVL', 'ASIF', or 'SBSF' """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed for each cpu seed += 1 * proc_id() env.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Load model if True if load_latest: models = glob.glob(f"{PATH}/models/PPO/*") LoadPath = max(models, key=os.path.getctime) ac.load_state_dict(torch.load(LoadPath)) elif load_custom: ac.load_state_dict(torch.load(LoadPath)) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Import RTA if RTA_type == 'CBF': from CBF_for_speed_limit import RTA elif RTA_type == 'SVL': from Simple_velocity_limit import RTA elif RTA_type == 'ASIF': from IASIF import RTA elif RTA_type == 'SBSF': from ISimplex import RTA # Call RTA, define action conversion if RTA_type != 'off': env.RTA_reward = RTA_type rta = RTA(env) def RTA_act(obs, act): act = np.clip(act, -env.force_magnitude, env.force_magnitude) x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0] u_des = np.array([[act[0]], [act[1]], [0]]) u = rta.main(x0, u_des) new_act = [u[0, 0], u[1, 0]] if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001: env.RTA_on = False else: env.RTA_on = True return new_act # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes = 0 RTA_percent = 0 # Create TensorBoard file if True if TensorBoard and proc_id() == 0: if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name = f"{PATH}/runs/Spacecraft-docking-" + current_time elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name = f"{PATH}/runs/Dubins-aircraft-" + current_time writer = SummaryWriter(Name) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): batch_ret = [] # Track episode returns batch_len = [] # Track episode lengths batch_RTA_percent = [] # Track precentage of time RTA is on env.success = 0 # Track episode success rate env.failure = 0 # Track episode failure rate env.crash = 0 # Track episode crash rate env.overtime = 0 # Track episode over max time/control rate episodes = 0 # Track episodes delta_v = [] # Track episode total delta v for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) if RTA_type != 'off': # If RTA is on, get RTA action RTA_a = RTA_act(o, a) if env.RTA_on: RTA_percent += 1 next_o, r, d, _ = env.step(RTA_a) else: # If RTA is off, pass through desired action next_o, r, d, _ = env.step(a) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': over_max_vel, _, _ = env.check_velocity(a[0], a[1]) if over_max_vel: RTA_percent += 1 ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) batch_ret.append(ep_ret) batch_len.append(ep_len) episodes += 1 if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': delta_v.append(env.control_input / env.mass_deputy) batch_RTA_percent.append(RTA_percent / ep_len * 100) RTA_percent = 0 o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes += episodes # Track success, failure, crash, overtime rates if episodes != 0: success_rate = env.success / episodes failure_rate = env.failure / episodes crash_rate = env.crash / episodes overtime_rate = env.overtime / episodes else: success_rate = 0 failure_rate = 0 crash_rate = 0 overtime_rate = 0 raise ( "No completed episodes logging will break [increase steps per epoch]" ) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Average data over all cpus avg_batch_ret = mpi_avg(np.mean(batch_ret)) avg_batch_len = mpi_avg(np.mean(batch_len)) avg_success_rate = mpi_avg(success_rate) avg_failure_rate = mpi_avg(failure_rate) avg_crash_rate = mpi_avg(crash_rate) avg_overtime_rate = mpi_avg(overtime_rate) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': avg_delta_v = mpi_avg(np.mean(delta_v)) avg_RTA_percent = mpi_avg(np.mean(batch_RTA_percent)) if proc_id() == 0: # Only on one cpu # Plot to TensorBoard if True, only on one cpu if TensorBoard: writer.add_scalar('Return', avg_batch_ret, epoch) writer.add_scalar('Episode-Length', avg_batch_len * env.tau, epoch) writer.add_scalar('Success-Rate', avg_success_rate * 100, epoch) writer.add_scalar('Failure-Rate', avg_failure_rate * 100, epoch) writer.add_scalar('Crash-Rate', avg_crash_rate * 100, epoch) writer.add_scalar('Overtime-Rate', avg_overtime_rate * 100, epoch) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': writer.add_scalar('Delta-V', avg_delta_v, epoch) writer.add_scalar('RTA-on-percent', avg_RTA_percent, epoch) # Save neural network if true, can change to desired location if save_nn and epoch % save_every == 0 and epoch != 0: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + f"-epoch{epoch}.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + f"-epoch{epoch}.dat" torch.save(ac.state_dict(), Name2) # Average episodes per hour, episode per epoch ep_hr = mpi_avg(total_episodes) * args.cpu / (time.time() - start_time) * 3600 ep_Ep = mpi_avg(total_episodes) * args.cpu / (epoch + 1) # Plot on one cpu if proc_id() == 0: # Save neural network if save_nn: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + "-final.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + "-final.dat" torch.save(ac.state_dict(), Name2) # Print statistics on episodes print( f"Episodes per hour: {ep_hr:.0f}, Episodes per epoch: {ep_Ep:.0f}, Epochs per hour: {(epoch+1)/(time.time()-start_time)*3600:.0f}" )
def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-4, alpha=0.0, batch_size=1024, start_steps=10000, update_after=0, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='SAC'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ.load_state_dict(self.ac.state_dict()) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.p_lr = p_lr self.lr = lr self.alpha = 0 # # Algorithm specific hyperparams # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo))
def __init__(self, opt, job): self.opt = opt with tf.Graph().as_default(): tf.set_random_seed(opt.seed) np.random.seed(opt.seed) # Inputs to computation graph self.x_ph, self.a_ph, self.x2_ph = core.placeholders( opt.obs_shape, opt.act_shape, opt.obs_shape) self.r_ph, self.d_ph, self.logp_pi_ph = core.placeholders( (opt.Ln, ), (opt.Ln, ), (opt.Ln, )) # ------ if opt.alpha == 'auto': log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha_v = tf.exp(log_alpha) else: alpha_v = opt.alpha # ------ # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, self.logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \ = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_, q1_mu_, q2_mu_ \ = actor_critic(self.x2_ph, self.x2_ph, self.a_ph, alpha_v, use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer, hidden_sizes=opt.hidden_size, action_space=opt.act_space, model=opt.model) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) # ------ if isinstance(alpha_v, tf.Tensor): alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi_ + opt.target_entropy)) alpha_optimizer = tf.train.AdamOptimizer( learning_rate=opt.lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) # ------ # Min Double-Q: if opt.use_max: min_q_pi = tf.minimum(q1_mu_, q2_mu_) else: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # x2 # get rid of abnormal explosion # min_q_pi = tf.clip_by_value(min_q_pi, -300.0, 900.0) #### n-step backup q_backup = tf.stop_gradient(min_q_pi) for step_i in reversed(range(opt.Ln)): q_backup = self.r_ph[:, step_i] + \ opt.gamma * (1 - self.d_ph[:, step_i]) * (-alpha_v * self.logp_pi_ph[:, step_i] + q_backup) #### # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) self.value_loss = q1_loss + q2_loss value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr) value_params = get_vars('main/q') bn_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(bn_update_ops): train_value_op = value_optimizer.minimize( self.value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha_v, Number): self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha_v), train_value_op, target_update ] else: self.step_ops = [ q1_loss, q2_loss, q1, q2, logp_pi_, alpha_v, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables self.target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) if job == "learner": config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 self.sess = tf.Session(config=config) else: self.sess = tf.Session(config=tf.ConfigProto( # device_count={'GPU': 0}, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) self.sess.run(tf.global_variables_initializer()) if job == "learner": # Set up summary Ops self.train_ops, self.train_vars = self.build_summaries() self.writer = tf.summary.FileWriter( opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" + opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph) self.variables = ray.experimental.tf_utils.TensorFlowVariables( self.value_loss, self.sess)