def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5*tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R}) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac1_rnn(args, env_fn, actor_critic=core.mlp_actor_critic, sac1_dynamic_rnn=core.sac1_dynamic_rnn, ac_kwargs=dict(), seed=0, Lb=10, Lt=10, hc_dim=128, steps_per_epoch=3000, epochs=100, replay_size=int(5e5), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, h0=1.0, batch_size=150, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn('train'), env_fn('test') obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space ###################################### # Inputs to computation graph # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # # # Main outputs from computation graph # with tf.variable_scope('main'): # mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # # # Target value network # with tf.variable_scope('target'): # _, _, logp_pi_, _, _, q1_pi_, q2_pi_ = actor_critic(x2_ph, a_ph, **ac_kwargs) # ###################################### obs_ph, hc_ph = core.placeholders((Lb + Lt + 1, obs_dim), (hc_dim,)) a_ph_all, r_ph_all, d_ph_all, data01_ph = core.placeholders((Lb + Lt, act_dim), (Lb + Lt,), (Lb + Lt,), (Lb + Lt,)) obs_burn = obs_ph[:, :Lb] obs_train = obs_ph[:, Lb:] obs12_train = data01_ph[:, Lb:] # obs12_train = tf.transpose(obs12_train, perm=[1, 0]) a_ph = a_ph_all[:, Lb:] r_ph = r_ph_all[:, Lb:] d_ph = d_ph_all[:, Lb:] _, state_burn_in = sac1_dynamic_rnn(obs_burn, hc_ph) state_burn_in = tf.stop_gradient(state_burn_in) * data01_ph[:, 0][..., tf.newaxis] s_outputs, _ = sac1_dynamic_rnn(obs_train, state_burn_in) s_ph = s_outputs[:, :-1] s2_ph = s_outputs[:, 1:] logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = [None, ] * Lt, [None, ] * Lt, [None, ] * Lt, \ [None, ] * Lt, [None, ] * Lt, [None, ] * Lt logp_pi_, q1_pi_, q2_pi_ = [None, ] * Lt, [None, ] * Lt, [None, ] * Lt for i in range(Lt): # Main outputs from computation graph with tf.variable_scope('main', reuse=tf.AUTO_REUSE): ###################################### _, _, logp_pi[i], logp_pi2[i], q1[i], q2[i], q1_pi[i], q2_pi[i] = actor_critic(s_ph[:, i], s2_ph[:, i], a_ph[:, i], **ac_kwargs) # Target value network with tf.variable_scope('target', reuse=tf.AUTO_REUSE): _, _, logp_pi_[i], _, _, _, q1_pi_[i], q2_pi_[i] = actor_critic(s2_ph[:, i], s2_ph[:, i], a_ph[:, i], **ac_kwargs) logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = tf.stack(logp_pi, axis=1), tf.stack(logp_pi2, axis=1), \ tf.stack(q1, axis=1), tf.stack(q2, axis=1), tf.stack(q1_pi, axis=1), tf.stack(q2_pi, axis=1) logp_pi_, q1_pi_, q2_pi_ = tf.stack(logp_pi_, axis=1), tf.stack(q1_pi_, axis=1), tf.stack(q2_pi_, axis=1) ###################################### # Experience buffer # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) replay_buffer_rnn = ReplayBuffer_RNN(Lb=Lb, Lt=Lt, hc_dim=hc_dim, obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables # var_counts = tuple(core.count_vars(scope) for scope in # ['main/pi', 'main/q1', 'main/q2', 'rnn']) # print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t rnn: %d\n') % var_counts) # print('Number of parameters: \t Total: %d\n' % sum(var_counts)) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * h0, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi_ = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi_ - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(obs12_train * (alpha * logp_pi - q1_pi)) q1_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q1) ** 2) q2_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q2) ** 2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) pi_params = get_vars('main/pi') train_pi_op = pi_optimizer.minimize(pi_loss, var_list=pi_params) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('rnn') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update] else: step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Inputs to computation graph x_ph_geta, hc_ph_geta, a_ph_geta = core.placeholders((1, obs_dim), hc_dim, act_dim) s_geta, hc_geta = sac1_dynamic_rnn(x_ph_geta, hc_ph_geta) # Main outputs from computation graph with tf.variable_scope('main', reuse=tf.AUTO_REUSE): mu, pi, _, _, _, _, _, _ = actor_critic(s_geta[:, 0], s_geta[:, 0], a_ph_geta, **ac_kwargs) # Setup model saving # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, # outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) saver = tf.train.Saver() checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) if args.is_test or args.is_restore_train: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") # def get_action(o, deterministic=False): # act_op = mu if deterministic else pi # return sess.run(act_op, feed_dict={x_ph_geta: o.reshape(1, -1)})[0]#[0] def get_action(o, hc_0, deterministic=False): """s_t_0_ starting step for testing 1 H""" act_op = mu if deterministic else pi action, hc_1 = sess.run([act_op, hc_geta], feed_dict={x_ph_geta: o.reshape(1, 1, obs_dim), hc_ph_geta: hc_0}) # time.sleep(0.001) return action[0], hc_1 ############################## test ############################ if args.is_test: test_env = gym.make(args.env) ave_ep_ret = 0 for j in range(10000): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not d: # (d or (ep_len == 2000)): o, r, d, _ = test_env.step(get_action(o)) ep_ret += r ep_len += 1 if args.test_render: test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1)) return ############################## train ############################ def test_agent(n=5): # print('test') global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 hc_run_test = np.zeros((1, hc_dim,), dtype=np.float32) while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time a_test, hc_run_test = get_action(o, hc_run_test, True) o, r, d, _ = test_env.step(a_test) time.sleep(0.001) ep_ret += r ep_len += 1 # test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) ################################## deques obs_hc_queue = deque([], maxlen=Lb + Lt + 1) a_r_d_data01_queue = deque([], maxlen=Lb + Lt) ################################## deques start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 hc_run = np.zeros((1, hc_dim,), dtype=np.float32) for _i in range(Lb): obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32))) a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((o, hc_run[0])) ################################## deques reset total_steps = steps_per_epoch * epochs # test_ep_ret = test_ep_ret_1 = -10000.0 test_ep_ret_best = test_ep_ret = -10000.0 # Main loop: collect experience in env and update/log each epoch start = time.time() for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a, hc_run = get_action(o, hc_run) else: _, hc_run = get_action(o, hc_run) a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len_train else d # Store experience to replay buffer # replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 #################################### deques store a_r_d_data01_queue.append((a, r, d, True)) obs_hc_queue.append((o2, hc_run[0])) if t_queue % Lt == 0: replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue) if (d or (ep_len == max_ep_len_train)) and t_queue % Lt != 0: for _0 in range(Lt - t_queue % Lt): a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32))) replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue) t_queue += 1 #################################### deques store # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer_rnn.sample_batch(batch_size) feed_dict = {obs_ph: batch['obs'], hc_ph: batch['hc'], a_ph_all: batch['acts'], r_ph_all: batch['rews'], d_ph_all: batch['done'], data01_ph: batch['data01'] } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3][:, 0], Q2Vals=outs[4][:, 0], LogPi=outs[5][:, 0], Alpha=outs[6]) logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len) print("ep_len", ep_len, "time", time.time() - start) start = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ################################## deques reset t_queue = 1 hc_run = np.zeros((1, hc_dim,), dtype=np.float32) for _i in range(Lb): obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32))) a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False)) obs_hc_queue.append((o, hc_run[0])) ################################## deques reset # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if epoch < 2000: test_agent(25) # test_ep_ret = logger.get_stats('TestEpRet')[0] # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) else: test_agent(25) test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.epoch_dict['TestEpRet'] = [] print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) # test_agent(25) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Name', name) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) # test_ep_ret_1 = logger.get_stats('TestEpRet')[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=False) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t) print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, info_key='danger', init=None): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. info_key (str): key of the info to log (sum of infos) """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs, init=init) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q = ac.q(o,a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not(d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len, tr_info, it_info = env.reset(), 0, 0, 0, 0 eps_per_iter = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). #if t > start_steps: a = get_action(o, act_noise) #else: #HELL NO #a = env.action_space.sample() # Step the env #print(o, a) o2, r, d, info = env.step(a) #print(o, a, ac.act(torch.as_tensor(o, dtype=torch.float32)), o2, info[info_key]) ep_ret += r ep_len += 1 tr_info = max(info[info_key], tr_info) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) it_info += tr_info o, ep_ret, ep_len, tr_info = env.reset(), 0, 0, 0 eps_per_iter += 1 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t+1) % steps_per_epoch == 0: epoch = (t+1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch perf = logger.get_stats('EpRet')[0] for name, param in ac.pi.named_parameters(): if param.requires_grad: print('Policy params:', param.data) logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Info', it_info / eps_per_iter) logger.log_tabular('fail', it_info) logger.log_tabular('perf', perf) logger.dump_tabular() it_info = 0 eps_per_iter = 0 return perf
def sac1(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=200, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if not args.is_test: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, x2_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) ############################## save and restore ############################ saver = tf.train.Saver() checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) if args.is_test or args.is_restore_train: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] ############################## test ############################ if args.is_test: test_env = gym.make(args.env) ave_ep_ret = 0 for j in range(10000): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not d: # (d or (ep_len == 2000)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if args.test_render: test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1)) return ############################## train ############################ def test_agent(n=25): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 # test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret_best = test_ep_ret = -10000.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len_train else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): ep_index += 1 print('episode: {}, reward: {}'.format(ep_index, ep_ret / reward_scale)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(int(1.5 * ep_len)): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch test_agent(10) # test_ep_ret = logger.get_stats('TestEpRet')[0] # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) if logger.get_stats('TestEpRet')[0] >= 280: print('Recalculating TestEpRet...') test_agent(100) test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.epoch_dict['TestEpRet'] = [] if test_ep_ret >= 300: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(ep_index, test_ep_ret)) exit() print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Num_Ep', ep_index) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=False) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t) print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret
def maxsqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=200, start_steps=5000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # print(max_ep_len,type(max_ep_len)) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] obs_space = env.observation_space act_dim = env.action_space.n act_space = env.action_space # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None) ###### if alpha == 'auto': # target_entropy = (-np.prod(env.action_space.n)) # target_entropy = (np.prod(env.action_space.n))/4/10 target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) ###### # Main outputs from computation graph with tf.variable_scope('main'): v_x, mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu = actor_critic(x_ph,x2_ph, a_ph, alpha, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, logp_pi_, _, _, _,q1_pi_, q2_pi_,q1_mu_, q2_mu_= actor_critic(x2_ph, x2_ph,a_ph, alpha, **ac_kwargs) # Experience buffer if isinstance(act_space, Box): a_dim = act_dim elif isinstance(act_space, Discrete): a_dim = 1 replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if isinstance(alpha,tf.Tensor): alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi_ + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # min_q_pi = tf.minimum(q1_mu_, q2_mu_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi)# - alpha * logp_pi2) ############################## alpha=0 q_backup = r_ph + gamma*(1-d_ph)*v_backup # Soft actor-critic losses q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # # Policy train op # # (has to be separate from value train op, because q1_pi appears in pi_loss) # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') #with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha), train_value_op, target_update] else: step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, alpha, train_value_op, target_update, train_alpha_op] # Initializing targets to match main variables target_init = tf.group([tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target'))]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2}) def get_action0(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] def entropy(logits, alpha): logps = log_softmax(np.array(logits) / alpha) return -np.sum(np.exp(logps) * logps) # entf = lambda x, logits: x * entropy(logits, x) - 0.3 entf = lambda x, logits: entropy(logits, x) - 0.35 def softmax_ud(logits): logps = log_softmax(np.array(logits)) return np.exp(logps) def get_action(o, deterministic=False): if deterministic: return sess.run(mu, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] else: q_logits = sess.run(v_x, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0] try: alpha_ad = optimize.bisect(entf, args=q_logits,a=0.0001, b=10000, xtol=1e-4, rtol=1e-5) except: print(q_logits) alpha_ad = 0.0001 # alpha_ad = min(1.0, alpha_ad) # print(alpha_ad) # print(alpha_ad * entropy(q_logits, alpha_ad)) return np.random.choice(act_dim, 1, p=softmax_ud(q_logits/alpha_ad))[0] def test_agent(n=20): # n: number of tests global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not(d or (ep_len == max_ep_len)): # max_ep_len # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret = 0.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum if t > start_steps: a = get_action(o) else: a = env.action_space.sample() np.random.random() # Step the env o2, r, d, _ = env.step(a) #print(a,o2) # o2, r, _, d = env.step(a) ##################### # d = d['ale.lives'] < 5 ##################### ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len==max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len): # make sure: max_ep_len < steps_per_epoch ep_index += 1 print('episode: {}, ep_len: {}, reward: {}'.format(ep_index, ep_len, ep_ret)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(int(1.5*ep_len)): batch = replay_buffer.sample_batch(batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossQ1=outs[0], LossQ2=outs[1], Q1Vals=outs[2], Q2Vals=outs[3], LogPi=outs[4], Alpha=outs[5]) #if d: logger.store(EpRet=ep_ret, EpLen=ep_len) # o = env.reset() ##################### # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0 ##################### o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent(10) if logger.get_stats('TestEpRet')[0] >= 190: print('Recalculating TestEpRet...') # test_agent(100) # test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha',average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) # logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=2048, epochs=250, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=100, train_v_iters=70, lam=0.95, max_ep_len=512, target_kl=0.005, logger_kwargs=dict(), save_freq=5): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env( "PandaPegIn", has_offscreen_renderer=True, # has_renderer=True, use_camera_obs=False, control_freq=100, ) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) #加载预训练模型 # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt" # pre_model = torch.load(fname) # ac.pi = pre_model.pi # ac.v =pre_model.v #使用TensorboardX writer = logger.create_writer() # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi( obs, act ) #变化的是网络pi data['obs'],data['act'],data['adv'],data['logp']在回合内未变 ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合) # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() #一次更新改变一次data pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): #在kl散度不超标的前提下尽可能减小损失 pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # print(i,':',loss_v) # print('='*20) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 #运动到初始位置 pre_action = [0, 0, 0] for i in range(4): o, _, _, _ = env.step(pre_action) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): print("epoch:", epoch) for t in range(local_steps_per_epoch): # if( t == steps_per_epoch/2 ): # print("Half finished!") #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率 a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r #单次游戏回报 ep_len += 1 #单次游戏时长 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: #game die; game超出时间; 回合结束 if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) #计算GAE和rewards-to-go # print("steps:",t) # print("done",d) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! update() #将数据写入TensorboardX stats_to_write = logger.get_stats('EpRet') writer.add_scalar('AverageEpRet', stats_to_write[0], global_step=(epoch + 1) * 2048) # Log info about epoch 一个回合的数据 logger.log_tabular('Epoch', epoch) #第几个回合 logger.log_tabular('EpRet', with_min_and_max=True) #回报的最大、最小、平均值,游戏结束时停留的状态的回报 logger.log_tabular('EpLen', average_only=True) #单次游戏长度的平均值 logger.log_tabular('VVals', with_min_and_max=True) #值函数的最大、最小、平均值 logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) #目前总步数 logger.log_tabular('LossPi', average_only=True) #回合开始时策略网络的损失 logger.log_tabular('LossV', average_only=True) #回合开始时值网络的损失 logger.log_tabular('DeltaLossPi', average_only=True) #策略网络回合结束损失-开始损失 logger.log_tabular('DeltaLossV', average_only=True) #值略网络回合结束损失-开始损失 logger.log_tabular('Entropy', average_only=True) #? logger.log_tabular('KL', average_only=True) #散度值 logger.log_tabular('ClipFrac', average_only=True) #越界程度 logger.log_tabular('StopIter', average_only=True) #ppo策略网络迭代次数 logger.log_tabular('Time', time.time() - start_time) #回合时间 logger.dump_tabular() # if __name__ == '__main__': # import argparse # parser = argparse.ArgumentParser() # parser.add_argument('--env', type=str, default='HalfCheetah-v2') # parser.add_argument('--hid', type=int, default=64) # parser.add_argument('--l', type=int, default=2) # parser.add_argument('--gamma', type=float, default=0.99) # parser.add_argument('--seed', '-s', type=int, default=0) # parser.add_argument('--cpu', type=int, default=1) # parser.add_argument('--steps', type=int, default=4000) # parser.add_argument('--epochs', type=int, default=50) # parser.add_argument('--exp_name', type=str, default='ppo') # args = parser.parse_args() # mpi_fork(args.cpu) # run parallel code with mpi # from spinup.utils.run_utils import setup_logger_kwargs # logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) # ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic, # ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, # seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, # logger_kwargs=logger_kwargs)
def dqn(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e5), batch_size=100, gamma=0.99, q_lr=1e-4, start_steps=10000, update_after=1000, update_targ_every=50, num_test_episodes=10, max_ep_len=1000, epsilon=0.01, epsilon_decay=0.99995, logger_kwargs=dict(), writer_kwargs=dict(), save_freq=1): """ DQN (Deep Q-Networks). Reproduce the original paper from Minh et al. """ # Instantiate env env = env_fn() test_env = env_fn() # TODO: might have to assert discrete, or otherwise take only first index of shape or so obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Set up actor (pi) & critic (Q), and data buffer ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) q_targ = copy.deepcopy(ac.q) for p in q_targ.parameters(): p.requires_grad = False q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=q_lr) replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Set RNG seeds torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) env.action_space.seed(seed) # Set up logging logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) logger.setup_pytorch_saver(ac) writer = SummaryWriter(**writer_kwargs) start_time = time.time() total_steps = epochs * steps_per_epoch o = env.reset() op = preprocess_obs(o) # "op" = "observation_preprocessed" ep_return = 0 # episode return, counter ep_length = 0 # episode length, counter for step in range(total_steps): # Take an env step, then store data in replay buffer if step > start_steps: ac.pi.epsilon = max(epsilon, epsilon_decay**step) a = ac.act(torch.as_tensor(op, dtype=torch.float32)) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) o2p = preprocess_obs(o2) replay_buffer.store(op, a, r, o2p, d) # TODO: does DQN paper say to do 1 GD update with mean of minibatch, or many 1-data-point updates? # Sample a random batch from replay buffer and perform one GD step q_optimizer.zero_grad() batch_data = replay_buffer.sample_batch(batch_size) loss_q = compute_loss_q(batch_data, ac, q_targ, gamma) loss_q.backward() q_optimizer.step() # Update target network every so often if (step % update_targ_every == 0) and (step >= update_after): q_targ = copy.deepcopy(ac.q) for p in q_targ.parameters(): p.requires_grad = False # Keep track of episode return and length (for logging purposes) ep_return += r ep_length += 1 # If episode done, reset env if d: o = env.reset() op = preprocess_obs(o) logger.store(EpRet=ep_return, EpLen=ep_length) ep_return = 0 ep_length = 0 else: op = o2p # TODO: confirm: no need for test set if test agent & env are same as training agent & env (e.g. would need # test set if algo added noise to training but not test # If epoch end, then do a test to see average return thus far if step % steps_per_epoch == steps_per_epoch - 1: for ep_i in range(num_test_episodes): # turn off epsilon exploration: old_epsilon = ac.pi.epsilon ac.pi.epsilon = 0 test_ep_return, test_ep_length = run_test_episode(test_env, ac) logger.store(TestEpRet=test_ep_return, TestEpLen=test_ep_length) # turn it back on ac.pi.epsilon = old_epsilon # If epoch end, save models and show logged data if step % steps_per_epoch == steps_per_epoch - 1: epoch_i = int(step // steps_per_epoch) writer.add_scalar("EpRet_mean", logger.get_stats("EpRet")[0], epoch_i) # first item in `get_stats` is mean writer.add_scalar("EpRet_std", logger.get_stats("EpRet")[1], epoch_i) # 2nd item in `get_stats` is std writer.add_scalar("TestEpRet_mean", logger.get_stats("TestEpRet")[0], epoch_i) writer.add_scalar("TestEpRet_std", logger.get_stats("TestEpRet")[1], epoch_i) writer.add_scalar("epsilon", ac.pi.epsilon, epoch_i) logger.save_state({'env': env}, None) # saves both ac and env logger.log_tabular("Epoch", epoch_i) logger.log_tabular("EpRet", with_min_and_max=True) logger.log_tabular("EpLen", average_only=True) logger.log_tabular("TestEpRet", with_min_and_max=True) logger.log_tabular("TestEpLen", average_only=True) logger.log_tabular("TimeFromStart", time.time() - start_time) logger.dump_tabular() # Save model at end logger.save_state({'env': env}, None) writer.close()
def sac(env_fn, actor_critic=CNNActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e4), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=512, start_steps=10000, update_after=100, update_every=1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, device='cuda'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tb_writer = SummaryWriter(log_dir=logger_kwargs['output_dir']) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = PyTorchWrapper( DictToBoxWrapper( DictTransposeImage( CurriculumWrapper(env, epochs, steps_per_epoch, tb_writer=tb_writer))), device) test_env = PyTorchWrapper(DictToBoxWrapper(DictTransposeImage(env_fn())), device) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) ac.to(device) ac_targ.to(device) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device), deterministic) def test_agent(): eval_episode_returns = [] for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 eval_episode_returns.append(ep_ret) logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) tb_writer.add_scalar("eval_eprewmean_updates", np.mean(eval_episode_returns), epoch) tb_writer.add_scalar("eval_eprewmean_steps", np.mean(eval_episode_returns), (epoch + 1) * steps_per_epoch) tb_writer.add_scalar( "eval_success_rate", np.mean(np.array(eval_episode_returns) > 0).astype(np.float), (epoch + 1) * steps_per_epoch) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Tensorboard log try: tb_writer.add_scalar("eprewmean_updates", logger.get_stats('EpRet')[0], epoch) tb_writer.add_scalar("eprewmean_steps", logger.get_stats('EpRet')[0], (epoch + 1) * steps_per_epoch) except IndexError: pass tb_writer.flush() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=ImgStateActorCriticDictBox, ac_kwargs=dict(), seed=0, steps_per_epoch=4096, epochs=2441, gamma=0.99, clip_ratio=0.1, pi_lr=2.5e-4, vf_lr=2.5e-3, train_pi_iters=16, train_v_iters=16, lam=0.95, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=300, eval_interval=20, num_eval_episodes=32, device='cuda', linear_lr_decay=True, linear_clip_decay=True): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tb_writer = SummaryWriter(log_dir=logger_kwargs['output_dir']) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) initial_clip_ratio = clip_ratio local_steps_per_epoch = int(steps_per_epoch / num_procs()) # Instantiate environment env = env_fn() env = PyTorchWrapper( DictToBoxWrapper( DictTransposeImage( CurriculumWrapper(env, epochs, local_steps_per_epoch, tb_writer=tb_writer))), device) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac.to(device) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple( core.count_vars(module) for module in [ac.cnn, ac.pi_, ac.v_]) logger.log('\nNumber of parameters: \t cnn: %d, \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, device) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) if linear_clip_decay: clip_ratio = initial_clip_ratio * (1 - epoch / epochs) print(clip_ratio) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(list(ac.cnn.parameters()) + list(ac.pi_.parameters()), lr=pi_lr) vf_optimizer = Adam(list(ac.cnn.parameters()) + list(ac.v_.parameters()), lr=vf_lr) lr_decay = lambda e: 1 - e / epochs pi_lr_scheduler = LambdaLR(pi_optimizer, lr_decay) vf_lr_scheduler = LambdaLR(vf_optimizer, lr_decay) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.cnn) # average grads across MPI processes mpi_avg_grads(ac.pi_) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.cnn) # average grads across MPI processes mpi_avg_grads(ac.v_) # average grads across MPI processes vf_optimizer.step() if linear_lr_decay: pi_lr_scheduler.step() vf_lr_scheduler.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) def eval_policy(): eval_env = PyTorchWrapper( DictToBoxWrapper(DictTransposeImage(env_fn())), device) eval_episode_returns = [] ob = eval_env.reset() eval_ep_ret = 0 while len(eval_episode_returns) < num_eval_episodes: action = ac.act(ob, deterministic=True) ob, rew, done, _ = eval_env.step(action) eval_ep_ret += rew.cpu().numpy() if done: eval_episode_returns.append(eval_ep_ret) ob = eval_env.reset() eval_ep_ret = 0 eval_env.close() tb_writer.add_scalar("eval_eprewmean_updates", np.mean(eval_episode_returns), epoch) tb_writer.add_scalar("eval_eprewmean_steps", np.mean(eval_episode_returns), (epoch + 1) * steps_per_epoch) tb_writer.add_scalar( "eval_success_rate", np.mean(np.array(eval_episode_returns) > 0).astype(np.float), (epoch + 1) * steps_per_epoch) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 actions = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step( torch.as_tensor(o, dtype=torch.float32, device=device)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() if epoch % eval_interval == 0 or (epoch == epochs - 1): if epoch == epochs - 1: num_eval_episodes = 100 eval_policy() # Tensorboard log try: tb_writer.add_scalar("eprewmean_updates", logger.get_stats('EpRet')[0], epoch) tb_writer.add_scalar("eprewmean_steps", logger.get_stats('EpRet')[0], (epoch + 1) * steps_per_epoch) except IndexError: pass tb_writer.flush() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, env_name, test_env_fns=[], actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, load_dir=None, num_procs=1, clean_every=200): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ from spinup.examples.pytorch.eval_sac import load_pytorch_policy print(f"SAC proc_id {proc_id()}") logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if proc_id() == 0: writer = SummaryWriter(log_dir=os.path.join( logger.output_dir, str(datetime.datetime.now())), comment=logger_kwargs["exp_name"]) torch.manual_seed(seed) np.random.seed(seed) env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks if load_dir is not None: _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing TD feats-losses def compute_loss_feats(data): o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'], data["feats"] feats = torch.stack(list(feats.values())).T # (nbatch, nfeats) feats1 = ac.q1.predict_feats(o, a) feats2 = ac.q2.predict_feats(o, a) feats_keys = replay_buffer.feats_keys # Bellman backup for feature functions with torch.no_grad(): a2, _ = ac.pi(o2) # Target feature values feats1_targ = ac_targ.q1.predict_feats(o2, a2) feats2_targ = ac_targ.q2.predict_feats(o2, a2) feats_targ = torch.min(feats1_targ, feats2_targ) backup = feats + gamma * (1 - d[:, None]) * feats_targ # MSE loss against Bellman backup loss_feats1 = ((feats1 - backup)**2).mean(axis=0) loss_feats2 = ((feats2 - backup)**2).mean(axis=0) loss_feats = loss_feats1 + loss_feats2 # Useful info for logging feats_info = dict(Feats1Vals=feats1.detach().numpy(), Feats2Vals=feats2.detach().numpy()) return loss_feats, feats_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, feats_keys): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() loss_feats, feats_info = compute_loss_feats(data) q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Feature loss keys = [f"LossFeats_{key}" for key in feats_keys] for key, val in zip(keys, loss_feats): logger.store(**dict(key, val.item())) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(feats_keys): num_envs = len(test_env_fns) env_ep_rets = np.zeros(num_envs) for j in range(num_test_episodes): o, d = test_env.reset(), np.zeros(num_envs, dtype=bool) ep_len = np.zeros(num_envs) while not (np.all(d) or np.all(ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, info = test_env.step(get_action(o, True)) env_ep_rets += r ep_len += 1 # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) for ti in range(num_envs): logger.store( **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes}) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs) # Main loop: collect experience in env and update/log each epoch epoch = 0 update_times, clean_times = 0, 0 t = 0 while t <= total_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = np.stack([env.action_space.sample() for _ in range(num_procs)]) # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) if np.all(ep_len == max_ep_len): d.fill(False) # Store experience to replay buffer replay_buffer.store_vec(o, a, r, o2, d, [inf["features"] for inf in info]) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling, assumes all subenvs end at the same time if np.all(d) or np.all(ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) if clean_every > 0 and epoch // clean_every >= clean_times: env.close() test_env.close() env = SubprocVecEnv( [partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") clean_times += 1 o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros( num_procs) # Update handling if t >= update_after and t / update_every > update_times: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, feats_keys=replay_buffer.feats_keys) update_times += 1 # End of epoch handling if t // steps_per_epoch > epoch: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): # try: logger.save_state({'env_name': env_name}, None) # logger.save_state({'env': env}, None) #except: #logger.save_state({'env_name': env_name}, None) # Test the performance of the deterministic version of the agent. test_agent(replay_buffer.feats_keys) # Update tensorboard if proc_id() == 0: log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [ f"TestEpRet_{ti}" for ti in range(len(test_env_fns)) ] log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [ key for key in logger.epoch_dict.keys() if "LossFeats" in key ] log_board = { 'Performance': log_perf_board, 'Loss': log_loss_board } for key, value in log_board.items(): for val in value: mean, std = logger.get_stats(val) if key == 'Performance': writer.add_scalar(key + '/Average' + val, mean, epoch) writer.add_scalar(key + '/Std' + val, std, epoch) else: writer.add_scalar(key + '/' + val, mean, epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() if proc_id() == 0: writer.flush() import psutil # gives a single float value cpu_percent = psutil.cpu_percent() # gives an object with many fields mem_percent = psutil.virtual_memory().percent print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%") cpu_separate = psutil.cpu_percent(percpu=True) for ci, cval in enumerate(cpu_separate): print(f"\t cpu {ci}: {cval}%") # buf_size = replay_buffer.get_size() # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B") t += num_procs if proc_id() == 0: writer.close()