def ddpg(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, test=False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is #irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) saver = tf.train.Saver() save_path = './saved_model/' + env_name + '/test' def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def save(saver, sess): if not os.path.exists('./saved_model/' + env_name): os.mkdir('./saved_model/' + env_name) ckpt_path = saver.save(sess, save_path) #print('Save ckpt file: {}'.format(ckpt_path)) def load(saver, sess): if os.path.exists('./saved_model/' + env_name): saver.restore(sess, save_path) print('Load model complete.') else: print('There is no saved model.') if test is False: start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): #logger.save_state({'env': env}, None) save(saver, sess) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() #save(saver, sess) else: load(saver, sess) test_logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 num_episodes = 100 render = True max_ep_len = 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o, 0) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): test_logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 test_logger.log_tabular('EpRet', with_min_and_max=True) test_logger.log_tabular('EpLen', average_only=True) test_logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256, trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph\ raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph') rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40]) max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph') seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,)) # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out # when computing loss seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len)) # rescaled_image_ph This is a ph because we want to be able to pass in value to this node manually rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph') a_ph = core.placeholders_from_spaces( env.action_space)[0] conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5], stride=2) image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5], stride=2)) rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph') # Main outputs from computation graph action_encoder_matrix = np.load(r'encoder.npy') pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic( image_out, a_ph, rew_ph, rnn_state_ph, gru_units, max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, rnn_state, logits] # Experience buffer buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) # Need to mask out the padded zeros when computing loss sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len) # Convert bool tensor to int tensor with 1 and 0 sequence_mask = tf.where(sequence_mask, np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)), np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len))) # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape # it back pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask))) pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio))) aaa = (ret_ph - v)**2 v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask))) ccc = tf.reshape(v_loss_vec, tf.shape(v)) v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v))) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v}) def update(): print(f'Start updating at {datetime.now()}') inputs = {k:v for k,v in zip(all_phs, buf.get())} inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32) inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len) inputs[seq_len_ph] = buf.seq_len_buf pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) buf.reset() # Training print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}') for i in range(train_pi_iters): _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs) print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}') logger.store(StopIter=i) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) print(f'Updating finished at {datetime.now()}') start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 def recenter_rgb(image, min=0.0, max=255.0): ''' :param image: :param min: :param max: :return: an image with rgb value re-centered to [-1, 1] ''' mid = (min + max) / 2.0 return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image) o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): # TODO: tweek settings to match the paper # TODO: find a way to generate mazes last_a = np.array(0) last_r = np.array(r) last_rnn_state = np.zeros((1, gru_units), np.float32) step_counter = 0 for episode in range(episodes_per_trial): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) action_dict = defaultdict(int) # dirty hard coding to make it print in order action_dict[0] = 0 action_dict[1] = 0 action_dict[2] = 0 for step in range(max_ep_len): a, v_t, logp_t, rnn_state_t, logits_t = sess.run( get_action_ops, feed_dict={ rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, # v_rnn_state_ph: last_v_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) action_dict[a[0]] += 1 # save and log buf.store(o_rescaled, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) step_counter += 1 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) ep_ret += r ep_len += 1 last_a = a[0] last_r = np.array(r) last_rnn_state = rnn_state_t terminal = d or (ep_len == max_ep_len) if terminal or (step==n-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}') break print(action_dict) if step_counter < episodes_per_trial * max_ep_len: buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter) buf.seq_len_buf[trial] = step_counter # pad zeros to sequence buffer after each trial # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.0, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=1000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): act_dim = action_space.n logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) pi_all = tf.nn.softmax(logits) logpi_all = tf.nn.log_softmax(logits) # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1) pi = tf.random.categorical(logits, 1) # a = tf.cast( a, tf.uint8) # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1) return pi, pi_all, logpi_all LOG_STD_MIN = -20 LOG_STD_MAX = 2 with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): pi, pi_all, logpi_all = mlp_categorical_policy( x_ph, a_ph, hidden_sizes, activation, None, env.action_space) print("### DEBUG @ main-discrete.py pi and others' dimensions") print(pi) print(pi_all) print(logpi_all) input() with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, 1, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0] else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32) # Main networks with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # Losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Optimizer and train ops train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=get_vars('main/pi')) train_q_op = tf.train.AdamOptimizer(q_loss).minimize( q_loss, var_list=get_vars('main/q')) # Update target networks target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Init targets target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_actions(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_actions(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: for t in range(total_steps): if t > start_steps: a = get_actions(o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len == max_ep_len else d # Storing experience replaybuffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replaybuffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, batch_size=250000, n=100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape sequence_length = n * max_ep_len trials = batch_size // sequence_length # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) x_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='x_ph') t_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='t_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='a_ph') r_ph = tf.placeholder(dtype=tf.float32, shape=(None, sequence_length), name='r_ph') # input_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, n, None), name='rew_ph') adv_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='logp_old_ph') # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, t_ph, a_ph, r_ph, sequence_length, env.action_space.n, env.observation_space.shape[0]) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, t_ph, a_ph, r_ph, adv_ph, ret_ph, logp_old_ph] # for ph in all_phs: # print(ph.shape) # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer buf = PPOBuffer(obs_dim, act_dim, batch_size, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving model_inputs = {'x': x_ph, 't': t_ph, 'a': a_ph, 'r': r_ph} model_outputs = {'pi': pi} logger.setup_tf_saver(sess, inputs=model_inputs, outputs=model_outputs) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} # inputs[a_ph] = np.tril(np.transpose(np.repeat(inputs[a_ph], n).reshape(trials, n, n), [0, 2, 1])) # inputs[rew_ph] = np.tril(np.transpose(np.repeat(inputs[rew_ph], n).reshape(trials, n, n), [0, 2, 1])) # print(inputs[x_ph]) # print(inputs[t_ph]) # print(inputs[a_ph]) # print(inputs[r_ph]) inputs[x_ph] = inputs[x_ph].reshape(trials, sequence_length) inputs[t_ph] = inputs[t_ph].reshape(trials, sequence_length) inputs[a_ph] = inputs[a_ph].reshape(trials, sequence_length) inputs[r_ph] = inputs[r_ph].reshape(trials, sequence_length) # print('x:', inputs[x_ph]) # print('t:', inputs[t_ph]) # print('a:', inputs[a_ph]) # print('r:', inputs[r_ph]) # print('ret:', inputs[ret_ph]) # print('adv:', inputs[adv_ph]) # print('logp_old:', inputs[logp_old_ph]) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # kl = mpi_avg(kl) # if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.'%i) # break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() save_itr = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trail in range(trials): print('trial:', trail) # last_a = np.zeros(n).reshape(1, n) # last_r = np.zeros(n).reshape(1, n) o_deque = deque(sequence_length * [0], sequence_length) t_deque = deque(sequence_length * [0], sequence_length) last_a = deque(sequence_length * [0], sequence_length) last_r = deque(sequence_length * [0], sequence_length) means = env.sample_tasks(1)[0] # print('task means:', means) action_dict = defaultdict(int) total_reward = 0 env.reset_task(means) o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 for episode in range(sequence_length): # print('episode:', episode) # print('o:', o_deque) # print('d:', t_deque) # print('a:', last_a) # print('r:', last_r) a, v_t, logp_t = sess.run( get_action_ops, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) # print("a shape:", a.shape) # print("v_t shape:", v_t.shape) # print("logp_t shape:", logp_t.shape) # choosen_a = a[episode, 0] # choosen_v_t = v_t[0, episode] # choosen_logp_t = logp_t[episode] # print('a:', a) choosen_a = a[-1] choosen_v_t = v_t[-1] choosen_logp_t = logp_t[-1] action_dict[choosen_a] += 1 o, r, d, _ = env.step(choosen_a) ep_ret += r ep_len += 1 t = ep_len == max_ep_len total_reward += r o_deque.append(o) t_deque.append(int(d)) last_a.append(choosen_a) last_r.append(r) # save and log buf.store(o, int(t), choosen_a, r, choosen_v_t, choosen_logp_t) logger.store(VVals=v_t) terminal = d or t if terminal or (episode == sequence_length - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = r else: last_val = sess.run( v, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) last_val = last_val[-1] buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_deque[-1] = 0 t_deque[-1] = 0 last_a[-1] = 0 last_r[-1] = 0 print(action_dict) print('average reward:', total_reward / sequence_length) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, save_itr) save_itr += 1 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * batch_size) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, clip_ratio=.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=.97, max_ep_len=1000, target_kl=.01, logger_kwargs=dict(), save_freq=10): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Share action space structure with the actor_critic ac_kwargs['action_space'] = env.action_space x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) # Main outputs from computation graph # print( actor_critic( x_ph, a_ph, **ac_kwargs)) pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO Objectives ratio = tf.exp(logp - logp_old_ph) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Stats to watch approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger( **logger_kwargs) logger.save_config( locals()) tf.set_random_seed(seed) np.random.seed( seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping act_limit = env.action_space.high[0] # Share action sapce info with A2C ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\ tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32) # Actor policy and value with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # Tghis seems a bit memory inneficient: what happens to the q values created # along with the target policy ? the poluicy created along the q targets ? # Not referenced, but still declared right, a the cost of GPU memory # Target policy with tf.variable_scope( 'target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope( 'target', reuse=True): epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise) epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value( a2, -act_limit, act_limit) # Target Q-Values using actions from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars( scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # CLiped Double Q-Learning with Bellman backup min_q_targ = tf.minimum( q1_targ, q2_targ) backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ) # TD3 Losses pi_loss = - tf.reduce_mean( q1_pi) q1_loss = tf.reduce_mean( (q1 - backup)**2) q2_loss = tf.reduce_mean( (q2 - backup)**2) q_loss = q1_loss + q2_loss # Trainin ops pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss) q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss) # Polyak wise target update target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak) * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))]) target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in zip( get_vars('target'), get_vars('main'))]) sess = tf.Session() sess.run( tf.global_variables_initializer()) sess.run( target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action( o, noise_scale): a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)}) a += noise_scale * np.random.randn( act_dim) return np.clip( a, -act_limit, act_limit) def test_agent( n=10): for j in range( n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0 while not ( d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( get_action( o, 0)) ep_ret += r ep_len += 1 logger.store( TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0 total_steps = steps_per_epoch * epochs # Main loop for t in range( total_steps): if t > start_steps: a = get_action( o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step( a) ep_ret += r ep_len += 1 d = False or ( ep_len == max_ep_len) o2 = np.squeeze( o2) # print( "O2: ", o2) replaybuffer.store( o, a, r, o2, d) o = o2 if d or ( ep_len == max_ep_len): for j in range( ep_len): batch = replaybuffer.sample_batch( batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, q_train] outs = sess.run( q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: outs = sess.run( [pi_loss, pi_train, target_update], feed_dict) logger.store( LossPi=outs[0]) logger.store( EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or ( epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, delta=.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=.8, lam=.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo="trpo"): # LOgger tools logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Seed inits seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) # Environment recreation env = env_fn() # Getting obs dims obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ac_kwargs['action_space'] = env.action_space # Placeholders x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) def keys_as_sorted_list(dict): return sorted(list(dict.keys())) def values_as_sorted_list(dict): return [dict[k] for k in keys_as_sorted_list(dict)] all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + values_as_sorted_list(info_phs) get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info) # Experience buffer init local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ["pi", "v"]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO Losses ratio = tf.exp(logp - logp_old_ph) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # CG solver requirements pi_params = get_vars("pi") # Some helpers def flat_concat(xs): return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0) def flat_grad(f, params): return flat_concat(tf.gradients(xs=params, ys=f)) def hessian_vector_product(f, params): g = flat_grad(f, params) x = tf.placeholder(tf.float32, shape=g.shape) return x, flat_grad(tf.reduce_sum(g * x), params) def assign_params_from_flat(x, params): flat_size = lambda p: int(np.prod(p.shape.as_list()) ) # the 'int' is important for scalars splits = tf.split(x, [flat_size(p) for p in params]) new_params = [ tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits) ] return tf.group( [tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) gradient = flat_grad(pi_loss, pi_params) v_ph, hvp = hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = flat_concat(pi_params) set_pi_params = assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): x = np.zeros_like(b) r = b.copy() p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval # Always so elegant haha inputs = {k: v for k, v in zip(all_phs, buf.get())} def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # OK old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == "trpo": for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # Save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.5, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=10000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) # Why isn't the k used here ? def gaussian_likelihood(x, mu, log_std): EPS = 1e-8 pre_sum = -0.5 * ( ((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) def clip_but_pass_gradient(x, l=-1., u=1.): clip_up = tf.cast(x > u, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) LOG_STD_MIN = -20 LOG_STD_MAX = 2 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) """ Because algorithm maximizes trade-off of reward and entropy, entropy must be unique to state---and therefore log_stds need to be a neural network output instead of a shared-across-states learnable parameter vector. But for deep Relu and other nets, simply sticking an activationless dense layer at the end would be quite bad---at the beginning of training, a randomly initialized net could produce extremely large values for the log_stds, which would result in some actions being either entirely deterministic or too random to come back to earth. Either of these introduces numerical instability which could break the algorithm. To protect against that, we'll constrain the output range of the log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is slightly different from the trick used by the original authors of SAC---they used tf.clip_by_value instead of squashing and rescaling. I prefer this approach because it allows gradient propagation through log_std where clipping wouldn't, but I don't know if it makes much of a difference. """ log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): mu = tf.tanh(mu) pi = tf.tanh(pi) # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. logp_pi -= tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): # mu = mlp( x_ph, hidden_sizes, activation, None) # log_std = mlp( mu, (act_dim,), activation, None) # # Avoid out of range log_std. Refer to Github for explanation. # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # # mu = mlp( mu, (act_dim,), activation, None) # # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu)) # logp_pi = gaussian_likelihood( pi, mu, log_std) # # # Follow SpinningUp Implementation # mu = tf.tanh(mu) # pi = tf.tanh(pi) # # def clip_but_pass_gradient(x, l=-1., u=1.): # clip_up = tf.cast(x > u, tf.float32) # clip_low = tf.cast(x < l, tf.float32) # # What is this supposed to mean even ? # return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) # # # Shameless copy paste # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) # Not working version bak # squashed_pi = tf.tanh( pi) # # # To be sure # pi = tf.clip_by_value( pi, -act_limit, act_limit) # # # Must take in the squased polic # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std) # Shamefull plug mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes, tf.tanh, None) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, act_dim, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 o2, r, d, _ = env.step(o) d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()