def __init__(self, args): self.args = args self.env = gym.make(args.env_name) self.env_params = get_env_params(self.env) self.video_file = 'data_test/test_video' self.output_dir = 'data_test' self.exp_name = 'test' self.logger = EpochLogger(output_dir=self.output_dir, exp_name=self.exp_name) # self.env = wrappers.Monitor(self.env, self.video_file, force=True) device = 'cuda' if args.cuda else 'cpu' self.device = torch.device(device) # load data_file = os.path.join(args.load_fold, 'vars.pkl') data = joblib.load(data_file) ## load obs_mean obs_std g_mean g_std self.obs_mean = data['observation_mean'] self.obs_std = data['observation_std'] ## load policy model model = { 'ddpg': actor, 'td3': actor, 'sac': actor_sac, 'gac': actor_gac } self.actor_network = model[args.alg](self.env_params).to(self.device) model_file = os.path.join(args.load_fold, 'pyt_save', 'model.pt') self.actor_network.load_state_dict(torch.load(model_file))
def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join((self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join(self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network.load_state_dict(self.critic_network.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) else: device = 'cpu' self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network.cuda(self.device) self.actor_target_network.cuda(self.device) self.critic_target_network.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network)
def ppo(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, clip_ratio=.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=.97, max_ep_len=1000, target_kl=.01, logger_kwargs=dict(), save_freq=10): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Share action space structure with the actor_critic ac_kwargs['action_space'] = env.action_space x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) # Main outputs from computation graph # print( actor_critic( x_ph, a_ph, **ac_kwargs)) pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO Objectives ratio = tf.exp(logp - logp_old_ph) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Stats to watch approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, args, env, test_env, env_params): self.args = args # path to save the model if self.args.mmd: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd), 's' + str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd))), self.exp_name) else: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.test_env = test_env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) self.advice_network1 = critic(env_params) self.advice_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) sync_networks(self.advice_network1) sync_networks(self.advice_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) self.advice_target_network1 = critic(env_params) self.advice_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) self.advice_target_network1.load_state_dict( self.advice_network1.state_dict()) self.advice_target_network2.load_state_dict( self.advice_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() self.mpi_size = MPI.COMM_WORLD.Get_size() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) self.advice_network1.cuda(self.device) self.advice_network2.cuda(self.device) self.advice_target_network1.cuda(self.device) self.advice_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) self.advice_optim1 = torch.optim.Adam( self.advice_network1.parameters(), lr=self.args.lr_critic) self.advice_optim2 = torch.optim.Adam( self.advice_network2.parameters(), lr=self.args.lr_critic) # create the replay buffer self.buffer = ReplayBuffer(self.env_params['obs'], self.env_params['action'], self.args.buffer_size) self.logger.setup_pytorch_saver(self.actor_network) self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std
def vpg(env_config, ac_type, ac_kwargs, gamma, lam, epochs, steps_per_epoch, lr, train_v_iters, max_ep_len, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None, None) actor_critic = gaussian_mlp_actor_critic pi, logp, logp_pi, v = actor_critic(obs_ph, a_ph, **ac_kwargs) all_phs = [obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] # Experience buffer buf = VPGBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): buffer_data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, buffer_data)} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) sess.run(train_pi, feed_dict=inputs) # Training for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, v_new = sess.run( [pi_loss, v_loss, approx_kl, v], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={obs_ph: o.reshape(1, -1)}) buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) delta = np.exp(a[0]) delta = np.clip(delta, 0.9, 1.1) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: last_val = sess.run(v, feed_dict={obs_ph: o.reshape(1, -1)}) #print(last_val) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, test=False): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q`` (batch,) | Gives the current estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q_pi`` (batch,) | Gives the composition of ``q`` and | ``pi`` for states in ``x_ph``: | q(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = gym.make(env_name), gym.make(env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): # Note that the action placeholder going to actor_critic here is #irrelevant, because we only need q_targ(s, pi_targ(s)). pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) saver = tf.train.Saver() save_path = './saved_model/' + env_name + '/test' def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) def save(saver, sess): if not os.path.exists('./saved_model/' + env_name): os.mkdir('./saved_model/' + env_name) ckpt_path = saver.save(sess, save_path) #print('Save ckpt file: {}'.format(ckpt_path)) def load(saver, sess): if os.path.exists('./saved_model/' + env_name): saver.restore(sess, save_path) print('Load model complete.') else: print('There is no saved model.') if test is False: start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): #logger.save_state({'env': env}, None) save(saver, sess) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() #save(saver, sess) else: load(saver, sess) test_logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 num_episodes = 100 render = True max_ep_len = 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o, 0) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): test_logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 test_logger.log_tabular('EpRet', with_min_and_max=True) test_logger.log_tabular('EpLen', average_only=True) test_logger.dump_tabular()
returns = tf.placeholder(dtype=tf.float32, shape=[ None, ]) advs = tf.placeholder(dtype=tf.float32, shape=[ None, ]) log_policy = tf.placeholder(dtype=tf.float32, shape=[ None, ]) return PlaceHolders(states=states, returns=returns, log_policy=log_policy, advs=advs) logger = EpochLogger() env = gym.make('LunarLander-v2') state_size = env.observation_space.shape[0] action_size = env.action_space.n storage = Storage(steps_per_epoch, action_size, state_size) ph = create_placeholders(state_size) actor = build_nets(ph.states, action_size) ratio = tf.exp(actor.log_policy - ph.log_policy) min_adv = tf.where(ph.advs > 0, (1 + clip_ratio) * ph.advs, (1 - clip_ratio) * ph.advs) pi_loss = -tf.reduce_mean(tf.minimum(ratio * ph.advs, min_adv)) v_loss = tf.reduce_mean((ph.returns - actor.baselines)**2)
def ppo(env_config, ac_type, ac_kwargs, clip_ratio, epochs, steps_per_epoch, optimizer, lr, train_pi_iters, max_ep_len, target_kl, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high obs_ph, a_ph, adv_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None) all_phs = [obs_ph, a_ph, adv_ph, logp_old_ph] actor_critic = get_ppo_actor_critic(ac_type) pi, logp, logp_pi = actor_critic(obs_ph, a_ph, **ac_kwargs) # Experience buffer buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # Optimizers if optimizer == "adam": train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) elif optimizer == "sgd": train_pi = tf.train.GradientDescentOptimizer( learning_rate=lr).minimize(pi_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): print(sess.run(tf.trainable_variables())) data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, data[:4])} pi_l_old, ent = sess.run([pi_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # Log changes from update pi_l_new, kl, cf = sess.run([pi_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) print(tf.trainable_variables()) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): episode_count = 0 ep_actions = [] for t in range(steps_per_epoch): a, logp_t = sess.run([pi, logp_pi], feed_dict={obs_ph: o.reshape(1, -1)}) delta = np.exp(a[0]) delta = np.clip(delta, 0.95, 1.05) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) buf.store(o, a, r, logp_t) ep_actions.append(real_action) episode_actions.append(real_action) episode_obs.append(o) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: buf.finish_path() logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) util.plot_seq_obs_and_actions( episode_obs, episode_actions, act_high, logger.output_dir + '/episode_actions_%d_%d.png' % (epoch, episode_count)) episode_count += 1 episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() util.plot_actions(ep_actions, act_high, logger.output_dir + '/ep_actions%d.png' % epoch)
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.0, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=1000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): act_dim = action_space.n logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) pi_all = tf.nn.softmax(logits) logpi_all = tf.nn.log_softmax(logits) # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1) pi = tf.random.categorical(logits, 1) # a = tf.cast( a, tf.uint8) # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1) return pi, pi_all, logpi_all LOG_STD_MIN = -20 LOG_STD_MAX = 2 with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): pi, pi_all, logpi_all = mlp_categorical_policy( x_ph, a_ph, hidden_sizes, activation, None, env.action_space) print("### DEBUG @ main-discrete.py pi and others' dimensions") print(pi) print(pi_all) print(logpi_all) input() with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, 1, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0] else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class Test: def __init__(self, args): self.args = args self.env = gym.make(args.env_name) self.env_params = get_env_params(self.env) self.video_file = 'data_test/test_video' self.output_dir = 'data_test' self.exp_name = 'test' self.logger = EpochLogger(output_dir=self.output_dir, exp_name=self.exp_name) # self.env = wrappers.Monitor(self.env, self.video_file, force=True) device = 'cuda' if args.cuda else 'cpu' self.device = torch.device(device) # load data_file = os.path.join(args.load_fold, 'vars.pkl') data = joblib.load(data_file) ## load obs_mean obs_std g_mean g_std self.obs_mean = data['observation_mean'] self.obs_std = data['observation_std'] ## load policy model model = { 'ddpg': actor, 'td3': actor, 'sac': actor_sac, 'gac': actor_gac } self.actor_network = model[args.alg](self.env_params).to(self.device) model_file = os.path.join(args.load_fold, 'pyt_save', 'model.pt') self.actor_network.load_state_dict(torch.load(model_file)) def run(self): self._eval_agent() self.logger.log_tabular('EpReward') self.logger.log_tabular('EpCost') self.logger.dump_tabular() def _preproc_inputs(self, obs): obs_norm = np.clip((obs - self.obs_mean) / self.obs_std, -self.args.clip_range, self.args.clip_range) # concatenate the stuffs inputs = torch.tensor(obs_norm, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs def _eval_agent(self): for _ in range(self.args.n_test_rollouts): obs, ep_reward, ep_cost = self.env.reset(), 0, 0 for _ in range(self.env_params['max_timesteps']): if self.args.render: self.env.render() time.sleep(1e-3) with torch.no_grad(): input_tensor = self._preproc_inputs(obs) if self.args.alg == 'gac': pi = self.actor_network(input_tensor, std=0.5) elif self.args.alg == 'sac': pi, _ = self.actor_network(input_tensor) else: pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() obs, reward, cost, info = self.env.step(actions) ep_reward += reward ep_cost += cost self.logger.store(EpReward=ep_reward, EpCost=ep_cost)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=None, target_kl=0.01, logger_kwargs=dict(), save_freq=10, TensorBoard=True, save_nn=True, save_every=1000, load_latest=False, load_custom=False, LoadPath=None, RTA_type=None): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. TensorBoard (bool): True plots to TensorBoard, False does not save_nn (bool): True saves neural network data, False does not save_every (int): How often to save neural network load_latest (bool): Load last saved neural network data before training load_custom (bool): Load custom neural network data file before training LoadPath (str): Path for custom neural network data file RTA_type (str): RTA framework, either 'CBF', 'SVL', 'ASIF', or 'SBSF' """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Random seed for each cpu seed += 1 * proc_id() env.seed(seed) # Create actor-critic module ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) # Load model if True if load_latest: models = glob.glob(f"{PATH}/models/PPO/*") LoadPath = max(models, key=os.path.getctime) ac.load_state_dict(torch.load(LoadPath)) elif load_custom: ac.load_state_dict(torch.load(LoadPath)) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Import RTA if RTA_type == 'CBF': from CBF_for_speed_limit import RTA elif RTA_type == 'SVL': from Simple_velocity_limit import RTA elif RTA_type == 'ASIF': from IASIF import RTA elif RTA_type == 'SBSF': from ISimplex import RTA # Call RTA, define action conversion if RTA_type != 'off': env.RTA_reward = RTA_type rta = RTA(env) def RTA_act(obs, act): act = np.clip(act, -env.force_magnitude, env.force_magnitude) x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0] u_des = np.array([[act[0]], [act[1]], [0]]) u = rta.main(x0, u_des) new_act = [u[0, 0], u[1, 0]] if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001: env.RTA_on = False else: env.RTA_on = True return new_act # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes = 0 RTA_percent = 0 # Create TensorBoard file if True if TensorBoard and proc_id() == 0: if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name = f"{PATH}/runs/Spacecraft-docking-" + current_time elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name = f"{PATH}/runs/Dubins-aircraft-" + current_time writer = SummaryWriter(Name) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): batch_ret = [] # Track episode returns batch_len = [] # Track episode lengths batch_RTA_percent = [] # Track precentage of time RTA is on env.success = 0 # Track episode success rate env.failure = 0 # Track episode failure rate env.crash = 0 # Track episode crash rate env.overtime = 0 # Track episode over max time/control rate episodes = 0 # Track episodes delta_v = [] # Track episode total delta v for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) if RTA_type != 'off': # If RTA is on, get RTA action RTA_a = RTA_act(o, a) if env.RTA_on: RTA_percent += 1 next_o, r, d, _ = env.step(RTA_a) else: # If RTA is off, pass through desired action next_o, r, d, _ = env.step(a) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': over_max_vel, _, _ = env.check_velocity(a[0], a[1]) if over_max_vel: RTA_percent += 1 ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) batch_ret.append(ep_ret) batch_len.append(ep_len) episodes += 1 if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': delta_v.append(env.control_input / env.mass_deputy) batch_RTA_percent.append(RTA_percent / ep_len * 100) RTA_percent = 0 o, ep_ret, ep_len = env.reset(), 0, 0 total_episodes += episodes # Track success, failure, crash, overtime rates if episodes != 0: success_rate = env.success / episodes failure_rate = env.failure / episodes crash_rate = env.crash / episodes overtime_rate = env.overtime / episodes else: success_rate = 0 failure_rate = 0 crash_rate = 0 overtime_rate = 0 raise ( "No completed episodes logging will break [increase steps per epoch]" ) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Average data over all cpus avg_batch_ret = mpi_avg(np.mean(batch_ret)) avg_batch_len = mpi_avg(np.mean(batch_len)) avg_success_rate = mpi_avg(success_rate) avg_failure_rate = mpi_avg(failure_rate) avg_crash_rate = mpi_avg(crash_rate) avg_overtime_rate = mpi_avg(overtime_rate) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': avg_delta_v = mpi_avg(np.mean(delta_v)) avg_RTA_percent = mpi_avg(np.mean(batch_RTA_percent)) if proc_id() == 0: # Only on one cpu # Plot to TensorBoard if True, only on one cpu if TensorBoard: writer.add_scalar('Return', avg_batch_ret, epoch) writer.add_scalar('Episode-Length', avg_batch_len * env.tau, epoch) writer.add_scalar('Success-Rate', avg_success_rate * 100, epoch) writer.add_scalar('Failure-Rate', avg_failure_rate * 100, epoch) writer.add_scalar('Crash-Rate', avg_crash_rate * 100, epoch) writer.add_scalar('Overtime-Rate', avg_overtime_rate * 100, epoch) if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': writer.add_scalar('Delta-V', avg_delta_v, epoch) writer.add_scalar('RTA-on-percent', avg_RTA_percent, epoch) # Save neural network if true, can change to desired location if save_nn and epoch % save_every == 0 and epoch != 0: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + f"-epoch{epoch}.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + f"-epoch{epoch}.dat" torch.save(ac.state_dict(), Name2) # Average episodes per hour, episode per epoch ep_hr = mpi_avg(total_episodes) * args.cpu / (time.time() - start_time) * 3600 ep_Ep = mpi_avg(total_episodes) * args.cpu / (epoch + 1) # Plot on one cpu if proc_id() == 0: # Save neural network if save_nn: if not os.path.isdir(f"{PATH}/models"): os.mkdir(f"{PATH}/models") if not os.path.isdir(f"{PATH}/models/PPO"): os.mkdir(f"{PATH}/models/PPO") if env_name == 'spacecraft-docking-continuous-v0' or env_name == 'spacecraft-docking-v0': Name2 = f"{PATH}/models/PPO/Spacecraft-docking-" + current_time + "-final.dat" elif env_name == 'dubins-aircraft-v0' or env_name == 'dubins-aircraft-continuous-v0': Name2 = f"{PATH}/models/PPO/Dubins-aircraft-" + current_time + "-final.dat" torch.save(ac.state_dict(), Name2) # Print statistics on episodes print( f"Episodes per hour: {ep_hr:.0f}, Episodes per epoch: {ep_Ep:.0f}, Epochs per hour: {(epoch+1)/(time.time()-start_time)*3600:.0f}" )
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=2000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): global RENDER, BONUS """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Reachability Trainer r_network = R_Network().to(device) trainer = R_Network_Trainer(r_network=r_network, exp_name="random1") episodic_memory = EpisodicMemory(embedding_shape=[EMBEDDING_DIM]) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(3, 64, 64)) action_space = gym.spaces.Discrete(3) obs_dim = observation_space.shape act_dim = action_space.shape # Create actor-critic module ac = actor_critic(observation_space, action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) # Entropy bonus loss_pi += pi_info['ent'] * 0.0021 kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, _ = env.reset() env.render() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) ep_ret, ep_len = 0, 0 indices = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) a, v, logp = ac.step(state) next_o, r, d, info = env.step(a) next_o = next_o.astype(np.float32) / 255. d = ep_len == max_ep_len trainer.store_new_state([next_o], [r], [d], [None]) r_network.eval() with torch.no_grad(): state_embedding = r_network.embed_observation( torch.FloatTensor([o]).to(device)).cpu().numpy()[0] aggregated, _, _ = similarity_to_memory( state_embedding, episodic_memory, r_network) curiosity_bonus = 0.03 * (0.5 - aggregated) if BONUS: print(f'{curiosity_bonus:.3f}') if curiosity_bonus > 0 or len(episodic_memory) == 0: idx = episodic_memory.store_new_state(state_embedding) x = int(env.map_scale * info['pose']['x']) y = int(env.map_scale * info['pose']['y']) if idx == len(indices): indices.append((x, y)) else: indices[idx] = (x, y) r_network.train() next_o = next_o.transpose(2, 0, 1) ep_ret += r + curiosity_bonus ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) k = cv2.waitKey(1) if k == ord('s'): RENDER = 1 - RENDER elif k == ord('b'): BONUS = 1 - BONUS if RENDER: env.info['map'] = cv2.flip(env.info['map'], 0) for index in indices: cv2.circle(env.info['map'], index, 3, (0, 0, 255), -1) env.info['map'] = cv2.flip(env.info['map'], 0) env.render() # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) _, v, _ = ac.step(state) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) print(ep_ret, ep_len, len(episodic_memory)) ep_ret, ep_len = 0, 0 o, _ = env.reset() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) episodic_memory.reset() indices = [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! if epoch > 4: update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() else: buf.get()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger( **logger_kwargs) logger.save_config( locals()) tf.set_random_seed(seed) np.random.seed( seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping act_limit = env.action_space.high[0] # Share action sapce info with A2C ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\ tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32) # Actor policy and value with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # Tghis seems a bit memory inneficient: what happens to the q values created # along with the target policy ? the poluicy created along the q targets ? # Not referenced, but still declared right, a the cost of GPU memory # Target policy with tf.variable_scope( 'target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope( 'target', reuse=True): epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise) epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value( a2, -act_limit, act_limit) # Target Q-Values using actions from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars( scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # CLiped Double Q-Learning with Bellman backup min_q_targ = tf.minimum( q1_targ, q2_targ) backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ) # TD3 Losses pi_loss = - tf.reduce_mean( q1_pi) q1_loss = tf.reduce_mean( (q1 - backup)**2) q2_loss = tf.reduce_mean( (q2 - backup)**2) q_loss = q1_loss + q2_loss # Trainin ops pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss) q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss) # Polyak wise target update target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak) * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))]) target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in zip( get_vars('target'), get_vars('main'))]) sess = tf.Session() sess.run( tf.global_variables_initializer()) sess.run( target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action( o, noise_scale): a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)}) a += noise_scale * np.random.randn( act_dim) return np.clip( a, -act_limit, act_limit) def test_agent( n=10): for j in range( n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0 while not ( d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( get_action( o, 0)) ep_ret += r ep_len += 1 logger.store( TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0 total_steps = steps_per_epoch * epochs # Main loop for t in range( total_steps): if t > start_steps: a = get_action( o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step( a) ep_ret += r ep_len += 1 d = False or ( ep_len == max_ep_len) o2 = np.squeeze( o2) # print( "O2: ", o2) replaybuffer.store( o, a, r, o2, d) o = o2 if d or ( ep_len == max_ep_len): for j in range( ep_len): batch = replaybuffer.sample_batch( batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, q_train] outs = sess.run( q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: outs = sess.run( [pi_loss, pi_train, target_update], feed_dict) logger.store( LossPi=outs[0]) logger.store( EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or ( epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, delta=.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=.8, lam=.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo="trpo"): # LOgger tools logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Seed inits seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) # Environment recreation env = env_fn() # Getting obs dims obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ac_kwargs['action_space'] = env.action_space # Placeholders x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) def keys_as_sorted_list(dict): return sorted(list(dict.keys())) def values_as_sorted_list(dict): return [dict[k] for k in keys_as_sorted_list(dict)] all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + values_as_sorted_list(info_phs) get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info) # Experience buffer init local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ["pi", "v"]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO Losses ratio = tf.exp(logp - logp_old_ph) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # CG solver requirements pi_params = get_vars("pi") # Some helpers def flat_concat(xs): return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0) def flat_grad(f, params): return flat_concat(tf.gradients(xs=params, ys=f)) def hessian_vector_product(f, params): g = flat_grad(f, params) x = tf.placeholder(tf.float32, shape=g.shape) return x, flat_grad(tf.reduce_sum(g * x), params) def assign_params_from_flat(x, params): flat_size = lambda p: int(np.prod(p.shape.as_list()) ) # the 'int' is important for scalars splits = tf.split(x, [flat_size(p) for p in params]) new_params = [ tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits) ] return tf.group( [tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) gradient = flat_grad(pi_loss, pi_params) v_ph, hvp = hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = flat_concat(pi_params) set_pi_params = assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): x = np.zeros_like(b) r = b.copy() p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval # Always so elegant haha inputs = {k: v for k, v in zip(all_phs, buf.get())} def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # OK old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == "trpo": for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # Save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim, z_type, act_noise, test_without_state, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, z_dim, obs_dim, None, None) actor_critic = core.get_iac_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n' % var_counts) # Bellman backup for Q and V function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) min_q_pi = tf.minimum(q1_pi, q2_pi) v_backup = tf.stop_gradient(min_q_pi) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2) q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2) v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2) value_loss = q1_loss + q2_loss + v_loss # Separate train ops for pi, q policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_policy_op = policy_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) if ac_kwargs["pi_separate"]: train_policy_emb_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/emb')) train_policy_d_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/d')) train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q') + get_vars('main/v')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def sample_z(size): if z_type == "uniform": return np.random.random_sample(size=size) elif z_type == "gaussian": return np.random.normal(size=size) else: raise Exception("z_type error") def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={ x_ph: o.reshape(1, -1), z_ph: sample_z((1, z_dim)) })[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) if test_without_state: _, real_a = get_action(np.zeros(o.shape), 0) else: _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } feed_dict[z_ph] = sample_z((batch_size, z_dim)) # Policy Learning update for key in feed_dict: feed_dict[key] = np.repeat(feed_dict[key], q_pi_sample_size, axis=0) feed_dict[z_ph] = sample_z( (batch_size * q_pi_sample_size, z_dim)) if ac_kwargs["pi_separate"]: if len(rewards) % 2 == 0: outs = sess.run([pi_loss, train_policy_emb_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_d_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_op], feed_dict) logger.store(LossPi=outs[0]) # Q-learning update outs = sess.run([q1_loss, v_loss, q1, v, train_value_op], feed_dict) logger.store(LossQ=outs[0], LossV=outs[1], ValueQ=outs[2], ValueV=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True)[0] test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('ValueQ', average_only=True) logger.log_tabular('ValueV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep sess.run(target_update, feed_dict) logger.save_state( { "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def ddpg(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32) # Main networks with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # Losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Optimizer and train ops train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=get_vars('main/pi')) train_q_op = tf.train.AdamOptimizer(q_loss).minimize( q_loss, var_list=get_vars('main/q')) # Update target networks target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Init targets target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_actions(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_actions(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: for t in range(total_steps): if t > start_steps: a = get_actions(o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len == max_ep_len else d # Storing experience replaybuffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replaybuffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, batch_size=250000, n=100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape sequence_length = n * max_ep_len trials = batch_size // sequence_length # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) x_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='x_ph') t_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='t_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, sequence_length), name='a_ph') r_ph = tf.placeholder(dtype=tf.float32, shape=(None, sequence_length), name='r_ph') # input_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, n, None), name='rew_ph') adv_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='logp_old_ph') # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, t_ph, a_ph, r_ph, sequence_length, env.action_space.n, env.observation_space.shape[0]) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, t_ph, a_ph, r_ph, adv_ph, ret_ph, logp_old_ph] # for ph in all_phs: # print(ph.shape) # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer buf = PPOBuffer(obs_dim, act_dim, batch_size, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving model_inputs = {'x': x_ph, 't': t_ph, 'a': a_ph, 'r': r_ph} model_outputs = {'pi': pi} logger.setup_tf_saver(sess, inputs=model_inputs, outputs=model_outputs) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} # inputs[a_ph] = np.tril(np.transpose(np.repeat(inputs[a_ph], n).reshape(trials, n, n), [0, 2, 1])) # inputs[rew_ph] = np.tril(np.transpose(np.repeat(inputs[rew_ph], n).reshape(trials, n, n), [0, 2, 1])) # print(inputs[x_ph]) # print(inputs[t_ph]) # print(inputs[a_ph]) # print(inputs[r_ph]) inputs[x_ph] = inputs[x_ph].reshape(trials, sequence_length) inputs[t_ph] = inputs[t_ph].reshape(trials, sequence_length) inputs[a_ph] = inputs[a_ph].reshape(trials, sequence_length) inputs[r_ph] = inputs[r_ph].reshape(trials, sequence_length) # print('x:', inputs[x_ph]) # print('t:', inputs[t_ph]) # print('a:', inputs[a_ph]) # print('r:', inputs[r_ph]) # print('ret:', inputs[ret_ph]) # print('adv:', inputs[adv_ph]) # print('logp_old:', inputs[logp_old_ph]) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # kl = mpi_avg(kl) # if kl > 1.5 * target_kl: # logger.log('Early stopping at step %d due to reaching max kl.'%i) # break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() save_itr = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trail in range(trials): print('trial:', trail) # last_a = np.zeros(n).reshape(1, n) # last_r = np.zeros(n).reshape(1, n) o_deque = deque(sequence_length * [0], sequence_length) t_deque = deque(sequence_length * [0], sequence_length) last_a = deque(sequence_length * [0], sequence_length) last_r = deque(sequence_length * [0], sequence_length) means = env.sample_tasks(1)[0] # print('task means:', means) action_dict = defaultdict(int) total_reward = 0 env.reset_task(means) o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 for episode in range(sequence_length): # print('episode:', episode) # print('o:', o_deque) # print('d:', t_deque) # print('a:', last_a) # print('r:', last_r) a, v_t, logp_t = sess.run( get_action_ops, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) # print("a shape:", a.shape) # print("v_t shape:", v_t.shape) # print("logp_t shape:", logp_t.shape) # choosen_a = a[episode, 0] # choosen_v_t = v_t[0, episode] # choosen_logp_t = logp_t[episode] # print('a:', a) choosen_a = a[-1] choosen_v_t = v_t[-1] choosen_logp_t = logp_t[-1] action_dict[choosen_a] += 1 o, r, d, _ = env.step(choosen_a) ep_ret += r ep_len += 1 t = ep_len == max_ep_len total_reward += r o_deque.append(o) t_deque.append(int(d)) last_a.append(choosen_a) last_r.append(r) # save and log buf.store(o, int(t), choosen_a, r, choosen_v_t, choosen_logp_t) logger.store(VVals=v_t) terminal = d or t if terminal or (episode == sequence_length - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = r else: last_val = sess.run( v, feed_dict={ x_ph: np.array(o_deque).reshape(1, sequence_length), t_ph: np.array(t_deque).reshape(1, sequence_length), a_ph: np.array(last_a).reshape(1, sequence_length), r_ph: np.array(last_r).reshape(1, sequence_length) }) last_val = last_val[-1] buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_deque[-1] = 0 t_deque[-1] = 0 last_a[-1] = 0 last_r[-1] = 0 print(action_dict) print('average reward:', total_reward / sequence_length) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, save_itr) save_itr += 1 # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * batch_size) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class td3_agent: def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) # build up the target network self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) # load the weights into the target networks self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) else: device = 'cpu' self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network) def learn(self): """ train the network """ # start to collect samples for epoch in range(self.args.n_epochs): for _ in range(self.args.n_cycles): mb_obs, mb_ag, mb_g, mb_actions = [], [], [], [] for _ in range(self.args.num_rollouts_per_mpi): # reset the rollouts ep_obs, ep_ag, ep_g, ep_actions = [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) action = self._select_actions(pi) # feed the actions into the environment observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_actions.append(ep_actions) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions]) self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions]) for _ in range(self.args.n_batches): # train the network self._update_network() # soft update self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network(self.critic_target_network1, self.critic_network1) self._soft_update_target_network(self.critic_target_network2, self.critic_network2) # start to do the evaluation success_rate = self._eval_agent() # save some necessary objects # self.logger.save_state will also save pytorch's model implicitly. # self.logger.save_state({'env':self.env, 'o_norm':self.o_norm, 'g_norm':self.g_norm}, None) state = { 'env': self.env, 'o_norm': self.o_norm.get(), 'g_norm': self.g_norm.get() } self.logger.save_state(state, None) t = ((epoch + 1) * self.args.n_cycles * self.args.num_rollouts_per_mpi * MPI.COMM_WORLD.Get_size() * self.env_params['max_timesteps']) self.logger.log_tabular('Epoch', epoch + 1) self.logger.log_tabular('SuccessRate', success_rate) self.logger.log_tabular('LossPi') self.logger.log_tabular('LossQ') self.logger.log_tabular('TotalEnvInteracts', t) self.logger.dump_tabular() # pre_process the inputs def _preproc_inputs(self, obs, g): obs_norm = self.o_norm.normalize(obs) g_norm = self.g_norm.normalize(g) # concatenate the stuffs inputs = np.concatenate([obs_norm, g_norm]) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs # this function will choose action for the agent and do the exploration def _select_actions(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += self.args.noise_eps * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) # random actions... random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \ size=self.env_params['action']) # choose if use the random actions action += np.random.binomial(1, self.args.random_eps, 1)[0] * (random_actions - action) return action # update the normalizer def _update_normalizer(self, episode_batch): mb_obs, mb_ag, mb_g, mb_actions = episode_batch mb_obs_next = mb_obs[:, 1:, :] mb_ag_next = mb_ag[:, 1:, :] # get the number of normalization transitions num_transitions = mb_actions.shape[1] # create the new buffer to store them buffer_temp = { 'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions, 'obs_next': mb_obs_next, 'ag_next': mb_ag_next, } transitions = self.her_module.sample_her_transitions( buffer_temp, num_transitions) obs, g = transitions['obs'], transitions['g'] # pre process the obs and g transitions['obs'], transitions['g'] = self._preproc_og(obs, g) # update self.o_norm.update(transitions['obs']) self.g_norm.update(transitions['g']) # recompute the stats self.o_norm.recompute_stats() self.g_norm.recompute_stats() def _preproc_og(self, o, g): o = np.clip(o, -self.args.clip_obs, self.args.clip_obs) g = np.clip(g, -self.args.clip_obs, self.args.clip_obs) return o, g # soft update def _soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data) # update the network def _update_network(self): # sample the episodes transitions = self.buffer.sample(self.args.batch_size) # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions[ 'obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self._preproc_og(o, g) transitions['obs_next'], transitions['g_next'] = self._preproc_og( o_next, g) # start to do the update obs_norm = self.o_norm.normalize(transitions['obs']) g_norm = self.g_norm.normalize(transitions['g']) inputs_norm = np.concatenate([obs_norm, g_norm], axis=1) obs_next_norm = self.o_norm.normalize(transitions['obs_next']) g_next_norm = self.g_norm.normalize(transitions['g_next']) inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1) # transfer them into the tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32) r_tensor = torch.tensor(transitions['r'], dtype=torch.float32) if self.args.cuda: inputs_norm_tensor = inputs_norm_tensor.cuda(self.device) inputs_next_norm_tensor = inputs_next_norm_tensor.cuda(self.device) actions_tensor = actions_tensor.cuda(self.device) r_tensor = r_tensor.cuda(self.device) # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) actions_next += self.args.noise_eps * self.env_params[ 'action_max'] * torch.randn(actions_next.shape).cuda( self.device) actions_next = torch.clamp(actions_next, -self.env_params['action_max'], self.env_params['action_max']) q_next_value1 = self.critic_target_network1( inputs_next_norm_tensor, actions_next) q_next_value2 = self.critic_target_network2( inputs_next_norm_tensor, actions_next) target_q_value = r_tensor + self.args.gamma * torch.min( q_next_value1, q_next_value2) # clip the q value clip_return = 1 / (1 - self.args.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) target_q_value = target_q_value.detach() # the q loss real_q_value1 = self.critic_network1(inputs_norm_tensor, actions_tensor) critic_loss1 = (target_q_value - real_q_value1).pow(2).mean() real_q_value2 = self.critic_network2(inputs_norm_tensor, actions_tensor) critic_loss2 = (target_q_value - real_q_value2).pow(2).mean() # the actor loss actions_real = self.actor_network(inputs_norm_tensor) actor_loss = -torch.min( self.critic_network1(inputs_norm_tensor, actions_real), self.critic_network2(inputs_norm_tensor, actions_real)).mean() actor_loss += self.args.action_l2 * ( actions_real / self.env_params['action_max']).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim1.zero_grad() critic_loss1.backward() sync_grads(self.critic_network1) self.critic_optim1.step() self.critic_optim2.zero_grad() critic_loss2.backward() sync_grads(self.critic_network2) self.critic_optim2.step() self.logger.store(LossPi=actor_loss.detach().cpu().numpy()) self.logger.store(LossQ=(critic_loss1 + critic_loss2).detach().cpu().numpy()) # do the evaluation def _eval_agent(self): total_success_rate = [] for _ in range(self.args.n_test_rollouts): per_success_rate = [] observation = self.env.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs, g) pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() observation_new, _, _, info = self.env.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] per_success_rate.append(info['is_success']) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.5, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=10000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) # Why isn't the k used here ? def gaussian_likelihood(x, mu, log_std): EPS = 1e-8 pre_sum = -0.5 * ( ((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) def clip_but_pass_gradient(x, l=-1., u=1.): clip_up = tf.cast(x > u, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) LOG_STD_MIN = -20 LOG_STD_MAX = 2 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) """ Because algorithm maximizes trade-off of reward and entropy, entropy must be unique to state---and therefore log_stds need to be a neural network output instead of a shared-across-states learnable parameter vector. But for deep Relu and other nets, simply sticking an activationless dense layer at the end would be quite bad---at the beginning of training, a randomly initialized net could produce extremely large values for the log_stds, which would result in some actions being either entirely deterministic or too random to come back to earth. Either of these introduces numerical instability which could break the algorithm. To protect against that, we'll constrain the output range of the log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is slightly different from the trick used by the original authors of SAC---they used tf.clip_by_value instead of squashing and rescaling. I prefer this approach because it allows gradient propagation through log_std where clipping wouldn't, but I don't know if it makes much of a difference. """ log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): mu = tf.tanh(mu) pi = tf.tanh(pi) # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. logp_pi -= tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): # mu = mlp( x_ph, hidden_sizes, activation, None) # log_std = mlp( mu, (act_dim,), activation, None) # # Avoid out of range log_std. Refer to Github for explanation. # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # # mu = mlp( mu, (act_dim,), activation, None) # # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu)) # logp_pi = gaussian_likelihood( pi, mu, log_std) # # # Follow SpinningUp Implementation # mu = tf.tanh(mu) # pi = tf.tanh(pi) # # def clip_but_pass_gradient(x, l=-1., u=1.): # clip_up = tf.cast(x > u, tf.float32) # clip_low = tf.cast(x < l, tf.float32) # # What is this supposed to mean even ? # return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) # # # Shameless copy paste # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) # Not working version bak # squashed_pi = tf.tanh( pi) # # # To be sure # pi = tf.clip_by_value( pi, -act_limit, act_limit) # # # Must take in the squased polic # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std) # Shamefull plug mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes, tf.tanh, None) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, act_dim, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 o2, r, d, _ = env.step(o) d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) actor_critic = core.get_ddpg_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # DDPG losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs actions = [] epoch_actions = [] rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 epoch_actions.append(pi_a) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all DDPG updates at the end of the trajectory, in accordance with tuning done by TD3 paper authors. """ for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) actions.append(np.mean(epoch_actions)) epoch_actions = [] rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True) test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('QVals', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep util.plot_actions(test_actions, act_high, logger.output_dir + '/actions%s.png' % epoch) logger.save_state( { "actions": actions, "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def __init__(self, args, env, env_params): self.args = args # path to save the model self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join((self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) # her sampler self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward) # create the replay buffer self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions) # create the normalizer self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range) self.logger.setup_pytorch_saver(self.actor_network) # auto temperature if self.args.alpha < 0.0: # if self.args.alpha < 0.0, # sac will use auto temperature and init alpha = - self.args.alpha self.alpha = -self.args.alpha self.log_alpha = torch.tensor(np.log(self.alpha), dtype=torch.float32, device=device, requires_grad=True) self.target_entropy = -np.prod(env.action_space.shape).astype( np.float32) self.target_entropy = self.target_entropy / 2.0 self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.args.lr_actor) else: self.alpha = self.args.alpha self.alpha = torch.tensor(self.alpha)
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256, trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph\ raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph') rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40]) max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph') seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,)) # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out # when computing loss seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len)) # rescaled_image_ph This is a ph because we want to be able to pass in value to this node manually rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph') a_ph = core.placeholders_from_spaces( env.action_space)[0] conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5], stride=2) image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5], stride=2)) rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None) rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph') # Main outputs from computation graph action_encoder_matrix = np.load(r'encoder.npy') pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic( image_out, a_ph, rew_ph, rnn_state_ph, gru_units, max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, rnn_state, logits] # Experience buffer buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) # Need to mask out the padded zeros when computing loss sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len) # Convert bool tensor to int tensor with 1 and 0 sequence_mask = tf.where(sequence_mask, np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)), np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len))) # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape # it back pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask))) pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio))) aaa = (ret_ph - v)**2 v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask))) ccc = tf.reshape(v_loss_vec, tf.shape(v)) v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v))) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v}) def update(): print(f'Start updating at {datetime.now()}') inputs = {k:v for k,v in zip(all_phs, buf.get())} inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32) inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len) inputs[seq_len_ph] = buf.seq_len_buf pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) buf.reset() # Training print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}') for i in range(train_pi_iters): _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs) print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}') logger.store(StopIter=i) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) print(f'Updating finished at {datetime.now()}') start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0 def recenter_rgb(image, min=0.0, max=255.0): ''' :param image: :param min: :param max: :return: an image with rgb value re-centered to [-1, 1] ''' mid = (min + max) / 2.0 return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image) o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): # TODO: tweek settings to match the paper # TODO: find a way to generate mazes last_a = np.array(0) last_r = np.array(r) last_rnn_state = np.zeros((1, gru_units), np.float32) step_counter = 0 for episode in range(episodes_per_trial): o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) action_dict = defaultdict(int) # dirty hard coding to make it print in order action_dict[0] = 0 action_dict[1] = 0 action_dict[2] = 0 for step in range(max_ep_len): a, v_t, logp_t, rnn_state_t, logits_t = sess.run( get_action_ops, feed_dict={ rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, # v_rnn_state_ph: last_v_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) action_dict[a[0]] += 1 # save and log buf.store(o_rescaled, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) step_counter += 1 o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o})) ep_ret += r ep_len += 1 last_a = a[0] last_r = np.array(r) last_rnn_state = rnn_state_t terminal = d or (ep_len == max_ep_len) if terminal or (step==n-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0), a_ph: last_a.reshape(-1,), rew_ph: last_r.reshape(-1,1), rnn_state_ph: last_rnn_state, max_seq_len_ph: 1, seq_len_ph: [1]}) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}') break print(action_dict) if step_counter < episodes_per_trial * max_ep_len: buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter) buf.seq_len_buf[trial] = step_counter # pad zeros to sequence buffer after each trial # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # GAedit # Special function to avoid certain slowdowns from PyTorch + MPI combo. # setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # GAedit # Seed seed = 333 torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() #GAedit # obs_dim = env.observation_space.shape # act_dim = env.action_space.shape # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action act_dim = brain.vector_action_space_size # examine the state space obs_dim = env_info.vector_observations.shape[1] #GAedit # Create actor-critic module # ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac = actor_critic(obs_dim, act_dim, **ac_kwargs) # GAedit - don't think we need to sync # Sync params across processes # sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer # GAedit # local_steps_per_epoch = int(steps_per_epoch / num_procs()) local_steps_per_epoch = int(steps_per_epoch / num_agents) #GAedit buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch * num_agents, gamma, lam) # buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) #GAedit # kl = mpi_avg(pi_info['kl']) kl = pi_info['kl'] if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() #GAedit # mpi_avg_grads(ac.pi) # average grads across MPI processes # ac.pi.mean() pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() #GAedit # mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() #GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) # GAedit # next_o, r, d, _ = env.step(a) env_info = env.step(a)[brain_name] next_o, r, d = env_info.vector_observations, env_info.rewards, env_info.local_done #GAedit # ep_ret += r ep_ret += np.mean(r) ep_len += 1 # save and log #GAedit # buf.store(o, a, r, v, logp) for i in range(20): buf.store(o[i], a[i], r[i], v[i], logp[i]) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len # GAedit # terminal = d or timeout terminal = any(d) or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) # GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
env = make_env(util.ENV_CONFIG_DIR + env_config) obs = [] actions = [] action_sign = np.array([-1, -1]) for i in range(iterations): current_bound = initial_bound o = env.reset() real_action = env.action_space.default() * 0.5 for t in range(max_ep_len): o, r, d, _ = env.step(real_action) obs.append(o) actions.append(real_action) vp = o vi = np.mean(obs[-5:]) vd = np.mean(np.diff(obs, axis=0)[-5:]) vd = 0 if np.isnan(vd) else vd delta = np.exp((wp * vp + wi * vi + wd * vd) * action_sign) delta = np.clip(delta, 1. / current_bound, current_bound) #print(real_action, o, delta) real_action = env.action_space.clip(real_action * delta) current_bound = np.maximum(final_bound, current_bound * bound_decay) print(np.mean(np.abs(obs[-20:])) * 100) logger_kwargs = setup_logger_kwargs(exp_name, seed, data_dir=util.LOG_DIR + os.path.splitext(env_config)[0]) logger = EpochLogger(**logger_kwargs) #util.plot_seq_obs_and_actions(np.abs(obs), actions, env.action_space.high, logger.output_dir + '/actions.png')
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000, update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150, logger_kwargs=dict(), save_freq=1): logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) torch.set_num_threads(torch.get_num_threads()) actor_critic = core.MLPActorCritic ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) gamma = args.gamma seed = args.seed epochs = args.epochs logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime())) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task test_env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) test_env.set_task(tasks[0]) # Set task obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4) q_optimizer = Adam(q_params, lr=3e-4) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, logger_tensor, t): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) logger_tensor.log_value(t, loss_q.item(), "loss q") # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) logger_tensor.log_value(t, loss_pi.item(), "loss pi") # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) logger_tensor.log_value(t, ep_ret, "test ep reward") logger_tensor.log_value(t, ep_len, "test ep length") # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger_tensor.log_value(t, ep_ret, "reward") logging.info("> total_steps={} | reward={}".format(t, ep_ret)) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, logger_tensor = logger_tensor, t = t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger_tensor.log_value(t, epoch, "epoch") logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch) ac.save(args.save_model_dir, args.model_name)
q_target = q_target.detach() # loss loss = self.loss_function(q_eval, q_target) logger.store(loss=loss) # backprop loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss dqn = DQN() logdir = './DQN/%s' % args.games + '/%i' % int(time.time()) logger_kwargs = setup_logger_kwargs(args.games, args.seed, data_dir=logdir) logger = EpochLogger(**logger_kwargs) kwargs = { 'seed': args.seed, 'learning rate':args.lr, } logger.save_config(kwargs) # model load with check if LOAD and os.path.isfile(PRED_PATH) and os.path.isfile(TARGET_PATH): dqn.load_model() pkl_file = open(RESULT_PATH,'rb') result = pickle.load(pkl_file) pkl_file.close() print('Load complete!') else: result = []
class gac_agent: def __init__(self, args, env, test_env, env_params): self.args = args # path to save the model if self.args.mmd: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd), 's' + str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd))), self.exp_name) else: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.test_env = test_env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) self.advice_network1 = critic(env_params) self.advice_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) sync_networks(self.advice_network1) sync_networks(self.advice_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) self.advice_target_network1 = critic(env_params) self.advice_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) self.advice_target_network1.load_state_dict( self.advice_network1.state_dict()) self.advice_target_network2.load_state_dict( self.advice_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() self.mpi_size = MPI.COMM_WORLD.Get_size() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) self.advice_network1.cuda(self.device) self.advice_network2.cuda(self.device) self.advice_target_network1.cuda(self.device) self.advice_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) self.advice_optim1 = torch.optim.Adam( self.advice_network1.parameters(), lr=self.args.lr_critic) self.advice_optim2 = torch.optim.Adam( self.advice_network2.parameters(), lr=self.args.lr_critic) # create the replay buffer self.buffer = ReplayBuffer(self.env_params['obs'], self.env_params['action'], self.args.buffer_size) self.logger.setup_pytorch_saver(self.actor_network) self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std def learn(self): """ train the network """ # start to collect samples obs, ep_rew, ep_cost, ep_len, done = self.env.reset(), 0, 0, 0, False for epoch in range(self.args.n_epochs): for _ in range(self.args.n_train_rollouts): for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor) action = action.detach().cpu().numpy().squeeze() # feed the actions into the environment next_obs, reward, done, info = self.env.step( action * self.env_params['action_max']) ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.buffer.store(obs, action, reward, info['cost'], next_obs, done) obs = next_obs if done or (ep_len == self.env_params['max_timesteps'] ) or (t % self.args.n_batches == 0): self.buffer.obs_mean = MPI.COMM_WORLD.allreduce( self.buffer.obs_mean, op=MPI.SUM) / self.mpi_size self.buffer.obs_std = MPI.COMM_WORLD.allreduce( self.buffer.obs_std, op=MPI.SUM) / self.mpi_size self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std self.buffer.rew_mean = MPI.COMM_WORLD.allreduce( self.buffer.rew_mean, op=MPI.SUM) / self.mpi_size self.buffer.rew_std = MPI.COMM_WORLD.allreduce( self.buffer.rew_std, op=MPI.SUM) / self.mpi_size self.buffer.cost_mean = MPI.COMM_WORLD.allreduce( self.buffer.cost_mean, op=MPI.SUM) / self.mpi_size self.buffer.cost_std = MPI.COMM_WORLD.allreduce( self.buffer.cost_std, op=MPI.SUM) / self.mpi_size for _ in range(self.args.n_batches): # train the network self._update_network() # soft update # self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network( self.critic_target_network1, self.critic_network1, self.args.polyak) self._soft_update_target_network( self.critic_target_network2, self.critic_network2, self.args.polyak) if done or (ep_len == self.env_params['max_timesteps']): self.logger.store(EpReward=ep_rew, EpCost=ep_cost, EpLen=ep_len) obs, ep_rew, ep_cost, ep_len, done = self.env.reset( ), 0, 0, 0, False # start to do the evaluation self._test_policy() # save some necessary objects state = { 'observation_mean': self.buffer.obs_mean, 'observation_std': self.buffer.obs_std } self.logger.save_state(state, None) t = ((epoch + 1) * self.mpi_size * self.env_params['max_timesteps']) * self.args.n_train_rollouts self.logger.log_tabular('Epoch', epoch + 1) self.logger.log_tabular('EpReward', with_min_and_max=True) self.logger.log_tabular('EpCost', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestReward', with_min_and_max=True) self.logger.log_tabular('TestCost', with_min_and_max=True) self.logger.log_tabular('TestLen', average_only=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('MMDEntropy', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.dump_tabular() if MPI.COMM_WORLD.Get_rank() == 0: print("obs_mean=", self.buffer.obs_mean) print("obs_std=", self.buffer.obs_std) print("reward_mean=", self.buffer.rew_mean) print("reward_std=", self.buffer.rew_std) print("cost_mean=", self.buffer.cost_mean) print("cost_std=", self.buffer.cost_std) # pre_process the inputs def _preproc_inputs(self, obs): inputs = ((np.array(obs) - self.obs_mean) / (self.obs_std + 1e-8)).clip(-self.args.clip_range, self.args.clip_range) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs # soft update def _soft_update_target_network(self, target, source, polyak): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - polyak) * param.data + polyak * target_param.data) # update the network def _update_network(self): # sample the episodes batches = self.buffer.sample(self.args.batch_size) o = torch.FloatTensor(batches['obs']).to(self.device) o2 = torch.FloatTensor(batches['obs2']).to(self.device) a = torch.FloatTensor(batches['act']).to(self.device) r = torch.FloatTensor(batches['rew']).to(self.device) c = torch.FloatTensor(batches['cost']).to(self.device) d = torch.FloatTensor(batches['done']).to(self.device) # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs a2 = self.actor_network(o2) q_next_value1 = self.critic_target_network1(o2, a2).detach() q_next_value2 = self.critic_target_network2(o2, a2).detach() target_q_value = r + self.args.gamma * (1 - d) * torch.min( q_next_value1, q_next_value2) target_q_value = target_q_value.detach() p_next_value1 = self.advice_target_network1(o2, a2).detach() p_next_value2 = self.advice_target_network2(o2, a2).detach() target_p_value = -c + self.args.gamma * (1 - d) * torch.min( p_next_value1, p_next_value2) target_p_value = target_p_value.detach() # the q loss real_q_value1 = self.critic_network1(o, a) real_q_value2 = self.critic_network2(o, a) critic_loss1 = (target_q_value - real_q_value1).pow(2).mean() critic_loss2 = (target_q_value - real_q_value2).pow(2).mean() # the p loss real_p_value1 = self.advice_network1(o, a) real_p_value2 = self.advice_network2(o, a) advice_loss1 = (target_p_value - real_p_value1).pow(2).mean() advice_loss2 = (target_p_value - real_p_value2).pow(2).mean() # the actor loss o_exp = o.repeat(self.args.expand_batch, 1) a_exp = self.actor_network(o_exp) actor_loss = -torch.min(self.critic_network1(o_exp, a_exp), self.critic_network2(o_exp, a_exp)).mean() actor_loss -= self.args.advice * torch.min( self.advice_network1(o_exp, a_exp), self.advice_network2(o_exp, a_exp)).mean() mmd_entropy = torch.tensor(0.0) if self.args.mmd: # mmd is computationally expensive a_exp_reshape = a_exp.view(self.args.expand_batch, -1, a_exp.shape[-1]).transpose(0, 1) with torch.no_grad(): uniform_actions = (2 * torch.rand_like(a_exp_reshape) - 1) mmd_entropy = mmd(a_exp_reshape, uniform_actions) if self.args.beta_mmd <= 0.0: mmd_entropy.detach_() else: actor_loss += self.args.beta_mmd * mmd_entropy # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim1.zero_grad() critic_loss1.backward() sync_grads(self.critic_network1) self.critic_optim1.step() self.critic_optim2.zero_grad() critic_loss2.backward() sync_grads(self.critic_network2) self.critic_optim2.step() self.logger.store(LossPi=actor_loss.detach().cpu().numpy()) self.logger.store(LossQ=(critic_loss1 + critic_loss2).detach().cpu().numpy()) self.logger.store(MMDEntropy=mmd_entropy.detach().cpu().numpy()) # do the evaluation def _test_policy(self): for _ in range(self.args.n_test_rollouts): obs, ep_rew, ep_cost, ep_len, done = self.test_env.reset( ), 0, 0, 0, False while (not done and ep_len < self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor, std=0.5) action = action.detach().cpu().numpy().squeeze() obs_next, reward, done, info = self.test_env.step(action) obs = obs_next ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.logger.store(TestReward=ep_rew, TestCost=ep_cost, TestLen=ep_len)