def vpg(env_config, ac_type, ac_kwargs, gamma, lam, epochs, steps_per_epoch, lr, train_v_iters, max_ep_len, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None, None) actor_critic = gaussian_mlp_actor_critic pi, logp, logp_pi, v = actor_critic(obs_ph, a_ph, **ac_kwargs) all_phs = [obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph] get_action_ops = [pi, v, logp_pi] # Experience buffer buf = VPGBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) train_v = tf.train.AdamOptimizer(learning_rate=lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): buffer_data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, buffer_data)} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) sess.run(train_pi, feed_dict=inputs) # Training for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, v_new = sess.run( [pi_loss, v_loss, approx_kl, v], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={obs_ph: o.reshape(1, -1)}) buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) delta = np.exp(a[0]) delta = np.clip(delta, 0.9, 1.1) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: last_val = sess.run(v, feed_dict={obs_ph: o.reshape(1, -1)}) #print(last_val) buf.finish_path(last_val) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sac(args, steps_per_epoch=1500, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=3e-4, batch_size=128, start_steps=1000, update_after=1000, update_every=1, num_test_episodes=10, max_ep_len=150, logger_kwargs=dict(), save_freq=1): logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) torch.set_num_threads(torch.get_num_threads()) actor_critic = core.MLPActorCritic ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) gamma = args.gamma seed = args.seed epochs = args.epochs logger_tensor = Logger(logdir=args.logdir, run_name="{}-{}".format(args.model_name, time.ctime())) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) env.set_task(tasks[0]) # Set task test_env = ML1.get_train_tasks('reach-v1') # Create an environment with task `pick_place` tasks = env.sample_tasks(1) # Sample a task (in this case, a goal variation) test_env.set_task(tasks[0]) # Set task obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=3e-4) q_optimizer = Adam(q_params, lr=3e-4) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, logger_tensor, t): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) logger_tensor.log_value(t, loss_q.item(), "loss q") # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) logger_tensor.log_value(t, loss_pi.item(), "loss pi") # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) logger_tensor.log_value(t, ep_ret, "test ep reward") logger_tensor.log_value(t, ep_len, "test ep length") # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger_tensor.log_value(t, ep_ret, "reward") logging.info("> total_steps={} | reward={}".format(t, ep_ret)) logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, logger_tensor = logger_tensor, t = t) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger_tensor.log_value(t, epoch, "epoch") logger.dump_tabular(logger_tensor=logger_tensor,epoch = epoch) ac.save(args.save_model_dir, args.model_name)
class gac_agent: def __init__(self, args, env, test_env, env_params): self.args = args # path to save the model if self.args.mmd: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd), 's' + str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg, 'mmd' + str(self.args.beta_mmd))), self.exp_name) else: self.exp_name = '_'.join( (self.args.env_name, self.args.alg, str(self.args.seed), datetime.now().isoformat())) self.data_path = os.path.join( self.args.save_dir, '_'.join( (self.args.env_name, self.args.alg)), self.exp_name) self.logger = EpochLogger(output_dir=self.data_path, exp_name=self.exp_name) self.logger.save_config(args) self.env = env self.test_env = test_env self.env_params = env_params # create the network self.actor_network = actor(env_params) self.critic_network1 = critic(env_params) self.critic_network2 = critic(env_params) self.advice_network1 = critic(env_params) self.advice_network2 = critic(env_params) # sync the networks across the cpus sync_networks(self.actor_network) sync_networks(self.critic_network1) sync_networks(self.critic_network2) sync_networks(self.advice_network1) sync_networks(self.advice_network2) # build up the target network # self.actor_target_network = actor(env_params) self.critic_target_network1 = critic(env_params) self.critic_target_network2 = critic(env_params) self.advice_target_network1 = critic(env_params) self.advice_target_network2 = critic(env_params) # load the weights into the target networks # self.actor_target_network.load_state_dict(self.actor_network.state_dict()) self.critic_target_network1.load_state_dict( self.critic_network1.state_dict()) self.critic_target_network2.load_state_dict( self.critic_network2.state_dict()) self.advice_target_network1.load_state_dict( self.advice_network1.state_dict()) self.advice_target_network2.load_state_dict( self.advice_network2.state_dict()) # if use gpu self.rank = MPI.COMM_WORLD.Get_rank() self.mpi_size = MPI.COMM_WORLD.Get_size() if args.cuda: device = 'cuda:{}'.format(self.rank % torch.cuda.device_count()) self.device = torch.device(device) if self.args.cuda: self.actor_network.cuda(self.device) self.critic_network1.cuda(self.device) self.critic_network2.cuda(self.device) # self.actor_target_network.cuda(self.device) self.critic_target_network1.cuda(self.device) self.critic_target_network2.cuda(self.device) self.advice_network1.cuda(self.device) self.advice_network2.cuda(self.device) self.advice_target_network1.cuda(self.device) self.advice_target_network2.cuda(self.device) # create the optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor) self.critic_optim1 = torch.optim.Adam( self.critic_network1.parameters(), lr=self.args.lr_critic) self.critic_optim2 = torch.optim.Adam( self.critic_network2.parameters(), lr=self.args.lr_critic) self.advice_optim1 = torch.optim.Adam( self.advice_network1.parameters(), lr=self.args.lr_critic) self.advice_optim2 = torch.optim.Adam( self.advice_network2.parameters(), lr=self.args.lr_critic) # create the replay buffer self.buffer = ReplayBuffer(self.env_params['obs'], self.env_params['action'], self.args.buffer_size) self.logger.setup_pytorch_saver(self.actor_network) self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std def learn(self): """ train the network """ # start to collect samples obs, ep_rew, ep_cost, ep_len, done = self.env.reset(), 0, 0, 0, False for epoch in range(self.args.n_epochs): for _ in range(self.args.n_train_rollouts): for t in range(self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor) action = action.detach().cpu().numpy().squeeze() # feed the actions into the environment next_obs, reward, done, info = self.env.step( action * self.env_params['action_max']) ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.buffer.store(obs, action, reward, info['cost'], next_obs, done) obs = next_obs if done or (ep_len == self.env_params['max_timesteps'] ) or (t % self.args.n_batches == 0): self.buffer.obs_mean = MPI.COMM_WORLD.allreduce( self.buffer.obs_mean, op=MPI.SUM) / self.mpi_size self.buffer.obs_std = MPI.COMM_WORLD.allreduce( self.buffer.obs_std, op=MPI.SUM) / self.mpi_size self.obs_mean, self.obs_std = self.buffer.obs_mean, self.buffer.obs_std self.buffer.rew_mean = MPI.COMM_WORLD.allreduce( self.buffer.rew_mean, op=MPI.SUM) / self.mpi_size self.buffer.rew_std = MPI.COMM_WORLD.allreduce( self.buffer.rew_std, op=MPI.SUM) / self.mpi_size self.buffer.cost_mean = MPI.COMM_WORLD.allreduce( self.buffer.cost_mean, op=MPI.SUM) / self.mpi_size self.buffer.cost_std = MPI.COMM_WORLD.allreduce( self.buffer.cost_std, op=MPI.SUM) / self.mpi_size for _ in range(self.args.n_batches): # train the network self._update_network() # soft update # self._soft_update_target_network(self.actor_target_network, self.actor_network) self._soft_update_target_network( self.critic_target_network1, self.critic_network1, self.args.polyak) self._soft_update_target_network( self.critic_target_network2, self.critic_network2, self.args.polyak) if done or (ep_len == self.env_params['max_timesteps']): self.logger.store(EpReward=ep_rew, EpCost=ep_cost, EpLen=ep_len) obs, ep_rew, ep_cost, ep_len, done = self.env.reset( ), 0, 0, 0, False # start to do the evaluation self._test_policy() # save some necessary objects state = { 'observation_mean': self.buffer.obs_mean, 'observation_std': self.buffer.obs_std } self.logger.save_state(state, None) t = ((epoch + 1) * self.mpi_size * self.env_params['max_timesteps']) * self.args.n_train_rollouts self.logger.log_tabular('Epoch', epoch + 1) self.logger.log_tabular('EpReward', with_min_and_max=True) self.logger.log_tabular('EpCost', with_min_and_max=True) self.logger.log_tabular('EpLen', average_only=True) self.logger.log_tabular('TestReward', with_min_and_max=True) self.logger.log_tabular('TestCost', with_min_and_max=True) self.logger.log_tabular('TestLen', average_only=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('MMDEntropy', average_only=True) self.logger.log_tabular('TotalEnvInteracts', t) self.logger.dump_tabular() if MPI.COMM_WORLD.Get_rank() == 0: print("obs_mean=", self.buffer.obs_mean) print("obs_std=", self.buffer.obs_std) print("reward_mean=", self.buffer.rew_mean) print("reward_std=", self.buffer.rew_std) print("cost_mean=", self.buffer.cost_mean) print("cost_std=", self.buffer.cost_std) # pre_process the inputs def _preproc_inputs(self, obs): inputs = ((np.array(obs) - self.obs_mean) / (self.obs_std + 1e-8)).clip(-self.args.clip_range, self.args.clip_range) inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs # soft update def _soft_update_target_network(self, target, source, polyak): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - polyak) * param.data + polyak * target_param.data) # update the network def _update_network(self): # sample the episodes batches = self.buffer.sample(self.args.batch_size) o = torch.FloatTensor(batches['obs']).to(self.device) o2 = torch.FloatTensor(batches['obs2']).to(self.device) a = torch.FloatTensor(batches['act']).to(self.device) r = torch.FloatTensor(batches['rew']).to(self.device) c = torch.FloatTensor(batches['cost']).to(self.device) d = torch.FloatTensor(batches['done']).to(self.device) # calculate the target Q value function with torch.no_grad(): # do the normalization # concatenate the stuffs a2 = self.actor_network(o2) q_next_value1 = self.critic_target_network1(o2, a2).detach() q_next_value2 = self.critic_target_network2(o2, a2).detach() target_q_value = r + self.args.gamma * (1 - d) * torch.min( q_next_value1, q_next_value2) target_q_value = target_q_value.detach() p_next_value1 = self.advice_target_network1(o2, a2).detach() p_next_value2 = self.advice_target_network2(o2, a2).detach() target_p_value = -c + self.args.gamma * (1 - d) * torch.min( p_next_value1, p_next_value2) target_p_value = target_p_value.detach() # the q loss real_q_value1 = self.critic_network1(o, a) real_q_value2 = self.critic_network2(o, a) critic_loss1 = (target_q_value - real_q_value1).pow(2).mean() critic_loss2 = (target_q_value - real_q_value2).pow(2).mean() # the p loss real_p_value1 = self.advice_network1(o, a) real_p_value2 = self.advice_network2(o, a) advice_loss1 = (target_p_value - real_p_value1).pow(2).mean() advice_loss2 = (target_p_value - real_p_value2).pow(2).mean() # the actor loss o_exp = o.repeat(self.args.expand_batch, 1) a_exp = self.actor_network(o_exp) actor_loss = -torch.min(self.critic_network1(o_exp, a_exp), self.critic_network2(o_exp, a_exp)).mean() actor_loss -= self.args.advice * torch.min( self.advice_network1(o_exp, a_exp), self.advice_network2(o_exp, a_exp)).mean() mmd_entropy = torch.tensor(0.0) if self.args.mmd: # mmd is computationally expensive a_exp_reshape = a_exp.view(self.args.expand_batch, -1, a_exp.shape[-1]).transpose(0, 1) with torch.no_grad(): uniform_actions = (2 * torch.rand_like(a_exp_reshape) - 1) mmd_entropy = mmd(a_exp_reshape, uniform_actions) if self.args.beta_mmd <= 0.0: mmd_entropy.detach_() else: actor_loss += self.args.beta_mmd * mmd_entropy # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim1.zero_grad() critic_loss1.backward() sync_grads(self.critic_network1) self.critic_optim1.step() self.critic_optim2.zero_grad() critic_loss2.backward() sync_grads(self.critic_network2) self.critic_optim2.step() self.logger.store(LossPi=actor_loss.detach().cpu().numpy()) self.logger.store(LossQ=(critic_loss1 + critic_loss2).detach().cpu().numpy()) self.logger.store(MMDEntropy=mmd_entropy.detach().cpu().numpy()) # do the evaluation def _test_policy(self): for _ in range(self.args.n_test_rollouts): obs, ep_rew, ep_cost, ep_len, done = self.test_env.reset( ), 0, 0, 0, False while (not done and ep_len < self.env_params['max_timesteps']): with torch.no_grad(): input_tensor = self._preproc_inputs(obs) action = self.actor_network(input_tensor, std=0.5) action = action.detach().cpu().numpy().squeeze() obs_next, reward, done, info = self.test_env.step(action) obs = obs_next ep_rew += reward ep_cost += info['cost'] ep_len += 1 self.logger.store(TestReward=ep_rew, TestCost=ep_cost, TestLen=ep_len)
class Test: def __init__(self, args): self.args = args self.env = gym.make(args.env_name) self.env_params = get_env_params(self.env) self.video_file = 'data_test/test_video' self.output_dir = 'data_test' self.exp_name = 'test' self.logger = EpochLogger(output_dir=self.output_dir, exp_name=self.exp_name) # self.env = wrappers.Monitor(self.env, self.video_file, force=True) device = 'cuda' if args.cuda else 'cpu' self.device = torch.device(device) # load data_file = os.path.join(args.load_fold, 'vars.pkl') data = joblib.load(data_file) ## load obs_mean obs_std g_mean g_std self.obs_mean = data['observation_mean'] self.obs_std = data['observation_std'] ## load policy model model = { 'ddpg': actor, 'td3': actor, 'sac': actor_sac, 'gac': actor_gac } self.actor_network = model[args.alg](self.env_params).to(self.device) model_file = os.path.join(args.load_fold, 'pyt_save', 'model.pt') self.actor_network.load_state_dict(torch.load(model_file)) def run(self): self._eval_agent() self.logger.log_tabular('EpReward') self.logger.log_tabular('EpCost') self.logger.dump_tabular() def _preproc_inputs(self, obs): obs_norm = np.clip((obs - self.obs_mean) / self.obs_std, -self.args.clip_range, self.args.clip_range) # concatenate the stuffs inputs = torch.tensor(obs_norm, dtype=torch.float32).unsqueeze(0) if self.args.cuda: inputs = inputs.cuda(self.device) return inputs def _eval_agent(self): for _ in range(self.args.n_test_rollouts): obs, ep_reward, ep_cost = self.env.reset(), 0, 0 for _ in range(self.env_params['max_timesteps']): if self.args.render: self.env.render() time.sleep(1e-3) with torch.no_grad(): input_tensor = self._preproc_inputs(obs) if self.args.alg == 'gac': pi = self.actor_network(input_tensor, std=0.5) elif self.args.alg == 'sac': pi, _ = self.actor_network(input_tensor) else: pi = self.actor_network(input_tensor) # convert the actions actions = pi.detach().cpu().numpy().squeeze() obs, reward, cost, info = self.env.step(actions) ep_reward += reward ep_cost += cost self.logger.store(EpReward=ep_reward, EpCost=ep_cost)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # GAedit # Special function to avoid certain slowdowns from PyTorch + MPI combo. # setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # GAedit # Seed seed = 333 torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() #GAedit # obs_dim = env.observation_space.shape # act_dim = env.action_space.shape # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) # size of each action act_dim = brain.vector_action_space_size # examine the state space obs_dim = env_info.vector_observations.shape[1] #GAedit # Create actor-critic module # ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac = actor_critic(obs_dim, act_dim, **ac_kwargs) # GAedit - don't think we need to sync # Sync params across processes # sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer # GAedit # local_steps_per_epoch = int(steps_per_epoch / num_procs()) local_steps_per_epoch = int(steps_per_epoch / num_agents) #GAedit buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch * num_agents, gamma, lam) # buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) #GAedit # kl = mpi_avg(pi_info['kl']) kl = pi_info['kl'] if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() #GAedit # mpi_avg_grads(ac.pi) # average grads across MPI processes # ac.pi.mean() pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() #GAedit # mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() #GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) # GAedit # next_o, r, d, _ = env.step(a) env_info = env.step(a)[brain_name] next_o, r, d = env_info.vector_observations, env_info.rewards, env_info.local_done #GAedit # ep_ret += r ep_ret += np.mean(r) ep_len += 1 # save and log #GAedit # buf.store(o, a, r, v, logp) for i in range(20): buf.store(o[i], a[i], r[i], v[i], logp[i]) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len # GAedit # terminal = d or timeout terminal = any(d) or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) # GAedit # o, ep_ret, ep_len = env.reset(), 0, 0 ep_ret, ep_len = 0, 0 env_info = env.reset(train_mode=True)[brain_name] o = env_info.vector_observations # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=2000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): global RENDER, BONUS """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Reachability Trainer r_network = R_Network().to(device) trainer = R_Network_Trainer(r_network=r_network, exp_name="random1") episodic_memory = EpisodicMemory(embedding_shape=[EMBEDDING_DIM]) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(3, 64, 64)) action_space = gym.spaces.Discrete(3) obs_dim = observation_space.shape act_dim = action_space.shape # Create actor-critic module ac = actor_critic(observation_space, action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) # Entropy bonus loss_pi += pi_info['ent'] * 0.0021 kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, _ = env.reset() env.render() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) ep_ret, ep_len = 0, 0 indices = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) a, v, logp = ac.step(state) next_o, r, d, info = env.step(a) next_o = next_o.astype(np.float32) / 255. d = ep_len == max_ep_len trainer.store_new_state([next_o], [r], [d], [None]) r_network.eval() with torch.no_grad(): state_embedding = r_network.embed_observation( torch.FloatTensor([o]).to(device)).cpu().numpy()[0] aggregated, _, _ = similarity_to_memory( state_embedding, episodic_memory, r_network) curiosity_bonus = 0.03 * (0.5 - aggregated) if BONUS: print(f'{curiosity_bonus:.3f}') if curiosity_bonus > 0 or len(episodic_memory) == 0: idx = episodic_memory.store_new_state(state_embedding) x = int(env.map_scale * info['pose']['x']) y = int(env.map_scale * info['pose']['y']) if idx == len(indices): indices.append((x, y)) else: indices[idx] = (x, y) r_network.train() next_o = next_o.transpose(2, 0, 1) ep_ret += r + curiosity_bonus ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) k = cv2.waitKey(1) if k == ord('s'): RENDER = 1 - RENDER elif k == ord('b'): BONUS = 1 - BONUS if RENDER: env.info['map'] = cv2.flip(env.info['map'], 0) for index in indices: cv2.circle(env.info['map'], index, 3, (0, 0, 255), -1) env.info['map'] = cv2.flip(env.info['map'], 0) env.render() # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: state = torch.as_tensor(o[np.newaxis, ...], dtype=torch.float32) _, v, _ = ac.step(state) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) print(ep_ret, ep_len, len(episodic_memory)) ep_ret, ep_len = 0, 0 o, _ = env.reset() o = o.astype(np.float32) / 255. o = o.transpose(2, 0, 1) episodic_memory.reset() indices = [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! if epoch > 4: update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() else: buf.get()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, trials_per_epoch=2500, steps_per_trial=100, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph') a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph') # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1) adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='adv_ph') ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='ret_ph') logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, None), name='logp_old_ph') rew_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='rew_ph') pi_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='pi_state_ph') v_state_ph = tf.placeholder(dtype=tf.float32, shape=(None, NUM_GRU_UNITS), name='v_state_ph') # Initialize rnn states for pi and v # Main outputs from computation graph pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic( x_ph, a_ph, rew_ph, pi_state_ph, v_state_ph, NUM_GRU_UNITS, action_space=env.action_space) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph] # Every step, get: action, value, and logprob and reward get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state] # Experience buffer steps_per_epoch = trials_per_epoch * steps_per_trial local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # tf.reset_default_graph() # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save') def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS)) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) print(pi_l_old, v_l_old) # Training for i in range(train_pi_iters): # print(f'pi:{i}') _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # print(sess.run(pi_loss, feed_dict=inputs)) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): # print(f'v:{_}') sess.run(train_v, feed_dict=inputs) # Log changes from update import datetime print(f'finish one batch training at {datetime.datetime.now()}') pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for trial in range(trials_per_epoch): print(f'trial: {trial}') old_a = np.array([0]).reshape(1, 1) old_r = np.array([0]).reshape((1, 1, 1)) means = env.sample_tasks(1)[0] action_dict = defaultdict(int) for i in range(env.action_space.n): action_dict[i] = 0 env.reset_task_simple(means) task_avg = 0.0 pi_state_t = np.zeros((1, NUM_GRU_UNITS)) v_state_t = np.zeros((1, NUM_GRU_UNITS)) for step in range(steps_per_trial): a, v_t, logp_t, pi_state_t, v_state_t = sess.run( get_action_ops, feed_dict={ x_ph: o.reshape(1, 1, -1), a_ph: old_a, rew_ph: old_r, pi_state_ph: pi_state_t, v_state_ph: v_state_t }) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) try: o, r, d, _ = env.step(a[0][0]) except: print(a) raise AssertionError action_dict[a[0][0]] += 1 old_a = np.array(a).reshape(1, 1) old_r = np.array([r]).reshape(1, 1, 1) ep_ret += r task_avg += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (step == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # logger.log_tabular('Epoch', epoch) # logger.log_tabular('EpRet', with_min_and_max=True) # logger.log_tabular('Means', means) # logger.dump_tabular() print(f'avg in trial {trial}: {task_avg / steps_per_trial}') print(f'Means in trial {trial}: {means}') print(action_dict) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt") # print(f'Model saved in {saved_path}') # Perform PPO update! update() logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger( **logger_kwargs) logger.save_config( locals()) tf.set_random_seed(seed) np.random.seed( seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping act_limit = env.action_space.high[0] # Share action sapce info with A2C ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\ tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32) # Actor policy and value with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # Tghis seems a bit memory inneficient: what happens to the q values created # along with the target policy ? the poluicy created along the q targets ? # Not referenced, but still declared right, a the cost of GPU memory # Target policy with tf.variable_scope( 'target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope( 'target', reuse=True): epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise) epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value( a2, -act_limit, act_limit) # Target Q-Values using actions from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars( scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # CLiped Double Q-Learning with Bellman backup min_q_targ = tf.minimum( q1_targ, q2_targ) backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ) # TD3 Losses pi_loss = - tf.reduce_mean( q1_pi) q1_loss = tf.reduce_mean( (q1 - backup)**2) q2_loss = tf.reduce_mean( (q2 - backup)**2) q_loss = q1_loss + q2_loss # Trainin ops pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss) q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss) # Polyak wise target update target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak) * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))]) target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in zip( get_vars('target'), get_vars('main'))]) sess = tf.Session() sess.run( tf.global_variables_initializer()) sess.run( target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action( o, noise_scale): a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)}) a += noise_scale * np.random.randn( act_dim) return np.clip( a, -act_limit, act_limit) def test_agent( n=10): for j in range( n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0 while not ( d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( get_action( o, 0)) ep_ret += r ep_len += 1 logger.store( TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0 total_steps = steps_per_epoch * epochs # Main loop for t in range( total_steps): if t > start_steps: a = get_action( o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step( a) ep_ret += r ep_len += 1 d = False or ( ep_len == max_ep_len) o2 = np.squeeze( o2) # print( "O2: ", o2) replaybuffer.store( o, a, r, o2, d) o = o2 if d or ( ep_len == max_ep_len): for j in range( ep_len): batch = replaybuffer.sample_batch( batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, q_train] outs = sess.run( q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: outs = sess.run( [pi_loss, pi_train, target_update], feed_dict) logger.store( LossPi=outs[0]) logger.store( EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or ( epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def trpo(env_fn, actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=.99, delta=.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=.8, lam=.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo="trpo"): # LOgger tools logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Seed inits seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) # Environment recreation env = env_fn() # Getting obs dims obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] ac_kwargs['action_space'] = env.action_space # Placeholders x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32) adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \ tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) def keys_as_sorted_list(dict): return sorted(list(dict.keys())) def values_as_sorted_list(dict): return [dict[k] for k in keys_as_sorted_list(dict)] all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + values_as_sorted_list(info_phs) get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info) # Experience buffer init local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple(count_vars(scope) for scope in ["pi", "v"]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO Losses ratio = tf.exp(logp - logp_old_ph) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # CG solver requirements pi_params = get_vars("pi") # Some helpers def flat_concat(xs): return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0) def flat_grad(f, params): return flat_concat(tf.gradients(xs=params, ys=f)) def hessian_vector_product(f, params): g = flat_grad(f, params) x = tf.placeholder(tf.float32, shape=g.shape) return x, flat_grad(tf.reduce_sum(g * x), params) def assign_params_from_flat(x, params): flat_size = lambda p: int(np.prod(p.shape.as_list()) ) # the 'int' is important for scalars splits = tf.split(x, [flat_size(p) for p in params]) new_params = [ tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits) ] return tf.group( [tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) gradient = flat_grad(pi_loss, pi_params) v_ph, hvp = hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = flat_concat(pi_params) set_pi_params = assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): x = np.zeros_like(b) r = b.copy() p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval # Always so elegant haha inputs = {k: v for k, v in zip(all_phs, buf.get())} def mpi_avg(x): """Average a scalar or vector over MPI processes.""" return mpi_sum(x) / num_procs() Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) # OK old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == "trpo": for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] # Save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak, batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len, test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim, z_type, act_noise, test_without_state, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = make_env(env_config), make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high # Inputs to computation graph x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, z_dim, obs_dim, None, None) actor_critic = core.get_iac_actor_critic(ac_type) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs) # Experience buffer RB = get_replay_buffer(rb_type) replay_buffer = RB(obs_dim, act_dim, **rb_kwargs) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n' % var_counts) # Bellman backup for Q and V function q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) min_q_pi = tf.minimum(q1_pi, q2_pi) v_backup = tf.stop_gradient(min_q_pi) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2) q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2) v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2) value_loss = q1_loss + q2_loss + v_loss # Separate train ops for pi, q policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_policy_op = policy_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) if ac_kwargs["pi_separate"]: train_policy_emb_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/emb')) train_policy_d_op = policy_optimizer.minimize( pi_loss, var_list=get_vars('main/pi/d')) train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q') + get_vars('main/v')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) def sample_z(size): if z_type == "uniform": return np.random.random_sample(size=size) elif z_type == "gaussian": return np.random.normal(size=size) else: raise Exception("z_type error") def get_action(o, noise_scale): pi_a = sess.run(pi, feed_dict={ x_ph: o.reshape(1, -1), z_ph: sample_z((1, z_dim)) })[0] pi_a += noise_scale * np.random.randn(act_dim) pi_a = np.clip(pi_a, 0, 1) real_a = pi_a * act_high return pi_a, real_a def test_agent(n=10): test_actions = [] for j in range(n): test_actions_ep = [] o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == test_max_ep_len)): # Take deterministic actions at test time (noise_scale=0) if test_without_state: _, real_a = get_action(np.zeros(o.shape), 0) else: _, real_a = get_action(o, 0) test_actions_ep.append(real_a) o, r, d, _ = test_env.step(real_a) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) test_actions.append(test_actions_ep) return test_actions start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs rewards = [] rets = [] test_rets = [] max_ret = None # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: pi_a, real_a = get_action(o, act_noise) else: pi_a, real_a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(real_a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, pi_a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } feed_dict[z_ph] = sample_z((batch_size, z_dim)) # Policy Learning update for key in feed_dict: feed_dict[key] = np.repeat(feed_dict[key], q_pi_sample_size, axis=0) feed_dict[z_ph] = sample_z( (batch_size * q_pi_sample_size, z_dim)) if ac_kwargs["pi_separate"]: if len(rewards) % 2 == 0: outs = sess.run([pi_loss, train_policy_emb_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_d_op], feed_dict) else: outs = sess.run([pi_loss, train_policy_op], feed_dict) logger.store(LossPi=outs[0]) # Q-learning update outs = sess.run([q1_loss, v_loss, q1, v, train_value_op], feed_dict) logger.store(LossQ=outs[0], LossV=outs[1], ValueQ=outs[2], ValueV=outs[3]) logger.store(EpRet=ep_ret, EpLen=ep_len) rewards.append(ep_ret) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Test the performance of the deterministic version of the agent. test_actions = test_agent(number_of_tests_per_epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) ret = logger.log_tabular('EpRet', average_only=True)[0] test_ret = logger.log_tabular('TestEpRet', average_only=True)[0] logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('ValueQ', average_only=True) logger.log_tabular('ValueV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rets.append(ret) test_rets.append(test_ret) if max_ret is None or test_ret > max_ret: max_ret = test_ret best_test_actions = test_actions max_ep_len += inc_ep sess.run(target_update, feed_dict) logger.save_state( { "rewards": rewards, "best_test_actions": best_test_actions, "rets": rets, "test_rets": test_rets, "max_ret": max_ret }, None) util.plot_actions(best_test_actions, act_high, logger.output_dir + '/best_test_actions.png') logger.log("max ret: %f" % max_ret)
def ppo(env_config, ac_type, ac_kwargs, clip_ratio, epochs, steps_per_epoch, optimizer, lr, train_pi_iters, max_ep_len, target_kl, logger_kwargs, seed): logger = EpochLogger(**logger_kwargs) configs = locals().copy() configs.pop("logger") logger.save_config(configs) tf.set_random_seed(seed) np.random.seed(seed) env = make_env(env_config) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high obs_ph, a_ph, adv_ph, logp_old_ph = core.placeholders( obs_dim, act_dim, None, None) all_phs = [obs_ph, a_ph, adv_ph, logp_old_ph] actor_critic = get_ppo_actor_critic(ac_type) pi, logp, logp_pi = actor_critic(obs_ph, a_ph, **ac_kwargs) # Experience buffer buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # Optimizers if optimizer == "adam": train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss) elif optimizer == "sgd": train_pi = tf.train.GradientDescentOptimizer( learning_rate=lr).minimize(pi_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) def update(): print(sess.run(tf.trainable_variables())) data = buf.get() #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch) inputs = {k: v for k, v in zip(all_phs, data[:4])} pi_l_old, ent = sess.run([pi_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # Log changes from update pi_l_new, kl, cf = sess.run([pi_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) print(tf.trainable_variables()) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): episode_count = 0 ep_actions = [] for t in range(steps_per_epoch): a, logp_t = sess.run([pi, logp_pi], feed_dict={obs_ph: o.reshape(1, -1)}) delta = np.exp(a[0]) delta = np.clip(delta, 0.95, 1.05) real_action = env.action_space.clip(real_action * delta) o, r, d, _ = env.step(real_action) buf.store(o, a, r, logp_t) ep_actions.append(real_action) episode_actions.append(real_action) episode_obs.append(o) ep_ret += r ep_len += 1 if ep_len == max_ep_len or t == steps_per_epoch - 1: buf.finish_path() logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 real_action = env.action_space.default() o, r, d, _ = env.step(real_action) util.plot_seq_obs_and_actions( episode_obs, episode_actions, act_high, logger.output_dir + '/episode_actions_%d_%d.png' % (epoch, episode_count)) episode_count += 1 episode_actions = [] episode_obs = [] episode_actions.append(real_action) episode_obs.append(o) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() util.plot_actions(ep_actions, act_high, logger.output_dir + '/ep_actions%d.png' % epoch)
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.5, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=10000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) # Why isn't the k used here ? def gaussian_likelihood(x, mu, log_std): EPS = 1e-8 pre_sum = -0.5 * ( ((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) def clip_but_pass_gradient(x, l=-1., u=1.): clip_up = tf.cast(x > u, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) LOG_STD_MIN = -20 LOG_STD_MAX = 2 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) """ Because algorithm maximizes trade-off of reward and entropy, entropy must be unique to state---and therefore log_stds need to be a neural network output instead of a shared-across-states learnable parameter vector. But for deep Relu and other nets, simply sticking an activationless dense layer at the end would be quite bad---at the beginning of training, a randomly initialized net could produce extremely large values for the log_stds, which would result in some actions being either entirely deterministic or too random to come back to earth. Either of these introduces numerical instability which could break the algorithm. To protect against that, we'll constrain the output range of the log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is slightly different from the trick used by the original authors of SAC---they used tf.clip_by_value instead of squashing and rescaling. I prefer this approach because it allows gradient propagation through log_std where clipping wouldn't, but I don't know if it makes much of a difference. """ log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): mu = tf.tanh(mu) pi = tf.tanh(pi) # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. logp_pi -= tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): # mu = mlp( x_ph, hidden_sizes, activation, None) # log_std = mlp( mu, (act_dim,), activation, None) # # Avoid out of range log_std. Refer to Github for explanation. # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # # mu = mlp( mu, (act_dim,), activation, None) # # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu)) # logp_pi = gaussian_likelihood( pi, mu, log_std) # # # Follow SpinningUp Implementation # mu = tf.tanh(mu) # pi = tf.tanh(pi) # # def clip_but_pass_gradient(x, l=-1., u=1.): # clip_up = tf.cast(x > u, tf.float32) # clip_low = tf.cast(x < l, tf.float32) # # What is this supposed to mean even ? # return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) # # # Shameless copy paste # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) # Not working version bak # squashed_pi = tf.tanh( pi) # # # To be sure # pi = tf.clip_by_value( pi, -act_limit, act_limit) # # # Must take in the squased polic # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std) # Shamefull plug mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes, tf.tanh, None) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, act_dim, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 o2, r, d, _ = env.step(o) d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()