def sac(env_fn, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, policy_type = 1, logger_kwargs=dict(), save_freq=1000, save_dir=None): torch.manual_seed(seed) np.random.seed(seed) env = env_fn() opp_policy = Policy(game=env, player_num=False) test_env = SoccerPLUS(visual=False) test_opp_policy = Policy(game=test_env, player_num=False) obs_dim = env.n_features act_dim = env.n_actions #env.n_actions # Action limit for clamping: critically, assumes all dimensions share the same bound! # act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(obs_dim, act_dim, **ac_kwargs) ac_targ = deepcopy(ac) if torch.cuda.is_available(): ac.cuda() ac_targ.cuda() device = torch.device('cuda') if args.cpc: cpc = CPC(timestep=args.timestep, obs_dim=4, hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim, c_dim=args.c_dim, device=device) else: cpc = None # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) # Experience buffer T = Counter() # training step E = Counter() # training episode replay_buffer = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, cpc=args.cpc, cpc_model=cpc, writer=writer_cpc,T=T) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q1_optimizer = Adam(ac.q1.parameters(), lr=lr) q2_optimizer = Adam(ac.q2.parameters(), lr=lr) if args.cpc: cpc_optimizer = Adam(cpc.parameters(), lr=args.lr, eps=1e-4) # Set up model saving # product action def get_actions_info(a_prob): a_dis = Categorical(a_prob) max_a = torch.argmax(a_prob) sample_a = a_dis.sample().cpu() z = a_prob == 0.0 z = z.float() * 1e-20 log_a_prob = torch.log(a_prob + z) return a_prob, log_a_prob, sample_a, max_a # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o2)) # Target Q-values q1_pi_targ = ac_targ.q1(o2) q2_pi_targ = ac_targ.q2(o2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * torch.sum(a_prob * (q_pi_targ - alpha * log_a_prob),dim=1) # MSE loss against Bellman backup q1 = ac.q1(o).gather(1, a.unsqueeze(-1).long()) q2 = ac.q2(o).gather(1, a.unsqueeze(-1).long()) loss_q1 = F.mse_loss(q1, backup.unsqueeze(-1)) loss_q2 = F.mse_loss(q2, backup.unsqueeze(-1)) loss_q = loss_q1 + loss_q2 return loss_q # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o)) q1_pi = ac.q1(o) q2_pi = ac.q2(o) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = torch.sum(a_prob * (alpha * log_a_prob - q_pi),dim=1,keepdim=True).mean() entropy = torch.sum(log_a_prob * a_prob, dim=1).detach() # Useful info for logging pi_info = dict(LogPi=entropy.cpu().numpy()) return loss_pi, entropy def update(): data = replay_buffer.sample_trans(args.batch_size, device=device) # First run one gradient descent step for Q1 and Q2 q1_optimizer.zero_grad() q2_optimizer.zero_grad() loss_q = compute_loss_q(data) loss_q.backward() nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2) q1_optimizer.step() q2_optimizer.step() # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, entropy = compute_loss_pi(data) loss_pi.backward() nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2) pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. # for p in q_params: # p.requires_grad = True # Record things if t >= update_after: # lr = max(args.lr * 2 ** (-(t-update_after) * 0.0001), 1e-10) _adjust_learning_rate(q1_optimizer, max(lr, 1e-10)) _adjust_learning_rate(q2_optimizer, max(lr, 1e-10)) _adjust_learning_rate(pi_optimizer, max(lr, 1e-10)) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): p_targ.data.copy_((1 - polyak) * p.data + polyak * p_targ.data) writer.add_scalar("training/pi_loss", loss_pi.detach().item(), t) writer.add_scalar("training/q_loss", loss_q.detach().item(), t) writer.add_scalar("training/entropy", entropy.detach().mean().item(), t) writer.add_scalar("training/lr", lr, t) def update_cpc(): data, indexes, min_len = replay_buffer.sample_traj(args.cpc_batch) data = data[:,:,3:] cpc_optimizer.zero_grad() c_hidden = cpc.init_hidden(len(data), args.c_dim) acc, loss, latents = cpc(data, c_hidden) # replay_buffer.update_latent(indexes, min_len, latents.detach()) loss.backward() # add gradient clipping nn.utils.clip_grad_norm_(cpc.parameters(), max_norm=20, norm_type=2) cpc_optimizer.step() writer_cpc.add_scalar("learner/cpc_acc", acc, t) writer_cpc.add_scalar("learner/cpc_loss", loss.detach().item(), t) def get_action(o, greedy=False): if len(o.shape) == 1: o = np.expand_dims(o, axis=0) a_prob = ac.act(torch.as_tensor(o, dtype=torch.float32,device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")), greedy) a_prob, log_a_prob, sample_a, max_a = get_actions_info(a_prob) action = sample_a if not greedy else max_a return action.item() def get_opp_policy(p): p_sample = np.random.rand() if p_sample < p: return args.opp1 else: return args.opp2 def test_agent(epoch, t_opp, writer): if num_test_episodes == 0: return with torch.no_grad(): win = 0 total_ret = 0 total_len = 0 for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o2, r, d, _ = test_env.step(get_action(o, True), test_opp_policy.get_actions(t_opp)) r *= 10 # test_env.render() o = o2 ep_ret += r ep_len += 1 total_ret += ep_ret total_len += ep_len if(ep_ret == 50): win += 1 mean_score = total_ret / num_test_episodes win_rate = win / num_test_episodes mean_len = total_len/ num_test_episodes print("opponent:\t{}\ntest epoch:\t{}\nmean score:\t{:.1f}\nwin_rate:\t{}\nmean len:\t{}".format( t_opp, epoch, mean_score, win_rate, mean_len)) writer.add_scalar("test/mean_score", mean_score, epoch) writer.add_scalar("test/win_rate", win_rate, epoch) writer.add_scalar("test/mean_len", mean_len,epoch) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() scores = [] trajectory, meta = [], [] o, ep_ret, ep_len = env.reset(), 0, 0 discard = False opp = get_opp_policy(args.p1) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): T.increment() # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. with torch.no_grad(): if t >= start_steps: a = get_action(o) else: a = np.random.randint(act_dim) # Step the env o2, r, d, info = env.step(a,opp_policy.get_actions(opp)) if info.get('no_data_receive', False): discard = True env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len or discard else d # Store experience to replay buffer # replay_buffer.store(o, a, r, o2, d) e = E.value() transition = (o, a, r, o2, d) trajectory.append(transition) meta.append([opp, 1, e, ep_len, r, a]) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len) or discard: scores.append(ep_ret) logger.info("total_step:{}, total_episode:{}, opp:{}, round len:{}, round score:{}, 100 mean score:{}, 10 mean Score:{}".format(t, e, opp, ep_len, ep_ret, np.mean(scores[-100:]),np.mean(scores[-10:]))) writer.add_scalar("metrics/round_score", ep_ret, t) writer.add_scalar("metrics/round_step", ep_len, t) writer.add_scalar("metrics/alpha", alpha, t) o, ep_ret, ep_len = env.reset(), 0, 0 replay_buffer.store(trajectory, meta=meta) trajectory, meta = [], [] E.increment() if t <= args.change_step: opp = get_opp_policy(args.p1) else: opp = get_opp_policy(args.p2) discard = False # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): update() # CPC update handing if args.cpc and e > args.cpc_batch * 2 and e % args.cpc_update_freq == 0: for _ in range(args.cpc_update_freq): update_cpc() if t >= update_after and t % save_freq == 0: # Test the performance of the deterministic version of the agent. test_agent(t, args.opp1, writer_1) test_agent(t, args.opp2, writer_3)
hidden_sizes=[args.hid] * args.l, z_dim=args.z_dim, c_dim=args.c_dim, device=device) else: global_cpc = None # create shared model for actor global_ac_targ = deepcopy(global_ac) shared_ac = deepcopy(global_ac).cpu() # create optimizer pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4) q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4) q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4) alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4) if args.cpc: cpc_optimizer = Adam(global_cpc.parameters(), lr=args.lr, eps=1e-4) env.close() del env # training setup T = Counter() # training steps E = Counter() # training episode replay_buffer = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, cpc=args.cpc, cpc_model=global_cpc, writer=writer, E=E) # bufferopp1 = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, cpc=args.cpc, # cpc_model=global_cpc, writer=writer, E=E)