def test_proc(global_ac, env, opp, args, device): scores, wins, m_score, win_rate = [], [], 0, 0 o, ep_ret, ep_len = env.reset(), 0, 0 discard = False local_t = 0 local_e = 0 opp_policy = Policy(game=env, player_num=False) while local_e < args.test_episode: with torch.no_grad(): a = global_ac.get_action(o, greedy=True, device=device) # Step the env # o2, r, d, info = env.step(a) o2, r, d, info = env.step(a, opp_policy.get_actions(opp)) if info.get('no_data_receive', False): discard = True ep_ret += r ep_len += 1 d = False if (ep_len == args.max_ep_len) or discard else d o = o2 local_t += 1 # End of trajectory handling if d or (ep_len == args.max_ep_len) or discard: # logger.store(EpRet=ep_ret, EpLen=ep_len) local_e += 1 if info.get('win', False): wins.append(1) else: wins.append(0) scores.append(ep_ret) o, ep_ret, ep_len = env.reset(), 0, 0 discard = False m_score = np.mean(scores) win_rate = np.mean(wins) return m_score, win_rate, local_t
def main(): # env = gym.make('CartPole-v0') # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n env = SoccerPLUS(visual=False) obs_dim = env.n_features act_dim = env.n_actions learning_rate = 0.0001 gamma = 0.98 hidden = 256 n_rollout = 10 policy_type = 1 opp_policy = Policy(game=env, player_num=False) model = ActorCritic(obs_dim, act_dim, hidden, learning_rate, gamma) # Training Loop print_interval = 100 score = 0.0 n_epi = 0 while True: n_epi += 1 done = False s = env.reset() while not done: for t in range(n_rollout): prob = model.pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample().item() # s_prime, r, done, info = env.step(a) s_prime, r, done, info = env.step( a, opp_policy.get_actions(policy_type)) env.render() model.put_data((s, a, r, s_prime, done)) s = s_prime score += r if done: break model.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.1f}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def sac(env_fn, actor_critic=MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, policy_type=1, logger_kwargs=dict(), save_freq=1000, save_dir=None): torch.manual_seed(seed) np.random.seed(seed) env = env_fn() opp_policy = Policy(game=env, player_num=False) test_env = SoccerPLUS(visual=False) test_opp_policy = Policy(game=test_env, player_num=False) obs_dim = env.n_features act_dim = env.n_actions #env.n_actions # Action limit for clamping: critically, assumes all dimensions share the same bound! # act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(obs_dim, act_dim, **ac_kwargs) ac_targ = deepcopy(ac) if torch.cuda.is_available(): ac.cuda() ac_targ.cuda() # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q1_optimizer = Adam(ac.q1.parameters(), lr=lr) q2_optimizer = Adam(ac.q2.parameters(), lr=lr) # Set up model saving # product action def get_actions_info(a_prob): a_dis = Categorical(a_prob) max_a = torch.argmax(a_prob) sample_a = a_dis.sample().cpu() z = a_prob == 0.0 z = z.float() * 1e-20 log_a_prob = torch.log(a_prob + z) return a_prob, log_a_prob, sample_a, max_a # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o2)) # Target Q-values q1_pi_targ = ac_targ.q1(o2) q2_pi_targ = ac_targ.q2(o2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * torch.sum( a_prob * (q_pi_targ - alpha * log_a_prob), dim=1) # MSE loss against Bellman backup q1 = ac.q1(o).gather(1, a.unsqueeze(-1).long()) q2 = ac.q2(o).gather(1, a.unsqueeze(-1).long()) loss_q1 = F.mse_loss(q1, backup.unsqueeze(-1)) loss_q2 = F.mse_loss(q2, backup.unsqueeze(-1)) loss_q = loss_q1 + loss_q2 return loss_q # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] a_prob, log_a_prob, sample_a, max_a = get_actions_info(ac.pi(o)) q1_pi = ac.q1(o) q2_pi = ac.q2(o) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = torch.sum(a_prob * (alpha * log_a_prob - q_pi), dim=1, keepdim=True).mean() entropy = torch.sum(log_a_prob * a_prob, dim=1).detach() # Useful info for logging pi_info = dict(LogPi=entropy.cpu().numpy()) return loss_pi, entropy def update(data): # First run one gradient descent step for Q1 and Q2 q1_optimizer.zero_grad() q2_optimizer.zero_grad() loss_q = compute_loss_q(data) loss_q.backward() nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2) q1_optimizer.step() q2_optimizer.step() # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, entropy = compute_loss_pi(data) loss_pi.backward() nn.utils.clip_grad_norm_(ac.parameters(), max_norm=10, norm_type=2) pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. # for p in q_params: # p.requires_grad = True # Record things if t >= update_after: # lr = max(args.lr * 2 ** (-(t-update_after) * 0.0001), 1e-10) _adjust_learning_rate(q1_optimizer, max(lr, 1e-10)) _adjust_learning_rate(q2_optimizer, max(lr, 1e-10)) _adjust_learning_rate(pi_optimizer, max(lr, 1e-10)) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): p_targ.data.copy_((1 - polyak) * p.data + polyak * p_targ.data) writer.add_scalar("training/pi_loss", loss_pi.detach().item(), t) writer.add_scalar("training/q_loss", loss_q.detach().item(), t) writer.add_scalar("training/entropy", entropy.detach().mean().item(), t) writer.add_scalar("training/lr", lr, t) def get_action(o, greedy=False): if len(o.shape) == 1: o = np.expand_dims(o, axis=0) a_prob = ac.act( torch.as_tensor( o, dtype=torch.float32, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")), greedy) a_prob, log_a_prob, sample_a, max_a = get_actions_info(a_prob) action = sample_a if not greedy else max_a return action.item() def test_agent(epoch, t_opp, writer): if num_test_episodes == 0: return with torch.no_grad(): win = 0 total_ret = 0 total_len = 0 for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o2, r, d, _ = test_env.step( get_action(o, True), test_opp_policy.get_actions(t_opp)) r *= 10 # test_env.render() o = o2 ep_ret += r ep_len += 1 total_ret += ep_ret total_len += ep_len if (ep_ret == 50): win += 1 mean_score = total_ret / num_test_episodes win_rate = win / num_test_episodes mean_len = total_len / num_test_episodes logger.info( "opponent:\t{}\ntest epoch:\t{}\nmean score:\t{:.1f}\nwin_rate:\t{}\nmean len:\t{}" .format(t_opp, epoch, mean_score, win_rate, mean_len)) writer.add_scalar("test/mean_score", mean_score, epoch) writer.add_scalar("test/win_rate", win_rate, epoch) writer.add_scalar("test/mean_len", mean_len, epoch) def test_opp(epoch, p, writer): if num_test_episodes == 0: return with torch.no_grad(): win = 0 total_ret = 0 for j in range(num_test_episodes): t_opp = get_opp_policy(p) o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o2, r, d, _ = test_env.step( get_action(o, True), test_opp_policy.get_actions(t_opp)) r *= 10 # test_env.render() o = o2 ep_ret += r ep_len += 1 total_ret += ep_ret if (ep_ret == 50): win += 1 mean_score = total_ret / num_test_episodes win_rate = win / num_test_episodes logger.info( "p:\t{}\ntest epoch:\t{}\nmean score:\t{:.1f}\nwin_rate:\t{}\n" .format(p, epoch, mean_score, win_rate)) writer.add_scalar("test/mean_score", mean_score, epoch) writer.add_scalar("test/win_rate", win_rate, epoch) def get_opp_policy(p): p_sample = np.random.rand() if p_sample < p: return 4 else: return 5 # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() scores = [] o, ep_ret, ep_len = env.reset(), 0, 0 discard = False episode = 0 opp = get_opp_policy(args.p) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. with torch.no_grad(): if t >= start_steps: a = get_action(o) else: a = np.random.randint(act_dim) # Step the env o2, r, d, info = env.step(a, opp_policy.get_actions(opp)) # r = int(r * 0.2) if info.get('no_data_receive', False): discard = True env.render() ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len or discard else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) writer.add_scalar("learner/buffer_size", replay_buffer.size, t) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len) or discard: scores.append(ep_ret) # print("total_step {},round len:{}, round score: {}, 100 mean score: {}, 10 mean Score: {}".format(t,ep_len, ep_ret, np.mean(scores[-100:]),np.mean(scores[-10:]))) logger.info( "total_step {}, episode: {}, opp:{}, round len:{}, round score: {}, 100 mean score: {}, 10 mean Score: {}" .format(t, episode, opp, ep_len, ep_ret, np.mean(scores[-100:]), np.mean(scores[-10:]))) writer.add_scalar("metrics/round_score", ep_ret, t) writer.add_scalar("metrics/round_step", ep_len, t) writer.add_scalar("metrics/alpha", alpha, t) o, ep_ret, ep_len = env.reset(), 0, 0 opp = get_opp_policy(args.p) episode += 1 discard = False # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch( batch_size, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) update(data=batch) # End of epoch handling # if (t + 1) % steps_per_epoch == 0: # epoch = (t + 1) // steps_per_epoch # # Save model # if t % save_freq == 0 and t > 0: # torch.save(ac.state_dict(), os.path.join(save_dir, "model")) # print("Saving model at episode:{}".format(t)) if t >= update_after and t % save_freq == 0: # Test the performance of the deterministic version of the agent. test_agent(t, 4, writer_1) test_agent(t, 5, writer_3) test_opp(t, args.p, writer_opp)
def sac( rank, E, T, args, model_q, buffer_q, device=None, tensorboard_dir=None, ): torch.manual_seed(args.seed + rank) np.random.seed(args.seed + rank) # writer = GlobalSummaryWriter.getSummaryWriter() tensorboard_dir = os.path.join(tensorboard_dir, str(rank)) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) writer = SummaryWriter(log_dir=tensorboard_dir) # if args.exp_name == "test": # env = gym.make("CartPole-v0") # elif args.non_station: # env = make_ftg_ram_nonstation(args.env, p2_list=args.opp_list, total_episode=args.station_rounds,stable=args.stable) # else: # env = make_ftg_ram(args.env, p2=args.p2) # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n env = SoccerPLUS() opp_policy = Policy(game=env, player_num=False) opps = [] for opp in args.opp_list: opps += [opp] * args.opp_freq opp = opps[0] obs_dim = env.n_features act_dim = env.n_actions ac_kwargs = dict(hidden_sizes=[args.hid] * args.l) local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs) print("set up child process env") # Prepare for interaction with environment scores, wins = [], [] # meta data is purely for experiment analysis trajectory, meta = [], [] o, ep_ret, ep_len = env.reset(), 0, 0 discard = False local_t, local_e = 0, 0 if not model_q.empty(): print("Process {}\t Initially LOADING...".format(rank)) received_obj = model_q.get() model_dict = deepcopy(received_obj) local_ac.load_state_dict(model_dict) print("Process {}\t Initially Loading FINISHED!!!".format(rank)) del received_obj # Main loop: collect experience in env and update/log each epoch while T.value() < args.episode: with torch.no_grad(): if E.value() <= args.update_after: a = np.random.randint(act_dim) else: a = local_ac.get_action(o, device=device) # print(o) # Step the env o2, r, d, info = env.step(a, opp_policy.get_actions(opp)) env.render() if info.get('no_data_receive', False): discard = True ep_ret += r ep_len += 1 d = False if (ep_len == args.max_ep_len) or discard else d # send the transition to main process # if hasattr(env, 'p2'): # opp = env.p2 # else: # opp = None transition = (o, a, r, o2, d) trajectory.append(transition) meta.append([opp, rank, local_e, ep_len, r, a]) o = o2 local_t += 1 # End of trajectory handling if d or (ep_len == args.max_ep_len) or discard: e = E.value() t = T.value() send_data = (trajectory, meta) buffer_q.put(send_data, ) local_e += 1 # logger.store(EpRet=ep_ret, EpLen=ep_len) if info.get('win', False): wins.append(1) else: wins.append(0) scores.append(ep_ret) m_score = np.mean(scores[-100:]) win_rate = np.mean(wins[-100:]) # print( # "Process\t{}\topponent:{},\t# of local episode :{},\tglobal episode {},\tglobal step {}\tround score: {},\tmean score : {:.1f},\twin rate:{},\tsteps: {}".format( # rank, opp, local_e, e, t, ep_ret, m_score, win_rate, ep_len)) writer.add_scalar("actor/round_score", ep_ret, local_e) writer.add_scalar("actor/mean_score", m_score.item(), local_e) writer.add_scalar("actor/win_rate", win_rate.item(), local_e) writer.add_scalar("actor/round_step", ep_len, local_e) writer.add_scalar("actor/learner_actor_speed", e, local_e) o, ep_ret, ep_len = env.reset(), 0, 0 opp = opps[local_e % len(opps)] discard = False trajectory, meta = list(), list() if not model_q.empty(): # print("Process {}\tLOADING model at Global\t{},local\t{} EPISODE...".format(rank, e, local_e)) received_obj = model_q.get() model_dict = deepcopy(received_obj) local_ac.load_state_dict(model_dict) # print("Process {}\tLOADED new mode at Global\t{},local\t{}!!!".format(rank, e, local_e)) del received_obj print("Process {}\tActor Ended".format(rank))