def main(): # training settings parser = argparse.ArgumentParser(description = 'MAVPG markov soccer') parser.add_argument('--n-epochs', type = int, default = 30001, metavar = 'N', help = 'number of epochs to train (default: 30001)') parser.add_argument('--n-eps', type = int, default = 10, metavar = 'N', help = 'number of episodes in an epoch (default: 10)') parser.add_argument('--lamda1', type = float, default = 0.5, metavar = 'LAM', help = 'weight on performance of agent 1 (default: 0.5)') parser.add_argument('--lamda2', type = float, default = 0.5, metavar = 'LAM', help = 'weight on performance of agent 2 (default: 0.5)') parser.add_argument('--lr-p', type = float, default = 0.01, metavar = 'LR', help = 'mavpg learning rate for actors (default: 0.01)') parser.add_argument('--lr-q1', type = float, default = 0.01, metavar = 'LR', help = 'critic 1 learning rate (default: 0.01)') parser.add_argument('--lr-q2', type = float, default = 0.01, metavar = 'LR', help = 'critic 2 learning rate (default: 0.01)') parser.add_argument('--gamma', type = float, default = 0.99, metavar = 'GAMMA', help = 'discount factor (default: 0.99)') parser.add_argument('--tau', type = float, default = 0.95, metavar = 'TAU', help = 'GAE factor (default: 0.95)') parser.add_argument('--obs-dim', type = int, default = 12, metavar = 'DIM', help = 'dimension of observation space of each agent (default: 12)') parser.add_argument('--state-dim', type = int, default = 12, metavar = 'DIM', help = 'dimension of state space (default: 12)') parser.add_argument('--beta', type = float, default = 0.99, metavar = 'MOM', help = 'momemtum (default: 0.99)') parser.add_argument('--eps', type = float, default = 1e-8, metavar = 'EPS', help = 'epsilon (default: 1e-8)') parser.add_argument('--run-num', type = int, default = 0, metavar = 'NUM', help = 'index of experiment run (default: 0)') parser.add_argument('--save-model', action = 'store_true', default = False, help = 'save model parameters or not (default: False)') parser.add_argument('--rms', action = 'store_true', default = False, help = 'use mavpg with rms or not (default: False)') parser.add_argument('--cuda', action = 'store_true', default = False, help = 'use cuda or not (default: False)') args = parser.parse_args() """###################### Hyperparameters ######################## n_epochs, n_eps, lamda1, lamda2, lr_p, lr_q1, lr_q2, gamma, tau, obs_dim, state_dim, cuda, save_model, beta, eps, run_num, rms ###############################################################""" use_cuda = args.cuda and torch.cuda.is_available() print(use_cuda, args.cuda, torch.cuda.is_available()) device = torch.device("cuda" if use_cuda else "cpu") env_location = '../tensorboard/markov_soccer' experiment_name = '/mavpg_lr=' + str(args.lr_p) model_mavpg = env_location + experiment_name + '/model' data_mavpg = env_location + experiment_name + '/data' if not os.path.exists(model_mavpg): os.makedirs(model_mavpg) if not os.path.exists(data_mavpg): os.makedirs(data_mavpg) writer = SummaryWriter(data_mavpg) game = 'markov_soccer' env = rl_environment.Environment(game) action_dim = env.action_spec()['num_actions'] p1 = policy(args.obs_dim, action_dim).to(device) p2 = policy(args.obs_dim, action_dim).to(device) q1 = critic(args.state_dim).to(device) q2 = critic(args.state_dim).to(device) if not args.rms: policy_optim = CGD(p1.parameters(), p2.parameters(), lr = args.lr_p, device = device) else: policy_optim = RCGD(p1.parameters(), p2.parameters(), lr = args.lr_p, beta = args.beta, eps = args.eps, device = device) optim_q1 = torch.optim.Adam(q1.parameters(), lr = args.lr_q1) optim_q2 = torch.optim.Adam(q2.parameters(), lr = args.lr_q1) n_wins_a = n_wins_a_500 = 0 n_wins_b = n_wins_b_500 = 0 type_obs = 'full' if args.state_dim == args.obs_dim else 'partial' # file to keep all training logs # log_file_path = ../tensorboard/markov_soccer/mavpg_lr=0.01/ms_full_mavpg_lr=0.01_run_num=0.txt logs_file_path = env_location+experiment_name+'/ms_'+type_obs+'_mavpg_lr='+str(args.lr_p)+'_run_num='+str(args.run_num)+'.txt' f_ptr = open(logs_file_path, 'a') # file to track the time in episodes # time_in_episode_file_path = ../tensorboard/markov_soccer/mavpg_lr=0.01/time_in_episode_full_mavpg_lr=0.01_run_num=0.txt #file_path = '../tensorboard/markov_soccer/copg_lr0_01/avg_time_in_epoch_run0.txt' #f_t2 = open(file_path, 'a') avg_t_opt = 0 avg_t_p_opt = 0 avg_t_q_opt = 0 t_episode = [] start = time.time() for epoch in range(args.n_epochs): state_a = [] state_b = [] action_a_b = [] reward_a = [] reward_b = [] timestep = env.reset() a_status, b_status, s_a, s_b = get_two_state(timestep) time_in_epoch = 0 avg_time_in_epoch = 0 #avg_time_in_eps_per_epoch = 0 for eps in range(args.n_eps): time_in_episode = 0 #print(timestep.last()) timestep = env.reset() a_status, b_status, s_a, s_b = get_two_state(timestep) while timestep.last() == False: #print('HI') time_in_episode += 1 # pi1 == softmax output pi1 = p1(torch.FloatTensor(s_a).to(device)) dist1 = Categorical(pi1) action1 = dist1.sample().cpu() # pi2 == softmax output pi2 = p2(torch.FloatTensor(s_b).to(device)) dist2 = Categorical(pi2) action2 = dist2.sample().cpu() action = np.array([action1, action2]) state_a.append(torch.FloatTensor(s_a)) state_b.append(torch.FloatTensor(s_b)) action_a_b.append(torch.FloatTensor(action)) timestep = env.step(action) a_status, b_status, s_a, s_b = get_two_state(timestep) rew_a = timestep.rewards[0] rew_b = timestep.rewards[1] reward_a.append(torch.FloatTensor([rew_a])) reward_b.append(torch.FloatTensor([rew_b])) """if timestep.last() == True: done == True else: done == False""" # 0 if either A or B wins (and hence game ends), 1 if game lasts more than 1000 steps (game ended in draw) #done_eps.append(torch.FloatTensor([1-done])) if timestep.last == True: writer.add_scalar('reward_zero_sum for agent1', reward_a, epoch) #timestep = env.reset() #a_status, b_status, s_a, s_b = get_two_state(timestep) break t_episode.append(time_in_episode) s = 'epoch number: {}, time in episode {}: {}, Done reached'.format(epoch, eps, time_in_episode) #print(s) f_ptr.write(s + '\n') if rew_a > 0: s = 'A won episode {} of epoch {}'.format(eps, epoch) # print(s) f_ptr.write(s + '\n') n_wins_a += 1 n_wins_a_500 += 1 if rew_b > 0: s = 'B won episode {} of epoch {}'.format(eps, epoch) # print(s) f_ptr.write(s + '\n') n_wins_b += 1 n_wins_b_500 += 1 time_in_epoch += time_in_episode s = 'Total time in episode {} of epoch {}: {}'.format(eps, epoch, time_in_episode) #print(s) f_ptr.write(s + '\n\n') writer.add_scalar('Total time in this episode', time_in_episode, epoch * args.n_eps + eps) #avg_time_in_epoch = time_in_episode/n_eps #(avg_time_in_eps_per_epoch * eps + time_in_episode)/(eps + 1) #writer.add_scalar('Average time in this epoch', avg_time_in_epoch, n_epochs) avg_time_in_epoch = time_in_epoch/args.n_eps s = 'Total time in epoch {}: {}'.format(epoch, time_in_epoch) #print(s) f_ptr.write(s + '\n') s = 'Average time in epoch {}: {}'.format(epoch, avg_time_in_epoch) #print(s) f_ptr.write(s + '\n\n') writer.add_scalar('Total time in this epoch', time_in_epoch, epoch) writer.add_scalar('Average time in this epoch', avg_time_in_epoch, epoch) start_opt = time.time() val1 = q1(torch.stack(state_a).to(device)) v1 = val1.detach().squeeze() v1 = torch.cat((v1, torch.FloatTensor([0]))) r1 = torch.tensor(reward_a) # advantage1 is on gpu and detached advantage1 = get_advantage(v1.cpu(), r1, args.gamma, args.tau).to(device) val2 = q2(torch.stack(state_b).to(device)) v2 = val2.detach().squeeze() v2 = torch.cat((v2, torch.FloatTensor([0]))) r2 = torch.tensor(reward_b) advantage2 = get_advantage(v2.cpu(), r2, args.gamma, args.tau).to(device) q1_loss = (r1.to(device) + args.gamma * val1[1:] - val1[:-1]).pow(2).mean() optim_q1.zero_grad() q1_loss.backward() optim_q1.step() q2_loss = (r2.to(device) + args.gamma * val2[1:] - val2[:-1]).pow(2).mean() optim_q2.zero_grad() q2_loss.backward() optim_q2.step() end_q = time.time() t_q_opt = end_q - start_opt avg_t_q_opt = (avg_t_q_opt * epoch + t_q_opt)/(epoch + 1) s = 'Mean advantage for agent 1 (eta_new - eta_old): {}\nMean advantage for agent 2 -(eta_new - eta_old): {}'.format(advantage1.mean().cpu(), advantage2.mean().cpu()) # print(s) f_ptr.write(s + '\n') writer.add_scalar('Mean advantage for agent1', advantage1.mean().cpu(), epoch) writer.add_scalar('Mean advantage for agent2', advantage2.mean().cpu(), epoch) action_both = torch.stack(action_a_b) pi1_a_s = p1(torch.stack(state_a).to(device)) dist1 = Categorical(pi1_a_s) log_prob1 = dist1.log_prob(action_both[:,0]) pi2_a_s = p2(torch.stack(state_b).to(device)) dist2 = Categorical(pi2_a_s) log_prob2 = dist2.log_prob(action_both[:,1]) cum_log_prob1 = torch.zeros(log_prob1.shape[0]-1).to(device) cum_log_prob2 = torch.zeros(log_prob2.shape[0]-1).to(device) cum_log_prob1[0] = log_prob1[0] cum_log_prob2[0] = log_prob2[0] for i in range(1, log_prob1.shape[0]-1): cum_log_prob1[i] = cum_log_prob1[i-1] + log_prob1[i] cum_log_prob2[i] = cum_log_prob2[i-1] + log_prob2[i] lp_x_1 = (log_prob1 * advantage1).mean() lp_x_2 = (log_prob1 * advantage2).mean() lp_y_1 = (log_prob2 * advantage1).mean() lp_y_2 = (log_prob2 * advantage2).mean() lp_x = args.lamda1 * lp_x_1 - args.lamda2 * lp_x_2 lp_y = args.lamda1 * lp_y_1 - args.lamda2 * lp_y_2 mh1_1 = (log_prob1 * log_prob2 * advantage1).mean() mh2_1 = log_prob1[1:] * cum_log_prob2 * advantage1[1:] mh2_1 = mh2_1.sum()/(mh2_1.size(0)-args.n_eps+1) mh3_1 = log_prob2[1:] * cum_log_prob1 * advantage1[1:] mh3_1 = mh3_1.sum()/(mh3_1.size(0)-args.n_eps+1) mh1_2 = (log_prob1 * log_prob2 * advantage2).mean() mh2_2 = log_prob1[1:] * cum_log_prob2 * advantage2[1:] mh2_2 = mh2_2.sum()/(mh2_2.size(0)-args.n_eps+1) mh3_2 = log_prob2[1:] * cum_log_prob1 * advantage2[1:] mh3_2 = mh3_2.sum()/(mh3_2.size(0)-args.n_eps+1) mh_1 = mh1_1 + mh2_1 + mh3_1 mh_2 = mh1_2 + mh2_2 + mh3_2 mh = args.lamda1 * mh_1 - args.lamda2 * mh_2 policy_optim.zero_grad() policy_optim.step(lp_x, lp_y, mh) #policy_optim.step(log_prob1, log_prob2, cum_log_prob1, cum_log_prob2, advantage1, advantage2, lamda1, lamda2, n_eps) end_opt = time.time() t_p_opt = end_opt - end_q avg_t_p_opt = (avg_t_p_opt * epoch + t_p_opt)/(epoch + 1) t_opt = end_opt - start_opt avg_t_opt = (avg_t_opt * epoch + t_opt)/(epoch + 1) if (epoch + 1) % 500 == 0: s = '\nA won {} of games till now | B won {} of games in last 500 episodes'.format(n_wins_a_500/500, n_wins_b_500/500) #print(s) f_ptr.write(s + '\n') writer.add_scalar('Win rate last 500 episodes for agent1', (n_wins_a_500)/500, epoch) writer.add_scalar('Win rate last 500 episodes for agent2', (n_wins_b_500)/500, epoch) n_wins_a_500 = 0 n_wins_b_500 = 0 tot_games = n_wins_a + n_wins_b s = '\nA won {} of games till now | B won {} of games till now'.format(n_wins_a/tot_games, n_wins_b/tot_games) #print(s) #print('\n') #print('##################################################################################################################') #print('\n') f_ptr.write(s+'\n'+'##################################################################################################################'+'\n\n') writer.add_scalar('Entropy for agent1', dist1.entropy().mean().detach(), epoch) writer.add_scalar('Entropy for agent2', dist2.entropy().mean().detach(), epoch) writer.add_scalar('Cum win rate for agent1', (n_wins_a/tot_games), epoch) writer.add_scalar('Cum win rate for agent2', (n_wins_b/tot_games), epoch) writer.add_scalar('Time/avg_t_opt', avg_t_opt, epoch) writer.add_scalar('Time/avg_t_p_opt', avg_t_p_opt, epoch) writer.add_scalar('Time/avg_t_q_opt', avg_t_q_opt, epoch) writer.add_scalar('Time/t_opt', t_opt, epoch) writer.add_scalar('Time/t_p_opt', t_p_opt, epoch) writer.add_scalar('Time/t_q_opt', t_q_opt, epoch) if args.save_model and epoch % 500 == 0: #print(epoch) torch.save(p1.state_dict(), model_mavpg + '/policy_agent1_' + str(epoch) + ".pth") torch.save(p2.state_dict(), model_mavpg + '/policy_agent2_' + str(epoch) + ".pth") torch.save(q1.state_dict(), model_mavpg + '/value_agent1_' + str(epoch) + ".pth") torch.save(q2.state_dict(), model_mavpg + '/value_agent2_' + str(epoch) + ".pth") end = time.time() total_time = end - start s = 'Total time taken: {} seconds \n\n'.format(total_time) f_ptr.write(s) np_file = env_location+experiment_name+'/ms_mavpg_lr='+str(args.lr_p)+'_n_wins_time.npz' with open(np_file, 'wb') as np_f: np.savez(np_f, n_wins_a = np.array(n_wins_a), n_wins_b = np.array(n_wins_b), time_in_episode = np.array(t_episode), total_time = np.array(total_time))
def main(): # training settings parser = argparse.ArgumentParser( description='GDA iterated matching pennies') parser.add_argument('--n-epochs', type=int, default=151, metavar='N', help='number of epochs to train (default: 151)') parser.add_argument( '--repeated-steps', type=int, default=200, metavar='N', help= 'number of repetitions of the matrix game (not known to the agents) default: 200' ) parser.add_argument('--lr-p1', type=float, default=0.01, metavar='LR', help='gda learning rate for actor 1 (default: 0.01)') parser.add_argument('--lr-p2', type=float, default=0.01, metavar='LR', help='gda learning rate for actor 2 (default: 0.01)') parser.add_argument('--lr-q1', type=float, default=0.01, metavar='LR', help='critic 1 learning rate (default: 0.01)') parser.add_argument('--lr-q2', type=float, default=0.01, metavar='LR', help='critic 2 learning rate (default: 0.01)') parser.add_argument('--beta', type=float, default=0.99, metavar='MOM', help='momemtum (default: 0.99)') parser.add_argument('--eps', type=float, default=1e-8, metavar='EPS', help='epsilon (default: 1e-8)') parser.add_argument('--run-num', type=int, default=0, metavar='NUM', help='index of experiment run (default: 0)') parser.add_argument('--save-model', action='store_true', default=False, help='save model parameters or not (default: False)') parser.add_argument('--rms', action='store_true', default=False, help='use gda with rms or not (default: False)') parser.add_argument('--cuda', action='store_true', default=False, help='use cuda or not (default: False)') parser.add_argument('--tensorboard', action='store_true', default=False, help='use tensorboard or not (default: False') parser.add_argument( '--activation-function', type=str, default='tan', help='which activation function to use (relu or tan, default: tan)') parser.add_argument( '--policy', type=str, default='mlp', help='which type of policy to use (lstm or mlp, default: mlp)') parser.add_argument('--gamma', type=float, default=0.99, metavar='GAMMA', help='discount factor (default: 0.99)') parser.add_argument('--tau', type=float, default=0.95, metavar='TAU', help='GAE factor (default: 0.95)') parser.add_argument('--logs', action='store_true', default=False, help='write data to logs or not (default: False') args = parser.parse_args() """###################### Hyperparameters ######################## n_epochs, n_eps, lamda1, lamda2, lr_p, lr_q1, lr_q2, gamma, tau, obs_dim, state_dim, cuda, save_model, beta, eps, run_num, rms ###############################################################""" use_cuda = args.cuda and torch.cuda.is_available() print(use_cuda, args.cuda, torch.cuda.is_available()) #if args.cuda and not torch.cuda.is_available(): # print('CUDA CONFLICT') # raise 'error' device = torch.device("cuda" if use_cuda else "cpu") env_location = '../tensorboard/iterated_matching_pennies' experiment_name = '/gda_lr=' + str(args.lr_p1) + '_' + str( args.lr_p2) + '/run_' + str( args.run_num) + '_' + args.activation_function + '_' + args.policy model_gda = env_location + experiment_name + '/model' data_gda = env_location + experiment_name + '/data' if not os.path.exists(model_gda): os.makedirs(model_gda) if not os.path.exists(data_gda): os.makedirs(data_gda) # try: # env = iterated_matching_pennies(args.repeated_steps) # except: # env = iterated_matching_pennies(200) env = iterated_matching_pennies(args.repeated_steps) # 0: HH, 1: HT, 2: TH, 3: TT, 4: initial state (observation is the action taken in the previous iteration) obs_dim = 5 # combined observation of both the agents: s1 = [o1, o2], s2 = [o2, o1] state_dim = 10 # H: Heads, T: Tails action_dim = 2 if args.policy == 'mlp': if args.activation_function == 'tan': p1 = MLP_policy_tan(obs_dim, action_dim).to(device) p2 = MLP_policy_tan(obs_dim, action_dim).to(device) else: p1 = MLP_policy_relu(obs_dim, action_dim).to(device) p2 = MLP_policy_relu(obs_dim, action_dim).to(device) else: if args.activation_function == 'tan': p1 = LSTM_policy_tan(obs_dim, action_dim).to(device) p2 = LSTM_policy_tan(obs_dim, action_dim).to(device) else: p1 = LSTM_policy_relu(obs_dim, action_dim).to(device) p2 = LSTM_policy_relu(obs_dim, action_dim).to(device) q1 = value_network(state_dim).to(device) q2 = value_network(state_dim).to(device) if not args.rms: optim_p1 = torch.optim.SGD(p1.parameters(), lr=args.lr_p1) optim_p2 = torch.optim.SGD(p2.parameters(), lr=args.lr_p2) else: optim_p1 = torch.optim.RMSProp(p1.parameters(), lr=args.lr_p1, momentum=args.beta, eps=args.eps) optim_p2 = torch.optim.RMSProp(p2.parameters(), lr=args.lr_p2, momentum=args.beta, eps=args.eps) optim_q1 = torch.optim.Adam(q1.parameters(), lr=args.lr_q1) optim_q2 = torch.optim.Adam(q2.parameters(), lr=args.lr_q2) if args.logs: logs_file_path = env_location + experiment_name + '/imp_gda_lr=' + str( args.lr_p1) + '_' + str(args.lr_p2) + '.txt' f_ptr = open(logs_file_path, 'a') if args.tensorboard: writer = SummaryWriter(data_gda) prob_h_1 = [] prob_t_1 = [] prob_h_2 = [] prob_t_2 = [] avg_rew_1 = [] avg_rew_2 = [] total_t_p_opt = 0 total_t_q_opt = 0 avg_t_p_opt = 0 avg_t_q_opt = 0 start = time.time() for epoch in range(args.n_epochs): state_1 = [] state_2 = [] action_1_2 = [] reward_1 = [] reward_2 = [] observations, rewards, done = env.reset() while done == False: pi1_a_s = p1(torch.FloatTensor(observations[0]).to(device)) dist1 = Categorical(pi1_a_s) action1 = dist1.sample().cpu() pi2_a_s = p2(torch.FloatTensor(observations[1]).to(device)) dist2 = Categorical(pi2_a_s) action2 = dist2.sample().cpu() action = np.array([action1, action2]) state_1.append( torch.FloatTensor( np.array([observations[0], observations[1]]).reshape(state_dim))) state_2.append( torch.FloatTensor( np.array([observations[1], observations[0]]).reshape(state_dim))) action_1_2.append(torch.FloatTensor(action)) observations, rewards, done = env.step(action) reward_1.append(torch.FloatTensor([rewards[0]])) reward_2.append(torch.FloatTensor([rewards[1]])) if done == True: break val1 = q1(torch.stack(state_1).to(device)) v1 = val1.detach().squeeze() v1 = torch.cat((v1, torch.FloatTensor([0]))) r1 = torch.tensor(reward_1) # advantage1 is on gpu and detached advantage1 = get_advantage(v1.cpu(), r1, args.gamma, args.tau).to(device) val2 = q2(torch.stack(state_2).to(device)) v2 = val2.detach().squeeze() v2 = torch.cat((v2, torch.FloatTensor([0]))) r2 = torch.tensor(reward_2) advantage2 = get_advantage(v2.cpu(), r2, args.gamma, args.tau).to(device) avg_rew_1.append(sum(reward_1) / len(reward_1)) avg_rew_2.append(sum(reward_2) / len(reward_2)) if args.tensorboard: writer.add_scalar('Average reward in the epoch/Agent1', sum(reward_1) / len(reward_1), epoch) writer.add_scalar('Average reward in the epoch/Agent2', sum(reward_2) / len(reward_2), epoch) writer.add_scalar('Mean advantage in the epoch/Agent1', advantage1.mean().cpu(), epoch) writer.add_scalar('Mean advantage in the epoch/Agent2', advantage2.mean().cpu(), epoch) q1_loss = (r1.to(device) + args.gamma * val1[1:] - val1[:-1]).pow(2).mean() q2_loss = (r2.to(device) + args.gamma * val2[1:] - val2[:-1]).pow(2).mean() optim_q1.zero_grad() optim_q2.zero_grad() start_q = time.time() q1_loss.backward() optim_q1.step() q2_loss.backward() optim_q2.step() end_q = time.time() t_q_opt = end_q - start_q total_t_q_opt += t_q_opt avg_t_q_opt = (avg_t_q_opt * epoch + t_q_opt) / (epoch + 1) action_both = torch.stack(action_1_2) # action is either cooperate or defect if args.tensorboard: writer.add_scalar('Action/Agent1', torch.mean(action_both[:, 0]), epoch) writer.add_scalar('Action/agent2', torch.mean(action_both[:, 1]), epoch) pi1_a_s = p1((torch.stack(state_1)[:, :obs_dim]).to(device)) dist1 = Categorical(pi1_a_s) log_prob1 = dist1.log_prob(action_both[:, 0]) pi2_a_s = p2((torch.stack(state_2)[:, obs_dim:]).to(device)) dist2 = Categorical(pi2_a_s) log_prob2 = dist2.log_prob(action_both[:, 1]) if args.tensorboard: writer.add_scalar('Entropy/Agent1', dist1.entropy().mean().detach(), epoch) writer.add_scalar('Entropy/Agent2', dist2.entropy().mean().detach(), epoch) objective1 = (log_prob1 * (-advantage1)).mean() optim_p1.zero_grad() objective2 = (log_prob2 * (-advantage2)).mean() optim_p2.zero_grad() start_p = time.time() objective1.backward() optim_p1.step() objective2.backward() optim_p2.step() end_p = time.time() t_p_opt = end_p - start_p total_t_p_opt += t_p_opt avg_t_p_opt = total_t_p_opt / (epoch + 1) if args.tensorboard: writer.add_scalar('Mean_prob_heads/Agent1', pi1_a_s.data[:, 0].mean(), epoch) writer.add_scalar('Mean_prob_tails/Agent1', pi1_a_s.data[:, 1].mean(), epoch) writer.add_scalar('Mean_prob_heads/Agent2', pi2_a_s.data[:, 0].mean(), epoch) writer.add_scalar('Mean_prob_tails/Agent2', pi2_a_s.data[:, 1].mean(), epoch) writer.add_scalar('Time/avg_t_p_opt', avg_t_p_opt, epoch) writer.add_scalar('Time/avg_t_q_opt', avg_t_q_opt, epoch) writer.add_scalar('Time/t_p_opt', t_p_opt, epoch) writer.add_scalar('Time/t_q_opt', t_q_opt, epoch) if args.logs: s = 'Epoch: {}\nMean probability of Heads/Agent1: {}, Mean probability of Tails/Agent1: {}\nMean probability of Heads/Agent2: {}, Mean probability of Tails/Agent2: {}\n\n'.format( epoch, pi1_a_s.data[:, 0].mean(), pi1_a_s.data[:, 1].mean(), pi2_a_s.data[:, 0].mean(), pi2_a_s.data[:, 1].mean()) f_ptr.write(s) s = 'Average reward in the epoch for agent 1: {}\nAverage reward in the epoch for agent 2: {}\n\n'.format( sum(reward_1) / len(reward_1), sum(reward_2) / len(reward_2)) f_ptr.write(s) s = 'Time for policy optimization in this epoch: {} seconds\nTime for critic optimization in this epoch: {} seconds\n\n'.format( t_p_opt, t_q_opt) f_ptr.write( s + '###############################################################################\n\n' ) prob_h_1.append(pi1_a_s.data[:, 0].mean()) prob_t_1.append(pi1_a_s.data[:, 1].mean()) prob_h_2.append(pi2_a_s.data[:, 0].mean()) prob_t_2.append(pi2_a_s.data[:, 1].mean()) if args.save_model and epoch % 50 == 0: #print(epoch) torch.save(p1.state_dict(), model_gda + '/policy_agent1_' + str(epoch) + ".pth") torch.save(p2.state_dict(), model_gda + '/policy_agent2_' + str(epoch) + ".pth") torch.save(q1.state_dict(), model_gda + '/value_agent1_' + str(epoch) + ".pth") torch.save(q2.state_dict(), model_gda + '/value_agent2_' + str(epoch) + ".pth") end = time.time() total_time = end - start if args.logs: s = 'Total time taken: {} seconds\nTotal time for policy optimization steps only: {} seconds\nTotal time for critic optimization steps only: {} seconds\n'.format( total_time, total_t_p_opt, total_t_q_opt) f_ptr.write(s) s = 'Average time for policy optimization steps only: {} seconds\nAverage time for critic optimization steps only: {} seconds\n\n'.format( avg_t_p_opt, avg_t_q_opt) f_ptr.write(s) np_file = env_location + experiment_name + '/imp_gda_lr=' + str( args.lr_p1) + '_' + str(args.lr_p2) + '_rew_prob_h_t_time.npz' with open(np_file, 'wb') as np_f: np.savez(np_f, avg_rew_1 = np.array(avg_rew_1), avg_rew_2 = np.array(avg_rew_2), prob_h_1 = np.array(prob_h_1),\ prob_t_1 = np.array(prob_t_1), prob_h_2 = np.array(prob_h_2), prob_t_2 = np.array(prob_t_2),\ total_time = np.array(total_time), total_time_p_opt = np.array(total_t_p_opt), total_time_q_opt = np.array(total_t_q_opt),\ avg_time_p_opt = np.array(avg_t_p_opt), avg_time_q_opt = np.array(avg_t_q_opt)) fig = plt.figure(figsize=(15, 15)) ax = plt.subplot() ax.clear() fig.suptitle( 'Probabilities of heads and tails v/s #iterations/epochs (repeated steps = {})' .format(args.repeated_steps), fontsize=25) plt.xlabel('$Iterations/Epochs$', fontsize=20) plt.ylabel('$Probability$', fontsize=20) plt.xticks(fontsize=15) plt.yticks(fontsize=15) ax.plot(np.array(prob_h_1), label='GDA, Prob_H1, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) ax.plot(np.array(prob_t_1), label='GDA, Prob_T1, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) ax.plot(np.array(prob_h_2), label='GDA, Prob_H2, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) ax.plot(np.array(prob_t_2), label='GDA, Prob_T2, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) plt.legend(loc='upper right') plt.grid() plt.savefig(env_location + experiment_name + '/imp_gda_lr=' + str(args.lr_p1) + '_' + str(args.lr_p2) + '_prob_h_t.png') fig = plt.figure(figsize=(15, 15)) ax = plt.subplot() ax.clear() fig.suptitle( 'Avg reward/epoch for Agent1 & Agent2 v/s #iterations/epochs (repeated steps = {})' .format(args.repeated_steps), fontsize=20) plt.xlabel(r'$Iterations/Epochs$', fontsize=20) plt.ylabel(r'$Average\ reward\ per\ epoch$', fontsize=20) plt.xticks(fontsize=15) plt.yticks(fontsize=15) ax.plot(np.array(avg_rew_1), label='GDA, Avg rew/epoch Ag1, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) ax.plot(np.array(avg_rew_2), label='GDA, Avg rew/epoch Ag2, lr1 = {}, lr2 = {}'.format( args.lr_p1, args.lr_p2)) plt.legend(loc='upper right') plt.grid() plt.savefig(env_location + experiment_name + '/imp_gda_lr=' + str(args.lr_p1) + '_' + str(args.lr_p2) + '_avg_rew_per_epoch.png')
def main(): # training settings parser = argparse.ArgumentParser(description='MAVPG coin game') parser.add_argument('--n-epochs', type=int, default=151, metavar='N', help='number of epochs to train (default: 151)') parser.add_argument('--n-eps', type=int, default=10, metavar='N', help='number of episodes in an epoch (default: 10)') parser.add_argument( '--max-steps', type=int, default=500, metavar='N', help= 'maximum number of steps for which the episode lasts (default: 500)') parser.add_argument( '--interval', type=int, default=50, metavar='N', help= 'Interval of epochs to plot stats for last N steps and save model (default: N = 50)' ) parser.add_argument('--lamda1', type=float, default=0.5, metavar='LAM', help='weight on performance of agent 1 (default: 0.5)') parser.add_argument('--lamda2', type=float, default=0.5, metavar='LAM', help='weight on performance of agent 2 (default: 0.5)') parser.add_argument( '--state-dim', type=int, default=4, metavar='DIM', help='dimension of the square matrix in the state input (default: 4)') parser.add_argument( '--action-dim', type=int, default=5, metavar='DIM', help='number of actions (default: 5 - UP, DOWN, LEFT, RIGHT, NOOP)') parser.add_argument( '--num-players', type=int, default=2, metavar='N', help= 'number of players in the game - one color is given to each player (default: 2)' ) parser.add_argument('--num-coins', type=int, default=1, metavar='N', help='number of coins in the game (default: 1)') parser.add_argument('--lr-p', type=float, default=0.5, metavar='LR', help='mavpg learning rate for actors (default: 0.5)') parser.add_argument('--lr-q1', type=float, default=0.01, metavar='LR', help='critic 1 learning rate (default: 0.01)') parser.add_argument('--lr-q2', type=float, default=0.01, metavar='LR', help='critic 2 learning rate (default: 0.01)') parser.add_argument('--beta', type=float, default=0.99, metavar='MOM', help='momemtum (default: 0.99)') parser.add_argument('--eps', type=float, default=1e-8, metavar='EPS', help='epsilon (default: 1e-8)') parser.add_argument('--run-num', type=int, default=0, metavar='NUM', help='index of experiment run (default: 0)') parser.add_argument('--save-model', action='store_true', default=False, help='save model parameters or not (default: False)') parser.add_argument('--rms', action='store_true', default=False, help='use mavpg with rms or not (default: False)') parser.add_argument('--cuda', action='store_true', default=False, help='use cuda or not (default: False)') parser.add_argument('--tensorboard', action='store_true', default=False, help='use tensorboard or not (default: False') parser.add_argument( '--activation-function', type=str, default='relu', help='which activation function to use (relu or tanh, default: relu)') parser.add_argument( '--policy', type=str, default='mlp', help='which type of policy to use (lstm or mlp, default: mlp)') parser.add_argument( '--hidden-dim', type=int, default=32, metavar='DIM', help='number of features in the hidden state of the LSTM (default: 32)' ) parser.add_argument('--conv', action='store_true', default=False, help='use convolutions or not (default: False)') parser.add_argument('--gamma', type=float, default=0.99, metavar='GAMMA', help='discount factor (default: 0.99)') parser.add_argument('--tau', type=float, default=0.95, metavar='TAU', help='GAE factor (default: 0.95)') parser.add_argument('--logs', action='store_true', default=False, help='write data to logs or not (default: False') parser.add_argument('--lola', action='store_true', default=False, help='whether to use LOLA or not (default: False') args = parser.parse_args() use_cuda = args.cuda and torch.cuda.is_available() #print(use_cuda, args.cuda, torch.cuda.is_available()) if args.cuda and not torch.cuda.is_available(): raise Exception('torch.cuda is not available!') #if args.cuda and not torch.cuda.is_available(): # print('CUDA CONFLICT') # raise 'error' device = torch.device("cuda" if use_cuda else "cpu") env_location = '../tensorboard/coin_game' experiment_name = '/mavpg/run_' + str( args.run_num ) #+'_mavpg_lr='+str(args.lr_p)+'_'+args.activation_function+'_'+args.policy job_path = r'../coin_game/job_coin1.yaml' if os.path.exists(env_location + experiment_name): raise Exception('Run index {} already exists!'.format(args.run_num)) os.makedirs(env_location + experiment_name) info_str = env_location + experiment_name + '/info_mavpg_run_' + str( args.run_num) + '.txt' with open(info_str, 'a') as info_ptr: info_ptr.write( 'This is run number {} of MAVPG (LOLA1). The hyperparameters are:\n\n' .format(args.run_num)) info_ptr.write('Running torch version {}\n\n'.format( torch.__version__)) for arg in vars(args): info_ptr.write(str(arg) + ' ' + str(getattr(args, arg)) + '\n') #info_ptr.write('\nuse_cuda == args.cuda and torch.cuda.is_available() == {}\n'.format(use_cuda)) with open(job_path) as f_y: d_y = yaml.full_load(f_y) info_ptr.write('\nResources: {}'.format( d_y['spec']['containers'][0]['resources'])) if use_cuda: current_device = torch.cuda.current_device() info_ptr.write('\ntorch.cuda.device(current_device): {}'.format( torch.cuda.device(current_device))) info_ptr.write('\ntorch.cuda.device_count(): {}'.format( torch.cuda.device_count())) info_ptr.write( '\ntorch.cuda.get_device_name(current_device): {}\n\n'.format( torch.cuda.get_device_name(current_device))) current_device = torch.cuda.current_device() model_mavpg = env_location + experiment_name + '/model' data_mavpg = env_location + experiment_name + '/data' if not os.path.exists(model_mavpg): os.makedirs(model_mavpg) if not os.path.exists(data_mavpg): os.makedirs(data_mavpg) FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor if args.num_players > 2: env = coin_game_Np(args.max_steps, args.state_dim, args.action_dim, args.num_players, args.num_coins) else: env = coin_game_2p(args.max_steps, args.state_dim) if args.policy == 'mlp': p1 = MLP_policy(args.state_dim, args.num_coins * args.num_players + args.num_players, args.action_dim, args.conv, args.activation_function).to(device) p2 = MLP_policy(args.state_dim, args.num_coins * args.num_players + args.num_players, args.action_dim, args.conv, args.activation_function).to(device) elif args.policy == 'lstm': p1 = LSTM_policy(args.state_dim, args.num_coins * args.num_players + args.num_players, args.action_dim, args.hidden_dim, args.conv, args.activation_function).to(device) p2 = LSTM_policy(args.state_dim, args.num_coins * args.num_players + args.num_players, args.action_dim, args.hidden_dim, args.conv, args.activation_function).to(device) else: raise Exception('Policy type not recognized') q1 = MLP_value(args.state_dim, args.num_coins * args.num_players + args.num_players, args.conv, args.activation_function).to(device) q2 = MLP_value(args.state_dim, args.num_coins * args.num_players + args.num_players, args.conv, args.activation_function).to(device) with open(info_str, 'a') as info_ptr: info_ptr.write('Policy1 architecture:\n{}\n'.format( next(p1.named_modules()))) info_ptr.write('Policy2 architecture:\n{}\n'.format( next(p2.named_modules()))) info_ptr.write('Value1 architecture:\n{}\n'.format( next(q1.named_modules()))) info_ptr.write('Value2 architecture:\n{}\n'.format( next(q2.named_modules()))) if args.lola: policy_optim = LOLA1(p1.parameters(), p2.parameters(), lr=args.lr_p, device=device) else: if not args.rms: policy_optim = CGD(p1.parameters(), p2.parameters(), lr=args.lr_p, device=device) else: policy_optim = RCGD(p1.parameters(), p2.parameters(), lr=args.lr_p, beta=args.beta, eps=args.eps, device=device) optim_q1 = torch.optim.Adam(q1.parameters(), lr=args.lr_q1) optim_q2 = torch.optim.Adam(q2.parameters(), lr=args.lr_q2) if args.logs: logs_file_path = env_location + experiment_name + '/coin_mavpg_run_' + str( args.run_num) + '.txt' f_ptr = open(logs_file_path, 'a') s = 'This is run number {} of MAVPG\n\n'.format(args.run_num) f_ptr.write(s) if args.tensorboard: writer = SummaryWriter(data_mavpg) n_pR_cR = n_pR_cB = n_pR_cR_interval = n_pR_cB_interval = 0 n_pB_cR = n_pB_cB = n_pB_cR_interval = n_pB_cB_interval = 0 n_draw = n_draw_interval = 0 avg_rew_1 = [] avg_rew_2 = [] arr_pR_cR = [] arr_pR_cB = [] arr_pB_cR = [] arr_pB_cB = [] arr_draw = [] total_t_p_opt = 0 total_t_q_opt = 0 total_t_play = 0 avg_t_p_opt = 0 avg_t_q_opt = 0 avg_t_play = 0 t_episodes = [] num_episodes = 0 start = time.time() print('1Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('1Max memory allocated: {} GB'.format( round(torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') for epoch in range(args.n_epochs): state_1_2 = [] if args.policy == 'lstm': hidden_state_1 = [] cell_state_1 = [] hidden_state_2 = [] cell_state_2 = [] action_1_2 = [] reward_1 = [] reward_2 = [] state, rewards, done, info = env.reset() time_in_epoch = 0 avg_time_in_epoch = 0 n_rr_e = n_rb_e = n_br_e = n_bb_e = n_d_e = 0 start_play = time.time() for eps in range(args.n_eps): time_in_episode = 0 num_episodes += 1 state, rewards, done, info = env.reset() if args.policy == 'lstm': hx_1 = torch.zeros(1, args.hidden_dim).to(device) cx_1 = torch.zeros(1, args.hidden_dim).to(device) hx_2 = torch.zeros(1, args.hidden_dim).to(device) cx_2 = torch.zeros(1, args.hidden_dim).to(device) while done == False: time_in_episode += 1 if args.policy == 'lstm': hidden_state_1.append(hx_1) cell_state_1.append(cx_1) hidden_state_2.append(hx_2) cell_state_2.append(cx_2) pi1_a_s, hx_1, cx_1 = p1( (FloatTensor(state).unsqueeze(0)).to(device), hx_1, cx_1) pi2_a_s, hx_2, cx_2 = p2( (FloatTensor(state).unsqueeze(0)).to(device), hx_2, hx_2) else: pi1_a_s = p1((FloatTensor(state).unsqueeze(0)).to(device)) pi2_a_s = p2((FloatTensor(state).unsqueeze(0)).to(device)) dist1 = Categorical(pi1_a_s) action1 = dist1.sample() dist2 = Categorical(pi2_a_s) action2 = dist2.sample() action = np.array([action1.cpu(), action2.cpu()]) state_1_2.append(FloatTensor(state)) action_1_2.append(FloatTensor(action)) state, rewards, done, info = env.step(action) reward_1.append(torch.FloatTensor([rewards[0]])) reward_2.append(torch.FloatTensor([rewards[1]])) if done == True: break t_episodes.append(time_in_episode) s = 'Epoch number: {}, timesteps in episode {}: {}, Done reached\n'.format( epoch, eps, time_in_episode) #print(s) if args.logs: f_ptr.write(s) if info[0] == 'RED': s = '{} player (player1) got {} coin in episode {} of epoch {}\n'.format( info[0], info[1], eps, epoch) # print(s) if args.logs: f_ptr.write(s) if info[1] == 'RED': n_pR_cR += 1 n_pR_cR_interval += 1 n_rr_e += 1 #arr_pR_cR.append(n_pR_cR/num_episodes) elif info[1] == 'BLUE': n_pR_cB += 1 n_pR_cB_interval += 1 n_rb_e += 1 #arr_pR_cB.append(n_pR_cB/num_episodes) elif info[0] == 'BLUE': s = '{} player (player2) got {} coin in episode {} of epoch {}\n'.format( info[0], info[1], eps, epoch) # print(s) if args.logs: f_ptr.write(s) if info[1] == 'RED': n_pB_cR += 1 n_pB_cR_interval += 1 n_br_e += 1 #arr_pB_cR.append(n_pB_cR/num_episodes) elif info[1] == 'BLUE': n_pB_cB += 1 n_pB_cB_interval += 1 n_bb_e += 1 #arr_pB_cB.append(n_pB_cB/num_episodes) elif info[0] == 'NONE': s = 'NONE of the players got the coin in episode {} of epoch {}\n'.format( eps, epoch) n_draw += 1 n_draw_interval += 1 n_d_e += 1 #arr_draw.append(n_draw/num_episodes) if args.logs: f_ptr.write(s) arr_pR_cR.append(n_pR_cR / num_episodes) arr_pR_cB.append(n_pR_cB / num_episodes) arr_pB_cR.append(n_pB_cR / num_episodes) arr_pB_cB.append(n_pB_cB / num_episodes) arr_draw.append(n_draw / num_episodes) time_in_epoch += time_in_episode s = 'Total timesteps in episode {} of epoch {}: {}\n\n'.format( eps, epoch, time_in_episode) #print(s) if args.logs: f_ptr.write(s) if args.tensorboard: writer.add_scalar('Timesteps/Total timesteps in episode', time_in_episode, epoch * args.n_eps + eps) #avg_time_in_epoch = time_in_episode/n_eps #(avg_time_in_eps_per_epoch * eps + time_in_episode)/(eps + 1) #writer.add_scalar('Average time in this epoch', avg_time_in_epoch, n_epochs) end_play = time.time() t_play = end_play - start_play total_t_play += t_play avg_t_play = total_t_play / (epoch + 1) avg_time_in_epoch = time_in_epoch / args.n_eps s = 'Total number of episodes in epoch {}: {}\n'.format( epoch, args.n_eps) if args.logs: f_ptr.write(s) s = 'Total timesteps in epoch {}: {}\n'.format(epoch, time_in_epoch) #print(s) if args.logs: f_ptr.write(s) s = 'Average timesteps in epoch {}: {}\n'.format( epoch, avg_time_in_epoch) #print(s) if args.logs: f_ptr.write(s) print('2Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('2Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') val1 = q1(torch.stack(state_1_2).to(device)) # v1 is (1, n) v1 = val1.detach().squeeze().cpu() v1 = torch.cat((v1, torch.FloatTensor([0]))) r1 = torch.tensor(reward_1) print('3Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('3Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') # advantage1 is on gpu and detached # advantage1 is (1, n) advantage1 = get_advantage(v1, r1, args.gamma, args.tau).to(device) print('4Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('4Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') val2 = q2(torch.stack(state_1_2).to(device)) v2 = val2.detach().squeeze().cpu() v2 = torch.cat((v2, torch.FloatTensor([0]))) r2 = torch.tensor(reward_2) print('5Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('5Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') advantage2 = get_advantage(v2, r2, args.gamma, args.tau).to(device) print('6Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('6Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') avg_rew_1.append(sum(reward_1) / len(reward_1)) avg_rew_2.append(sum(reward_2) / len(reward_2)) s = '\nAverage reward in this epoch for Agent1 (RED): {}\nAverage reward in this epoch for Agent2 (BLUE): {}\n'.format( sum(reward_1) / len(reward_1), sum(reward_2) / len(reward_2)) # print(s) if args.logs: f_ptr.write(s) s = '\nMean advantage for Agent1 (RED) (eta_new - eta_old): {}\nMean advantage for Agent2 (BLUE) -(eta_new - eta_old): {}\n'.format( advantage1.mean().cpu(), advantage2.mean().cpu()) # print(s) if args.logs: f_ptr.write(s) print('7Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('7Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') q1_loss = (r1.to(device) + args.gamma * val1[1:] - val1[:-1]).pow(2).mean() q2_loss = (r2.to(device) + args.gamma * val2[1:] - val2[:-1]).pow(2).mean() optim_q1.zero_grad() optim_q2.zero_grad() print('8Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('8Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') start_q = time.time() q1_loss.backward() optim_q1.step() q2_loss.backward() optim_q2.step() end_q = time.time() print('9Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('9Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') t_q_opt = end_q - start_q total_t_q_opt += t_q_opt avg_t_q_opt = total_t_q_opt / (epoch + 1) action_both = torch.stack(action_1_2).to(device) print('10Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('10Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') if args.policy == 'lstm': pi1_a_s, _, _ = p1( torch.stack(state_1_2).to(device), torch.cat(hidden_state_1).to(device), torch.cat(cell_state_1).to(device)) pi2_a_s, _, _ = p2( torch.stack(state_1_2).to(device), torch.cat(hidden_state_2).to(device), torch.cat(cell_state_2).to(device)) else: pi1_a_s = p1(torch.stack(state_1_2).to(device)) pi2_a_s = p2(torch.stack(state_1_2).to(device)) print('11Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('11Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') dist1 = Categorical(pi1_a_s) log_prob1 = dist1.log_prob(action_both[:, 0]) print('12Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('12Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') dist2 = Categorical(pi2_a_s) log_prob2 = dist2.log_prob(action_both[:, 1]) print('13Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('13Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') cum_log_prob1 = torch.zeros(log_prob1.shape[0] - 1).to(device) cum_log_prob2 = torch.zeros(log_prob2.shape[0] - 1).to(device) cum_log_prob1[0] = log_prob1[0] cum_log_prob2[0] = log_prob2[0] print('14Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('14Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') for i in range(1, log_prob1.shape[0] - 1): cum_log_prob1[i] = cum_log_prob1[i - 1] + log_prob1[i] cum_log_prob2[i] = cum_log_prob2[i - 1] + log_prob2[i] print('15Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('15Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') lp_x_1 = (log_prob1 * (-advantage1)).mean() lp_x_2 = (log_prob1 * (-advantage2)).mean() lp_y_1 = (log_prob2 * (-advantage1)).mean() lp_y_2 = (log_prob2 * (-advantage2)).mean() print('16Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('16Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') # f -> trying to maximize both agent's performance #lp_x = args.lamda1 * lp_x_1 + args.lamda2 * lp_x_2 # g -> trying to maximize both agent's performance #lp_y = args.lamda1 * lp_y_1 + args.lamda2 * lp_y_2 mh1_1 = (log_prob1 * log_prob2 * (-advantage1)).mean() mh2_1 = (log_prob1[1:] * cum_log_prob2 * (-advantage1[1:])).mean() #mh2_1 = mh2_1.sum()/(mh2_1.size(0)-args.n_eps+1) mh3_1 = (log_prob2[1:] * cum_log_prob1 * (-advantage1[1:])).mean() #mh3_1 = mh3_1.sum()/(mh3_1.size(0)-args.n_eps+1) print('17Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('17Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') mh1_2 = (log_prob1 * log_prob2 * (-advantage2)).mean() mh2_2 = (log_prob1[1:] * cum_log_prob2 * (-advantage2[1:])).mean() #mh2_2 = mh2_2.sum()/(mh2_2.size(0)-args.n_eps+1) mh3_2 = (log_prob2[1:] * cum_log_prob1 * (-advantage2[1:])).mean() #mh3_2 = mh3_2.sum()/(mh3_2.size(0)-args.n_eps+1) print('18Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('18Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') mh_1 = mh1_1 + mh2_1 + mh3_1 mh_2 = mh1_2 + mh2_2 + mh3_2 #mh = args.lamda1 * mh_1 + args.lamda2 * mh_2 print('19Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('19Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') policy_optim.zero_grad() start_p = time.time() policy_optim.step(lp_x_1, lp_x_2, lp_y_1, lp_y_2, mh_1, mh_2) end_p = time.time() print('20Allocated: {} GB'.format( round(torch.cuda.memory_allocated(current_device) / 1024**3, 1))) print('\n') print('20Max memory allocated: {} GB'.format( round( torch.cuda.max_memory_allocated(current_device) / 1024**3, 1))) print('\n') t_p_opt = end_p - start_p total_t_p_opt += t_p_opt avg_t_p_opt = total_t_p_opt / (epoch + 1) grad_x, grad_x_y_mh, grad_y, grad_y_x_mh = policy_optim.getinfo() s = '\nRED player got {} RED coins in epoch {} | RED player got {} BLUE coins in epoch {}\nBLUE player got {} RED coins in epoch {} | BLUE player got {} BLUE coins in epoch {}\n'.format( n_rr_e, epoch, n_rb_e, epoch, n_br_e, epoch, n_bb_e, epoch) if args.logs: f_ptr.write(s) s = 'Number of draws in epoch {}: {}\n'.format(epoch, n_d_e) if args.logs: f_ptr.write(s) if (epoch + 1) % args.interval == 0: s = '\nRED player got {} RED coins in last {} epochs | RED player got {} BLUE coins in last {} epochs\n'.format( n_pR_cR_interval, args.interval, n_pR_cB_interval, args.interval) if args.logs: f_ptr.write(s) s = 'BLUE player got {} RED coins in last {} epochs | BLUE player got {} BLUE coins in last {} epochs\n'.format( n_pB_cR_interval, args.interval, n_pB_cB_interval, args.interval) if args.logs: f_ptr.write(s) s = 'Number of draws in last {} epochs: {}'.format( args.interval, n_draw_interval) if args.logs: f_ptr.write(s) s = 'RED-player-RED-coin rate in last {} epochs: {} | RED-player-BLUE-coin rate in last {} epochs: {}\n'.format( args.interval, n_pR_cR_interval / (args.interval * args.n_eps), args.interval, n_pR_cB_interval / (args.interval * args.n_eps)) if args.logs: f_ptr.write(s) s = 'BLUE-player-RED-coin rate in last {} epochs: {} | BLUE-player-BLUE-coin rate in last {} epochs: {}\n'.format( args.interval, n_pB_cR_interval / (args.interval * args.n_eps), args.interval, n_pB_cB_interval / (args.interval * args.n_eps)) if args.logs: f_ptr.write(s) s = 'DRAW rate in last {} epochs: {}\n'.format( args.interval, n_draw_interval / (args.interval * args.n_eps)) if args.logs: f_ptr.write(s) if args.tensorboard: writer.add_scalar( 'Coin take rate {} epochs/RED player_RED coin'.format( args.interval), (n_pR_cR_interval) / (args.interval * args.n_eps), epoch) writer.add_scalar( 'Coin take rate {} epochs/RED player_BLUE coin'.format( args.interval), (n_pR_cB_interval) / (args.interval * args.n_eps), epoch) writer.add_scalar( 'Coin take rate {} epochs/BLUE player_RED coin'.format( args.interval), (n_pB_cR_interval) / (args.interval * args.n_eps), epoch) writer.add_scalar( 'Coin take rate {} epochs/BLUE player_BLUE coin'.format( args.interval), (n_pB_cB_interval) / (args.interval * args.n_eps), epoch) writer.add_scalar( 'Coin take rate {} epochs/Draw rate'.format(args.interval), (n_draw_interval) / (args.interval * args.n_eps), epoch) n_pR_cR_interval = n_pR_cB_interval = 0 n_pB_cR_interval = n_pB_cB_interval = 0 n_draw_interval = 0 tot_games = (epoch + 1) * args.n_eps s = '\nRED player got {} RED coins till now | RED player got {} BLUE coins till now\n'.format( n_pR_cR, n_pR_cB) if args.logs: f_ptr.write(s) s = 'BLUE player got {} RED coins till now | BLUE player got {} BLUE coins till now\n'.format( n_pB_cR, n_pB_cB) if args.logs: f_ptr.write(s) s = 'Number of draws till now: {}\n\n'.format(n_draw) if args.logs: f_ptr.write(s) s = 'RED-player-RED-coin rate till now: {} | RED-player-BLUE-coin rate till now: {}\n'.format( n_pR_cR / tot_games, n_pR_cB / tot_games) if args.logs: f_ptr.write(s) s = 'BLUE-player-RED-coin rate till now: {} | BLUE-player-BLUE-coin rate till now: {}\n'.format( n_pB_cR / tot_games, n_pB_cB / tot_games) if args.logs: f_ptr.write(s) s = 'DRAW rate till now: {}\n'.format(n_draw / tot_games) if args.logs: f_ptr.write(s) s = '\nTime for game play in this epoch: {} seconds\n'.format(t_play) if args.logs: f_ptr.write(s) s = '\nTime for policy optimization in this epoch: {} seconds\nTime for critic optimization in this epoch: {}seconds\n\n'.format( t_p_opt, t_q_opt) if args.logs: f_ptr.write( s + '##################################################################################################################' + '\n\n') if args.tensorboard: writer.add_scalar('Timesteps/Total timesteps in this epoch', time_in_epoch, epoch) writer.add_scalar( 'Timesteps/Average timesteps in one episode in this epoch', avg_time_in_epoch, epoch) writer.add_scalar('Average reward in the epoch/Agent1', sum(reward_1) / len(reward_1), epoch) writer.add_scalar('Average reward in the epoch/Agent2', sum(reward_2) / len(reward_2), epoch) writer.add_scalar('Mean advantage in the epoch/Agent1', advantage1.mean().cpu(), epoch) writer.add_scalar('Mean advantage in the epoch/Agent2', advantage2.mean().cpu(), epoch) writer.add_scalar('Entropy/Agent1', dist1.entropy().mean().detach().cpu(), epoch) writer.add_scalar('Entropy/Agent2', dist2.entropy().mean().detach().cpu(), epoch) writer.add_scalar('Coin take rate/RED player_RED coin', (n_pR_cR) / tot_games, epoch) writer.add_scalar('Coin take rate/RED player_BLUE coin', (n_pR_cB) / tot_games, epoch) writer.add_scalar('Coin take rate/BLUE player_RED coin', (n_pB_cR) / tot_games, epoch) writer.add_scalar('Coin take rate/BLUE player_BLUE coin', (n_pB_cB) / tot_games, epoch) writer.add_scalar('Coin take rate/Draw rate', (n_draw) / tot_games, epoch) writer.add_scalar('Time/Avg time for policy optimization', avg_t_p_opt, epoch) writer.add_scalar('Time/Avg time for critic optimization', avg_t_q_opt, epoch) writer.add_scalar('Time/Avg time for game play', avg_t_play, epoch) writer.add_scalar('Time/Total time for policy optimization', t_p_opt, epoch) writer.add_scalar('Time/Total time for critic optimization', t_q_opt, epoch) writer.add_scalar('Time/Total time for game play', t_play, epoch) writer.add_scalar('Norm/grad_x(f)', grad_x, epoch) writer.add_scalar('Norm/grad_xy(f)_grad_y(g)', grad_x_y_mh, epoch) writer.add_scalar('Norm/grad_y(g)', grad_y, epoch) writer.add_scalar('Norm/grad_yx(g)_grad_x(f)', grad_y_x_mh, epoch) if args.save_model and epoch == args.n_epochs - 1: #epoch % args.interval == 0: #print(epoch) torch.save(p1.state_dict(), model_mavpg + '/policy_agent1_' + str(epoch) + ".pth") torch.save(p2.state_dict(), model_mavpg + '/policy_agent2_' + str(epoch) + ".pth") torch.save(q1.state_dict(), model_mavpg + '/value_agent1_' + str(epoch) + ".pth") torch.save(q2.state_dict(), model_mavpg + '/value_agent2_' + str(epoch) + ".pth") print( '################################################################################' ) end = time.time() total_time = end - start if args.logs: s = 'Total time taken: {} seconds\nTotal time for game play only: {} seconds\nTotal time for policy optimization steps only: {} seconds\nTotal time for critic optimization steps only: {} seconds\n'.format( total_time, total_t_play, total_t_p_opt, total_t_q_opt) f_ptr.write(s) s = 'Average time for policy optimization steps only: {} seconds\nAverage time for game play only: {} seconds\nAverage time for critic optimization steps only: {} seconds\n\n'.format( avg_t_p_opt, avg_t_play, avg_t_q_opt) f_ptr.write(s) np_file = env_location + experiment_name + '/coin_mavpg_run_' + str( args.run_num) + '_lr=' + str(args.lr_p) + '_stuff.npz' with open(np_file, 'wb') as np_f: np.savez(np_f, avg_rew_1 = np.array(avg_rew_1), avg_rew_2 = np.array(avg_rew_2), n_pR_cR = np.array(n_pR_cR),\ n_pR_cB = np.array(n_pR_cB), n_pB_cR = np.array(n_pB_cR), n_pB_cB = np.array(n_pB_cB), n_draw = np.array(n_draw), arr_pR_cR = np.array(arr_pR_cR), \ arr_pR_cB = np.array(arr_pR_cB), arr_pB_cR = np.array(arr_pB_cR), arr_pB_cB = np.array(arr_pB_cB), arr_draw = np.array(arr_draw),\ total_time = np.array(total_time), total_play_time = np.array(total_t_play), total_time_p_opt = np.array(total_t_p_opt), total_time_q_opt = np.array(total_t_q_opt),\ avg_time_p_opt = np.array(avg_t_p_opt), avg_play_time = np.array(avg_t_play), avg_time_q_opt = np.array(avg_t_q_opt), time_in_episodes = np.array(t_episodes)) fig = plt.figure(figsize=(15, 15)) ax = plt.subplot() ax.clear() fig.suptitle('Coin picking rates for RED and BLUE agents', fontsize=25) plt.xlabel( r'$Number\ of\ episodes\ (Num\ epochs\ =\ Total\ num\ eps/num\ eps\ per\ epoch)$', fontsize=20) plt.ylabel(r'$Coin\ picking\ rate$', fontsize=20) plt.xticks(fontsize=15) plt.yticks(fontsize=15) ax.plot(np.array(arr_pR_cR), label='MAVPG, RED player picks RED coin, lr = {}'.format( args.lr_p)) ax.plot(np.array(arr_pR_cB), label='MAVPG, RED player picks BLUE coin, lr = {}'.format( args.lr_p)) ax.plot(np.array(arr_pB_cR), label='MAVPG, BLUE player picks RED coin, lr = {}'.format( args.lr_p)) ax.plot(np.array(arr_pB_cB), label='MAVPG, BLUE player picks BLUE coin, lr = {}'.format( args.lr_p)) ax.plot(np.array(arr_draw), label='MAVPG, DRAW, lr = {}'.format(args.lr_p)) plt.legend(loc='upper right') plt.grid() plt.savefig(env_location + experiment_name + '/coin_mavpg_run_' + str(args.run_num) + '_lr=' + str(args.lr_p) + '_pick_rate.png') fig = plt.figure(figsize=(15, 15)) ax = plt.subplot() ax.clear() fig.suptitle( 'Avg reward per epoch for Agent1 (RED) & Agent2 (BLUE) v/s #epochs/iterations', fontsize=25) plt.xlabel(r'$Epochs/Iterations$', fontsize=20) plt.ylabel(r'$Average\ reward\ per\ epoch$', fontsize=20) plt.xticks(fontsize=15) plt.yticks(fontsize=15) ax.plot(np.array(avg_rew_1), label='MAVPG, Avg rew per epoch Ag1 (RED), lr = {}'.format( args.lr_p)) ax.plot(np.array(avg_rew_2), label='MAVPG, Avg rew per epoch Ag2 (BLUE), lr = {}'.format( args.lr_p)) plt.legend(loc='upper right') plt.grid() plt.savefig(env_location + experiment_name + '/coin_mavpg_run_' + str(args.run_num) + '_lr=' + str(args.lr_p) + '_avg_rew_per_epoch.png')
for l,p in enumerate(p2.parameters()): if l==0: writer.add_scalar('Controller/Agent2', p.data, t_eps) else: writer.add_scalar('controller_std/Agent2', p.data, t_eps) # writer.add_scalar('Action_prob/agent1', action1_prob[3], t_eps) # writer.add_scalar('Action_prob/agent2', action2_prob[3], t_eps) val1 = q(torch.stack(mat_state1)) val1 = val1.detach() next_value = 0 # because currently we end ony when its done which is equivalent to no next state returns_np1 = get_advantage(next_value, torch.stack(mat_reward1), val1, torch.stack(mat_done), gamma=0.99, tau=0.95) returns1 = torch.cat(returns_np1) advantage_mat1 = returns1 - val1.transpose(0,1) val2 = q(torch.stack(mat_state2)) val2 = val2.detach() next_value = 0 # because currently we end ony when its done which is equivalent to no next state returns_np2 = get_advantage(next_value, torch.stack(mat_reward2), val2, torch.stack(mat_done), gamma=0.99, tau=0.95) returns2 = torch.cat(returns_np2) advantage_mat2 = returns2 - val2.transpose(0,1) writer.add_scalar('Advantage/agent1', advantage_mat1.mean(), t_eps) writer.add_scalar('Advantage/agent2', advantage_mat2.mean(), t_eps)