Beispiel #1
0
def main():
	# training settings
	parser = argparse.ArgumentParser(description = 'MAVPG markov soccer')
	parser.add_argument('--n-epochs', type = int, default = 30001, metavar = 'N',
						help = 'number of epochs to train (default: 30001)')
	parser.add_argument('--n-eps', type = int, default = 10, metavar = 'N',
						help = 'number of episodes in an epoch (default: 10)')
	parser.add_argument('--lamda1', type = float, default = 0.5, metavar = 'LAM',
						help = 'weight on performance of agent 1 (default: 0.5)')
	parser.add_argument('--lamda2', type = float, default = 0.5, metavar = 'LAM',
						help = 'weight on performance of agent 2 (default: 0.5)')
	parser.add_argument('--lr-p', type = float, default = 0.01, metavar = 'LR',
						help = 'mavpg learning rate for actors (default: 0.01)')
	parser.add_argument('--lr-q1', type = float, default = 0.01, metavar = 'LR',
						help = 'critic 1 learning rate (default: 0.01)')
	parser.add_argument('--lr-q2', type = float, default = 0.01, metavar = 'LR',
						help = 'critic 2 learning rate (default: 0.01)')
	parser.add_argument('--gamma', type = float, default = 0.99, metavar = 'GAMMA',
						help = 'discount factor (default: 0.99)')
	parser.add_argument('--tau', type = float, default = 0.95, metavar = 'TAU',
						help = 'GAE factor (default: 0.95)')
	parser.add_argument('--obs-dim', type = int, default = 12, metavar = 'DIM',
						help = 'dimension of observation space of each agent (default: 12)')
	parser.add_argument('--state-dim', type = int, default = 12, metavar = 'DIM',
						help = 'dimension of state space (default: 12)')
	parser.add_argument('--beta', type = float, default = 0.99, metavar = 'MOM',
						help = 'momemtum (default: 0.99)')
	parser.add_argument('--eps', type = float, default = 1e-8, metavar = 'EPS',
						help = 'epsilon (default: 1e-8)')
	parser.add_argument('--run-num', type = int, default = 0, metavar = 'NUM',
						help = 'index of experiment run (default: 0)')
	parser.add_argument('--save-model', action = 'store_true', default = False,
						help = 'save model parameters or not (default: False)')
	parser.add_argument('--rms', action = 'store_true', default = False,
						help = 'use mavpg with rms or not (default: False)')
	parser.add_argument('--cuda', action = 'store_true', default = False,
						help = 'use cuda or not (default: False)')

	args = parser.parse_args()

	"""###################### Hyperparameters ########################
	n_epochs, n_eps, lamda1, lamda2, lr_p, lr_q1, lr_q2, gamma, tau, 
	obs_dim, state_dim, cuda, save_model, beta, eps, run_num, rms
	###############################################################"""

	use_cuda = args.cuda and torch.cuda.is_available()
	print(use_cuda, args.cuda, torch.cuda.is_available())
	device = torch.device("cuda" if use_cuda else "cpu")

	env_location = '../tensorboard/markov_soccer'
	experiment_name = '/mavpg_lr=' + str(args.lr_p)
	model_mavpg = env_location + experiment_name + '/model'
	data_mavpg = env_location + experiment_name + '/data'

	if not os.path.exists(model_mavpg):
		os.makedirs(model_mavpg)
	if not os.path.exists(data_mavpg):
		os.makedirs(data_mavpg)

	writer = SummaryWriter(data_mavpg)

	game = 'markov_soccer'
	env = rl_environment.Environment(game)
	action_dim = env.action_spec()['num_actions']

	p1 = policy(args.obs_dim, action_dim).to(device)
	p2 = policy(args.obs_dim, action_dim).to(device)
	q1 = critic(args.state_dim).to(device)
	q2 = critic(args.state_dim).to(device)

	if not args.rms:
		policy_optim = CGD(p1.parameters(), p2.parameters(), lr = args.lr_p, device = device)
	else:
		policy_optim = RCGD(p1.parameters(), p2.parameters(), lr = args.lr_p, beta = args.beta, eps = args.eps, device = device)
	optim_q1 = torch.optim.Adam(q1.parameters(), lr = args.lr_q1)
	optim_q2 = torch.optim.Adam(q2.parameters(), lr = args.lr_q1)

	n_wins_a = n_wins_a_500 = 0
	n_wins_b = n_wins_b_500 = 0

	type_obs = 'full' if args.state_dim == args.obs_dim else 'partial'
	
	# file to keep all training logs
	# log_file_path = ../tensorboard/markov_soccer/mavpg_lr=0.01/ms_full_mavpg_lr=0.01_run_num=0.txt
	logs_file_path = env_location+experiment_name+'/ms_'+type_obs+'_mavpg_lr='+str(args.lr_p)+'_run_num='+str(args.run_num)+'.txt'
	f_ptr = open(logs_file_path, 'a')

	# file to track the time in episodes
	# time_in_episode_file_path = ../tensorboard/markov_soccer/mavpg_lr=0.01/time_in_episode_full_mavpg_lr=0.01_run_num=0.txt
	
	#file_path = '../tensorboard/markov_soccer/copg_lr0_01/avg_time_in_epoch_run0.txt'
	#f_t2 = open(file_path, 'a')

	avg_t_opt = 0
	avg_t_p_opt = 0
	avg_t_q_opt = 0
	t_episode = []

	start = time.time()
	for epoch in range(args.n_epochs):
		state_a = []
		state_b = []
		action_a_b = []
		reward_a = []
		reward_b = []

		timestep = env.reset()
		a_status, b_status, s_a, s_b = get_two_state(timestep)

		time_in_epoch = 0
		avg_time_in_epoch = 0
		#avg_time_in_eps_per_epoch = 0
		for eps in range(args.n_eps):
			time_in_episode = 0
			#print(timestep.last())
			timestep = env.reset()
			a_status, b_status, s_a, s_b = get_two_state(timestep)
			while timestep.last() == False:

				#print('HI')
				time_in_episode += 1
				# pi1 == softmax output
				pi1 = p1(torch.FloatTensor(s_a).to(device))
				dist1 = Categorical(pi1)
				action1 = dist1.sample().cpu()

				# pi2 == softmax output
				pi2 = p2(torch.FloatTensor(s_b).to(device))
				dist2 = Categorical(pi2)
				action2 = dist2.sample().cpu()

				action = np.array([action1, action2])

				state_a.append(torch.FloatTensor(s_a))
				state_b.append(torch.FloatTensor(s_b))
				action_a_b.append(torch.FloatTensor(action))
				
				timestep = env.step(action)
				a_status, b_status, s_a, s_b = get_two_state(timestep)

				rew_a = timestep.rewards[0]
				rew_b = timestep.rewards[1]

				reward_a.append(torch.FloatTensor([rew_a]))
				reward_b.append(torch.FloatTensor([rew_b]))
				
				"""if timestep.last() == True:
					done == True
				else:
					done == False"""

				# 0 if either A or B wins (and hence game ends), 1 if game lasts more than 1000 steps (game ended in draw)
				#done_eps.append(torch.FloatTensor([1-done]))

				if timestep.last == True:
					writer.add_scalar('reward_zero_sum for agent1', reward_a, epoch)
					#timestep = env.reset()
					#a_status, b_status, s_a, s_b = get_two_state(timestep)
					break

			t_episode.append(time_in_episode)
			s = 'epoch number: {}, time in episode {}: {}, Done reached'.format(epoch, eps, time_in_episode)
			#print(s)
			f_ptr.write(s + '\n')
			if rew_a > 0:
				s = 'A won episode {} of epoch {}'.format(eps, epoch)
				# print(s)
				f_ptr.write(s + '\n')
				n_wins_a += 1
				n_wins_a_500 += 1
			if rew_b > 0:
				s = 'B won episode {} of epoch {}'.format(eps, epoch)
				# print(s)
				f_ptr.write(s + '\n')
				n_wins_b += 1
				n_wins_b_500 += 1

			time_in_epoch += time_in_episode
			s = 'Total time in episode {} of epoch {}: {}'.format(eps, epoch, time_in_episode)
			#print(s)
			f_ptr.write(s + '\n\n')
			writer.add_scalar('Total time in this episode', time_in_episode, epoch * args.n_eps + eps)
			#avg_time_in_epoch = time_in_episode/n_eps #(avg_time_in_eps_per_epoch * eps + time_in_episode)/(eps + 1)
			#writer.add_scalar('Average time in this epoch', avg_time_in_epoch, n_epochs)

		avg_time_in_epoch = time_in_epoch/args.n_eps
		s = 'Total time in epoch {}: {}'.format(epoch, time_in_epoch)
		#print(s)
		f_ptr.write(s + '\n')
		s = 'Average time in epoch {}: {}'.format(epoch, avg_time_in_epoch)
		#print(s)
		f_ptr.write(s + '\n\n')
		writer.add_scalar('Total time in this epoch', time_in_epoch, epoch)
		writer.add_scalar('Average time in this epoch', avg_time_in_epoch, epoch)

		start_opt = time.time()

		val1 = q1(torch.stack(state_a).to(device))
		v1 = val1.detach().squeeze()
		v1 = torch.cat((v1, torch.FloatTensor([0])))
		r1 = torch.tensor(reward_a)

		# advantage1 is on gpu and detached
		advantage1 = get_advantage(v1.cpu(), r1, args.gamma, args.tau).to(device)

		val2 = q2(torch.stack(state_b).to(device))
		v2 = val2.detach().squeeze()
		v2 = torch.cat((v2, torch.FloatTensor([0])))
		r2 = torch.tensor(reward_b)
		
		advantage2 = get_advantage(v2.cpu(), r2, args.gamma, args.tau).to(device)

		q1_loss = (r1.to(device) + args.gamma * val1[1:] - val1[:-1]).pow(2).mean()
		optim_q1.zero_grad()
		q1_loss.backward()
		optim_q1.step()

		q2_loss = (r2.to(device) + args.gamma * val2[1:] - val2[:-1]).pow(2).mean()
		optim_q2.zero_grad()
		q2_loss.backward()
		optim_q2.step()

		end_q = time.time()
		t_q_opt = end_q - start_opt
		avg_t_q_opt = (avg_t_q_opt * epoch + t_q_opt)/(epoch + 1)

		s = 'Mean advantage for agent 1 (eta_new - eta_old): {}\nMean advantage for agent 2 -(eta_new - eta_old): {}'.format(advantage1.mean().cpu(), advantage2.mean().cpu())
		# print(s)
		f_ptr.write(s + '\n')
		writer.add_scalar('Mean advantage for agent1', advantage1.mean().cpu(), epoch)
		writer.add_scalar('Mean advantage for agent2', advantage2.mean().cpu(), epoch)

		action_both = torch.stack(action_a_b)

		pi1_a_s = p1(torch.stack(state_a).to(device))
		dist1 = Categorical(pi1_a_s)
		log_prob1 = dist1.log_prob(action_both[:,0])

		pi2_a_s = p2(torch.stack(state_b).to(device))
		dist2 = Categorical(pi2_a_s)
		log_prob2 = dist2.log_prob(action_both[:,1])

		cum_log_prob1 = torch.zeros(log_prob1.shape[0]-1).to(device)
		cum_log_prob2 = torch.zeros(log_prob2.shape[0]-1).to(device)
		cum_log_prob1[0] = log_prob1[0]
		cum_log_prob2[0] = log_prob2[0]
		
		for i in range(1, log_prob1.shape[0]-1):
			cum_log_prob1[i] = cum_log_prob1[i-1] + log_prob1[i]
			cum_log_prob2[i] = cum_log_prob2[i-1] + log_prob2[i]

		lp_x_1 = (log_prob1 * advantage1).mean()
		lp_x_2 = (log_prob1 * advantage2).mean()
		lp_y_1 = (log_prob2 * advantage1).mean()
		lp_y_2 = (log_prob2 * advantage2).mean()

		lp_x = args.lamda1 * lp_x_1 - args.lamda2 * lp_x_2
		lp_y = args.lamda1 * lp_y_1 - args.lamda2 * lp_y_2

		mh1_1 = (log_prob1 * log_prob2 * advantage1).mean()
		mh2_1 = log_prob1[1:] * cum_log_prob2 * advantage1[1:]
		mh2_1 = mh2_1.sum()/(mh2_1.size(0)-args.n_eps+1)
		mh3_1 = log_prob2[1:] * cum_log_prob1 * advantage1[1:]
		mh3_1 = mh3_1.sum()/(mh3_1.size(0)-args.n_eps+1)

		mh1_2 = (log_prob1 * log_prob2 * advantage2).mean()
		mh2_2 = log_prob1[1:] * cum_log_prob2 * advantage2[1:]
		mh2_2 = mh2_2.sum()/(mh2_2.size(0)-args.n_eps+1)
		mh3_2 = log_prob2[1:] * cum_log_prob1 * advantage2[1:]
		mh3_2 = mh3_2.sum()/(mh3_2.size(0)-args.n_eps+1)
			
		mh_1 = mh1_1 + mh2_1 + mh3_1		
		mh_2 = mh1_2 + mh2_2 + mh3_2

		mh = args.lamda1 * mh_1 - args.lamda2 * mh_2

		policy_optim.zero_grad()
		policy_optim.step(lp_x, lp_y, mh)
		#policy_optim.step(log_prob1, log_prob2, cum_log_prob1, cum_log_prob2, advantage1, advantage2, lamda1, lamda2, n_eps)

		end_opt = time.time()
		t_p_opt = end_opt - end_q
		avg_t_p_opt = (avg_t_p_opt * epoch + t_p_opt)/(epoch + 1)

		t_opt = end_opt - start_opt
		avg_t_opt = (avg_t_opt * epoch + t_opt)/(epoch + 1)

		if (epoch + 1) % 500 == 0:
			s = '\nA won {} of games till now | B won {} of games in last 500 episodes'.format(n_wins_a_500/500, n_wins_b_500/500)
	        #print(s)
			f_ptr.write(s + '\n')
			writer.add_scalar('Win rate last 500 episodes for agent1', (n_wins_a_500)/500, epoch)
			writer.add_scalar('Win rate last 500 episodes for agent2', (n_wins_b_500)/500, epoch)
			n_wins_a_500 = 0
			n_wins_b_500 = 0

		tot_games = n_wins_a + n_wins_b
		s = '\nA won {} of games till now | B won {} of games till now'.format(n_wins_a/tot_games, n_wins_b/tot_games)
		#print(s)
		#print('\n')
		#print('##################################################################################################################')
		#print('\n')
		f_ptr.write(s+'\n'+'##################################################################################################################'+'\n\n')

		writer.add_scalar('Entropy for agent1', dist1.entropy().mean().detach(), epoch)
		writer.add_scalar('Entropy for agent2', dist2.entropy().mean().detach(), epoch)

		writer.add_scalar('Cum win rate for agent1', (n_wins_a/tot_games), epoch)
		writer.add_scalar('Cum win rate for agent2', (n_wins_b/tot_games), epoch)

		writer.add_scalar('Time/avg_t_opt', avg_t_opt, epoch)
		writer.add_scalar('Time/avg_t_p_opt', avg_t_p_opt, epoch)
		writer.add_scalar('Time/avg_t_q_opt', avg_t_q_opt, epoch)
		writer.add_scalar('Time/t_opt', t_opt, epoch)
		writer.add_scalar('Time/t_p_opt', t_p_opt, epoch)
		writer.add_scalar('Time/t_q_opt', t_q_opt, epoch)

		if args.save_model and epoch % 500 == 0:
			#print(epoch)
			torch.save(p1.state_dict(), model_mavpg + '/policy_agent1_' + str(epoch) + ".pth")
			torch.save(p2.state_dict(), model_mavpg + '/policy_agent2_' + str(epoch) + ".pth")
			torch.save(q1.state_dict(), model_mavpg + '/value_agent1_' + str(epoch) + ".pth")
			torch.save(q2.state_dict(), model_mavpg + '/value_agent2_' + str(epoch) + ".pth")

	end = time.time()
	total_time = end - start
	s = 'Total time taken: {} seconds \n\n'.format(total_time)
	f_ptr.write(s)
	np_file = env_location+experiment_name+'/ms_mavpg_lr='+str(args.lr_p)+'_n_wins_time.npz'
	with open(np_file, 'wb') as np_f:
		np.savez(np_f, n_wins_a = np.array(n_wins_a), n_wins_b = np.array(n_wins_b), time_in_episode = np.array(t_episode), total_time = np.array(total_time))
def main():
    # training settings
    parser = argparse.ArgumentParser(
        description='GDA iterated matching pennies')
    parser.add_argument('--n-epochs',
                        type=int,
                        default=151,
                        metavar='N',
                        help='number of epochs to train (default: 151)')
    parser.add_argument(
        '--repeated-steps',
        type=int,
        default=200,
        metavar='N',
        help=
        'number of repetitions of the matrix game (not known to the agents) default: 200'
    )
    parser.add_argument('--lr-p1',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='gda learning rate for actor 1 (default: 0.01)')
    parser.add_argument('--lr-p2',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='gda learning rate for actor 2 (default: 0.01)')
    parser.add_argument('--lr-q1',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='critic 1 learning rate (default: 0.01)')
    parser.add_argument('--lr-q2',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='critic 2 learning rate (default: 0.01)')
    parser.add_argument('--beta',
                        type=float,
                        default=0.99,
                        metavar='MOM',
                        help='momemtum (default: 0.99)')
    parser.add_argument('--eps',
                        type=float,
                        default=1e-8,
                        metavar='EPS',
                        help='epsilon (default: 1e-8)')
    parser.add_argument('--run-num',
                        type=int,
                        default=0,
                        metavar='NUM',
                        help='index of experiment run (default: 0)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='save model parameters or not (default: False)')
    parser.add_argument('--rms',
                        action='store_true',
                        default=False,
                        help='use gda with rms or not (default: False)')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='use cuda or not (default: False)')
    parser.add_argument('--tensorboard',
                        action='store_true',
                        default=False,
                        help='use tensorboard or not (default: False')
    parser.add_argument(
        '--activation-function',
        type=str,
        default='tan',
        help='which activation function to use (relu or tan, default: tan)')
    parser.add_argument(
        '--policy',
        type=str,
        default='mlp',
        help='which type of policy to use (lstm or mlp, default: mlp)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='GAMMA',
                        help='discount factor (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.95,
                        metavar='TAU',
                        help='GAE factor (default: 0.95)')
    parser.add_argument('--logs',
                        action='store_true',
                        default=False,
                        help='write data to logs or not (default: False')

    args = parser.parse_args()
    """###################### Hyperparameters ########################
	n_epochs, n_eps, lamda1, lamda2, lr_p, lr_q1, lr_q2, gamma, tau, 
	obs_dim, state_dim, cuda, save_model, beta, eps, run_num, rms
	###############################################################"""

    use_cuda = args.cuda and torch.cuda.is_available()
    print(use_cuda, args.cuda, torch.cuda.is_available())
    #if args.cuda and not torch.cuda.is_available():
    #	print('CUDA CONFLICT')
    #	raise 'error'
    device = torch.device("cuda" if use_cuda else "cpu")

    env_location = '../tensorboard/iterated_matching_pennies'
    experiment_name = '/gda_lr=' + str(args.lr_p1) + '_' + str(
        args.lr_p2) + '/run_' + str(
            args.run_num) + '_' + args.activation_function + '_' + args.policy
    model_gda = env_location + experiment_name + '/model'
    data_gda = env_location + experiment_name + '/data'

    if not os.path.exists(model_gda):
        os.makedirs(model_gda)
    if not os.path.exists(data_gda):
        os.makedirs(data_gda)


#	try:
#		env = iterated_matching_pennies(args.repeated_steps)
#	except:
#		env = iterated_matching_pennies(200)

    env = iterated_matching_pennies(args.repeated_steps)
    # 0: HH, 1: HT, 2: TH, 3: TT, 4: initial state (observation is the action taken in the previous iteration)
    obs_dim = 5
    # combined observation of both the agents: s1 = [o1, o2], s2 = [o2, o1]
    state_dim = 10
    # H: Heads, T: Tails
    action_dim = 2

    if args.policy == 'mlp':
        if args.activation_function == 'tan':
            p1 = MLP_policy_tan(obs_dim, action_dim).to(device)
            p2 = MLP_policy_tan(obs_dim, action_dim).to(device)
        else:
            p1 = MLP_policy_relu(obs_dim, action_dim).to(device)
            p2 = MLP_policy_relu(obs_dim, action_dim).to(device)
    else:
        if args.activation_function == 'tan':
            p1 = LSTM_policy_tan(obs_dim, action_dim).to(device)
            p2 = LSTM_policy_tan(obs_dim, action_dim).to(device)
        else:
            p1 = LSTM_policy_relu(obs_dim, action_dim).to(device)
            p2 = LSTM_policy_relu(obs_dim, action_dim).to(device)

    q1 = value_network(state_dim).to(device)
    q2 = value_network(state_dim).to(device)

    if not args.rms:
        optim_p1 = torch.optim.SGD(p1.parameters(), lr=args.lr_p1)
        optim_p2 = torch.optim.SGD(p2.parameters(), lr=args.lr_p2)
    else:
        optim_p1 = torch.optim.RMSProp(p1.parameters(),
                                       lr=args.lr_p1,
                                       momentum=args.beta,
                                       eps=args.eps)
        optim_p2 = torch.optim.RMSProp(p2.parameters(),
                                       lr=args.lr_p2,
                                       momentum=args.beta,
                                       eps=args.eps)

    optim_q1 = torch.optim.Adam(q1.parameters(), lr=args.lr_q1)
    optim_q2 = torch.optim.Adam(q2.parameters(), lr=args.lr_q2)

    if args.logs:
        logs_file_path = env_location + experiment_name + '/imp_gda_lr=' + str(
            args.lr_p1) + '_' + str(args.lr_p2) + '.txt'
        f_ptr = open(logs_file_path, 'a')

    if args.tensorboard:
        writer = SummaryWriter(data_gda)

    prob_h_1 = []
    prob_t_1 = []
    prob_h_2 = []
    prob_t_2 = []
    avg_rew_1 = []
    avg_rew_2 = []

    total_t_p_opt = 0
    total_t_q_opt = 0
    avg_t_p_opt = 0
    avg_t_q_opt = 0
    start = time.time()
    for epoch in range(args.n_epochs):
        state_1 = []
        state_2 = []
        action_1_2 = []
        reward_1 = []
        reward_2 = []
        observations, rewards, done = env.reset()

        while done == False:
            pi1_a_s = p1(torch.FloatTensor(observations[0]).to(device))
            dist1 = Categorical(pi1_a_s)
            action1 = dist1.sample().cpu()

            pi2_a_s = p2(torch.FloatTensor(observations[1]).to(device))
            dist2 = Categorical(pi2_a_s)
            action2 = dist2.sample().cpu()

            action = np.array([action1, action2])

            state_1.append(
                torch.FloatTensor(
                    np.array([observations[0],
                              observations[1]]).reshape(state_dim)))
            state_2.append(
                torch.FloatTensor(
                    np.array([observations[1],
                              observations[0]]).reshape(state_dim)))
            action_1_2.append(torch.FloatTensor(action))

            observations, rewards, done = env.step(action)
            reward_1.append(torch.FloatTensor([rewards[0]]))
            reward_2.append(torch.FloatTensor([rewards[1]]))

            if done == True:
                break

        val1 = q1(torch.stack(state_1).to(device))
        v1 = val1.detach().squeeze()
        v1 = torch.cat((v1, torch.FloatTensor([0])))
        r1 = torch.tensor(reward_1)

        # advantage1 is on gpu and detached
        advantage1 = get_advantage(v1.cpu(), r1, args.gamma,
                                   args.tau).to(device)

        val2 = q2(torch.stack(state_2).to(device))
        v2 = val2.detach().squeeze()
        v2 = torch.cat((v2, torch.FloatTensor([0])))
        r2 = torch.tensor(reward_2)

        advantage2 = get_advantage(v2.cpu(), r2, args.gamma,
                                   args.tau).to(device)

        avg_rew_1.append(sum(reward_1) / len(reward_1))
        avg_rew_2.append(sum(reward_2) / len(reward_2))

        if args.tensorboard:
            writer.add_scalar('Average reward in the epoch/Agent1',
                              sum(reward_1) / len(reward_1), epoch)
            writer.add_scalar('Average reward in the epoch/Agent2',
                              sum(reward_2) / len(reward_2), epoch)
            writer.add_scalar('Mean advantage in the epoch/Agent1',
                              advantage1.mean().cpu(), epoch)
            writer.add_scalar('Mean advantage in the epoch/Agent2',
                              advantage2.mean().cpu(), epoch)

        q1_loss = (r1.to(device) + args.gamma * val1[1:] -
                   val1[:-1]).pow(2).mean()
        q2_loss = (r2.to(device) + args.gamma * val2[1:] -
                   val2[:-1]).pow(2).mean()
        optim_q1.zero_grad()
        optim_q2.zero_grad()

        start_q = time.time()
        q1_loss.backward()
        optim_q1.step()
        q2_loss.backward()
        optim_q2.step()
        end_q = time.time()

        t_q_opt = end_q - start_q
        total_t_q_opt += t_q_opt
        avg_t_q_opt = (avg_t_q_opt * epoch + t_q_opt) / (epoch + 1)

        action_both = torch.stack(action_1_2)

        # action is either cooperate or defect
        if args.tensorboard:
            writer.add_scalar('Action/Agent1', torch.mean(action_both[:, 0]),
                              epoch)
            writer.add_scalar('Action/agent2', torch.mean(action_both[:, 1]),
                              epoch)

        pi1_a_s = p1((torch.stack(state_1)[:, :obs_dim]).to(device))
        dist1 = Categorical(pi1_a_s)
        log_prob1 = dist1.log_prob(action_both[:, 0])

        pi2_a_s = p2((torch.stack(state_2)[:, obs_dim:]).to(device))
        dist2 = Categorical(pi2_a_s)
        log_prob2 = dist2.log_prob(action_both[:, 1])

        if args.tensorboard:
            writer.add_scalar('Entropy/Agent1',
                              dist1.entropy().mean().detach(), epoch)
            writer.add_scalar('Entropy/Agent2',
                              dist2.entropy().mean().detach(), epoch)

        objective1 = (log_prob1 * (-advantage1)).mean()
        optim_p1.zero_grad()
        objective2 = (log_prob2 * (-advantage2)).mean()
        optim_p2.zero_grad()

        start_p = time.time()
        objective1.backward()
        optim_p1.step()
        objective2.backward()
        optim_p2.step()
        end_p = time.time()

        t_p_opt = end_p - start_p
        total_t_p_opt += t_p_opt
        avg_t_p_opt = total_t_p_opt / (epoch + 1)

        if args.tensorboard:
            writer.add_scalar('Mean_prob_heads/Agent1',
                              pi1_a_s.data[:, 0].mean(), epoch)
            writer.add_scalar('Mean_prob_tails/Agent1',
                              pi1_a_s.data[:, 1].mean(), epoch)
            writer.add_scalar('Mean_prob_heads/Agent2',
                              pi2_a_s.data[:, 0].mean(), epoch)
            writer.add_scalar('Mean_prob_tails/Agent2',
                              pi2_a_s.data[:, 1].mean(), epoch)

            writer.add_scalar('Time/avg_t_p_opt', avg_t_p_opt, epoch)
            writer.add_scalar('Time/avg_t_q_opt', avg_t_q_opt, epoch)
            writer.add_scalar('Time/t_p_opt', t_p_opt, epoch)
            writer.add_scalar('Time/t_q_opt', t_q_opt, epoch)

        if args.logs:
            s = 'Epoch: {}\nMean probability of Heads/Agent1: {}, Mean probability of Tails/Agent1: {}\nMean probability of Heads/Agent2: {}, Mean probability of Tails/Agent2: {}\n\n'.format(
                epoch, pi1_a_s.data[:, 0].mean(), pi1_a_s.data[:, 1].mean(),
                pi2_a_s.data[:, 0].mean(), pi2_a_s.data[:, 1].mean())
            f_ptr.write(s)
            s = 'Average reward in the epoch for agent 1: {}\nAverage reward in the epoch for agent 2: {}\n\n'.format(
                sum(reward_1) / len(reward_1),
                sum(reward_2) / len(reward_2))
            f_ptr.write(s)
            s = 'Time for policy optimization in this epoch: {} seconds\nTime for critic optimization in this epoch: {} seconds\n\n'.format(
                t_p_opt, t_q_opt)
            f_ptr.write(
                s +
                '###############################################################################\n\n'
            )

        prob_h_1.append(pi1_a_s.data[:, 0].mean())
        prob_t_1.append(pi1_a_s.data[:, 1].mean())
        prob_h_2.append(pi2_a_s.data[:, 0].mean())
        prob_t_2.append(pi2_a_s.data[:, 1].mean())

        if args.save_model and epoch % 50 == 0:
            #print(epoch)
            torch.save(p1.state_dict(),
                       model_gda + '/policy_agent1_' + str(epoch) + ".pth")
            torch.save(p2.state_dict(),
                       model_gda + '/policy_agent2_' + str(epoch) + ".pth")
            torch.save(q1.state_dict(),
                       model_gda + '/value_agent1_' + str(epoch) + ".pth")
            torch.save(q2.state_dict(),
                       model_gda + '/value_agent2_' + str(epoch) + ".pth")

    end = time.time()
    total_time = end - start

    if args.logs:
        s = 'Total time taken: {} seconds\nTotal time for policy optimization steps only: {} seconds\nTotal time for critic optimization steps only: {} seconds\n'.format(
            total_time, total_t_p_opt, total_t_q_opt)
        f_ptr.write(s)
        s = 'Average time for policy optimization steps only: {} seconds\nAverage time for critic optimization steps only: {} seconds\n\n'.format(
            avg_t_p_opt, avg_t_q_opt)
        f_ptr.write(s)
        np_file = env_location + experiment_name + '/imp_gda_lr=' + str(
            args.lr_p1) + '_' + str(args.lr_p2) + '_rew_prob_h_t_time.npz'
        with open(np_file, 'wb') as np_f:
            np.savez(np_f, avg_rew_1 = np.array(avg_rew_1), avg_rew_2 = np.array(avg_rew_2), prob_h_1 = np.array(prob_h_1),\
              prob_t_1 = np.array(prob_t_1), prob_h_2 = np.array(prob_h_2), prob_t_2 = np.array(prob_t_2),\
              total_time = np.array(total_time), total_time_p_opt = np.array(total_t_p_opt), total_time_q_opt = np.array(total_t_q_opt),\
              avg_time_p_opt = np.array(avg_t_p_opt), avg_time_q_opt = np.array(avg_t_q_opt))

    fig = plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    ax.clear()
    fig.suptitle(
        'Probabilities of heads and tails v/s #iterations/epochs (repeated steps = {})'
        .format(args.repeated_steps),
        fontsize=25)
    plt.xlabel('$Iterations/Epochs$', fontsize=20)
    plt.ylabel('$Probability$', fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    ax.plot(np.array(prob_h_1),
            label='GDA, Prob_H1, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    ax.plot(np.array(prob_t_1),
            label='GDA, Prob_T1, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    ax.plot(np.array(prob_h_2),
            label='GDA, Prob_H2, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    ax.plot(np.array(prob_t_2),
            label='GDA, Prob_T2, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    plt.legend(loc='upper right')
    plt.grid()
    plt.savefig(env_location + experiment_name + '/imp_gda_lr=' +
                str(args.lr_p1) + '_' + str(args.lr_p2) + '_prob_h_t.png')

    fig = plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    ax.clear()
    fig.suptitle(
        'Avg reward/epoch for Agent1 & Agent2 v/s #iterations/epochs (repeated steps = {})'
        .format(args.repeated_steps),
        fontsize=20)
    plt.xlabel(r'$Iterations/Epochs$', fontsize=20)
    plt.ylabel(r'$Average\ reward\ per\ epoch$', fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    ax.plot(np.array(avg_rew_1),
            label='GDA, Avg rew/epoch Ag1, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    ax.plot(np.array(avg_rew_2),
            label='GDA, Avg rew/epoch Ag2, lr1 = {}, lr2 = {}'.format(
                args.lr_p1, args.lr_p2))
    plt.legend(loc='upper right')
    plt.grid()
    plt.savefig(env_location + experiment_name + '/imp_gda_lr=' +
                str(args.lr_p1) + '_' + str(args.lr_p2) +
                '_avg_rew_per_epoch.png')
def main():
    # training settings
    parser = argparse.ArgumentParser(description='MAVPG coin game')
    parser.add_argument('--n-epochs',
                        type=int,
                        default=151,
                        metavar='N',
                        help='number of epochs to train (default: 151)')
    parser.add_argument('--n-eps',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of episodes in an epoch (default: 10)')
    parser.add_argument(
        '--max-steps',
        type=int,
        default=500,
        metavar='N',
        help=
        'maximum number of steps for which the episode lasts (default: 500)')
    parser.add_argument(
        '--interval',
        type=int,
        default=50,
        metavar='N',
        help=
        'Interval of epochs to plot stats for last N steps and save model (default: N = 50)'
    )
    parser.add_argument('--lamda1',
                        type=float,
                        default=0.5,
                        metavar='LAM',
                        help='weight on performance of agent 1 (default: 0.5)')
    parser.add_argument('--lamda2',
                        type=float,
                        default=0.5,
                        metavar='LAM',
                        help='weight on performance of agent 2 (default: 0.5)')
    parser.add_argument(
        '--state-dim',
        type=int,
        default=4,
        metavar='DIM',
        help='dimension of the square matrix in the state input (default: 4)')
    parser.add_argument(
        '--action-dim',
        type=int,
        default=5,
        metavar='DIM',
        help='number of actions (default: 5 - UP, DOWN, LEFT, RIGHT, NOOP)')
    parser.add_argument(
        '--num-players',
        type=int,
        default=2,
        metavar='N',
        help=
        'number of players in the game - one color is given to each player (default: 2)'
    )
    parser.add_argument('--num-coins',
                        type=int,
                        default=1,
                        metavar='N',
                        help='number of coins in the game (default: 1)')
    parser.add_argument('--lr-p',
                        type=float,
                        default=0.5,
                        metavar='LR',
                        help='mavpg learning rate for actors (default: 0.5)')
    parser.add_argument('--lr-q1',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='critic 1 learning rate (default: 0.01)')
    parser.add_argument('--lr-q2',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='critic 2 learning rate (default: 0.01)')
    parser.add_argument('--beta',
                        type=float,
                        default=0.99,
                        metavar='MOM',
                        help='momemtum (default: 0.99)')
    parser.add_argument('--eps',
                        type=float,
                        default=1e-8,
                        metavar='EPS',
                        help='epsilon (default: 1e-8)')
    parser.add_argument('--run-num',
                        type=int,
                        default=0,
                        metavar='NUM',
                        help='index of experiment run (default: 0)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='save model parameters or not (default: False)')
    parser.add_argument('--rms',
                        action='store_true',
                        default=False,
                        help='use mavpg with rms or not (default: False)')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='use cuda or not (default: False)')
    parser.add_argument('--tensorboard',
                        action='store_true',
                        default=False,
                        help='use tensorboard or not (default: False')
    parser.add_argument(
        '--activation-function',
        type=str,
        default='relu',
        help='which activation function to use (relu or tanh, default: relu)')
    parser.add_argument(
        '--policy',
        type=str,
        default='mlp',
        help='which type of policy to use (lstm or mlp, default: mlp)')
    parser.add_argument(
        '--hidden-dim',
        type=int,
        default=32,
        metavar='DIM',
        help='number of features in the hidden state of the LSTM (default: 32)'
    )
    parser.add_argument('--conv',
                        action='store_true',
                        default=False,
                        help='use convolutions or not (default: False)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.99,
                        metavar='GAMMA',
                        help='discount factor (default: 0.99)')
    parser.add_argument('--tau',
                        type=float,
                        default=0.95,
                        metavar='TAU',
                        help='GAE factor (default: 0.95)')
    parser.add_argument('--logs',
                        action='store_true',
                        default=False,
                        help='write data to logs or not (default: False')
    parser.add_argument('--lola',
                        action='store_true',
                        default=False,
                        help='whether to use LOLA or not (default: False')

    args = parser.parse_args()

    use_cuda = args.cuda and torch.cuda.is_available()
    #print(use_cuda, args.cuda, torch.cuda.is_available())
    if args.cuda and not torch.cuda.is_available():
        raise Exception('torch.cuda is not available!')
    #if args.cuda and not torch.cuda.is_available():
    #	print('CUDA CONFLICT')
    #	raise 'error'
    device = torch.device("cuda" if use_cuda else "cpu")

    env_location = '../tensorboard/coin_game'
    experiment_name = '/mavpg/run_' + str(
        args.run_num
    )  #+'_mavpg_lr='+str(args.lr_p)+'_'+args.activation_function+'_'+args.policy
    job_path = r'../coin_game/job_coin1.yaml'
    if os.path.exists(env_location + experiment_name):
        raise Exception('Run index {} already exists!'.format(args.run_num))
    os.makedirs(env_location + experiment_name)
    info_str = env_location + experiment_name + '/info_mavpg_run_' + str(
        args.run_num) + '.txt'
    with open(info_str, 'a') as info_ptr:
        info_ptr.write(
            'This is run number {} of MAVPG (LOLA1). The hyperparameters are:\n\n'
            .format(args.run_num))
        info_ptr.write('Running torch version {}\n\n'.format(
            torch.__version__))
        for arg in vars(args):
            info_ptr.write(str(arg) + ' ' + str(getattr(args, arg)) + '\n')
        #info_ptr.write('\nuse_cuda == args.cuda and torch.cuda.is_available() == {}\n'.format(use_cuda))
        with open(job_path) as f_y:
            d_y = yaml.full_load(f_y)
            info_ptr.write('\nResources: {}'.format(
                d_y['spec']['containers'][0]['resources']))
        if use_cuda:
            current_device = torch.cuda.current_device()
            info_ptr.write('\ntorch.cuda.device(current_device): {}'.format(
                torch.cuda.device(current_device)))
            info_ptr.write('\ntorch.cuda.device_count(): {}'.format(
                torch.cuda.device_count()))
            info_ptr.write(
                '\ntorch.cuda.get_device_name(current_device): {}\n\n'.format(
                    torch.cuda.get_device_name(current_device)))

    current_device = torch.cuda.current_device()

    model_mavpg = env_location + experiment_name + '/model'
    data_mavpg = env_location + experiment_name + '/data'

    if not os.path.exists(model_mavpg):
        os.makedirs(model_mavpg)
    if not os.path.exists(data_mavpg):
        os.makedirs(data_mavpg)

    FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

    if args.num_players > 2:
        env = coin_game_Np(args.max_steps, args.state_dim, args.action_dim,
                           args.num_players, args.num_coins)
    else:
        env = coin_game_2p(args.max_steps, args.state_dim)

    if args.policy == 'mlp':
        p1 = MLP_policy(args.state_dim,
                        args.num_coins * args.num_players + args.num_players,
                        args.action_dim, args.conv,
                        args.activation_function).to(device)
        p2 = MLP_policy(args.state_dim,
                        args.num_coins * args.num_players + args.num_players,
                        args.action_dim, args.conv,
                        args.activation_function).to(device)
    elif args.policy == 'lstm':
        p1 = LSTM_policy(args.state_dim,
                         args.num_coins * args.num_players + args.num_players,
                         args.action_dim, args.hidden_dim, args.conv,
                         args.activation_function).to(device)
        p2 = LSTM_policy(args.state_dim,
                         args.num_coins * args.num_players + args.num_players,
                         args.action_dim, args.hidden_dim, args.conv,
                         args.activation_function).to(device)
    else:
        raise Exception('Policy type not recognized')

    q1 = MLP_value(args.state_dim,
                   args.num_coins * args.num_players + args.num_players,
                   args.conv, args.activation_function).to(device)
    q2 = MLP_value(args.state_dim,
                   args.num_coins * args.num_players + args.num_players,
                   args.conv, args.activation_function).to(device)

    with open(info_str, 'a') as info_ptr:
        info_ptr.write('Policy1 architecture:\n{}\n'.format(
            next(p1.named_modules())))
        info_ptr.write('Policy2 architecture:\n{}\n'.format(
            next(p2.named_modules())))
        info_ptr.write('Value1 architecture:\n{}\n'.format(
            next(q1.named_modules())))
        info_ptr.write('Value2 architecture:\n{}\n'.format(
            next(q2.named_modules())))

    if args.lola:
        policy_optim = LOLA1(p1.parameters(),
                             p2.parameters(),
                             lr=args.lr_p,
                             device=device)
    else:
        if not args.rms:
            policy_optim = CGD(p1.parameters(),
                               p2.parameters(),
                               lr=args.lr_p,
                               device=device)
        else:
            policy_optim = RCGD(p1.parameters(),
                                p2.parameters(),
                                lr=args.lr_p,
                                beta=args.beta,
                                eps=args.eps,
                                device=device)
    optim_q1 = torch.optim.Adam(q1.parameters(), lr=args.lr_q1)
    optim_q2 = torch.optim.Adam(q2.parameters(), lr=args.lr_q2)

    if args.logs:
        logs_file_path = env_location + experiment_name + '/coin_mavpg_run_' + str(
            args.run_num) + '.txt'
        f_ptr = open(logs_file_path, 'a')
        s = 'This is run number {} of MAVPG\n\n'.format(args.run_num)
        f_ptr.write(s)

    if args.tensorboard:
        writer = SummaryWriter(data_mavpg)

    n_pR_cR = n_pR_cB = n_pR_cR_interval = n_pR_cB_interval = 0
    n_pB_cR = n_pB_cB = n_pB_cR_interval = n_pB_cB_interval = 0
    n_draw = n_draw_interval = 0

    avg_rew_1 = []
    avg_rew_2 = []
    arr_pR_cR = []
    arr_pR_cB = []
    arr_pB_cR = []
    arr_pB_cB = []
    arr_draw = []

    total_t_p_opt = 0
    total_t_q_opt = 0
    total_t_play = 0
    avg_t_p_opt = 0
    avg_t_q_opt = 0
    avg_t_play = 0
    t_episodes = []
    num_episodes = 0
    start = time.time()

    print('1Allocated: {} GB'.format(
        round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
    print('\n')
    print('1Max memory allocated: {} GB'.format(
        round(torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
    print('\n')

    for epoch in range(args.n_epochs):
        state_1_2 = []
        if args.policy == 'lstm':
            hidden_state_1 = []
            cell_state_1 = []
            hidden_state_2 = []
            cell_state_2 = []
        action_1_2 = []
        reward_1 = []
        reward_2 = []
        state, rewards, done, info = env.reset()
        time_in_epoch = 0
        avg_time_in_epoch = 0
        n_rr_e = n_rb_e = n_br_e = n_bb_e = n_d_e = 0

        start_play = time.time()
        for eps in range(args.n_eps):
            time_in_episode = 0
            num_episodes += 1
            state, rewards, done, info = env.reset()
            if args.policy == 'lstm':
                hx_1 = torch.zeros(1, args.hidden_dim).to(device)
                cx_1 = torch.zeros(1, args.hidden_dim).to(device)
                hx_2 = torch.zeros(1, args.hidden_dim).to(device)
                cx_2 = torch.zeros(1, args.hidden_dim).to(device)
            while done == False:
                time_in_episode += 1
                if args.policy == 'lstm':
                    hidden_state_1.append(hx_1)
                    cell_state_1.append(cx_1)
                    hidden_state_2.append(hx_2)
                    cell_state_2.append(cx_2)
                    pi1_a_s, hx_1, cx_1 = p1(
                        (FloatTensor(state).unsqueeze(0)).to(device), hx_1,
                        cx_1)
                    pi2_a_s, hx_2, cx_2 = p2(
                        (FloatTensor(state).unsqueeze(0)).to(device), hx_2,
                        hx_2)
                else:
                    pi1_a_s = p1((FloatTensor(state).unsqueeze(0)).to(device))
                    pi2_a_s = p2((FloatTensor(state).unsqueeze(0)).to(device))

                dist1 = Categorical(pi1_a_s)
                action1 = dist1.sample()

                dist2 = Categorical(pi2_a_s)
                action2 = dist2.sample()

                action = np.array([action1.cpu(), action2.cpu()])

                state_1_2.append(FloatTensor(state))
                action_1_2.append(FloatTensor(action))

                state, rewards, done, info = env.step(action)
                reward_1.append(torch.FloatTensor([rewards[0]]))
                reward_2.append(torch.FloatTensor([rewards[1]]))

                if done == True:
                    break
            t_episodes.append(time_in_episode)
            s = 'Epoch number: {}, timesteps in episode {}: {}, Done reached\n'.format(
                epoch, eps, time_in_episode)
            #print(s)
            if args.logs:
                f_ptr.write(s)
            if info[0] == 'RED':
                s = '{} player (player1) got {} coin in episode {} of epoch {}\n'.format(
                    info[0], info[1], eps, epoch)
                # print(s)
                if args.logs:
                    f_ptr.write(s)
                if info[1] == 'RED':
                    n_pR_cR += 1
                    n_pR_cR_interval += 1
                    n_rr_e += 1
                    #arr_pR_cR.append(n_pR_cR/num_episodes)
                elif info[1] == 'BLUE':
                    n_pR_cB += 1
                    n_pR_cB_interval += 1
                    n_rb_e += 1
                    #arr_pR_cB.append(n_pR_cB/num_episodes)
            elif info[0] == 'BLUE':
                s = '{} player (player2) got {} coin in episode {} of epoch {}\n'.format(
                    info[0], info[1], eps, epoch)
                # print(s)
                if args.logs:
                    f_ptr.write(s)
                if info[1] == 'RED':
                    n_pB_cR += 1
                    n_pB_cR_interval += 1
                    n_br_e += 1
                    #arr_pB_cR.append(n_pB_cR/num_episodes)
                elif info[1] == 'BLUE':
                    n_pB_cB += 1
                    n_pB_cB_interval += 1
                    n_bb_e += 1
                    #arr_pB_cB.append(n_pB_cB/num_episodes)
            elif info[0] == 'NONE':
                s = 'NONE of the players got the coin in episode {} of epoch {}\n'.format(
                    eps, epoch)
                n_draw += 1
                n_draw_interval += 1
                n_d_e += 1
                #arr_draw.append(n_draw/num_episodes)
                if args.logs:
                    f_ptr.write(s)
            arr_pR_cR.append(n_pR_cR / num_episodes)
            arr_pR_cB.append(n_pR_cB / num_episodes)
            arr_pB_cR.append(n_pB_cR / num_episodes)
            arr_pB_cB.append(n_pB_cB / num_episodes)
            arr_draw.append(n_draw / num_episodes)
            time_in_epoch += time_in_episode
            s = 'Total timesteps in episode {} of epoch {}: {}\n\n'.format(
                eps, epoch, time_in_episode)
            #print(s)
            if args.logs:
                f_ptr.write(s)
            if args.tensorboard:
                writer.add_scalar('Timesteps/Total timesteps in episode',
                                  time_in_episode, epoch * args.n_eps + eps)
            #avg_time_in_epoch = time_in_episode/n_eps #(avg_time_in_eps_per_epoch * eps + time_in_episode)/(eps + 1)
            #writer.add_scalar('Average time in this epoch', avg_time_in_epoch, n_epochs)
        end_play = time.time()

        t_play = end_play - start_play
        total_t_play += t_play
        avg_t_play = total_t_play / (epoch + 1)

        avg_time_in_epoch = time_in_epoch / args.n_eps
        s = 'Total number of episodes in epoch {}: {}\n'.format(
            epoch, args.n_eps)
        if args.logs:
            f_ptr.write(s)
        s = 'Total timesteps in epoch {}: {}\n'.format(epoch, time_in_epoch)
        #print(s)
        if args.logs:
            f_ptr.write(s)
        s = 'Average timesteps in epoch {}: {}\n'.format(
            epoch, avg_time_in_epoch)
        #print(s)
        if args.logs:
            f_ptr.write(s)

        print('2Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('2Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        val1 = q1(torch.stack(state_1_2).to(device))
        # v1 is (1, n)
        v1 = val1.detach().squeeze().cpu()
        v1 = torch.cat((v1, torch.FloatTensor([0])))
        r1 = torch.tensor(reward_1)

        print('3Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('3Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        # advantage1 is on gpu and detached
        # advantage1 is (1, n)
        advantage1 = get_advantage(v1, r1, args.gamma, args.tau).to(device)

        print('4Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('4Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        val2 = q2(torch.stack(state_1_2).to(device))
        v2 = val2.detach().squeeze().cpu()
        v2 = torch.cat((v2, torch.FloatTensor([0])))
        r2 = torch.tensor(reward_2)

        print('5Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('5Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        advantage2 = get_advantage(v2, r2, args.gamma, args.tau).to(device)

        print('6Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('6Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        avg_rew_1.append(sum(reward_1) / len(reward_1))
        avg_rew_2.append(sum(reward_2) / len(reward_2))

        s = '\nAverage reward in this epoch for Agent1 (RED): {}\nAverage reward in this epoch for Agent2 (BLUE): {}\n'.format(
            sum(reward_1) / len(reward_1),
            sum(reward_2) / len(reward_2))
        # print(s)
        if args.logs:
            f_ptr.write(s)
        s = '\nMean advantage for Agent1 (RED) (eta_new - eta_old): {}\nMean advantage for Agent2 (BLUE) -(eta_new - eta_old): {}\n'.format(
            advantage1.mean().cpu(),
            advantage2.mean().cpu())
        # print(s)
        if args.logs:
            f_ptr.write(s)

        print('7Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('7Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        q1_loss = (r1.to(device) + args.gamma * val1[1:] -
                   val1[:-1]).pow(2).mean()
        q2_loss = (r2.to(device) + args.gamma * val2[1:] -
                   val2[:-1]).pow(2).mean()
        optim_q1.zero_grad()
        optim_q2.zero_grad()

        print('8Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('8Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        start_q = time.time()
        q1_loss.backward()
        optim_q1.step()
        q2_loss.backward()
        optim_q2.step()
        end_q = time.time()

        print('9Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('9Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        t_q_opt = end_q - start_q
        total_t_q_opt += t_q_opt
        avg_t_q_opt = total_t_q_opt / (epoch + 1)

        action_both = torch.stack(action_1_2).to(device)

        print('10Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('10Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        if args.policy == 'lstm':
            pi1_a_s, _, _ = p1(
                torch.stack(state_1_2).to(device),
                torch.cat(hidden_state_1).to(device),
                torch.cat(cell_state_1).to(device))
            pi2_a_s, _, _ = p2(
                torch.stack(state_1_2).to(device),
                torch.cat(hidden_state_2).to(device),
                torch.cat(cell_state_2).to(device))
        else:
            pi1_a_s = p1(torch.stack(state_1_2).to(device))
            pi2_a_s = p2(torch.stack(state_1_2).to(device))

        print('11Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('11Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        dist1 = Categorical(pi1_a_s)
        log_prob1 = dist1.log_prob(action_both[:, 0])

        print('12Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('12Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        dist2 = Categorical(pi2_a_s)
        log_prob2 = dist2.log_prob(action_both[:, 1])

        print('13Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('13Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        cum_log_prob1 = torch.zeros(log_prob1.shape[0] - 1).to(device)
        cum_log_prob2 = torch.zeros(log_prob2.shape[0] - 1).to(device)
        cum_log_prob1[0] = log_prob1[0]
        cum_log_prob2[0] = log_prob2[0]

        print('14Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('14Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        for i in range(1, log_prob1.shape[0] - 1):
            cum_log_prob1[i] = cum_log_prob1[i - 1] + log_prob1[i]
            cum_log_prob2[i] = cum_log_prob2[i - 1] + log_prob2[i]

        print('15Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('15Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        lp_x_1 = (log_prob1 * (-advantage1)).mean()
        lp_x_2 = (log_prob1 * (-advantage2)).mean()
        lp_y_1 = (log_prob2 * (-advantage1)).mean()
        lp_y_2 = (log_prob2 * (-advantage2)).mean()

        print('16Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('16Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        # f -> trying to maximize both agent's performance
        #lp_x = args.lamda1 * lp_x_1 + args.lamda2 * lp_x_2
        # g -> trying to maximize both agent's performance
        #lp_y = args.lamda1 * lp_y_1 + args.lamda2 * lp_y_2

        mh1_1 = (log_prob1 * log_prob2 * (-advantage1)).mean()
        mh2_1 = (log_prob1[1:] * cum_log_prob2 * (-advantage1[1:])).mean()
        #mh2_1 = mh2_1.sum()/(mh2_1.size(0)-args.n_eps+1)
        mh3_1 = (log_prob2[1:] * cum_log_prob1 * (-advantage1[1:])).mean()
        #mh3_1 = mh3_1.sum()/(mh3_1.size(0)-args.n_eps+1)

        print('17Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('17Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        mh1_2 = (log_prob1 * log_prob2 * (-advantage2)).mean()
        mh2_2 = (log_prob1[1:] * cum_log_prob2 * (-advantage2[1:])).mean()
        #mh2_2 = mh2_2.sum()/(mh2_2.size(0)-args.n_eps+1)
        mh3_2 = (log_prob2[1:] * cum_log_prob1 * (-advantage2[1:])).mean()
        #mh3_2 = mh3_2.sum()/(mh3_2.size(0)-args.n_eps+1)

        print('18Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('18Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        mh_1 = mh1_1 + mh2_1 + mh3_1
        mh_2 = mh1_2 + mh2_2 + mh3_2

        #mh = args.lamda1 * mh_1 + args.lamda2 * mh_2

        print('19Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('19Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        policy_optim.zero_grad()

        start_p = time.time()
        policy_optim.step(lp_x_1, lp_x_2, lp_y_1, lp_y_2, mh_1, mh_2)
        end_p = time.time()

        print('20Allocated: {} GB'.format(
            round(torch.cuda.memory_allocated(current_device) / 1024**3, 1)))
        print('\n')
        print('20Max memory allocated: {} GB'.format(
            round(
                torch.cuda.max_memory_allocated(current_device) / 1024**3, 1)))
        print('\n')

        t_p_opt = end_p - start_p
        total_t_p_opt += t_p_opt
        avg_t_p_opt = total_t_p_opt / (epoch + 1)

        grad_x, grad_x_y_mh, grad_y, grad_y_x_mh = policy_optim.getinfo()

        s = '\nRED player got {} RED coins in epoch {} | RED player got {} BLUE coins in epoch {}\nBLUE player got {} RED coins in epoch {} | BLUE player got {} BLUE coins in epoch {}\n'.format(
            n_rr_e, epoch, n_rb_e, epoch, n_br_e, epoch, n_bb_e, epoch)
        if args.logs:
            f_ptr.write(s)
        s = 'Number of draws in epoch {}: {}\n'.format(epoch, n_d_e)
        if args.logs:
            f_ptr.write(s)
        if (epoch + 1) % args.interval == 0:
            s = '\nRED player got {} RED coins in last {} epochs | RED player got {} BLUE coins in last {} epochs\n'.format(
                n_pR_cR_interval, args.interval, n_pR_cB_interval,
                args.interval)
            if args.logs:
                f_ptr.write(s)
            s = 'BLUE player got {} RED coins in last {} epochs | BLUE player got {} BLUE coins in last {} epochs\n'.format(
                n_pB_cR_interval, args.interval, n_pB_cB_interval,
                args.interval)
            if args.logs:
                f_ptr.write(s)
            s = 'Number of draws in last {} epochs: {}'.format(
                args.interval, n_draw_interval)
            if args.logs:
                f_ptr.write(s)
            s = 'RED-player-RED-coin rate in last {} epochs: {} | RED-player-BLUE-coin rate in last {} epochs: {}\n'.format(
                args.interval, n_pR_cR_interval / (args.interval * args.n_eps),
                args.interval, n_pR_cB_interval / (args.interval * args.n_eps))
            if args.logs:
                f_ptr.write(s)
            s = 'BLUE-player-RED-coin rate in last {} epochs: {} | BLUE-player-BLUE-coin rate in last {} epochs: {}\n'.format(
                args.interval, n_pB_cR_interval / (args.interval * args.n_eps),
                args.interval, n_pB_cB_interval / (args.interval * args.n_eps))
            if args.logs:
                f_ptr.write(s)
            s = 'DRAW rate in last {} epochs: {}\n'.format(
                args.interval, n_draw_interval / (args.interval * args.n_eps))
            if args.logs:
                f_ptr.write(s)
            if args.tensorboard:
                writer.add_scalar(
                    'Coin take rate {} epochs/RED player_RED coin'.format(
                        args.interval),
                    (n_pR_cR_interval) / (args.interval * args.n_eps), epoch)
                writer.add_scalar(
                    'Coin take rate {} epochs/RED player_BLUE coin'.format(
                        args.interval),
                    (n_pR_cB_interval) / (args.interval * args.n_eps), epoch)
                writer.add_scalar(
                    'Coin take rate {} epochs/BLUE player_RED coin'.format(
                        args.interval),
                    (n_pB_cR_interval) / (args.interval * args.n_eps), epoch)
                writer.add_scalar(
                    'Coin take rate {} epochs/BLUE player_BLUE coin'.format(
                        args.interval),
                    (n_pB_cB_interval) / (args.interval * args.n_eps), epoch)
                writer.add_scalar(
                    'Coin take rate {} epochs/Draw rate'.format(args.interval),
                    (n_draw_interval) / (args.interval * args.n_eps), epoch)
            n_pR_cR_interval = n_pR_cB_interval = 0
            n_pB_cR_interval = n_pB_cB_interval = 0
            n_draw_interval = 0

        tot_games = (epoch + 1) * args.n_eps
        s = '\nRED player got {} RED coins till now | RED player got {} BLUE coins till now\n'.format(
            n_pR_cR, n_pR_cB)
        if args.logs:
            f_ptr.write(s)
        s = 'BLUE player got {} RED coins till now | BLUE player got {} BLUE coins till now\n'.format(
            n_pB_cR, n_pB_cB)
        if args.logs:
            f_ptr.write(s)
        s = 'Number of draws till now: {}\n\n'.format(n_draw)
        if args.logs:
            f_ptr.write(s)
        s = 'RED-player-RED-coin rate till now: {} | RED-player-BLUE-coin rate till now: {}\n'.format(
            n_pR_cR / tot_games, n_pR_cB / tot_games)
        if args.logs:
            f_ptr.write(s)
        s = 'BLUE-player-RED-coin rate till now: {} | BLUE-player-BLUE-coin rate till now: {}\n'.format(
            n_pB_cR / tot_games, n_pB_cB / tot_games)
        if args.logs:
            f_ptr.write(s)
        s = 'DRAW rate till now: {}\n'.format(n_draw / tot_games)
        if args.logs:
            f_ptr.write(s)
        s = '\nTime for game play in this epoch: {} seconds\n'.format(t_play)
        if args.logs:
            f_ptr.write(s)
        s = '\nTime for policy optimization in this epoch: {} seconds\nTime for critic optimization in this epoch: {}seconds\n\n'.format(
            t_p_opt, t_q_opt)
        if args.logs:
            f_ptr.write(
                s +
                '##################################################################################################################'
                + '\n\n')

        if args.tensorboard:
            writer.add_scalar('Timesteps/Total timesteps in this epoch',
                              time_in_epoch, epoch)
            writer.add_scalar(
                'Timesteps/Average timesteps in one episode in this epoch',
                avg_time_in_epoch, epoch)

            writer.add_scalar('Average reward in the epoch/Agent1',
                              sum(reward_1) / len(reward_1), epoch)
            writer.add_scalar('Average reward in the epoch/Agent2',
                              sum(reward_2) / len(reward_2), epoch)

            writer.add_scalar('Mean advantage in the epoch/Agent1',
                              advantage1.mean().cpu(), epoch)
            writer.add_scalar('Mean advantage in the epoch/Agent2',
                              advantage2.mean().cpu(), epoch)

            writer.add_scalar('Entropy/Agent1',
                              dist1.entropy().mean().detach().cpu(), epoch)
            writer.add_scalar('Entropy/Agent2',
                              dist2.entropy().mean().detach().cpu(), epoch)

            writer.add_scalar('Coin take rate/RED player_RED coin',
                              (n_pR_cR) / tot_games, epoch)
            writer.add_scalar('Coin take rate/RED player_BLUE coin',
                              (n_pR_cB) / tot_games, epoch)
            writer.add_scalar('Coin take rate/BLUE player_RED coin',
                              (n_pB_cR) / tot_games, epoch)
            writer.add_scalar('Coin take rate/BLUE player_BLUE coin',
                              (n_pB_cB) / tot_games, epoch)
            writer.add_scalar('Coin take rate/Draw rate', (n_draw) / tot_games,
                              epoch)

            writer.add_scalar('Time/Avg time for policy optimization',
                              avg_t_p_opt, epoch)
            writer.add_scalar('Time/Avg time for critic optimization',
                              avg_t_q_opt, epoch)
            writer.add_scalar('Time/Avg time for game play', avg_t_play, epoch)
            writer.add_scalar('Time/Total time for policy optimization',
                              t_p_opt, epoch)
            writer.add_scalar('Time/Total time for critic optimization',
                              t_q_opt, epoch)
            writer.add_scalar('Time/Total time for game play', t_play, epoch)

            writer.add_scalar('Norm/grad_x(f)', grad_x, epoch)
            writer.add_scalar('Norm/grad_xy(f)_grad_y(g)', grad_x_y_mh, epoch)
            writer.add_scalar('Norm/grad_y(g)', grad_y, epoch)
            writer.add_scalar('Norm/grad_yx(g)_grad_x(f)', grad_y_x_mh, epoch)

        if args.save_model and epoch == args.n_epochs - 1:  #epoch % args.interval == 0:
            #print(epoch)
            torch.save(p1.state_dict(),
                       model_mavpg + '/policy_agent1_' + str(epoch) + ".pth")
            torch.save(p2.state_dict(),
                       model_mavpg + '/policy_agent2_' + str(epoch) + ".pth")
            torch.save(q1.state_dict(),
                       model_mavpg + '/value_agent1_' + str(epoch) + ".pth")
            torch.save(q2.state_dict(),
                       model_mavpg + '/value_agent2_' + str(epoch) + ".pth")

        print(
            '################################################################################'
        )

    end = time.time()
    total_time = end - start
    if args.logs:
        s = 'Total time taken: {} seconds\nTotal time for game play only: {} seconds\nTotal time for policy optimization steps only: {} seconds\nTotal time for critic optimization steps only: {} seconds\n'.format(
            total_time, total_t_play, total_t_p_opt, total_t_q_opt)
        f_ptr.write(s)
        s = 'Average time for policy optimization steps only: {} seconds\nAverage time for game play only: {} seconds\nAverage time for critic optimization steps only: {} seconds\n\n'.format(
            avg_t_p_opt, avg_t_play, avg_t_q_opt)
        f_ptr.write(s)
        np_file = env_location + experiment_name + '/coin_mavpg_run_' + str(
            args.run_num) + '_lr=' + str(args.lr_p) + '_stuff.npz'
        with open(np_file, 'wb') as np_f:
            np.savez(np_f, avg_rew_1 = np.array(avg_rew_1), avg_rew_2 = np.array(avg_rew_2), n_pR_cR = np.array(n_pR_cR),\
            n_pR_cB = np.array(n_pR_cB), n_pB_cR = np.array(n_pB_cR), n_pB_cB = np.array(n_pB_cB), n_draw = np.array(n_draw), arr_pR_cR = np.array(arr_pR_cR), \
            arr_pR_cB = np.array(arr_pR_cB), arr_pB_cR = np.array(arr_pB_cR), arr_pB_cB = np.array(arr_pB_cB), arr_draw = np.array(arr_draw),\
            total_time = np.array(total_time), total_play_time = np.array(total_t_play), total_time_p_opt = np.array(total_t_p_opt), total_time_q_opt = np.array(total_t_q_opt),\
            avg_time_p_opt = np.array(avg_t_p_opt), avg_play_time = np.array(avg_t_play), avg_time_q_opt = np.array(avg_t_q_opt), time_in_episodes = np.array(t_episodes))

    fig = plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    ax.clear()
    fig.suptitle('Coin picking rates for RED and BLUE agents', fontsize=25)
    plt.xlabel(
        r'$Number\ of\ episodes\ (Num\ epochs\ =\ Total\ num\ eps/num\ eps\ per\ epoch)$',
        fontsize=20)
    plt.ylabel(r'$Coin\ picking\ rate$', fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    ax.plot(np.array(arr_pR_cR),
            label='MAVPG, RED player picks RED coin, lr = {}'.format(
                args.lr_p))
    ax.plot(np.array(arr_pR_cB),
            label='MAVPG, RED player picks BLUE coin, lr = {}'.format(
                args.lr_p))
    ax.plot(np.array(arr_pB_cR),
            label='MAVPG, BLUE player picks RED coin, lr = {}'.format(
                args.lr_p))
    ax.plot(np.array(arr_pB_cB),
            label='MAVPG, BLUE player picks BLUE coin, lr = {}'.format(
                args.lr_p))
    ax.plot(np.array(arr_draw), label='MAVPG, DRAW, lr = {}'.format(args.lr_p))
    plt.legend(loc='upper right')
    plt.grid()
    plt.savefig(env_location + experiment_name + '/coin_mavpg_run_' +
                str(args.run_num) + '_lr=' + str(args.lr_p) + '_pick_rate.png')

    fig = plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    ax.clear()
    fig.suptitle(
        'Avg reward per epoch for Agent1 (RED) & Agent2 (BLUE) v/s #epochs/iterations',
        fontsize=25)
    plt.xlabel(r'$Epochs/Iterations$', fontsize=20)
    plt.ylabel(r'$Average\ reward\ per\ epoch$', fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    ax.plot(np.array(avg_rew_1),
            label='MAVPG, Avg rew per epoch Ag1 (RED), lr = {}'.format(
                args.lr_p))
    ax.plot(np.array(avg_rew_2),
            label='MAVPG, Avg rew per epoch Ag2 (BLUE), lr = {}'.format(
                args.lr_p))
    plt.legend(loc='upper right')
    plt.grid()
    plt.savefig(env_location + experiment_name + '/coin_mavpg_run_' +
                str(args.run_num) + '_lr=' + str(args.lr_p) +
                '_avg_rew_per_epoch.png')

    for l,p in enumerate(p2.parameters()):
        if l==0:
            writer.add_scalar('Controller/Agent2', p.data, t_eps)
        else:
            writer.add_scalar('controller_std/Agent2', p.data, t_eps)


    # writer.add_scalar('Action_prob/agent1', action1_prob[3], t_eps)
    # writer.add_scalar('Action_prob/agent2', action2_prob[3], t_eps)

    val1 = q(torch.stack(mat_state1))
    val1 = val1.detach()
    next_value = 0  # because currently we end ony when its done which is equivalent to no next state
    returns_np1 = get_advantage(next_value, torch.stack(mat_reward1), val1, torch.stack(mat_done), gamma=0.99, tau=0.95)

    returns1 = torch.cat(returns_np1)
    advantage_mat1 = returns1 - val1.transpose(0,1)


    val2 = q(torch.stack(mat_state2))
    val2 = val2.detach()
    next_value = 0  # because currently we end ony when its done which is equivalent to no next state
    returns_np2 = get_advantage(next_value, torch.stack(mat_reward2), val2, torch.stack(mat_done), gamma=0.99, tau=0.95)

    returns2 = torch.cat(returns_np2)
    advantage_mat2 = returns2 - val2.transpose(0,1)

    writer.add_scalar('Advantage/agent1', advantage_mat1.mean(), t_eps)
    writer.add_scalar('Advantage/agent2', advantage_mat2.mean(), t_eps)