forked from maxmax1992/DDPG-MADDPG
/
train_maddpg.py
94 lines (82 loc) · 3.65 KB
/
train_maddpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# import sys
# sys.path.append('.')
import numpy as np
import argparse
from collections import deque
from MADDPG_trainer import MADDPG_Trainer
from torch.utils.tensorboard import SummaryWriter
from utils import make_multiagent_env, map_to_tensors
def get_args():
parser = argparse.ArgumentParser(description=None)
parser.add_argument('--env', default="simple", type=str, help='gym environment name')
parser.add_argument('--n_eps', default=10000, type=int, help='N_episodes')
parser.add_argument('--T', default=25, type=int, help='maximum timesteps per episode')
parser.add_argument("--render", action="store_true", help="Render the environment mode")
parser.add_argument("--use_writer", action="store_true", help="Render the environment mode")
parser.add_argument("--use_ounoise", action="store_true", help="Use OUNoise")
parser.add_argument("--lograte", default=100, type=int, help="Log frequency")
parser.add_argument("--train_freq", default=100, type=int, help="Training frequency")
parser.add_argument("--buffer_length", default=int(1e6), type=int)
parser.add_argument("--batch_size", default=1024, type=int, help="Batch size for training")
# parser.add_argument("--render", default=, help="Render the environment mode")
return parser.parse_args()
def learn_episodic_MADDPG(args):
###
args.env = "simple_speaker_listener"
# args.discrete_action = True
env = make_multiagent_env(args.env)
# print(act_sp)
if not args.use_writer:
print("not using writer")
n_agents = len(env.agents)
action_spaces = [act_sp.n for act_sp in env.action_space]
observation_spaces = [ob_sp.shape[0] for ob_sp in env.observation_space]
log_dir = "maddpg_test_run"
writer = SummaryWriter(log_dir) if args.use_writer else None
running_rewards = deque([], maxlen=args.lograte)
# discrete actions maddpg agentgent
# agent = None
trainer = MADDPG_Trainer(n_agents, action_spaces, observation_spaces, writer, args)
trainer.eval()
timesteps = 0
episode_rewards = [0.0]
for ep in range(args.n_eps):
observations = env.reset()
trainer.reset()
done = False
for t in range(args.T):
timesteps += 1
actions = trainer.get_actions(observations)
actions = [a.cpu().numpy() for a in actions]
# print(actions)
next_obs, rewards, dones, _ = env.step(actions)
trainer.store_transitions(*map_to_tensors(observations, actions, rewards, next_obs, dones))
done = all(dones) or t >= args.T
if timesteps % args.train_freq == 0:
trainer.prep_training()
trainer.sample_and_train(args.batch_size)
trainer.eval()
observations = next_obs
if args.render:
env.render()
episode_rewards[-1] += np.sum(rewards)
if done:
break
if args.use_writer:
writer.add_scalar('rewards', episode_rewards[-1] / n_agents, ep)
running_rewards.append(episode_rewards[-1] / n_agents)
episode_rewards.append(0)
if (ep + 1) % args.lograte == 0:
print(f"episode: {ep}, running episode rewards: {np.mean(running_rewards)}")
# TODO ADD logging to the
writer.export_scalars_to_json(str(log_dir / 'summary.json'))
writer.close()
return 0
if __name__ == '__main__':
N_EPS = 10000
args = get_args()
# rewards_DQN_dueling = learn_episodic_DQN(N_EPS, 500, use_dueling=True)
rewards_DDPG = learn_episodic_MADDPG(args)
# plt.plot(moving_average(rewards_DDPG, 100), label="DDPG")
# plt.legend()
# plt.show()