def train_agents_qmix(env, training_agents, models, args): buffer = ReplayBuffer(size=args.buffer_size) mac = MultiAgentController(env, training_agents, models, args) for step_idx in range(args.n_steps): episode = generate_episode(env, args) buffer.insert_list(episode) if len(buffer) < args.batch_size: continue batch = buffer.sample(args.batch_size) loss = mac.update(batch) if step_idx % args.sync_interval == 0: mac.sync_networks() ## logging ex.log_scalar('loss', loss) if step_idx % args.log_interval == 0: episode = generate_episode(env, args, test_mode=True) if step_idx == 0: episode[-1].rewards["blue"] = 0 episode[-1].rewards["red"] = 1 ex.log_scalar('length', len(episode)) ex.log_scalar('reward', episode[-1].rewards["blue"]) ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1)) ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1)) ex.log_scalar('epsilon', training_agents[0].scheduler()) return training_agents
def train_agents_reinforce(env, training_agents, args): epi_len, nwins = 0, 0 n_episodes = 0 for step_idx in range(int(args.n_steps / args.n_episodes_per_step)): batch = [] for _ in range(args.n_episodes_per_step): episode = generate_episode(env, args) n_episodes += 1 batch.extend(episode) epi_len += len(episode) reward = episode[-1].rewards["blue"] ex.log_scalar('length', len(episode)) ex.log_scalar('reward', reward) ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1)) ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1)) if episode[-1].rewards["blue"] == 1: nwins += 1 for agent in training_agents: loss = agent.update(batch) ex.log_scalar(f'loss{agent.id}', loss['loss']) s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0 return training_agents
def test_replay(model_file, agent_type='qmix', period=None): import yaml from utilities import get_args args = get_args(yaml.load(open('default_config.yaml', 'r'))) path = '/home/koen' + args.path args.gamma = 0.8 args.max_episode_length = 30 args.step_penalty = 0.05 args.a_terrain = True if agent_type == 'qmix': model = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) target = QMixModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) model.load_state_dict(torch.load(path + model_file)) target.load_state_dict(torch.load(path + model_file)) models = {"model": model, "target": target} team_blue = [ QMIXAgent(idx, "blue", args) for idx in range(args.n_friends) ] elif agent_type == 'reinforce': models = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) models.load_state_dict(torch.load(path + model_file)) team_blue = [ PGAgent(idx, "blue", args) for idx in range(args.n_friends) ] for agent in team_blue: agent.set_model(models) team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] agents = team_blue + team_red env = RestrictedEnvironment(agents, args) while True: episode = generate_episode(env, args) print(len(episode)) if len(episode) < 6: visualize(env, episode, period=period) break
def train(args): team_blue = [PGAgent(idx, "blue", args) for idx in range(args.n_friends)] team_red = [ Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) elif args.env_type == 'restricted': env = RestrictedEnvironment(agents, args) args.n_actions = 6 + args.n_enemies args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies + args.n_enemies # setup model model = RNNModel(input_shape=args.n_inputs, n_actions=args.n_actions, args=args) for agent in training_agents: agent.set_model(model) epi_len, nwins = 0, 0 n_episodes = 0 ex.log_scalar(f'win', 0.0, step=n_episodes + 1) # forces start of run at 0 wins () for step_idx in range(int(args.n_steps / args.n_episodes_per_step)): batch = [] for _ in range(args.n_episodes_per_step): episode = generate_episode(env, args) n_episodes += 1 batch.extend(episode) epi_len += len(episode) reward = episode[-1].rewards["blue"] ex.log_scalar('length', len(episode), step=n_episodes) ex.log_scalar('reward', reward, step=n_episodes) ex.log_scalar(f'win', int(episode[-1].rewards["blue"] == 1), step=n_episodes + 1) if episode[-1].rewards["blue"] == 1: nwins += 1 for agent in training_agents: stats = agent.update(batch) ex.log_scalar(f'loss{agent.id}', stats["loss"], step=n_episodes) ex.log_scalar(f'grads{agent.id}', stats["grads_l2"], step=n_episodes) ex.log_scalar(f'grads_var{agent.id}', stats["grads_var"], step=n_episodes) s = f"Step {step_idx}: " s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - " s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - " print(s) epi_len, nwins = 0, 0 #_ = generate_episode(env, render=True) from os.path import expanduser home = expanduser("~") #for agent in training_agents: # agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p') torch.save(model.state_dict(), home + args.path + f'RUN_{get_run_id()}.torch')
def train(args): team_blue = [QMIXAgent(idx, "blue", args) for idx in range(args.n_friends)] team_red = [ Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies) ] training_agents = team_blue agents = team_blue + team_red if args.env_type == 'normal': env = Environment(agents, args) if args.env_type == 'restricted': env = RestrictedEnvironment(agents, args) args.n_actions = 6 + args.n_enemies # 6 fixed actions + 1 aim action per enemy args.n_inputs = 4 + 3 * ( args.n_friends - 1 ) + 3 * args.n_enemies + args.n_enemies # see process function in models.py models = generate_models(args.n_inputs, args.n_actions, args) for agent in training_agents: agent.set_model(models) buffer = ReplayBuffer(size=args.buffer_size) mac = MultiAgentController(env, training_agents, models, args) for step_idx in range(args.n_steps): episode = generate_episode(env, args) buffer.insert_list(episode) if len(buffer) < args.batch_size: continue batch = buffer.sample(args.batch_size) loss = mac.update(batch) if step_idx % args.sync_interval == 0: mac.sync_networks() ## logging ex.log_scalar('loss', loss) if step_idx % args.log_interval == 0: episode = generate_episode(env, args, test_mode=True) if step_idx == 0: episode[-1].rewards["blue"] = 0 episode[-1].rewards["red"] = 1 ex.log_scalar('length', len(episode), step=step_idx) ex.log_scalar('reward', episode[-1].rewards["blue"], step=step_idx) ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1), step=step_idx) ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1), step=step_idx) ex.log_scalar('epsilon', training_agents[0].scheduler(), step=step_idx) if PRINT and step_idx > 0 and step_idx % PRINT_INTERVAL == 0: print( f"Step {step_idx}: loss = {loss}, reward = {episode[-1].rewards['blue']}" ) #episode = generate_episode(env, render=True) if args.save_model and step_idx > 0 and step_idx % args.save_model_interval == 0: from os.path import expanduser home = expanduser("~") torch.save(models["model"].state_dict(), home + args.path + f'RUN_{get_run_id()}_MODEL.torch') if args.use_mixer: torch.save( mac.mixer.state_dict(), home + args.path + f'RUN_{get_run_id()}_MIXER.torch')