def worker(worker_id, master_end, worker_end, game_params, map_name, obs_proc_params, action_dict): master_end.close() # Forbid worker to use the master end for messaging np.random.seed() # sets random seed for the environment env = init_game(game_params, map_name, random_seed=np.random.randint(10000)) op = ObsProcesser(**obs_proc_params) while True: cmd, data = worker_end.recv() if cmd == 'step': obs = env.step([data]) state_trg_dict, _ = op.get_state( obs) #returns (state_dict, names_dict) state_trg = merge_screen_and_minimap(state_trg_dict) reward = obs[0].reward done = obs[0].last() # Always bootstrap when episode finishes (in MoveToBeacon there is no real end) if done: bootstrap = True else: bootstrap = False # state_trg is the state used as next state for the update # state is the new state used to decide the next action # (different if the episode ends and another one begins) if done: obs = reset_and_skip_first_frame(env) state_dict, _ = op.get_state( obs) # returns (state_dict, names_dict) state = merge_screen_and_minimap(state_dict) else: state = state_trg available_actions = obs[0].observation.available_actions action_mask = get_action_mask(available_actions, action_dict) worker_end.send( (state, reward, done, bootstrap, state_trg, action_mask)) elif cmd == 'reset': obs = reset_and_skip_first_frame(env) state_dict, _ = op.get_state( obs) # returns (state_dict, names_dict) state = merge_screen_and_minimap(state_dict) available_actions = obs[0].observation.available_actions action_mask = get_action_mask(available_actions, action_dict) worker_end.send((state, action_mask)) elif cmd == 'close': worker_end.close() break else: raise NotImplementedError
def main(): # Environment parameters RESOLUTION = args.res game_params = dict(feature_screen=RESOLUTION, feature_minimap=RESOLUTION, action_space="FEATURES") game_names = [ 'MoveToBeacon', 'CollectMineralShards', 'DefeatRoaches', 'FindAndDefeatZerglings', 'DefeatZerglingsAndBanelings', 'CollectMineralsAndGas', 'BuildMarines' ] map_name = args.map_name if map_name not in game_names: raise Exception("map name " + map_name + " not recognized.") env = init_game(game_params, map_name) # Action and state space params if args.select_all_layers: obs_proc_params = {'select_all': True} else: obs_proc_params = { 'screen_names': args.screen_names, 'minimap_names': args.minimap_names } op = ObsProcesser(**obs_proc_params) screen_channels, minimap_channels = op.get_n_channels() in_channels = screen_channels + minimap_channels action_dict = get_action_dict(args.action_names) print(action_dict) action_space = len(action_dict) # A2C params spatial_model = net.FullyConvSpatial nonspatial_model = net.FullyConvNonSpatial embed_dim = args.embed_dim spatial_dict = {"in_channels": in_channels} nonspatial_dict = {'resolution': RESOLUTION, 'kernel_size': 3, 'stride': 2} HPs = dict(action_space=action_space, n_steps=args.n_steps, H=7e-2, spatial_model=spatial_model, nonspatial_model=nonspatial_model, n_features=args.n_features, n_channels=args.n_channels, spatial_dict=spatial_dict, nonspatial_dict=nonspatial_dict, action_dict=action_dict) if torch.cuda.is_available(): HPs['device'] = 'cuda' else: HPs['device'] = 'cpu' print("Using device " + HPs['device']) version = args.A2C_version if version == 1: HPs = {**HPs, 'embed_dim': embed_dim} agent = SpatialA2C_v1(env=env, **HPs) elif version == 2: # no action embedding agent = SpatialA2C_v2(env=env, **HPs) elif version == 3: agent = SpatialA2C_v3(env=env, **HPs) else: raise Exception("Version not implemented.") env.close() # Training args train_dict = dict(n_train_processes=args.n_train_processes, max_train_steps=args.max_train_steps, unroll_length=args.traj_length, test_interval=args.test_interval, inspection_interval=args.inspection_interval) # Creating paths if not existing if not os.path.isdir(args.save_dir): os.system("mkdir " + args.save_dir) if not os.path.isdir(args.save_dir + map_name): os.system("mkdir " + args.save_dir + map_name) # Actual training results = train_batched_A2C(agent, game_params, map_name, args.lr, obs_proc_params=obs_proc_params, action_dict=action_dict, save_path=args.save_dir + map_name, **train_dict) score, losses, trained_agent, PID = results # Save results save = True keywords = [ map_name, 'lr-' + str(args.lr), str(args.n_steps) + '-steps', str(args.res) + '-res', str(args.max_train_steps) + "-env-steps", str(args.traj_length) + "-unroll-len", str(in_channels) + '-in-channels' ] if save: save_dir = args.save_dir + map_name + "/" os.system('mkdir ' + save_dir) keywords.append(PID) filename = '_'.join(keywords) filename = 'S_' + filename print("Save at " + save_dir + filename) train_session_dict = dict(game_params=game_params, HPs=HPs, score=score, n_epochs=len(score), keywords=keywords, losses=losses) np.save(save_dir + filename, train_session_dict) torch.save(trained_agent, save_dir + "agent_" + PID) else: print("Nothing saved") pass
def train_batched_A2C(agent, game_params, map_name, lr, n_train_processes, max_train_steps, unroll_length, obs_proc_params, action_dict, test_interval=100, num_tests=5, inspection_interval=120000, save_path=None): if save_path is None: save_path = "../Results/" + map_name replay_dict = dict(save_replay_episodes=num_tests, replay_dir='Replays/', replay_prefix='A2C_' + map_name) test_env = init_game(game_params, map_name, **replay_dict) # save just test episodes op = ObsProcesser(**obs_proc_params) envs = ParallelEnv(n_train_processes, game_params, map_name, obs_proc_params, action_dict) optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr) #H_schedule = H_linear_schedule(agent.H, agent.H/10, max_train_steps) PID = gen_PID() print("Process ID: ", PID) score = [] critic_losses = [] actor_losses = [] entropy_losses = [] step_idx = 0 s, a_mask = envs.reset() # reset manually only at the beginning while step_idx < max_train_steps: s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list( ), list(), list(), list() log_probs = [] entropies = [] for _ in range(unroll_length): a, log_prob, entropy = agent.step(s, a_mask) # variables with gradient log_probs.append(log_prob) entropies.append(entropy) s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a) s_lst.append(s) r_lst.append(r) done_lst.append(done) bootstrap_lst.append(bootstrap) s_trg_lst.append(s_trg) s = s_prime step_idx += 1 #n_train_processes # all variables without gradient, batch first, then episode length s_lst = np.array(s_lst).transpose(1, 0, 2, 3, 4) r_lst = np.array(r_lst).transpose(1, 0) done_lst = np.array(done_lst).transpose(1, 0) bootstrap_lst = np.array(bootstrap_lst).transpose(1, 0) s_trg_lst = np.array(s_trg_lst).transpose(1, 0, 2, 3, 4) critic_loss, actor_loss, entropy_term = agent.compute_ac_loss( r_lst, log_probs, entropies, s_lst, done_lst, bootstrap_lst, s_trg_lst) loss = (critic_loss + actor_loss).mean() optimizer.zero_grad() loss.backward() optimizer.step() critic_losses.append(critic_loss.item()) actor_losses.append(actor_loss.item()) entropy_losses.append(entropy_term.item()) #H = H_schedule.get_H(step_idx) #agent.H = H ### Test time ### if step_idx % test_interval == 0: if not os.path.isdir(save_path + '/Logging/'): os.system('mkdir ' + save_path + '/Logging/') if step_idx // test_interval == 1: with open(save_path + '/Logging/' + PID + '.txt', 'a+') as f: print("#Steps,score", file=f) avg_score = test(step_idx, agent, test_env, PID, op, action_dict, num_tests, save_path) score.append(avg_score) if inspection and (step_idx % inspection_interval == 0): inspector = inspection_test(step_idx, agent, test_env, PID, op, action_dict) # save episode for inspection and model weights at that point if not os.path.isdir(save_path): os.system('mkdir ' + save_path) if not os.path.isdir(save_path + '/Inspection/'): os.system('mkdir ' + save_path + '/Inspection/') if not os.path.isdir(save_path + '/Checkpoints/'): os.system('mkdir ' + save_path + '/Checkpoints/') inspector.save_dict(path=save_path + '/Inspection/') torch.save(agent.AC.state_dict(), save_path + '/Checkpoints/' + PID + '_' + str(step_idx)) torch.save( optimizer.state_dict(), save_path + '/Checkpoints/optim_' + PID + '_' + str(step_idx)) envs.close() losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropy_losses) return score, losses, agent, PID
def train_from_checkpoint(agent, PID, step_idx, filename, game_params, map_name, lr, n_train_processes, max_train_steps, unroll_length, obs_proc_params, action_dict, test_interval=100, num_tests=5, inspection_interval=120000, save_path=None): if save_path is None: save_path = "../Results/" + map_name replay_dict = dict(save_replay_episodes=num_tests, replay_dir='Replays/', replay_prefix='A2C_' + map_name) test_env = init_game(game_params, map_name, **replay_dict) # save just test episodes op = ObsProcesser(**obs_proc_params) envs = ParallelEnv(n_train_processes, game_params, map_name, obs_proc_params, action_dict) optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr) ### Different from train_batched_A2C ### # Load checkpoints agent.AC.load_state_dict( torch.load(save_path + '/Checkpoints/' + PID + "_" + str(step_idx))) # for backcompatibility with when I wasn't saving optimizer state dict if os.path.isfile(save_path + '/Checkpoints/optim_' + PID + "_" + str(step_idx)): print("Loading optimizer checkpoint " + PID + "_" + str(step_idx)) optimizer.load_state_dict( torch.load(save_path + '/Checkpoints/optim_' + PID + "_" + str(step_idx))) max_train_steps = max_train_steps + step_idx # add initial offset # Load score and losses up to step_idx if available if os.path.isfile(save_path + filename + '.npy'): train_session_dict = np.load(save_path + filename + '.npy', allow_pickle=True) losses = train_session_dict['losses'] # Cut everything at the step_idx assuming that test and inspection intervals remained the same score = train_session_dict['score'][step_idx // test_interval] print("len(score): ", len(score)) critic_losses = losses['critic_losses'][step_idx // unroll_length] actor_losses = losses['actor_losses'][step_idx // unroll_length] entropy_losses = losses['entropy_losses'][step_idx // unroll_length] print("len(critic_losses): ", len(critic_losses)) else: print( "Unfortunately it wasn't possible to load the session dictionary at " + save_path + filename + '.npy') score = [] critic_losses = [] actor_losses = [] entropy_losses = [] #PID = gen_PID() # already defined #step_idx = 0 # already defined ### End of new part ### print("Process ID: ", PID) s, a_mask = envs.reset() # reset manually only at the beginning while step_idx < max_train_steps: s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list( ), list(), list(), list() log_probs = [] entropies = [] for _ in range(unroll_length): a, log_prob, entropy = agent.step(s, a_mask) # variables with gradient log_probs.append(log_prob) entropies.append(entropy) s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a) s_lst.append(s) r_lst.append(r) done_lst.append(done) bootstrap_lst.append(bootstrap) s_trg_lst.append(s_trg) s = s_prime step_idx += 1 #n_train_processes # all variables without gradient, batch first, then episode length s_lst = np.array(s_lst).transpose(1, 0, 2, 3, 4) r_lst = np.array(r_lst).transpose(1, 0) done_lst = np.array(done_lst).transpose(1, 0) bootstrap_lst = np.array(bootstrap_lst).transpose(1, 0) s_trg_lst = np.array(s_trg_lst).transpose(1, 0, 2, 3, 4) critic_loss, actor_loss, entropy_term = agent.compute_ac_loss( r_lst, log_probs, entropies, s_lst, done_lst, bootstrap_lst, s_trg_lst) loss = (critic_loss + actor_loss).mean() optimizer.zero_grad() loss.backward() optimizer.step() critic_losses.append(critic_loss.item()) actor_losses.append(actor_loss.item()) entropy_losses.append(entropy_term.item()) ### Test time ### if step_idx % test_interval == 0: if not os.path.isdir(save_path + '/Logging/'): os.system('mkdir ' + save_path + '/Logging/') if step_idx // test_interval == 1: with open(save_path + '/Logging/' + PID + '.txt', 'a+') as f: print("#Steps,score", file=f) avg_score = test(step_idx, agent, test_env, PID, op, action_dict, num_tests, save_path) score.append(avg_score) if inspection and (step_idx % inspection_interval == 0): inspector = inspection_test(step_idx, agent, test_env, PID, op, action_dict) # save episode for inspection and model weights at that point if not os.path.isdir(save_path): os.system('mkdir ' + save_path) if not os.path.isdir(save_path + '/Inspection/'): os.system('mkdir ' + save_path + '/Inspection/') if not os.path.isdir(save_path + '/Checkpoints/'): os.system('mkdir ' + save_path + '/Checkpoints/') inspector.save_dict(path=save_path + '/Inspection/') torch.save(agent.AC.state_dict(), save_path + '/Checkpoints/' + PID + '_' + str(step_idx)) torch.save( optimizer.state_dict(), save_path + '/Checkpoints/optim_' + PID + '_' + str(step_idx)) envs.close() losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropy_losses) return score, losses, agent, PID
def train_batched_A2C(agent, game_params, map_name, lr, n_train_processes, max_train_steps, unroll_length, max_episode_steps, obs_proc_params, test_interval=100, num_tests=5): replay_dict = dict(save_replay_episodes=num_tests, replay_dir='Replays/', replay_prefix='A2C_'+map_name) test_env = init_game(game_params, map_name, max_episode_steps, **replay_dict) # save just test episodes op = ObsProcesser(**obs_proc_params) envs = ParallelEnv(n_train_processes, game_params, map_name, max_episode_steps, obs_proc_params) optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr) PID = gen_PID() print("Process ID: ", PID) score = [] critic_losses = [] actor_losses = [] entropy_losses = [] step_idx = 0 while step_idx < max_train_steps: s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(), list(), list(), list() log_probs = [] entropies = [] s, a_mask = envs.reset() for _ in range(unroll_length): a, log_prob, entropy = agent.step(s, a_mask) # variables with gradient log_probs.append(log_prob) entropies.append(entropy) s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a) s_lst.append(s) r_lst.append(r) done_lst.append(done) bootstrap_lst.append(bootstrap) s_trg_lst.append(s_trg) s = s_prime step_idx += 1 #n_train_processes # all variables without gradient s_lst = np.array(s_lst).transpose(1,0,2,3,4) r_lst = np.array(r_lst).transpose(1,0) done_lst = np.array(done_lst).transpose(1,0) bootstrap_lst = np.array(bootstrap_lst).transpose(1,0) s_trg_lst = np.array(s_trg_lst).transpose(1,0,2,3,4) critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(r_lst, log_probs, entropies, s_lst, done_lst, bootstrap_lst, s_trg_lst) loss = (critic_loss + actor_loss).mean() optimizer.zero_grad() loss.backward() optimizer.step() critic_losses.append(critic_loss.item()) actor_losses.append(actor_loss.item()) entropy_losses.append(entropy_term.item()) ### Test time ### if step_idx % test_interval == 0: if inspection: avg_score, inspector = test(step_idx, agent, test_env, PID, op, num_tests) # save episode for inspection and model weights at that point inspector.save_dict() torch.save(agent.AC.state_dict(), "../Results/MoveToBeacon/Checkpoints/"+PID+"_"+str(step_idx)) else: avg_score = test(step_idx, agent, test_env, PID, op, num_tests) score.append(avg_score) envs.close() losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropy_losses) return score, losses, agent, PID