def benchmark(simulation_only=False, format="csv", arenas=[1, 16, 128, 256, 512]): """Run benchmark Args: simulation_only: Skip sending actions and observations, to get a benchmark of just how the simulation is performing """ delim = ', ' if format == 'csv' else ' | ' first = True for n_arenas in arenas: env_start = time.time() env = DerkEnv(n_arenas=n_arenas, turbo_mode=True, session_args={'debug_no_observations': simulation_only}) if first: first = False print('simulation_only=' + str(simulation_only) + ' ' + env.app.get_webgl_renderer()) if format == 'csv': print('"n_arenas", "create env", "reset", "run"') else: print('n_arenas | create env | reset | run') print('--- | --- | --- | ---') print(str(n_arenas) + delim, end="") print(str(time.time() - env_start) + delim, end="") # action_space.sample() can take a lot of time so we just run it once outside the loop action_n = None if simulation_only else [ env.action_space.sample() for i in range(env.n_agents) ] n_samples = 20 reset_time = 0 step_time = 0 for i in range(n_samples): reset_start = time.time() observation_n = env.reset() reset_time = reset_time + time.time() - reset_start run_start = time.time() while True: observation_n, reward_n, done_n, info_n = env.step(action_n) if all(done_n): break step_time = step_time + time.time() - run_start print(str(reset_time / n_samples) + delim, end="") print(str(step_time / n_samples)) env.close()
1 + self.eps_clip).sum(axis=1) * minibatch_adv loss = -torch.min(surrogate_loss1, surrogate_loss2 ) - self.entropy_coeff * entropy.sum(axis=1) self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() device = "cuda:0" ITERATIONS = 1000000 discount = 1 agent = nn_agent(512, device) env = DerkEnv(n_arenas=800, turbo_mode=True, reward_function=win_loss_reward_function, home_team=classes_team_config, away_team=classes_team_config) past_selves_ratio = 0.2 save_model_every = 10 eval_against_gap = 40 past_models = [] portion_controlled_by_curr = 1 - (past_selves_ratio / 2) model_checkpoint_schedule = [int(i**1.5) for i in range(1000)] save_folder = "checkpoints/PPO-GAE-" + str(time.time()) os.mkdir(save_folder) for iteration in range(ITERATIONS):
] tail_weapons = ["HealingGland", "VampireGland", "ParalyzingDart"] max_arenas = 800 teams_per_member = max_arenas // (len(league) // 2) n_arenas = (len(league) * teams_per_member) // 2 random_configs = [{ "slots": [ random.choice(arm_weapons), random.choice(misc_weapons), random.choice(tail_weapons) ] } for i in range(3 * n_arenas // 2)] env = DerkEnv(n_arenas=n_arenas, turbo_mode=True, home_team=random_configs, away_team=random_configs) for i in range(1): #randomize matchings between league members scrambled_team_IDS = np.random.permutation(env.n_agents // 3) league_agent_mappings = [] for i in range(len(league)): member_matches = scrambled_team_IDS[teams_per_member * i:teams_per_member * (i + 1)] league_agent_mappings.append( np.concatenate([(member_matches * 3) + i for i in range(3)], axis=0)) observation = [[] for i in range(len(league))] action = [[] for i in range(len(league))]
misc_weapons = [ "FrogLegs", "IronBubblegum", "HeliumBubblegum", "Shell", "Trombone" ] tail_weapons = ["HealingGland", "VampireGland", "ParalyzingDart"] n_arenas = 80 random_configs = [{ "slots": [ random.choice(arm_weapons), random.choice(misc_weapons), random.choice(tail_weapons) ] } for i in range(3 * n_arenas // 2)] env = DerkEnv(n_arenas=n_arenas, turbo_mode=True, reward_function=win_loss_reward_function, home_team=random_configs, away_team=random_configs) save_model_every = 100 eval_against_gap = 100 past_models = [] model_checkpoint_schedule = [int(i**1.5) for i in range(1000)] save_folder = "checkpoints/PPO-LSTM-" + str(time.time()) os.mkdir(save_folder) for iteration in range(ITERATIONS): print("\n-----------------------------ITERATION " + str(iteration) + "-----------------------------")
"timeScaling": 0.8, } env = DerkEnv(mode="train", turbo_mode=True, home_team=[{ 'primaryColor': '#ff00ff' }, { 'primaryColor': '#00ff00', 'slots': ['Talons', None, None] }, { 'primaryColor': '#ff0000', 'rewardFunction': { 'healTeammate1': 1 } }], away_team=[{ 'primaryColor': '#c0c0c0' }, { 'primaryColor': 'navy', 'slots': ['Talons', None, None] }, { 'primaryColor': 'red', 'rewardFunction': { 'healTeammate1': 1 } }], session_args={"reward_function": REWARD_FUNCTION}) if os.path.exists(NPZ_FILENAME): with np.load(NPZ_FILENAME) as data:
return total_log_prob def update(self, obs, act, adv): logprob_pi = self.get_log_prob(obs, torch.Tensor(act).to(self.device)) self.optimizer.zero_grad() loss = torch.sum((-logprob_pi * torch.Tensor(adv).to(self.device))) loss.backward() self.optimizer.step() device = "cuda:0" ITERATIONS = 1000000 discount = 0.99 agent = nn_agent(512, device) env = DerkEnv(n_arenas=400, turbo_mode=True, reward_function=reward_function) save_model_every = 10 play_against_gap = 30 past_models = [] for iteration in range(ITERATIONS): print("\n-----------------------------ITERATION " + str(iteration) + "-----------------------------") if iteration % save_model_every == 0: past_models.append(copy.deepcopy(agent)) observation = [] done = [] action = []
# f.add_media("protein.gif") # f.add_text("#hashtag") env = DerkEnv(mode="train", turbo_mode=True, n_arenas=args.n_arenas, home_team=[{ 'primaryColor': '#3AA8C1' }, { 'primaryColor': '#BD559C', 'slots': ['Talons', None, None] }, { 'primaryColor': '#832A0D', 'rewardFunction': { 'healTeammate1': 1 } }], away_team=[{ 'primaryColor': '#2D5DA1' }, { 'primaryColor': '#D05340', 'slots': ['Talons', None, None] }, { 'primaryColor': '#FBE870', 'rewardFunction': { 'healTeammate1': 1 } }], session_args={"reward_function": REWARD_FUNCTION}) main(env, n_episodes=10000, start_training_at=max(args.batch_size * 2, 200),
def main(): seed = 2531 np.random.seed(seed) torch.manual_seed(seed) device = torch.device('cpu' if torch.cuda.is_available() else 'cpu') pretrained = True learning_rate = 5e-3 batch_size = 256 per_epoch_updating = 1 max_game_history_size = 300 game_epochs = 30_000 training_epochs = 40 estimator = QNet().to(device) env = DerkEnv(mode="train", turbo_mode=True, home_team=[{ 'primaryColor': '#ff00ff' }, { 'primaryColor': '#00ff00', 'slots': ['Talons', None, None] }, { 'primaryColor': '#ff0000', 'rewardFunction': { 'healTeammate1': 1 } }], away_team=[{ 'primaryColor': '#c0c0c0' }, { 'primaryColor': 'navy', 'slots': ['Talons', None, None] }, { 'primaryColor': 'red', 'rewardFunction': { 'healTeammate1': 1 } }], session_args={"reward_function": estimator.reward_function}) game_history = GameHistory(max_game_history_size) if exists(config.weights_path) and exists( config.reward_function_path) and pretrained: estimator.load_parameters(config.weights_path, config.reward_function_path) optimizer = optim.Adam(estimator.parameters(), lr=learning_rate) loss_func = nn.MSELoss() agent = DerkAgent(env.n_agents, estimator, device=device) try: for i_epoch in range(1, game_epochs + 1): epsilon = max( 0.01, 0.2 - 0.01 * (i_epoch / 200)) # Linear annealing from 8% to 1% 0.08 agent.update_epsilon(epsilon) game_history.reset() epoch_games_history_collection(env, agent, game_history) epoch_training(estimator, optimizer, loss_func, game_history, batch_size, training_epochs, device) if i_epoch % per_epoch_updating == 0: print( f'Games epoch: {i_epoch} - Total reward: {env.total_reward}' ) agent.update_estimator(estimator) save_parameters(estimator, config.weights_history_path, i_epoch) except KeyboardInterrupt: print('Interrupted') finally: print('*Game closing*') env.close()