def main(): task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2) random_r1 = build_Random_Agent(task, {}, agent_name='random') random_r2 = deepcopy(random_r1) mcts_config = { 'budget': 10, 'rollout_budget': 1000, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 4 # Might need to tweak this? } mcts_r1 = build_MCTS_Agent(task, mcts_config, agent_name='P1: MCTS') mcts_r2 = build_MCTS_Agent(task, mcts_config, agent_name='P2: MCTS') human_r1 = HumanAgent(task.action_dim, name='P1') human_r2 = HumanAgent(task.action_dim, name='P2') # t = task.run_episode([mcts_r1, mcts_r2], training=False, render_mode='rgb', save_gif=True) t = task.run_episode([mcts_r1, mcts_r2], training=False) print(t)
def generate_evaluation_matrix(cool_game_params, benchmarking_episodes, mcts_budget): # 0: SawBot 1: TorchBot 2: NailBot import gym_cool_game saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = { 'budget': mcts_budget, 'rollout_budget': 1000, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 4 # Might need to tweak this? } mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_vs_torch = compute_matchup_winrates(mcts_agent, saw_vs_torch_task, 'Saw vs Torch', benchmarking_episodes, mcts_budget) saw_vs_nail = compute_matchup_winrates(mcts_agent, saw_vs_nail_task, 'Saw vs Nail', benchmarking_episodes, mcts_budget) torch_vs_nail = compute_matchup_winrates(mcts_agent, torch_vs_nail_task, 'Torch vs Nail', benchmarking_episodes, mcts_budget) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:[{saw_vs_torch}, {saw_vs_nail}] nail:[{torch_vs_nail}]' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') wandb.log({ 'Winrate_Saw_vs_Torch': saw_vs_torch, 'Winrate_Saw_vs_Nail': saw_vs_nail, 'Winrate_Torch_vs_Nail': torch_vs_nail }) return np.array([[0., saw_vs_torch, saw_vs_nail], [1. - saw_vs_torch, 0., torch_vs_nail], [1. - saw_vs_nail, 1. - torch_vs_nail, 0.]])
def generate_evaluation_matrix(cool_game_params, benchmarking_episodes, mcts_budget, logger: logging.Logger): # 0: SawBot 1: TorchBot 2: NailBot import gym_cool_game saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = {'budget': mcts_budget, 'rollout_budget': 10} mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_vs_torch = compute_matchup_winrates(mcts_agent, saw_vs_torch_task, 'Saw vs Torch', benchmarking_episodes, mcts_budget, logger) saw_vs_nail = compute_matchup_winrates(mcts_agent, saw_vs_nail_task, 'Saw vs Nail', benchmarking_episodes, mcts_budget, logger) torch_vs_nail = compute_matchup_winrates(mcts_agent, torch_vs_nail_task, 'Torch vs Nail', benchmarking_episodes, mcts_budget, logger) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:[{saw_vs_torch}, {saw_vs_nail}] nail:[{torch_vs_nail}]' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') return np.array([[0., saw_vs_torch, saw_vs_nail], [1. - saw_vs_torch, 0., torch_vs_nail], [1. - saw_vs_nail, 1. - torch_vs_nail, 0.]])
def generate_evaluation_matrix(cool_game_params, logger): # 0: SawBot 1: TorchBot 2: NailBot benchmarking_episodes = 1 mcts_budget = 1 saw_vs_torch_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=1, **cool_game_params) saw_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=0, botB_type=2, **cool_game_params) torch_vs_nail_task = generate_task('CoolGame-v0', EnvType.MULTIAGENT_SIMULTANEOUS_ACTION, botA_type=1, botB_type=2, **cool_game_params) mcts_config = {'budget': mcts_budget} mcts_agent = build_MCTS_Agent(saw_vs_torch_task, mcts_config, agent_name='MCTS agent') saw_winrates = benchmark_agents_on_tasks( tasks=[saw_vs_torch_task, saw_vs_nail_task], agents=[mcts_agent], populate_all_agents=True, num_episodes=benchmarking_episodes) nail_winrate = benchmark_agents_on_tasks( tasks=[torch_vs_nail_task], agents=[mcts_agent], populate_all_agents=True, num_episodes=benchmarking_episodes) bench_msg = f'episodes={benchmarking_episodes} MCTS_budget={mcts_budget}' winrates_msg = f'winrates=saw:{saw_winrates} nail:{nail_winrate}' logger.info(bench_msg) logger.info(winrates_msg) logger.info(f'params={cool_game_params}') return np.array([[0., saw_winrates[0], saw_winrates[1]], [-saw_winrates[0], 0., nail_winrate[0]], [-saw_winrates[0], -nail_winrate[0], 0.]])
def test_multiagent_sequential_tasks_with_model_based_agents_run_faster_on_parallel( env_name): task = generate_task(env_name, EnvType.MULTIAGENT_SEQUENTIAL_ACTION) mcts_config = { 'budget': 10, 'rollout_budget': 100, 'use_dirichlet': False, 'dirichlet_alpha': 1, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1 } agent_vector = [ build_MCTS_Agent(task, mcts_config, 'Test-MCTS-Random') for _ in range(task.num_agents) ] start = time.time() num_episodes = 10 num_envs = 1 _ = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=False) total_single = time.time() - start print('Sequential: ', total_single) start = time.time() num_envs = multiprocessing.cpu_count() _ = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=False) total_multiple = time.time() - start print('Parallel: ', total_multiple, 'Sequential: ', total_single, 'Diff: ', total_single - total_multiple) assert total_multiple < total_single
def estimate_agent_strength(agent: regym.rl_algorithms.agents.Agent, task: regym.environments.Task, desired_winrate: float, initial_mcts_config: Dict, benchmarking_episodes: int = 200) -> int: ''' Computes MCTS budget required to reach a :param: desired winrate against :param: agent ''' config = initial_mcts_config.copy() for budget in range(initial_mcts_config['budget'], 2000, 1): logger.info(f'Starting benchmarking with BUDGET: {budget}') config['budget'] = budget mcts_agent = build_MCTS_Agent(task, config, f'MCTS-{budget}') traj_1 = task.run_episodes([agent, mcts_agent], num_episodes=(benchmarking_episodes / 2), num_envs=-1, training=False) logger.info('Half way through') traj_2 = task.run_episodes([mcts_agent, agent], num_episodes=(benchmarking_episodes / 2), num_envs=-1, training=False) traj_1_winners = [extract_winner(t) for t in traj_1] # Our agent is pos: 0 traj_2_winners = [extract_winner(t) for t in traj_2] # Our agent is pos: 1 pos_0_winrate = traj_1_winners.count(0) / len(traj_1) pos_1_winrate = traj_2_winners.count(1) / len(traj_2) avg_winrate = (pos_0_winrate + pos_1_winrate) / 2 logger.info( f'WINRATES: Total = {avg_winrate}\tPos 0 = {pos_0_winrate}\t Pos 1 = {pos_1_winrate}' ) logger.info('') if avg_winrate < desired_winrate: return avg_winrate
def test_train_vanilla_exit_against_random_connect4(Connect4Task, expert_iteration_config_dict, mcts_config_dict): # Train worthy params expert_iteration_config_dict['use_apprentice_in_expert'] = True expert_iteration_config_dict['games_per_iteration'] = 100 expert_iteration_config_dict['mcts_budget'] = 200 expert_iteration_config_dict['mcts_rollout_budget'] = 10000 expert_iteration_config_dict['initial_memory_size'] = 6000 expert_iteration_config_dict['memory_size_increase_frequency'] = 2 expert_iteration_config_dict['end_memory_size'] = 30000 expert_iteration_config_dict['dirichlet_alpha'] = 1 expert_iteration_config_dict['batch_size'] = 256 expert_iteration_config_dict['num_epochs_per_iteration'] = 4 # expert_iteration_config_dict['residual_connections'] = [(2, 3), (3, 4)] ex_it = build_ExpertIteration_Agent(Connect4Task, expert_iteration_config_dict, agent_name=f"ExIt-test:{expert_iteration_config_dict['mcts_budget']}") ex_it.algorithm.summary_writer = SummaryWriter('expert_iteration_test') mcts_config_dict['budget'] = 1 mcts_agent = build_MCTS_Agent(Connect4Task, mcts_config_dict, agent_name=f"MCTS:{mcts_config_dict['budget']}") parallel_learn_against_fix_opponent(ex_it, fixed_opponent=mcts_agent, agent_position=0, task=Connect4Task, training_episodes=5000, test_episodes=100, benchmarking_episodes=20, benchmark_every_n_episodes=500, reward_tolerance=0.2, maximum_average_reward=1.0, evaluation_method='last', show_progress=True, num_envs=torch.multiprocessing.cpu_count(), summary_writer=summary_writer)
def estimate_agent_strength( agent: regym.rl_algorithms.agents.Agent, task: regym.environments.Task, desired_winrate: float, initial_mcts_config: Dict, logger, benchmarking_episodes: int = 200) -> Tuple[int, pd.DataFrame]: ''' Computes MCTS budget for an MCTS agent (with :param: initial_mcts_config) required to reach a :param: desired_winrate against :param: agent in :param: task. TODO: mention that we are talking about a non-symmetrical game :param agent: TODO :param task: TODO :param desired_winrate: TODO :param initial_mcts_config: TODO :param logger: TODO :param benchmarking_episodes: TODO :returns pd.DataFrame containing logs about winrates observed during strength estimation ''' df = pd.DataFrame(columns=('test_agent_id', 'mcts_budget', 'winrate_pos_0', 'winrate_pos_1', 'avg_winrate')) config = initial_mcts_config.copy() for budget in range(initial_mcts_config['budget'], 2000, 1): logger.info(f'Starting benchmarking with BUDGET: {budget}') config['budget'] = budget mcts_agent = build_MCTS_Agent(task, config, f'MCTS-{budget}') traj_1 = task.run_episodes([agent, mcts_agent], num_episodes=(benchmarking_episodes // 2), num_envs=-1, training=False) traj_2 = task.run_episodes([mcts_agent, agent], num_episodes=(benchmarking_episodes // 2), num_envs=-1, training=False) winrates_1 = [ len(list(filter(lambda t: t.winner == a_i, traj_1))) / len(traj_1) for a_i in range(2) ] winrates_2 = [ len(list(filter(lambda t: t.winner == a_i, traj_2))) / len(traj_2) for a_i in range(2) ] avg_winrate = (winrates_1[0] + winrates_2[1]) / 2 df = df.append( { 'test_agent_id': agent.handled_experiences, 'mcts_budget': budget, 'winrate_pos_0': winrates_1[0], 'winrate_pos_1': winrates_2[1], 'avg_winrate': avg_winrate }, ignore_index=True) logger.info( f'WINRATES: Total = {avg_winrate}\tPos 0 = {winrates_1[0]}\t Pos 1 = {winrates_2[1]}' ) if avg_winrate < desired_winrate: return budget, df
os.mkdir(args.name) ### To refactor at some point #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0]) # PPO test training sorted_population = load_population_from_path(path=args.path, sort_fn=sort_fn) sorted_population.sort(key=lambda agent: agent.finished_episodes) for agent in sorted_population: agent.requires_environment_model = False agent.training = False ### # Taken from MCTS equivalent strength benchmarking mcts_budgets = [29, 42, 42, 38, 45, 56, 48, 49, 51, 42, 53, 46, 35, 49, 49, 42, 45, 40, 45, 42, 47, 38, 42, 47, 45, 37, 42, 35, 39, 25, 38, 34, 33, 38, 40] mcts_population = [] for budget in mcts_budgets: initial_mcts_config = {'budget': budget, 'rollout_budget': 100, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1.41, 'use_dirichlet': False, 'dirichlet_alpha': None} mcts_population.append( build_MCTS_Agent(generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION), initial_mcts_config, agent_name=f'MCTS:{budget}') ) main(population=sorted_population+mcts_population, name=args.name)
def run_parallel_task_with_random_agent(env_name, env_type, num_envs, num_episodes, model_based_agents=False): task = generate_task(env_name, env_type) # Random agens, either MCTS or random if model_based_agents: mcts_config = { 'budget': 1, 'rollout_budget': 0, 'use_dirichlet': False, 'dirichlet_alpha': 1, 'selection_phase': 'ucb1', 'exploration_factor_ucb1': 1, 'expose_tree_in_predictions': True } agent_vector = [ build_MCTS_Agent(task, mcts_config, 'Test-MCTS-Random') for _ in range(task.num_agents) ] else: agent_vector = [ build_Random_Agent(task, {}, 'Test-Random') for _ in range(task.num_agents) ] # The number of environments is larger than number of # episodes because we want to test if we can generate # a specific number of trajectories regardless of the # Number of environments used to generate them trajectories = task.run_episodes(agent_vector, num_episodes=num_episodes, num_envs=num_envs, training=True, store_extra_information=True) import pdbr pdbr.set_trace() # We have the exact number of trajectories we asked for # The number of trajectories is lower-bounded by :param: num_episodes # But it is possible that multiple environments finish at the same time assert (len(trajectories) >= num_episodes) and (len(trajectories) <= (num_episodes + num_envs)) # All trajectories finish with a "done" flag assert all([t[-1].done for t in trajectories]) # All timesteps except for last one in all trajectories don't have "done" set for t in trajectories: assert all([not timestep.done for timestep in t[:-1]]) # ASSUMPTION: observation and succ_observation are numpy array if env_type == EnvType.SINGLE_AGENT: # Observation and succ_observation are the same # ASSUMPTION: observation and succ_observation are numpy array assert all([(ex_1.succ_observation == ex_2.observation).all() for t in trajectories for ex_1, ex_2 in zip(t, t[1:])]) else: # Observation and succ_observation are the same for all agents assert all([ (ex_1.succ_observation[a_i] == ex_2.observation[a_i]).all() for t in trajectories for ex_1, ex_2 in zip(t, t[1:]) for a_i in range(task.num_agents) ])