def main(path: str, name: str): task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION) #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0] ) # PPO test training sorted_population = load_population_from_path(path=path, sort_fn=sort_fn) for agent in sorted_population: print(agent.algorithm.num_updates) agent.requires_environment_model = False agent.training = False winrate_matrix = compute_winrate_matrix_metagame( population=sorted_population, episodes_per_matchup=1000, task=task) maxent_nash, nash_averaging = compute_nash_averaging( winrate_matrix, perform_logodds_transformation=True) winrate_matrix = np.array(winrate_matrix) print( 'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging' ) np.savetxt(f'{name}_winrate_matrix.csv', winrate_matrix, delimiter=', ') np.savetxt(f'{name}_maxent_nash.csv', maxent_nash, delimiter=', ') np.savetxt(f'{name}_nash_averaging.csv', maxent_nash, delimiter=', ') ax = plot_winrate_matrix(winrate_matrix) plt.show()
def initialize_experiment(experiment_config, agents_config, self_play_configs): task = create_task_from_config(experiment_config['environment']) sp_schemes = initialize_training_schemes(self_play_configs, task) agents = initialize_agents(task, agents_config) initial_menagerie = [] base_path = experiment_config['experiment_id'] menagerie_path = f"{base_path}/menagerie/{sp_schemes[0].name}-{experiment_config['algorithms'][0]}" # Load pre-trained agent, if there is any (there might be a menagerie but not a trained agent) if os.path.exists(base_path) and (os.listdir(base_path) != ['menagerie']): logger = logging.getLogger('LOADING AGENT AND MENAGERIE') logger.info(f"Attempting to load agent from: {base_path}/") agent = load_existing_agent_and_update_task(base_path, task) assert os.path.exists(menagerie_path), f'Menagerie should be present at {menagerie_path}' initial_menagerie = load_population_from_path(menagerie_path, show_progress=True) initial_menagerie.sort(key=lambda agent: agent.finished_episodes) logger.info(f'Loaded agent, with {agent.finished_episodes} episodes under its belt') logger.info(f'Loaded menagerie containing {len(initial_menagerie)} agents') return task, sp_schemes, agents, initial_menagerie
def single_experiment(task: Task, agents: List, selfplay_schemes: List[SelfPlayTrainingScheme], checkpoint_at_iterations: List[int], base_path: str, seed: int, benchmarking_episodes: int): trained_agent_paths = [] for sp_scheme in sp_schemes: for agent in agents: training_agent = agent.clone(training=True) path = f'{base_path}/{sp_scheme.name}-{agent.name}' trained_agent_paths += [path] train_and_evaluate(task=task, self_play_scheme=sp_scheme, training_agent=training_agent, checkpoint_at_iterations=checkpoint_at_iterations, benchmarking_episodes=experiment_config['benchmarking_episodes'], base_path=path, seed=seed) # Self-play schemes like PSRO contain useful information dill.dump(sp_scheme, open(f'{path}/{sp_scheme.name}.pickle', 'wb')) logging.info('Computing relative performances') relative_performances_path = f'{base_path}/relative_performances/' if not os.path.exists(relative_performances_path): os.mkdir(relative_performances_path) compute_relative_pop_performance_all_populations(trained_agent_paths, task, benchmarking_episodes, base_path=relative_performances_path) logging.info('Loading all trained agents') joint_trained_population = reduce(lambda succ, path: succ + load_population_from_path(path), trained_agent_paths, []) logging.info('START winrate matrix computation of all trained policies') final_winrate_matrix = compute_winrate_matrix_metagame(joint_trained_population, episodes_per_matchup=5, task=task) logging.info('START Nash averaging computation of all trained policies') maxent_nash, nash_avg = compute_nash_averaging(final_winrate_matrix, perform_logodds_transformation=True) logging.info('Experiment FINISHED!') dill.dump(final_winrate_matrix, open(f'{base_path}/final_winrate_matrix.pickle', 'wb')) dill.dump(maxent_nash, open(f'{base_path}/final_maxent_nash.pickle', 'wb'))
logging.basicConfig(level=logging.INFO) logger = logging.getLogger('MCTS_benchmarking') fh = logging.FileHandler('mcts_strength_benchmark.logs') fh.setLevel(logging.INFO) logger.addHandler(fh) parser = argparse.ArgumentParser( description= 'Estimates the skill of agents by playing against increasingly strong MCTS agents' ) parser.add_argument( '--path', required=True, help='Path to directory containing trained agents to be benchmarked') parser.add_argument('--num_stack', required=True, help='Number of FrameStack(s)') args = parser.parse_args() population = load_population_from_path(path=args.path, show_progress=True) population.sort(key=lambda agent: agent.finished_episodes) for agent in population: agent.requires_environment_model = False agent.training = False # If not using frame stack: TODO # If using frame stack agent.state_preprocess_fn = flatten_last_dim_and_batch_vector_observation main(population, logger, int(args.num_stack))
parser.add_argument( '--config', required=True, help= 'path to YAML config file containing info about environment and agents' ) parser.add_argument( '--opponents_path', required=True, help='path to directory containing agents to train against (opponents)' ) args = parser.parse_args() multiprocessing.set_start_method('forkserver') exper_config, agents_config = load_configs(args.config) task, agents = initialize_experiment(exper_config, agents_config) test_agents = load_population_from_path(args.opponents_path) test_agents = [ build_NeuralNet_Agent( task, { 'neural_net': t_a.algorithm.model, 'pre_processing_fn': batch_vector_observation }, f'TestAgent: {t_a.handled_experiences}') for t_a in test_agents ] summary_writer = SummaryWriter('Exit-TrainAgainstTestAgents') regym.rl_algorithms.expert_iteration.expert_iteration_loss.summary_writer = summary_writer train_against_fixed_agent(task, agents, test_agents, exper_config, summary_writer)
def create_wrapper(num_stack: int): frame_stack_wrapper = partial(FrameStack, num_stack=num_stack) return [frame_stack_wrapper] if __name__ == "__main__": parser = argparse.ArgumentParser(description='Computes winrate matrices and Nash averagings for test agents of paper "On Opponent Modelling in Expert Iteration"') parser.add_argument('--path', required=True, help='Path to directory containing trained agents to be benchmarked') parser.add_argument('--name', required=True, help='Identifier, used in file creation') args = parser.parse_args() os.mkdir(args.name) ### To refactor at some point #sort_fn = lambda x: int(x.split('_')[-1][:-3]) # ExIt sort_fn = lambda x: int(x.split('/')[-1].split('_')[0]) # PPO test training sorted_population = load_population_from_path(path=args.path, sort_fn=sort_fn) sorted_population.sort(key=lambda agent: agent.finished_episodes) for agent in sorted_population: agent.requires_environment_model = False agent.training = False ### # Taken from MCTS equivalent strength benchmarking mcts_budgets = [29, 42, 42, 38, 45, 56, 48, 49, 51, 42, 53, 46, 35, 49, 49, 42, 45, 40, 45, 42, 47, 38, 42, 47, 45, 37, 42, 35, 39, 25, 38, 34, 33, 38, 40] mcts_population = [] for budget in mcts_budgets: initial_mcts_config = {'budget': budget, 'rollout_budget': 100, 'selection_phase': 'ucb1',
def load_population(path): sort_fn = lambda x: int(x.split('/')[-1].split('_')[0] ) # PPO test training return load_population_from_path(path=path, sort_fn=sort_fn)