def main(path: str, name: str):
    task = generate_task('Connect4-v0', EnvType.MULTIAGENT_SEQUENTIAL_ACTION)

    #sort_fn = lambda x: int(x.split('_')[-1][:-3])  # ExIt
    sort_fn = lambda x: int(x.split('/')[-1].split('_')[0]
                            )  # PPO test training
    sorted_population = load_population_from_path(path=path, sort_fn=sort_fn)

    for agent in sorted_population:
        print(agent.algorithm.num_updates)
        agent.requires_environment_model = False
        agent.training = False

    winrate_matrix = compute_winrate_matrix_metagame(
        population=sorted_population, episodes_per_matchup=1000, task=task)
    maxent_nash, nash_averaging = compute_nash_averaging(
        winrate_matrix, perform_logodds_transformation=True)

    winrate_matrix = np.array(winrate_matrix)
    print(
        'Saving winrate_matrix, max-entropy Nash equilibrium for game defined by winrate matrix and Nash averaging'
    )
    np.savetxt(f'{name}_winrate_matrix.csv', winrate_matrix, delimiter=', ')
    np.savetxt(f'{name}_maxent_nash.csv', maxent_nash, delimiter=', ')
    np.savetxt(f'{name}_nash_averaging.csv', maxent_nash, delimiter=', ')

    ax = plot_winrate_matrix(winrate_matrix)

    plt.show()
Esempio n. 2
0
def initialize_experiment(experiment_config, agents_config, self_play_configs):
    task = create_task_from_config(experiment_config['environment'])
    sp_schemes = initialize_training_schemes(self_play_configs, task)
    agents = initialize_agents(task, agents_config)
    initial_menagerie = []

    base_path = experiment_config['experiment_id']
    menagerie_path = f"{base_path}/menagerie/{sp_schemes[0].name}-{experiment_config['algorithms'][0]}"
    # Load pre-trained agent, if there is any (there might be a menagerie but not a trained agent)
    if os.path.exists(base_path) and (os.listdir(base_path) != ['menagerie']):
        logger = logging.getLogger('LOADING AGENT AND MENAGERIE')
        logger.info(f"Attempting to load agent from: {base_path}/")
        agent = load_existing_agent_and_update_task(base_path, task)
        assert os.path.exists(menagerie_path), f'Menagerie should be present at {menagerie_path}'
        initial_menagerie = load_population_from_path(menagerie_path, show_progress=True)
        initial_menagerie.sort(key=lambda agent: agent.finished_episodes)
        logger.info(f'Loaded agent, with {agent.finished_episodes} episodes under its belt')
        logger.info(f'Loaded menagerie containing {len(initial_menagerie)} agents')

    return task, sp_schemes, agents, initial_menagerie
Esempio n. 3
0
def single_experiment(task: Task, agents: List, selfplay_schemes: List[SelfPlayTrainingScheme],
               checkpoint_at_iterations: List[int], base_path: str, seed: int,
               benchmarking_episodes: int):
    trained_agent_paths = []
    for sp_scheme in sp_schemes:
        for agent in agents:
            training_agent = agent.clone(training=True)
            path = f'{base_path}/{sp_scheme.name}-{agent.name}'
            trained_agent_paths += [path]
            train_and_evaluate(task=task, self_play_scheme=sp_scheme,
                               training_agent=training_agent,
                               checkpoint_at_iterations=checkpoint_at_iterations,
                               benchmarking_episodes=experiment_config['benchmarking_episodes'],
                               base_path=path, seed=seed)
            # Self-play schemes like PSRO contain useful information
            dill.dump(sp_scheme, open(f'{path}/{sp_scheme.name}.pickle', 'wb'))

    logging.info('Computing relative performances')
    relative_performances_path = f'{base_path}/relative_performances/'
    if not os.path.exists(relative_performances_path): os.mkdir(relative_performances_path)
    compute_relative_pop_performance_all_populations(trained_agent_paths, task,
                                                     benchmarking_episodes,
                                                     base_path=relative_performances_path)

    logging.info('Loading all trained agents')
    joint_trained_population = reduce(lambda succ, path: succ + load_population_from_path(path),
                                      trained_agent_paths, [])
    logging.info('START winrate matrix computation of all trained policies')
    final_winrate_matrix = compute_winrate_matrix_metagame(joint_trained_population,
                                                           episodes_per_matchup=5,
                                                           task=task)
    logging.info('START Nash averaging computation of all trained policies')
    maxent_nash, nash_avg = compute_nash_averaging(final_winrate_matrix,
                                                   perform_logodds_transformation=True)
    logging.info('Experiment FINISHED!')
    dill.dump(final_winrate_matrix,
                open(f'{base_path}/final_winrate_matrix.pickle', 'wb'))
    dill.dump(maxent_nash,
                open(f'{base_path}/final_maxent_nash.pickle', 'wb'))
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('MCTS_benchmarking')
    fh = logging.FileHandler('mcts_strength_benchmark.logs')
    fh.setLevel(logging.INFO)
    logger.addHandler(fh)

    parser = argparse.ArgumentParser(
        description=
        'Estimates the skill of agents by playing against increasingly strong MCTS agents'
    )
    parser.add_argument(
        '--path',
        required=True,
        help='Path to directory containing trained agents to be benchmarked')
    parser.add_argument('--num_stack',
                        required=True,
                        help='Number of FrameStack(s)')
    args = parser.parse_args()

    population = load_population_from_path(path=args.path, show_progress=True)
    population.sort(key=lambda agent: agent.finished_episodes)

    for agent in population:
        agent.requires_environment_model = False
        agent.training = False
        # If not using frame stack: TODO
        # If using frame stack
        agent.state_preprocess_fn = flatten_last_dim_and_batch_vector_observation

    main(population, logger, int(args.num_stack))
Esempio n. 5
0
    parser.add_argument(
        '--config',
        required=True,
        help=
        'path to YAML config file containing info about environment and agents'
    )
    parser.add_argument(
        '--opponents_path',
        required=True,
        help='path to directory containing agents to train against (opponents)'
    )
    args = parser.parse_args()

    multiprocessing.set_start_method('forkserver')
    exper_config, agents_config = load_configs(args.config)
    task, agents = initialize_experiment(exper_config, agents_config)

    test_agents = load_population_from_path(args.opponents_path)
    test_agents = [
        build_NeuralNet_Agent(
            task, {
                'neural_net': t_a.algorithm.model,
                'pre_processing_fn': batch_vector_observation
            }, f'TestAgent: {t_a.handled_experiences}') for t_a in test_agents
    ]
    summary_writer = SummaryWriter('Exit-TrainAgainstTestAgents')
    regym.rl_algorithms.expert_iteration.expert_iteration_loss.summary_writer = summary_writer

    train_against_fixed_agent(task, agents, test_agents, exper_config,
                              summary_writer)
def create_wrapper(num_stack: int):
    frame_stack_wrapper = partial(FrameStack, num_stack=num_stack)
    return [frame_stack_wrapper]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Computes winrate matrices and Nash averagings for test agents of paper "On Opponent Modelling in Expert Iteration"')
    parser.add_argument('--path', required=True, help='Path to directory containing trained agents to be benchmarked')
    parser.add_argument('--name', required=True, help='Identifier, used in file creation')
    args = parser.parse_args()
    os.mkdir(args.name)

    ### To refactor at some point
    #sort_fn = lambda x: int(x.split('_')[-1][:-3])  # ExIt
    sort_fn = lambda x: int(x.split('/')[-1].split('_')[0])  # PPO test training
    sorted_population = load_population_from_path(path=args.path, sort_fn=sort_fn)
    sorted_population.sort(key=lambda agent: agent.finished_episodes)

    for agent in sorted_population:
        agent.requires_environment_model = False
        agent.training = False
    ###

    # Taken from MCTS equivalent strength benchmarking
    mcts_budgets = [29, 42, 42, 38, 45, 56, 48, 49, 51, 42, 53, 46, 35, 49, 49,
                    42, 45, 40, 45, 42, 47, 38, 42, 47, 45, 37, 42, 35, 39, 25,
                    38, 34, 33, 38, 40]
    mcts_population = []
    for budget in mcts_budgets:
        initial_mcts_config = {'budget': budget, 'rollout_budget': 100,
                               'selection_phase': 'ucb1',
Esempio n. 7
0
def load_population(path):
    sort_fn = lambda x: int(x.split('/')[-1].split('_')[0]
                            )  # PPO test training
    return load_population_from_path(path=path, sort_fn=sort_fn)