예제 #1
0
def self_play(ctx):
    """Train by self-play, retraining from self-played frames and changing best player when
    new trained player beats currently best player.

    Args:
        ctx (click.core.Context): context object.
            Parameters for training:
                * 'game' (string)                     : game name (Default: tictactoe)
                * 'max_iter' (int)                    : number of train process iterations
                                                        (Default: -1)
                * 'min_examples' (int)                : minimum number of examples to start training
                                                        nn, if -1 then no threshold. (Default: -1)
                * 'policy_warmup' (int)               : how many stochastic warmup steps should take
                                                        deterministic policy (Default: 12)
                * 'n_self_plays' (int)                : number of self played episodes
                                                        (Default: 100)
                * 'n_tournaments' (int)               : number of tournament episodes (Default: 20)
                * 'save_checkpoint_folder' (string)   : folder to save best models
                                                        (Default: "checkpoints")
                * 'save_checkpoint_filename' (string) : filename of best model (Default: "best")
                * 'save_self_play_log_path' (string)  : where to save self-play logs.
                                                        (Default: "./logs/self-play.log")
                * 'save_tournament_log_path' (string) : where to save tournament logs.
                                                        (Default: "./logs/tournament.log")
                * 'update_threshold' (float):         : required threshold to be new best player
                                                        (Default: 0.55)
    """

    cfg = ctx.obj
    coach = Coach(cfg)

    # Create TensorBoard logger
    tb_logger = TensorBoardLogger(
        utils.create_tensorboard_log_dir(cfg.logging['tensorboard_log_folder'],
                                         'score'))

    iteration = coach.global_epoch // cfg.training['epochs']
    while cfg.self_play[
            "max_iter"] == -1 or iteration < cfg.self_play["max_iter"]:
        iter_counter_str = "{:03d}/{:03d}".format(iteration + 1, cfg.self_play["max_iter"]) \
            if cfg.self_play["max_iter"] > 0 else "{:03d}/inf".format(iteration + 1)

        coach.play("Self-play  " + iter_counter_str)

        # Proceed to training only if threshold is fulfilled
        if len(coach.storage.big_bag) <= cfg.self_play["min_examples"]:
            log.warning("Skip training, gather minimum %d training examples!",
                        cfg.self_play["min_examples"])
            continue

        coach.train()
        coach.evaluate("Tournament " + iter_counter_str, tournament_mode=True)

        # Log current player's score
        tb_logger.log_scalar("Best score", coach.best_score, iteration)

        # Increment iterator
        iteration += 1
예제 #2
0
def human_play(ctx, model_path, n_games):
    """Play `n_games` with trained model.

        Args:
            model_path: (string): Path to trained model.
    """

    cfg = ctx.obj
    coach = Coach(cfg, model_path)

    coach.current_mind.players[1] = HumanPlayer(cfg.mdp)
    coach.eval_callbacks.append(BoardRender(cfg.env, render=True, fancy=True))
    coach.scoreboard = Tournament()

    coach.evaluate(desc="Test models: Human vs. {}".format(
        model_path.split("/")[-1]),
                   n_games=n_games)

    log.info("Human vs. %s results: %s",
             model_path.split("/")[-1], coach.scoreboard.results)
예제 #3
0
def clash(ctx, first_model_path, second_model_path, render, n_games):
    """Test two models. Play `n_games` between themselves.

        Args:
            first_model_path: (string): Path to player one model.
            second_model_path (string): Path to player two model.
    """

    cfg = ctx.obj
    coach = Coach(cfg,
                  current_ckpt=first_model_path,
                  best_ckpt=second_model_path)

    coach.scoreboard = Tournament()
    coach.evaluate(desc="Test models: {} vs {}".format(
        first_model_path.split("/")[-1],
        second_model_path.split("/")[-1]),
                   render_mode=render,
                   n_games=n_games)

    log.info("%s vs %s results: %s",
             first_model_path.split("/")[-1],
             second_model_path.split("/")[-1], coach.scoreboard.results)