def self_play(ctx): """Train by self-play, retraining from self-played frames and changing best player when new trained player beats currently best player. Args: ctx (click.core.Context): context object. Parameters for training: * 'game' (string) : game name (Default: tictactoe) * 'max_iter' (int) : number of train process iterations (Default: -1) * 'min_examples' (int) : minimum number of examples to start training nn, if -1 then no threshold. (Default: -1) * 'policy_warmup' (int) : how many stochastic warmup steps should take deterministic policy (Default: 12) * 'n_self_plays' (int) : number of self played episodes (Default: 100) * 'n_tournaments' (int) : number of tournament episodes (Default: 20) * 'save_checkpoint_folder' (string) : folder to save best models (Default: "checkpoints") * 'save_checkpoint_filename' (string) : filename of best model (Default: "best") * 'save_self_play_log_path' (string) : where to save self-play logs. (Default: "./logs/self-play.log") * 'save_tournament_log_path' (string) : where to save tournament logs. (Default: "./logs/tournament.log") * 'update_threshold' (float): : required threshold to be new best player (Default: 0.55) """ cfg = ctx.obj coach = Coach(cfg) # Create TensorBoard logger tb_logger = TensorBoardLogger( utils.create_tensorboard_log_dir(cfg.logging['tensorboard_log_folder'], 'score')) iteration = coach.global_epoch // cfg.training['epochs'] while cfg.self_play[ "max_iter"] == -1 or iteration < cfg.self_play["max_iter"]: iter_counter_str = "{:03d}/{:03d}".format(iteration + 1, cfg.self_play["max_iter"]) \ if cfg.self_play["max_iter"] > 0 else "{:03d}/inf".format(iteration + 1) coach.play("Self-play " + iter_counter_str) # Proceed to training only if threshold is fulfilled if len(coach.storage.big_bag) <= cfg.self_play["min_examples"]: log.warning("Skip training, gather minimum %d training examples!", cfg.self_play["min_examples"]) continue coach.train() coach.evaluate("Tournament " + iter_counter_str, tournament_mode=True) # Log current player's score tb_logger.log_scalar("Best score", coach.best_score, iteration) # Increment iterator iteration += 1
def human_play(ctx, model_path, n_games): """Play `n_games` with trained model. Args: model_path: (string): Path to trained model. """ cfg = ctx.obj coach = Coach(cfg, model_path) coach.current_mind.players[1] = HumanPlayer(cfg.mdp) coach.eval_callbacks.append(BoardRender(cfg.env, render=True, fancy=True)) coach.scoreboard = Tournament() coach.evaluate(desc="Test models: Human vs. {}".format( model_path.split("/")[-1]), n_games=n_games) log.info("Human vs. %s results: %s", model_path.split("/")[-1], coach.scoreboard.results)
def clash(ctx, first_model_path, second_model_path, render, n_games): """Test two models. Play `n_games` between themselves. Args: first_model_path: (string): Path to player one model. second_model_path (string): Path to player two model. """ cfg = ctx.obj coach = Coach(cfg, current_ckpt=first_model_path, best_ckpt=second_model_path) coach.scoreboard = Tournament() coach.evaluate(desc="Test models: {} vs {}".format( first_model_path.split("/")[-1], second_model_path.split("/")[-1]), render_mode=render, n_games=n_games) log.info("%s vs %s results: %s", first_model_path.split("/")[-1], second_model_path.split("/")[-1], coach.scoreboard.results)