def initialise_rule_based_players(self): self.players = {} for strat in self.strategy_types: self.players[strat] = get_player("computer", None, strat) self.players_rand = {1: [], 2: []} for p, strat in product([1, 2], self.strategy_types): self.players_rand[p].append(get_player("computer", None, strat))
def play_game(gameplay, params_1, params_2): board, tiles = Board(), Tiles() player_1 = get_player(params_1["player_type"], board, params_1["strategy_type"], params=params_1) player_2 = get_player(params_2["player_type"], board, params_2["strategy_type"], params=params_2) winner, _ = gameplay.play_test_game(player_1, player_2) return winner
def initialise_game(self, player_to_start): super().initialise_game() strat_1, strat_2 = get_strategy_types(self.params) self.players[1] = get_player(self.params[1]["player_type"], self.board, strat_1, params=self.params[1]) self.players[2] = get_player(self.params[2]["player_type"], self.board, strat_2, params=self.params[2]) self.turn_of = get_other_player(player_to_start) self.other = get_other_player(self.turn_of) self.representation = RepresentationGenerator(self.params)
def initialise_game(self, _): super().initialise_game() strat_1, strat_2 = get_strategy_types(self.params) self.players[1] = get_player(self.params[1]["player_type"], self.board, strat_1, params=self.params[1]) self.players[2] = get_player(self.params[2]["player_type"], self.board, strat_2, params=self.params[2]) self.turn_of = np.random.choice([1, 2]) self.other = get_other_player(self.turn_of) self.players[self.other].pick_up(self.tiles) self.players[self.turn_of].pick_up(self.tiles) self.representation = RepresentationGenerator()
def play_game(gameplay, params): strat_1, strat_2 = get_strategy_types(params) player_1 = get_player(params[1]["player_type"], None, strat_1, params=params[1]) player_2 = get_player(params[2]["player_type"], None, strat_2, params=params[2]) winner, _ = gameplay.play_test_game(player_1, player_2) return winner
def train(self): self.initialise_rule_based_players() self.training_p1 = get_player( "computer", None, "rl", params={"max_eval_batch_size": self.p.max_eval_batch_size}) self.training_p2 = get_player( "computer", None, "rl", params={"max_eval_batch_size": self.p.max_eval_batch_size}) self.training_p1.strategy.set_model(self.get_new_network()) self.training_p2.strategy.set_model(self.get_new_network()) self.test_player = get_player( "computer", None, "rl", params={"max_eval_batch_size": self.p.max_eval_batch_size}) self.test_player.strategy.set_model(self.get_new_network()) if self.p.restore_ckpt_dir is not None: print("Loading checkpoint") load_ckpt_path = os.path.join(self.p.restore_ckpt_dir, "latest.pth") self.load_model(load_ckpt_path) # self.net.load_state_dict(torch.load(load_ckpt_path, map_location=self.device)) # set_model_to_float(self.net) self.load_optimiser(self.p.restore_ckpt_dir) set_optimizer_params(self.optimizer, lr=self.lr_tracker, weight_decay=self.p.weight_decay) self.save_model(self.latest_ckpt_path) if self.p.restore_ckpt_dir is not None: load_ckpt_path = os.path.join(self.p.restore_ckpt_dir, "best_self.pth") else: load_ckpt_path = self.latest_ckpt_path self.training_p1.strategy.load_model(load_ckpt_path) self.training_p2.strategy.load_model(load_ckpt_path) if self.p.restore_ckpt_dir is not None: p1 = self.training_p1 p2 = self.training_p2 else: p1 = get_player("computer", None, "random") p2 = get_player("computer", None, "random") self.fill_replay_buffer(p1, p2) self.add_n_games_to_replay_buffer(self.training_p1, self.training_p2, 2) self.net = self.net.to(self.device) running_loss, running_error = 0.0, 0.0 best_win_rate_rule = 0.0 self.training_finished = False self.add_graph_to_logs() print("Start training") while not self.training_finished: print("Step", self.current_step) self.add_n_games_to_replay_buffer(self.training_p1, self.training_p2, self.p.episodes_per_step) avg_loss, abs_error, vis_inputs = self.apply_n_learning_updates( self.p.updates_per_step) running_loss += avg_loss running_error += abs_error if self.current_step > 0 and self.current_step % int( self.p.log_every_n_steps) == 0: avg_running_loss = running_loss / float( self.p.log_every_n_steps) mean_abs_error = running_error / float( self.p.log_every_n_steps) self.write_metrics_to_tensorboard(avg_running_loss, mean_abs_error) running_loss, running_error = 0.0, 0.0 if self.current_step % int(self.p.vis_every_n_steps) == 0: vis_figs = generate_debug_visualisation(vis_inputs) self.writer.add_figure('examples', vis_figs, global_step=self.current_step) if self.current_step % int(self.p.test_every_n_steps) == 0: self.save_model(self.latest_ckpt_path) copyfile( self.latest_ckpt_path, os.path.join(self.logs_dir, f"ckpt-{self.current_step}.pth")) self.test_player.strategy.load_model(self.latest_ckpt_path) self.save_optimiser() print(f"Playing {self.p.n_test_games} test games against self") self_win_rate, _, _ = self.play_n_test_games( self.test_player, self.training_p1, self.p.n_test_games, learn=False) self.writer.add_scalar('win_rates/rl', self_win_rate, self.current_step) print("Win rate: {:.2f}".format(self_win_rate)) if self_win_rate > self.p.improvement_threshold: print("Best self model improved!") self.training_p1.strategy.load_model(self.latest_ckpt_path) self.training_p2.strategy.load_model(self.latest_ckpt_path) copyfile(self.latest_ckpt_path, self.best_self_ckpt_path) self.best_self_model_step = self.current_step win_rate_rule = 0.0 for strat in self.strategy_types: print( f"Playing {self.p.n_other_games} test games against {strat}" ) win_rate, _, _ = self.play_n_test_games( self.test_player, self.players[strat], self.p.n_other_games, learn=False) self.writer.add_scalar(f'win_rates/{strat}', win_rate, self.current_step) print("Win rate: {:.2f}".format(win_rate)) win_rate_rule += win_rate if win_rate_rule >= best_win_rate_rule: best_win_rate_rule = win_rate_rule print("Best rule model improved!") copyfile(self.latest_ckpt_path, self.best_rule_ckpt_path) self.best_rule_model_step = self.current_step self.save_training_state() self.step_learning_rate_scheduling() self.current_step += 1 print(f"Training finished after {self.current_step} steps...") print(f"Final best self model was at step {self.best_self_model_step}") print(f"Final best rule model was at step {self.best_rule_model_step}") self.writer.close()