def test_stockfish_player_vs_stockfish_player(self): winner, result, board = play_game(StockfishPlayer(depth=5), StockfishPlayer(depth=10)) self.assertEqual(winner, 'Black') self.assertTrue('checkmate' in result) self.assertEqual(board.fen(), '6k1/5pp1/7p/1n1pb1qK/8/8/8/8 w - - 10 59')
def test_worker(args, shared_model, total_steps, optimizer): args.environment.clip_rewards = False env = make_env(args) log_path = '{}/{}'.format(args.train.experiment_path, 'log.txt') logging.basicConfig(filename=log_path, level=logging.INFO) logging.info("STARTED TRAINING PROCESS {}".format( time.strftime("%Y.%m.%d_%H:%M", time.localtime()))) model = ActorCritic(env.observation_space.shape, env.action_space.n) model.eval() start_time = time.time() reward_history = [] while True: model.load_state_dict(shared_model.state_dict()) if (len(reward_history) + 1) % args.train.save_frequency == 0: save_progress(args, model, optimizer, total_steps.value) total_reward, _ = play_game(model, env) reward_history.append(total_reward) log_message = "Time {}, num steps {}, FPS {:.0f}, curr episode reward {}, mean episode reward: {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), total_steps.value, total_steps.value / (time.time() - start_time), total_reward, np.mean(reward_history[-60:]), ) print(log_message) logging.info(log_message) time.sleep(60)
def test_random_player_vs_random_player(self): winner, result, board = play_game(RandomPlayer(), RandomPlayer(), move_limit=100) self.assertIn(winner, [None, 'White', 'Black']) self.assertTrue('draw' in result or 'checkmate' in result) self.assertIsInstance(board, chess.Board)
def main(_): game = pyspiel.load_game("kuhn_poker") action_string = None env_configs = {"players": 2} env = rl_environment.Environment(game, **env_configs) num_actions = env.action_spec()["num_actions"] agents = [ tabular_policy_from_csv(game, "./kuhn_policy.csv"), ] # Loop through a certain amount of games and play with the trained agent against # a command line player or a random player # Define end player end_player = 1 # 0: Agent, 1: Other, (2: Ties) gains = [0, 0] wins = [0, 0, 0] for i in range(FLAGS.games_to_play): state = game.new_initial_state() time_step = env.reset() results = play_game(game, state, agents, end_player, print_output=False, interactive=False) gains[0] += results[1 - end_player] gains[1] += results[end_player] if results[1 - end_player] > results[end_player]: wins[0] += 1 elif results[1 - end_player] == results[end_player]: wins[2] += 1 else: wins[1] += 1 # Switch end player (= switch players P0 becoms P1 and visa versa) end_player = 1 - end_player for pid in range(game.num_players()): print("Final utility for Player {} is {}".format(pid, gains[pid])) print("\n") for pid in range(game.num_players()): print("Player {} won {} out of {} games".format( pid, wins[pid], num_games)) print("With {} ties out of {} games".format(wins[2], num_games))
def test_worker(args, shared_model, total_steps, optimizer): args.environment.clip_rewards = False env = make_env(args.environment) log_path = '{}/{}'.format(args.train.experiment_folder, 'log.txt') logging.basicConfig(filename=log_path, level=logging.INFO) logging.info("STARTED TRAINING PROCESS {}".format(time.strftime("%Y.%m.%d_%H:%M", time.localtime()))) model = ActorCritic(env.observation_space.shape, env.action_space.n) model = BaseWrapper(model) if (args.train.use_pixel_control or args.train.use_reward_prediction): model = ExperienceWrapper(model) if args.train.use_pixel_control: model = PixelControlWrapper(model, args.train.gamma, args.train.pc_coef) if args.train.use_reward_prediction: model = RewardPredictionWrapper(model, args.train.rp_coef) if args.train.use_value_replay: model = ValueReplayWrapper(model) model.config = args model.eval() start_time = time.time() reward_history = [] while True: model.load_state_dict(shared_model.state_dict()) if (len(reward_history) + 1) % args.train.save_frequency == 0: save_progress(args, model, optimizer, total_steps.value) stats = play_game(model, env) reward_history.append(stats['total_reward']) log_message = ( 'Time {}, num steps {}, FPS {:.0f}, '+ 'curr episode reward {:.2f}, mean episode reward: {:.2f}, '+ 'mean policy loss {:.2f}, mean value loss {:.2f}, '+ 'mean entropy percentage {:.2f}' ).format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), total_steps.value, total_steps.value / (time.time() - start_time), stats['total_reward'], np.mean(reward_history[-60:]), stats['policy_loss'], stats['value_loss'], stats['entropy'] ) if args.train.use_pixel_control: log_message += ', pixel control loss %.2f' %stats['pc_loss'] if args.train.use_reward_prediction: log_message += ', reward prediction loss %.2f' %stats['rp_loss'] if args.train.use_value_replay: log_message += ', value replay loss %.2f' %stats['vr_loss'] print(log_message) logging.info(log_message) time.sleep(60)
env = wrap_deepmind(gym.make(param['env']), frame_stack = True) dqn = model.DQN(num_actions = env.action_space.n).to(device) target_dqn = copy.deepcopy(dqn) def dqn_epsilon_agent(state, net = dqn, th = 0.05): if random.random() > th: yhat = net(default_states_preprocessor(state)) return int(yhat.argmax().cpu().numpy()) else: return env.action_space.sample() optimizer = optim.Adam(dqn.parameters(), lr = param['lr']) # Warmup buffer for _ in range(5): game = utils.play_game(env, agent = dqn_epsilon_agent, th = eps.get(0), memory = memory) step = 0 metrics = {} metrics['episode'] = 0 while True: metrics['episode'] += 1 ## PLAY GAME metrics['epsilon'] = eps.get(step) game = utils.play_game(env, agent = dqn_epsilon_agent, th = metrics['epsilon'], memory = memory) metrics['run_reward'], metrics['run_episode_steps'] = game['cum_reward'], game['steps'] step += metrics['run_episode_steps'] ## TRAIN for _ in range(metrics['run_episode_steps']//param['batch_size']):
def test_stockfish_player_vs_random_player(self): winner, result, board = play_game(StockfishPlayer(depth=5), RandomPlayer()) self.assertEqual(winner, 'White') self.assertTrue('checkmate' in result) self.assertIsInstance(board, chess.Board)
# trainingDQN.trainDQN(file_name="env1", # env=GridworldEnv(1), # batch_size=128, # gamma=0.999, # eps_start=0.9, # eps_end=0.05, # eps_decay=1000, # is_plot=True, # num_episodes=500, # max_num_steps_per_episode=1000, # learning_rate=0.0001, # memory_replay_size=10000, # ) agent, _, _ = trainingDQN.trainDQN( file_name="env1", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000, ) play_game(GridworldEnv(1), agent)
# %matplotlib inline # from IPython.display import clear_output import pandas as pd from utils import play_game from tqdm import tqdm board_size = 3 version = 'v03' log_frequency = 500 episodes = 100000 agent = DeepQLearningAgent(board_size, use_target_net=True, buffer_size=10000) agent_random = NoviceAgent(board_size) env = TicTacToe() # cold start problem, add some games to buffer for training _, _ = play_game(agent, agent_random, env, epsilon=1, n_games=100, record=True) # train on those games agent.train_agent() # main training procedure win_counts = {'win':0, 'lose':0, 'draw':0} loss_history = [] reward_history = [] epsilon = 0.9 decay = 0.99 epsilon_end = 0.01 # training loop model_logs = {'iteration':[], 'reward_mean':[], 'reward_dev':[], 'wins':[], 'draws':[], 'loss':[]} for index in tqdm(range(episodes)):