Exemple #1
0
 def test_stockfish_player_vs_stockfish_player(self):
     winner, result, board = play_game(StockfishPlayer(depth=5),
                                       StockfishPlayer(depth=10))
     self.assertEqual(winner, 'Black')
     self.assertTrue('checkmate' in result)
     self.assertEqual(board.fen(),
                      '6k1/5pp1/7p/1n1pb1qK/8/8/8/8 w - - 10 59')
Exemple #2
0
def test_worker(args, shared_model, total_steps, optimizer):
    args.environment.clip_rewards = False
    env = make_env(args)

    log_path = '{}/{}'.format(args.train.experiment_path, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.info("STARTED TRAINING PROCESS {}".format(
        time.strftime("%Y.%m.%d_%H:%M", time.localtime())))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model.eval()

    start_time = time.time()

    reward_history = []
    while True:
        model.load_state_dict(shared_model.state_dict())
        if (len(reward_history) + 1) % args.train.save_frequency == 0:
            save_progress(args, model, optimizer, total_steps.value)
        total_reward, _ = play_game(model, env)
        reward_history.append(total_reward)

        log_message = "Time {}, num steps {}, FPS {:.0f}, curr episode reward {}, mean episode reward: {}".format(
            time.strftime("%Hh %Mm %Ss",
                          time.gmtime(time.time() - start_time)),
            total_steps.value,
            total_steps.value / (time.time() - start_time),
            total_reward,
            np.mean(reward_history[-60:]),
        )
        print(log_message)
        logging.info(log_message)
        time.sleep(60)
Exemple #3
0
    def test_random_player_vs_random_player(self):
        winner, result, board = play_game(RandomPlayer(),
                                          RandomPlayer(),
                                          move_limit=100)

        self.assertIn(winner, [None, 'White', 'Black'])
        self.assertTrue('draw' in result or 'checkmate' in result)
        self.assertIsInstance(board, chess.Board)
Exemple #4
0
def main(_):
    game = pyspiel.load_game("kuhn_poker")
    action_string = None

    env_configs = {"players": 2}
    env = rl_environment.Environment(game, **env_configs)
    num_actions = env.action_spec()["num_actions"]

    agents = [
        tabular_policy_from_csv(game, "./kuhn_policy.csv"),
    ]

    # Loop through a certain amount of games and play with the trained agent against
    # a command line player or a random player

    # Define end player
    end_player = 1

    # 0: Agent, 1: Other, (2: Ties)
    gains = [0, 0]
    wins = [0, 0, 0]

    for i in range(FLAGS.games_to_play):
        state = game.new_initial_state()
        time_step = env.reset()

        results = play_game(game,
                            state,
                            agents,
                            end_player,
                            print_output=False,
                            interactive=False)

        gains[0] += results[1 - end_player]
        gains[1] += results[end_player]

        if results[1 - end_player] > results[end_player]:
            wins[0] += 1
        elif results[1 - end_player] == results[end_player]:
            wins[2] += 1
        else:
            wins[1] += 1

        # Switch end player (= switch players P0 becoms P1 and visa versa)
        end_player = 1 - end_player

    for pid in range(game.num_players()):
        print("Final utility for Player {} is {}".format(pid, gains[pid]))

    print("\n")

    for pid in range(game.num_players()):
        print("Player {} won {} out of {} games".format(
            pid, wins[pid], num_games))

    print("With {} ties out of {} games".format(wins[2], num_games))
Exemple #5
0
def test_worker(args, shared_model, total_steps, optimizer):
    args.environment.clip_rewards = False
    env = make_env(args.environment)

    log_path = '{}/{}'.format(args.train.experiment_folder, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.info("STARTED TRAINING PROCESS {}".format(time.strftime("%Y.%m.%d_%H:%M", time.localtime())))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (args.train.use_pixel_control or
            args.train.use_reward_prediction):
        model = ExperienceWrapper(model)
    if args.train.use_pixel_control:
        model = PixelControlWrapper(model, args.train.gamma, args.train.pc_coef)
    if args.train.use_reward_prediction:
        model = RewardPredictionWrapper(model, args.train.rp_coef)
    if args.train.use_value_replay:
        model = ValueReplayWrapper(model)
    model.config = args
    model.eval()

    start_time = time.time()

    reward_history = []
    while True:
        model.load_state_dict(shared_model.state_dict())
        if (len(reward_history) + 1) % args.train.save_frequency == 0:
            save_progress(args, model, optimizer, total_steps.value)
        stats = play_game(model, env)
        reward_history.append(stats['total_reward'])

        log_message = (
                'Time {}, num steps {}, FPS {:.0f}, '+
                'curr episode reward {:.2f}, mean episode reward: {:.2f}, '+
                'mean policy loss {:.2f}, mean value loss {:.2f}, '+
                'mean entropy percentage {:.2f}'
            ).format(
            time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
            total_steps.value,
            total_steps.value / (time.time() - start_time),
            stats['total_reward'],
            np.mean(reward_history[-60:]),
            stats['policy_loss'],
            stats['value_loss'],
            stats['entropy']
        )
        if args.train.use_pixel_control:
            log_message += ', pixel control loss %.2f' %stats['pc_loss']
        if args.train.use_reward_prediction:
            log_message += ', reward prediction loss %.2f' %stats['rp_loss']
        if args.train.use_value_replay:
            log_message += ', value replay loss %.2f' %stats['vr_loss']
        print(log_message)
        logging.info(log_message)
        time.sleep(60)
Exemple #6
0
    env = wrap_deepmind(gym.make(param['env']), frame_stack = True)
    dqn = model.DQN(num_actions = env.action_space.n).to(device)
    target_dqn = copy.deepcopy(dqn)
    
    def dqn_epsilon_agent(state, net = dqn, th = 0.05):
        if random.random() > th:
            yhat = net(default_states_preprocessor(state))
            return int(yhat.argmax().cpu().numpy())
        else:
            return env.action_space.sample()

    optimizer = optim.Adam(dqn.parameters(), lr = param['lr'])

    # Warmup buffer
    for _ in range(5):
        game = utils.play_game(env, agent = dqn_epsilon_agent, th = eps.get(0), memory = memory)

    step = 0
    metrics = {}
    metrics['episode'] = 0
    while True:
        metrics['episode'] += 1

        ## PLAY GAME
        metrics['epsilon'] = eps.get(step)
        game = utils.play_game(env, agent = dqn_epsilon_agent, th = metrics['epsilon'], memory = memory)
        metrics['run_reward'], metrics['run_episode_steps'] = game['cum_reward'], game['steps']
        step += metrics['run_episode_steps']

        ## TRAIN
        for _ in range(metrics['run_episode_steps']//param['batch_size']):
Exemple #7
0
 def test_stockfish_player_vs_random_player(self):
     winner, result, board = play_game(StockfishPlayer(depth=5),
                                       RandomPlayer())
     self.assertEqual(winner, 'White')
     self.assertTrue('checkmate' in result)
     self.assertIsInstance(board, chess.Board)
# trainingDQN.trainDQN(file_name="env1",
#                     env=GridworldEnv(1),
#                     batch_size=128,
#                     gamma=0.999,
#                     eps_start=0.9,
#                     eps_end=0.05,
#                     eps_decay=1000,
#                     is_plot=True,
#                     num_episodes=500,
#                     max_num_steps_per_episode=1000,
#                     learning_rate=0.0001,
#                     memory_replay_size=10000,
#                 )

agent, _, _ = trainingDQN.trainDQN(
    file_name="env1",
    env=GridworldEnv(1),
    batch_size=128,
    gamma=0.999,
    eps_start=0.9,
    eps_end=0.05,
    eps_decay=1000,
    is_plot=False,
    num_episodes=500,
    max_num_steps_per_episode=1000,
    learning_rate=0.0001,
    memory_replay_size=10000,
)

play_game(GridworldEnv(1), agent)
Exemple #9
0
# %matplotlib inline
# from IPython.display import clear_output
import pandas as pd
from utils import play_game
from tqdm import tqdm

board_size = 3
version = 'v03'
log_frequency = 500
episodes = 100000

agent = DeepQLearningAgent(board_size, use_target_net=True, buffer_size=10000)
agent_random = NoviceAgent(board_size)
env = TicTacToe()
# cold start problem, add some games to buffer for training
_, _ = play_game(agent, agent_random, env, epsilon=1, n_games=100, record=True)
# train on those games
agent.train_agent()

# main training procedure
win_counts = {'win':0, 'lose':0, 'draw':0}
loss_history = []
reward_history = []

epsilon = 0.9
decay = 0.99
epsilon_end = 0.01

# training loop
model_logs = {'iteration':[], 'reward_mean':[], 'reward_dev':[], 'wins':[],  'draws':[], 'loss':[]}
for index in tqdm(range(episodes)):