Ejemplo n.º 1
0
    def arena(self, agent1, agent2, mcts_args, games_to_play=10):
        mcts1 = MCTS(agent1, mcts_args)
        mcts2 = MCTS(agent2, mcts_args)
        results = []

        for i in range(games_to_play): #tqdm()
            if i % 2 == 0:
                player1 = mcts1
                player2 = mcts2
            else:
                player2 = mcts1
                player1 = mcts2

            env = TicTacToeEnv()

            done = False
            while not done:
                first_player_move = env.fpt
                if first_player_move:
                    probs = player1.getProbs(env, temp=0)
                else:
                    probs = player2.getProbs(env, temp=0)
                
                action = np.random.choice(probs.shape[0], p=probs.reshape(-1,))
                _, reward, done, _ = env.step( (action//env.size, action % env.size) )
                if reward == -1:
                    print('Repeated move!')
                if done:
                    results.append( reward if first_player_move else -1*reward )
        return results
Ejemplo n.º 2
0
def main():
    log_dir = '/Users/adam/Documents/projects/td_tic_tac_toe/log/leaf2'
    env = TicTacToeEnv()
    model = ValueModel(env.feature_vector_size, 100)
    # agent = SimpleAgent('agent_0', model, env)
    # agent = TDAgent('agent_0', model, env)
    # agent = ForwardAgent('agent_0', model, env)
    # agent = BackwardAgent('agent_0', model, env)
    agent = LeafAgent('agent_0', model, env)
    human = HumanAgent(env)

    with tf.train.SingularMonitoredSession(checkpoint_dir=log_dir) as sess:
        agent.sess = sess
        env.sess = sess
        players = [human, agent]
        env.play(players, verbose=True)
Ejemplo n.º 3
0
def learn_on_policy(episodes, epsilon=0.1, discount_factor=0.9):
    env = TicTacToeEnv()
    agents = [MCOPA('O', epsilon), MCOPA('X', epsilon)]

    start_mark = 'O'
    env.set_start_mark(start_mark)
    for i in range(episodes):
        episode = i + 1
        state = env.reset()
        _, mark = state
        steps = []
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            actions = env.available_actions()
            action = agent.act(state, actions)
            next_state, reward, done, _ = env.step(action)
            steps.append((state, reward))
            if done:
                break
            _, mark = state = next_state

        steps.reverse()
        G = 0
        # As in one episode of tic tac toe there will only be unique states we don't need to check for them
        for step in steps:
            _, mark = step[0]
            G = step[1] + discount_factor * G
            agents[0].update(step[0], G)

        # rotate start
        start_mark = next_mark(start_mark)
Ejemplo n.º 4
0
    def play_game(self, agent):
        train_samples = []
        episode_step = 0
        env = TicTacToeEnv()

        while True:
            episode_step += 1
            temp = int(episode_step < self.temp_thres)
            probs = self.mcts.getProbs(env, temp=temp)

            action = np.random.choice(probs.shape[0], p=probs.reshape(-1,))
            #probs_orig, val = self.mcts.net.predict(env.getPBoard())
            for board_s, probs_s in getBoardSims(env.getPBoard(), probs):
                train_samples.append([board_s, probs_s, env.fpt]) #, probs_orig, action, val
            _, reward, done, _ = env.step( (action//env.size, action % env.size) )
            if done:
                final_player = env.fpt
                return [ (x[0], x[1], reward if final_player != x[2] else -reward) for x in train_samples] #, x[3], x[4], x[5]
Ejemplo n.º 5
0
def main():
    env = TicTacToeEnv()
    model = ValueModel(env.feature_vector_size, 100)

    # agent = SimpleAgent('agent_0', model, env)
    # agent = TDAgent('agent_0', model, env)
    # agent = ForwardAgent('agent_0', model, env)
    # agent = BackwardAgent('agent_0', model, env)
    agent = LeafAgent('agent_0', model, env)

    random_agent = RandomAgent(env)

    log_dir = "./log/leaf"

    summary_op = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(log_dir)

    scaffold = tf.train.Scaffold(summary_op=summary_op)
    with tf.train.MonitoredTrainingSession(checkpoint_dir=log_dir,
                                           scaffold=scaffold) as sess:
        agent.sess = sess
        env.sess = sess

        while True:
            episode_count = sess.run(agent.episode_count)
            if episode_count % 1000 == 0:
                results = random_agent.test(agent)

                sess.run(agent.update_random_agent_test_results,
                         feed_dict={
                             random_agent_test_: result
                             for random_agent_test_, result in zip(
                                 agent.random_agent_test_s, results)
                         })
                print(episode_count, ':', results)

                if results[2] + results[5] == 0:
                    final_summary = sess.run(summary_op)
                    summary_writer.add_summary(final_summary,
                                               global_step=episode_count)
                    break
            else:
                agent.train(.2)
            sess.run(agent.increment_episode_count)
Ejemplo n.º 6
0
def learn_off_policy(episodes, discount_factor=0.9):
    env = TicTacToeEnv()
    agents = [MCOffPA('O'),
              MCOffPA('X')]

    start_mark = 'O'
    env.set_start_mark(start_mark)
    for i in range(episodes):
        state = env.reset()
        _, mark = state
        steps = []
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            actions = env.available_actions()
            action = random.choice(actions)
            next_state, reward, done, _ = env.step(action)
            steps.append((state, reward, action, actions))
            if done:
                break
            _, mark = state = next_state
            
        steps.reverse()
        G = 0
        W = 1
        
        # As in one episode of tic tac toe there will only be unique states we don't need to check for them
        for step in steps:
            _, mark = step[0]
            agent = agent_by_mark(agents, mark)
            G = step[1] + discount_factor*G
            agent.update(step[0], G, W)
            if agent.act(step[0], step[3]) != step[2]:
                break
                
            # behaviour policy = 1/available_actions
            W = W*len(step[3])
            
        # rotate start
        start_mark = next_mark(start_mark)
Ejemplo n.º 7
0
import numpy as np
from env import TicTacToeEnv
from time import sleep

if __name__ == '__main__':
    env = TicTacToeEnv(render=True)
    sleep(2)
    env.step((16, 17))
    sleep(2)
    env.step((16, 16))
    sleep(2)
    env.step((17, 16))
    sleep(2)
    env.step((17, 17))
    sleep(2)
    env.step((14, 16))
    sleep(2)
    env.step((14, 14))
    sleep(2)
    env.step((14, 15))
    sleep(2)
    env.step((13, 13))
    env.window.mainloop()
Ejemplo n.º 8
0
class RandomBot():
    def __init__(self):
        self.type = 'computer'

    def get_action(self, board, info):
        board_slice = board[info['up']:info['down'] + 1,
                            info['left']:info['right'] + 1]
        choices = np.transpose(np.nonzero(board_slice == 0))
        choice = np.random.choice(choices.shape[0], size=1)
        out = choices[choice[0], :]
        return out[0] + info['up'], out[1] + info['left']


if __name__ == '__main__':
    env = TicTacToeEnv(render=True)

    player1 = AlphaBot('AlphaZero/models/net_updates_866.pth', {
        'c': 1.,
        'num_sims': 25,
        'sleep_time': 0.15
    })
    #HumanPlayer()#

    player2 = HumanPlayer(
    )  #AlphaBot('AlphaZero/models/net_updates_859.pth', {'c': 1., 'num_sims': 25, 'sleep_time': 0.15})
    #
    host = GameHost(player1, player2, env)
    host.start_game()
    env.window.mainloop()
Ejemplo n.º 9
0
def play(max_episode=10):
    episode = 0
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'), BaseAgent('X')]

    while episode < max_episode:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            env.show_turn(True, mark)

            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotate start
        start_mark = next_mark(start_mark)
        episode += 1
Ejemplo n.º 10
0
def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    agents = [MinimaxAgent('O'),
              HumanAgent('X')]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if mark=='O':
                n,action=agent.act(state, ava_actions)
            else:
                action = agent.act(state, ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)
        
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, _ = state
            mark = next_mark(mark)

        episode += 1
Ejemplo n.º 11
0
def _bench(max_episode, model_file, show_result=True):
    """Benchmark given model.

    Args:
        max_episode (int): Episode count to benchmark.
        model_file (str): Learned model file name to benchmark.
        show_result (bool): Output result to stdout.

    Returns:
        (dict): Benchmark result.
    """
    minfo = load_model(model_file)
    agents = [BaseAgent('O'), TDAgent('X', 0, 0)]
    show = False

    start_mark = 'O'
    env = TicTacToeEnv()
    env.set_start_mark(start_mark)

    episode = 0
    results = []
    for i in tqdm(range(max_episode)):
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            if show:
                env.show_turn(True, mark)
                env.render(mode='human')

            if done:
                if show:
                    env.show_result(True, mark, reward)
                results.append(reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        episode += 1

    o_win = results.count(1)
    x_win = results.count(-1)
    draw = len(results) - o_win - x_win
    mfile = model_file.replace(CWD + os.sep, '')
    minfo.update(
        dict(base_win=o_win, td_win=x_win, draw=draw, model_file=mfile))
    result = json.dumps(minfo)

    if show_result:
        print(result)
    return result
Ejemplo n.º 12
0
def _play(load_file, vs_agent, show_number):
    """Play with learned model.

    Make TD agent and adversarial agnet to play with.
    Play and switch starting mark when the game finished.
    TD agent behave no exploring action while in play mode.

    Args:
        load_file (str):
        vs_agent (object): Enemy agent of TD agent.
        show_number (bool): Whether show grid number for visual hint.
    """
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    td_agent = TDAgent('X', 0, 0)  # prevent exploring
    start_mark = 'O'
    agents = [vs_agent, td_agent]

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'O':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)

            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if human:
                action = agent.act(ava_actions)
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)

            state, reward, done, info = env.step(action)

            env.render(mode='human')
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
Ejemplo n.º 13
0
def _learn(max_episode, epsilon, alpha, save_file):
    """Learn by episodes.

    Make two TD agent, and repeat self play for given episode count.
    Update state values as reward coming from the environment.

    Args:
        max_episode (int): Episode count.
        epsilon (float): Probability of exploration.
        alpha (float): Step size.
        save_file: File name to save result.
    """
    reset_state_values()

    env = TicTacToeEnv()
    agents = [TDAgent('O', epsilon, alpha), TDAgent('X', epsilon, alpha)]

    start_mark = 'O'
    for i in tqdm(range(max_episode)):
        episode = i + 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            agent.backup(state, nstate, reward)

            if done:
                env.show_result(False, mark, reward)
                # set terminal state value
                set_state_value(state, reward)

            _, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    # save states
    save_model(save_file, max_episode, epsilon, alpha)
Ejemplo n.º 14
0
import numpy as np
import pandas as pd
from env import TicTacToeEnv
from agent import QLearningAgent

env = TicTacToeEnv()
agent = QLearningAgent(env)

for game_nr in range(1000000):
    if game_nr % 10000 == 0:
        print(game_nr)
    done = False
    s = env.reset().copy()
    # print('Init', s)
    while not done:
        a = agent.take_action(s)
        r, s_, done, _ = env.step(a)
        agent.learn(s, a, r, s_, done)
        # print(s, a, r, s_, done)
        s = s_.copy()

V = pd.DataFrame.from_dict(agent._V,
                           orient='index',
                           dtype=np.float32,
                           columns=['V'])
N = pd.DataFrame.from_dict(agent._N,
                           orient='index',
                           dtype=np.uint32,
                           columns=['N'])
df = V.merge(N, how='left', left_index=True, right_index=True)
states = pd.DataFrame(df.index.values.tolist(), index=df.index)
Ejemplo n.º 15
0
def play(show_number):

    env = TicTacToeEnv(show_number=show_number)
    agents = [HumanAgent(HUMAN_MARK)]
    episode = 0
    j = 0
    while True:

        state = env.reset()
        _, mark = state
        done = False
        env.render()
        i = 0
        if j == 0:
            Papa = Node(state, None, [1, 2, 3, 4, 5, 6, 7, 8, 9], 0)
            Papa.fill()
            j += 1
        action = Papa.maxAddress
        current = Papa.children[Papa.maxAddress]
        print("X's Turn")
        while not done:
            pre_action = action
            pre_current = current
            ava_actions = env.available_actions()
            if i % 2 == 0 and i != 0:
                print("X's Turn")
                print("Previous Action: ", pre_action)
                action, current = pre_current.reach_child(pre_action)
                print("Playing: ", action)
            elif i % 2 == 1:
                print("O's Turn")
                action = HumanAgent.act(state, ava_actions)

            i += 1
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action - 1)

            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, _ = state
        mark = next_mark(mark)

        episode += 1