def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    agents = [MinimaxAgent('O'),
              HumanAgent('X')]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)
        
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, _ = state
            mark = next_mark(mark)

        episode += 1
Exemple #2
0
def play(max_episode=10):
    episode = 0
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    while episode < max_episode:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            env.show_turn(True, mark)

            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotate start
        start_mark = next_mark(start_mark)
        episode += 1
def _bench(max_episode, model_file, show_result=True):
    """Benchmark given model.

    Args:
        max_episode (int): Episode count to benchmark.
        model_file (str): Learned model file name to benchmark.
        show_result (bool): Output result to stdout.

    Returns:
        (dict): Benchmark result.
    """
    minfo = load_model(model_file)
    agents = [BaseAgent('O'), TDAgent('X', 0, 0)]
    show = False

    start_mark = 'O'
    env = TicTacToeEnv()
    env.set_start_mark(start_mark)

    episode = 0
    results = []
    for i in tqdm(range(max_episode)):
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            print((state,reward,action))
            if show:
                env.show_turn(True, mark)
                env.render(mode='human')

            if done:
                if show:
                    env.show_result(True, mark, reward)
                results.append(reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        episode += 1

    o_win = results.count(1)
    x_win = results.count(-1)
    draw = len(results) - o_win - x_win
    mfile = model_file.replace(CWD + os.sep, '')
    minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw,
                      model_file=mfile))
    result = json.dumps(minfo)

    if show_result:
        print(result)
    return result
Exemple #4
0
def _learnhuman(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print("==================================")
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                print("action is %s" % (action))
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)
                print("action is %s" % (action))
            ###
            nstate, reward, done, info = env.step(action)
            agent.backup(state, nstate, reward)

            env.render(mode='human')
            if done:
                print("Return reward : " + str(reward))
                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate

        # rotation start
        # start_mark = next_mark(start_mark)
        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)
def _play(load_file, vs_agent, show_number):
    """Play with learned model.

    Make TD agent and adversarial agnet to play with.
    Play and switch starting mark when the game finished.
    TD agent behave no exploring action while in play mode.

    Args:
        load_file (str):
        vs_agent (object): Enemy agent of TD agent.
        show_number (bool): Whether show grid number for visual hint.
    """
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    td_agent = TDAgent('X', 0, 0)  # prevent exploring
    start_mark = 'O'
    agents = [vs_agent, td_agent]

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'O':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)

            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if human:
                action = agent.act(ava_actions)
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)

            state, reward, done, info = env.step(action)
            print((state,reward,action))

            env.render(mode='human')
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
def _learn(max_episode, epsilon, alpha, save_file):
    """Learn by episodes.

    Make two TD agent, and repeat self play for given episode count.
    Update state values as reward coming from the environment.

    Args:
        max_episode (int): Episode count.
        epsilon (float): Probability of exploration.
        alpha (float): Step size.
        save_file: File name to save result.
    """
    reset_state_values()

    env = TicTacToeEnv()
    agents = [TDAgent('O', epsilon, alpha),
              TDAgent('X', epsilon, alpha)]

    start_mark = 'O'
    for i in tqdm(range(max_episode)):
        episode = i + 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            print((state,reward,action))
            agent.backup(state, nstate, reward)

            if done:
                env.show_result(False, mark, reward)
                # set terminal state value
                set_state_value(state, reward)

            _, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    # save states
    save_model(save_file, max_episode, epsilon, alpha)
    def simulate(self, node: Node, my_env: TicTacToeEnv) -> float:
        """
        MCTS: Simulation stage.
            - Randomly play out remainder of moved and report reward

        Won reward=1, Tie reward=0.5, Lost reward=0
        """
        state = node.state
        while not my_env.done:
            action = random.choice(my_env.available_actions())
            state, _, _, _ = my_env.step(action)
        return self.compute_reward(state)
Exemple #8
0
def play_against(agent_mc,agent_2,max_episode = 10,bench = True):
	start_mark = 'O'

	env = TicTacToeEnv()
	env.set_start_mark(start_mark)
	agents = [agent_mc,agent_2]

	episode = 0
	results = []

	for i in range(max_episode):
		env.set_start_mark(start_mark)
		state = env.reset()
		_,mark = state

		done = False

		while not done:
			agent = agent_by_mark(agents,mark)

			ava_actions = env.available_actions()

			# print(agent.mark)

			# if agent.mark == 'O':
				# print(agent.Q[state])

			action = agent.act(state,ava_actions)

			

			state,reward,done,_ = env.step(action)

			# env.render()

			if done:
				results.append(reward)
				break
			else:
				_,mark = state
		start_mark = next_mark(start_mark)
		episode += 1

	o_win = results.count(1)
	x_win = results.count(-1)
	draw = len(results) - o_win - x_win

	if bench == False:
		print("O_WINS = {},X_WINS = {},DRAW = {}".format(o_win,x_win,draw))


	return float(o_win-x_win)/(max_episode)
Exemple #9
0
def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    # print("101", env.available_actions())
    ROLLOUTS = 1000
    REWARD_FACTOR = 1  # because rollout agent is player 2
    agents = [
        HumanAgent('X'),
        #   HumanAgent('X')]
        RolloutAgent('O', ROLLOUTS, REWARD_FACTOR)
    ]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        _ = os.system('clear')
        print('')
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if agent.agenttype == 'rollout':
                action = agent.act(ava_actions, env)
            else:
                action = agent.act(ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)

            _ = os.system('clear')
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                # print("Reward:", reward)
                break
            else:
                _, mark = state
        time.sleep(10)
        episode += 1
    def act(self, state, my_env: TicTacToeEnv):
        available_actions = my_env.available_actions()
        # --- Step 1: play winning move, if possible ---
        for action in available_actions:
            nstate = after_action_state(state, action)
            gstatus = check_game_status(nstate[0])
            if gstatus > 0:
                if tomark(gstatus) == self.mark:
                    return action

        # --- Step 2: block opponent from winning ---
        # imagine the opponent was playing
        rev_state = (state[0], next_mark(state[1]))
        for action in available_actions:
            nstate = after_action_state(rev_state, action)
            gstatus = check_game_status(nstate[0])
            if gstatus > 0:
                # if they can make a winning move, play that
                if tomark(gstatus) == self.opponent_mark:
                    return action

        return random.choice(available_actions)
    def expand(self, node: Node, my_env: TicTacToeEnv):
        """
        MCTS: Expansion stage.
          - If additional moves are possible from given node
            child nodes will be created, one selected, and env advanced.
          - If not, same node and env will be returned.
        """
        # If this is a terminal state, don't try to expand
        if my_env.done:
            return node, my_env

        # Add a child node for each possible action
        for action in my_env.available_actions():
            nstate = after_action_state(node.state, action)
            Node(nstate, action, parent=node)

        # If node has children after expansion, select one
        if node.children:
            node = random.choice(node.children)
            my_env.step(node.action)

        return node, my_env
def play(max_episode=10):
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    for _ in range(max_episode):
        env.set_start_mark(start_mark)
        state = env.reset()
        while not env.done:
            _, mark = state
            env.show_turn(True, mark)
            
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

        env.show_result(True, mark, reward)

        # rotate start
        start_mark = next_mark(start_mark)
Exemple #13
0
def play_game(qagent):
    env = TicTacToeEnv()
    opponent = RandomAgent('X')
    start_mark = 'O'
    env.set_start_mark(start_mark)
    state = env.reset()
    s, mark = state
    done = False
    agents = [qagent, opponent]
    while not done:
        env.render()
        agent = agent_by_mark(agents, mark)
        ava_actions = env.available_actions()
        env.show_turn(False, mark)
        action = agent.act(state, ava_actions)

        nstate, reward, done, info = env.step(action)
        print(f'state: {s}, action: {action}')
        if done:
            env.render()
            env.show_result(True, mark, reward)

        s, mark = state = nstate
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    connection = pymysql.connect(host="localhost",
                                 user="******",
                                 passwd="",
                                 database="tictactoe")
    cursor = connection.cursor()

    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0
    agent_temp = 6  #Set Start position at 6 to td_agent
    human_temp = 0
    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        turns = 0
        human_diff = False

        # variable for save to database
        db_nstate = []
        db_nvalue = []
        db_choose = 0
        db_pick = 0
        db_current_state = ''
        db_action = 0
        db_note = ''

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print(
                "======================================Switch Turn======================================"
            )
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                turns += 1
                if turns == 1 and human_temp != action:
                    human_temp = action
                    human_diff = True
                    # print("Human action is %s"%(action))
                    # print("Turns == %s" % (turns))
                    # print("human_temp == %s" %(human_temp))
                    # print("human_diff == %s" %(human_diff))
                if action is None:
                    sys.exit()
                nstate, reward, done, info = env.step(action)
            else:
                action = agent.act(state, ava_actions)
                db_nstate, db_nvalue, db_choose, db_pick = agent.get_db()
                if turns == 1 and human_diff == False:
                    action = agent_temp
                    db_note = 'repeat'
                    print("------------> Human start in the same position")
                    print("------------> [Fix]Agent action is %s" %
                          (action + 1))
                elif turns == 1 and human_diff == True:
                    agent_temp = action
                    db_note = 'different'
                    print(
                        "------------> Human start in the diiferent position")
                    print("------------> [New]Agent action is %s" %
                          (action + 1))
                else:
                    print("------------> Agent action is %s" % (action + 1))
                nstate, reward, done, info = env.step(action)
                db_table_tictactoe_state = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, NVALUE, CHOOSE, PICK, STATE_NOW, ACTION, NOTE) \
                    VALUES(%d, '%s', '%s', %d, %d, '%s', %d, '%s');" % (
                    max_episode, db_nstate, db_nvalue, db_choose, db_pick,
                    remove_X(nstate), action, db_note)
                cursor.execute(db_table_tictactoe_state)
                connection.commit()
            ###
            agent.backup(state, nstate, reward)
            env.render(mode='human')
            if done:
                db_win_state = ''
                if reward == 0:
                    db_win_state = "draw"
                else:
                    db_win_state = mark
                print("Return reward : " + str(reward))
                db_table_check_win = "INSERT INTO Check_win(REWARD, WIN_STATE) VALUES(%d, '%s');" % (
                    reward, db_win_state)
                cursor.execute(db_table_check_win)
                connection.commit()
                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate
                connection.commit()

            # insert2 = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, REWARD, ALL_CHOICE, CHOOSE, PICK, STATE_NOW, \
            #     ACTION, NOTE) VALUES(1, 'asdasd', 1, 'asdasd', 1, 1, 'asdasd', 1, 'asdasasd');"
            # cursor.execute(insert2)
            # connection.commit()
        # rotation start
        # start_mark = next_mark(start_mark)

        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)

    connection.close()
Exemple #15
0
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0
    agent_temp = 6  #Set Start position at 6 to td_agent
    human_temp = 0
    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        turns = 0
        human_diff = False

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print(
                "======================================Switch Turn======================================"
            )
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                turns += 1
                if turns == 1 and human_temp != action:
                    human_temp = action
                    human_diff = True
                    # print("Human action is %s"%(action))
                    # print("Turns == %s" % (turns))
                    # print("human_temp == %s" %(human_temp))
                    # print("human_diff == %s" %(human_diff))
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)
                if turns == 1 and human_diff == False:
                    action = agent_temp
                    print("------------> Human start in the same position")
                    print("------------> [Fix]Agent action is %s" %
                          (action + 1))
                elif turns == 1 and human_diff == True:
                    agent_temp = action
                    print(
                        "------------> Human start in the diiferent position")
                    print("------------> [New]Agent action is %s" %
                          (action + 1))
                else:
                    print("------------> Agent action is %s" % (action + 1))
            ###
            nstate, reward, done, info = env.step(action)
            agent.backup(state, nstate, reward)

            env.render(mode='human')
            if done:
                print("Return reward : " + str(reward))

                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate

        # rotation start
        # start_mark = next_mark(start_mark)
        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)
Exemple #16
0
def train_agents(opponent,
                 max_episode,
                 epsilon,
                 epsilon_decay,
                 alpha,
                 alpha_decay,
                 gamma,
                 render=False):
    reset_state_values()

    env = TicTacToeEnv()
    if opponent == 'random':
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            RandomAgent('X')
        ]
    else:  # Two Q agents
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            QAgent(env.observation_space.n, env.action_space.n, 'X', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma)
        ]

    start_mark = 'O'
    agent_rewards = {'O': [], 'X': []}
    episode = 0
    for i in tqdm(range(max_episode)):
        episode += 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        s, mark = state
        done = False
        while not done:
            if render:
                env.render()
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            agent.update(s, nstate[0], action, reward, done)

            if done:
                if render:
                    env.render()
                env.show_result(render, mark, reward)
                # set terminal state value
                set_state_value(state, reward)
                agent_rewards['O'].append(reward)
                agent_rewards['X'].append(-reward)

            s, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    return agent_rewards, agent_by_mark(agents, 'O')