def run():
    env = TicTacToeEnv()
    state = torch.tensor([one_hot_board(env.reset())], dtype=torch.float).to(model.device)
    for step in range(n_step):
        t = np.clip(step / eps_steps, 0, 1)
        eps = (1 - t) * eps_start + t * eps_end
        print('\r'+f'step----{step},epsilon----{eps}', flush=True, end='')
        action, _ = model.select_action(state, eps)
        next_state, reward, done, _ = env.step(action.item())
        next_state=one_hot_board(next_state)
        
        if not done:
            next_state, _, done,  _ = env.step(
                model.select_dummy_action(next_state))
            next_state=one_hot_board(next_state)
            next_state = torch.tensor(
                [next_state], dtype=torch.float).to(model.device)

        if done:

            next_state = None

        model.Memory.memorize(state, action, next_state,
                              torch.tensor([reward], device=model.device))

        state = next_state

        model.learn()

        if done:
            state = torch.tensor(
                [one_hot_board(env.reset())], dtype=torch.float).to(model.device)

        if step % target_update == 0:
            model.target_update()
def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    agents = [MinimaxAgent('O'),
              HumanAgent('X')]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)
        
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, _ = state
            mark = next_mark(mark)

        episode += 1
Beispiel #3
0
def play(max_episode=10):
    episode = 0
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    while episode < max_episode:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            env.show_turn(True, mark)

            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotate start
        start_mark = next_mark(start_mark)
        episode += 1
def _bench(max_episode, model_file, show_result=True):
    """Benchmark given model.

    Args:
        max_episode (int): Episode count to benchmark.
        model_file (str): Learned model file name to benchmark.
        show_result (bool): Output result to stdout.

    Returns:
        (dict): Benchmark result.
    """
    minfo = load_model(model_file)
    agents = [BaseAgent('O'), TDAgent('X', 0, 0)]
    show = False

    start_mark = 'O'
    env = TicTacToeEnv()
    env.set_start_mark(start_mark)

    episode = 0
    results = []
    for i in tqdm(range(max_episode)):
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            print((state,reward,action))
            if show:
                env.show_turn(True, mark)
                env.render(mode='human')

            if done:
                if show:
                    env.show_result(True, mark, reward)
                results.append(reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        episode += 1

    o_win = results.count(1)
    x_win = results.count(-1)
    draw = len(results) - o_win - x_win
    mfile = model_file.replace(CWD + os.sep, '')
    minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw,
                      model_file=mfile))
    result = json.dumps(minfo)

    if show_result:
        print(result)
    return result
Beispiel #5
0
def _learnhuman(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print("==================================")
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                print("action is %s" % (action))
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)
                print("action is %s" % (action))
            ###
            nstate, reward, done, info = env.step(action)
            agent.backup(state, nstate, reward)

            env.render(mode='human')
            if done:
                print("Return reward : " + str(reward))
                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate

        # rotation start
        # start_mark = next_mark(start_mark)
        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)
def _play(load_file, vs_agent, show_number):
    """Play with learned model.

    Make TD agent and adversarial agnet to play with.
    Play and switch starting mark when the game finished.
    TD agent behave no exploring action while in play mode.

    Args:
        load_file (str):
        vs_agent (object): Enemy agent of TD agent.
        show_number (bool): Whether show grid number for visual hint.
    """
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    td_agent = TDAgent('X', 0, 0)  # prevent exploring
    start_mark = 'O'
    agents = [vs_agent, td_agent]

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'O':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)

            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if human:
                action = agent.act(ava_actions)
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)

            state, reward, done, info = env.step(action)
            print((state,reward,action))

            env.render(mode='human')
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
def _learn(max_episode, epsilon, alpha, save_file):
    """Learn by episodes.

    Make two TD agent, and repeat self play for given episode count.
    Update state values as reward coming from the environment.

    Args:
        max_episode (int): Episode count.
        epsilon (float): Probability of exploration.
        alpha (float): Step size.
        save_file: File name to save result.
    """
    reset_state_values()

    env = TicTacToeEnv()
    agents = [TDAgent('O', epsilon, alpha),
              TDAgent('X', epsilon, alpha)]

    start_mark = 'O'
    for i in tqdm(range(max_episode)):
        episode = i + 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            print((state,reward,action))
            agent.backup(state, nstate, reward)

            if done:
                env.show_result(False, mark, reward)
                # set terminal state value
                set_state_value(state, reward)

            _, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    # save states
    save_model(save_file, max_episode, epsilon, alpha)
Beispiel #8
0
def play_against(agent_mc,agent_2,max_episode = 10,bench = True):
	start_mark = 'O'

	env = TicTacToeEnv()
	env.set_start_mark(start_mark)
	agents = [agent_mc,agent_2]

	episode = 0
	results = []

	for i in range(max_episode):
		env.set_start_mark(start_mark)
		state = env.reset()
		_,mark = state

		done = False

		while not done:
			agent = agent_by_mark(agents,mark)

			ava_actions = env.available_actions()

			# print(agent.mark)

			# if agent.mark == 'O':
				# print(agent.Q[state])

			action = agent.act(state,ava_actions)

			

			state,reward,done,_ = env.step(action)

			# env.render()

			if done:
				results.append(reward)
				break
			else:
				_,mark = state
		start_mark = next_mark(start_mark)
		episode += 1

	o_win = results.count(1)
	x_win = results.count(-1)
	draw = len(results) - o_win - x_win

	if bench == False:
		print("O_WINS = {},X_WINS = {},DRAW = {}".format(o_win,x_win,draw))


	return float(o_win-x_win)/(max_episode)
Beispiel #9
0
def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    # print("101", env.available_actions())
    ROLLOUTS = 1000
    REWARD_FACTOR = 1  # because rollout agent is player 2
    agents = [
        HumanAgent('X'),
        #   HumanAgent('X')]
        RolloutAgent('O', ROLLOUTS, REWARD_FACTOR)
    ]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        _ = os.system('clear')
        print('')
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if agent.agenttype == 'rollout':
                action = agent.act(ava_actions, env)
            else:
                action = agent.act(ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)

            _ = os.system('clear')
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                # print("Reward:", reward)
                break
            else:
                _, mark = state
        time.sleep(10)
        episode += 1
def play(max_episode=10):
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    for _ in range(max_episode):
        env.set_start_mark(start_mark)
        state = env.reset()
        while not env.done:
            _, mark = state
            env.show_turn(True, mark)
            
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

        env.show_result(True, mark, reward)

        # rotate start
        start_mark = next_mark(start_mark)
Beispiel #11
0
def play_game(qagent):
    env = TicTacToeEnv()
    opponent = RandomAgent('X')
    start_mark = 'O'
    env.set_start_mark(start_mark)
    state = env.reset()
    s, mark = state
    done = False
    agents = [qagent, opponent]
    while not done:
        env.render()
        agent = agent_by_mark(agents, mark)
        ava_actions = env.available_actions()
        env.show_turn(False, mark)
        action = agent.act(state, ava_actions)

        nstate, reward, done, info = env.step(action)
        print(f'state: {s}, action: {action}')
        if done:
            env.render()
            env.show_result(True, mark, reward)

        s, mark = state = nstate
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    connection = pymysql.connect(host="localhost",
                                 user="******",
                                 passwd="",
                                 database="tictactoe")
    cursor = connection.cursor()

    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0
    agent_temp = 6  #Set Start position at 6 to td_agent
    human_temp = 0
    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        turns = 0
        human_diff = False

        # variable for save to database
        db_nstate = []
        db_nvalue = []
        db_choose = 0
        db_pick = 0
        db_current_state = ''
        db_action = 0
        db_note = ''

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print(
                "======================================Switch Turn======================================"
            )
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                turns += 1
                if turns == 1 and human_temp != action:
                    human_temp = action
                    human_diff = True
                    # print("Human action is %s"%(action))
                    # print("Turns == %s" % (turns))
                    # print("human_temp == %s" %(human_temp))
                    # print("human_diff == %s" %(human_diff))
                if action is None:
                    sys.exit()
                nstate, reward, done, info = env.step(action)
            else:
                action = agent.act(state, ava_actions)
                db_nstate, db_nvalue, db_choose, db_pick = agent.get_db()
                if turns == 1 and human_diff == False:
                    action = agent_temp
                    db_note = 'repeat'
                    print("------------> Human start in the same position")
                    print("------------> [Fix]Agent action is %s" %
                          (action + 1))
                elif turns == 1 and human_diff == True:
                    agent_temp = action
                    db_note = 'different'
                    print(
                        "------------> Human start in the diiferent position")
                    print("------------> [New]Agent action is %s" %
                          (action + 1))
                else:
                    print("------------> Agent action is %s" % (action + 1))
                nstate, reward, done, info = env.step(action)
                db_table_tictactoe_state = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, NVALUE, CHOOSE, PICK, STATE_NOW, ACTION, NOTE) \
                    VALUES(%d, '%s', '%s', %d, %d, '%s', %d, '%s');" % (
                    max_episode, db_nstate, db_nvalue, db_choose, db_pick,
                    remove_X(nstate), action, db_note)
                cursor.execute(db_table_tictactoe_state)
                connection.commit()
            ###
            agent.backup(state, nstate, reward)
            env.render(mode='human')
            if done:
                db_win_state = ''
                if reward == 0:
                    db_win_state = "draw"
                else:
                    db_win_state = mark
                print("Return reward : " + str(reward))
                db_table_check_win = "INSERT INTO Check_win(REWARD, WIN_STATE) VALUES(%d, '%s');" % (
                    reward, db_win_state)
                cursor.execute(db_table_check_win)
                connection.commit()
                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate
                connection.commit()

            # insert2 = "INSERT INTO Tictactoe_state(EPISODE, NSTATE, REWARD, ALL_CHOICE, CHOOSE, PICK, STATE_NOW, \
            #     ACTION, NOTE) VALUES(1, 'asdasd', 1, 'asdasd', 1, 1, 'asdasd', 1, 'asdasasd');"
            # cursor.execute(insert2)
            # connection.commit()
        # rotation start
        # start_mark = next_mark(start_mark)

        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)

    connection.close()
def play(num_games, verbose=True):
    """
    Test out two agents playing against each other.
    Displays progress and result.

    Parameters:
    -----------
    num_games: int
        How many games to simulate
    verbose: bool
        If true, display play information during each game
        If false, just display progress bar as simulations progress.
    """
    # Print header
    print("-" * 30)
    print(f"Playing {num_games} games")
    print("  * Player X: {}".format(players["X"].name))
    print("  * Player O: {}".format(players["O"].name))
    print("-" * 30)

    # select random starting player
    start_mark = random.choice(["X", "O"])

    # keep track of who won
    winners = []

    # if verbose is false, display progress bar
    if not verbose:
        myrange = trange
    else:
        myrange = range

    for _ in myrange(num_games):

        # set up board
        env = TicTacToeEnv()
        env.set_start_mark(start_mark)
        state = env.reset()

        # init the agents
        agents = [players["X"]("X"), players["O"]("O")]

        # play until game is done
        while not env.done:
            _, mark = state
            if verbose:
                env.show_turn(True, mark)
            agent = agent_by_mark(agents, mark)
            action = agent.act(state, copy(env))
            state, reward, _, _ = env.step(action)
            if verbose:
                env.render()

        # append winner to list (-1=X, 1=0, 0=tie)
        winners.append(reward)

        # print out result
        if verbose:
            env.show_result(True, mark, reward)

        # rotate start
        start_mark = next_mark(start_mark)

    # tally and display final stats
    c = Counter(winners)
    total = c[-1] + c[1] + c[0]
    print("\nX won {} ({:.2%})".format(c[-1], c[-1] / total))
    print("O won {} ({:.2%})".format(c[1], c[1] / total))
    print("Tied  {} ({:.2%})".format(c[0], c[0] / total))
Beispiel #14
0
def _learnhuman1(epsilon, alpha, save_file, load_file, vs_agent, show_number):
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    start_mark = 'X'
    agents = [vs_agent, TDAgent('O', epsilon, alpha)]
    max_episode = 0
    agent_temp = 6  #Set Start position at 6 to td_agent
    human_temp = 0
    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        turns = 0
        human_diff = False

        # show start board for human agent
        if mark == 'X':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)
            print(
                "======================================Switch Turn======================================"
            )
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            # print(ava_actions)
            if human:
                action = agent.act(ava_actions)
                turns += 1
                if turns == 1 and human_temp != action:
                    human_temp = action
                    human_diff = True
                    # print("Human action is %s"%(action))
                    # print("Turns == %s" % (turns))
                    # print("human_temp == %s" %(human_temp))
                    # print("human_diff == %s" %(human_diff))
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)
                if turns == 1 and human_diff == False:
                    action = agent_temp
                    print("------------> Human start in the same position")
                    print("------------> [Fix]Agent action is %s" %
                          (action + 1))
                elif turns == 1 and human_diff == True:
                    agent_temp = action
                    print(
                        "------------> Human start in the diiferent position")
                    print("------------> [New]Agent action is %s" %
                          (action + 1))
                else:
                    print("------------> Agent action is %s" % (action + 1))
            ###
            nstate, reward, done, info = env.step(action)
            agent.backup(state, nstate, reward)

            env.render(mode='human')
            if done:
                print("Return reward : " + str(reward))

                env.show_result(True, mark, reward)
                time.sleep(1)
                # if reward == 1:
                # _conlearn(700, epsilon, alpha, save_file, load_file)
                # set terminal state value
                set_state_value(state, reward)
                break
            else:
                _, mark = state = nstate

        # rotation start
        # start_mark = next_mark(start_mark)
        max_episode += 1
        # print(max_episode)
        save_model(save_file, max_episode, epsilon, alpha)
Beispiel #15
0
def train_agents(opponent,
                 max_episode,
                 epsilon,
                 epsilon_decay,
                 alpha,
                 alpha_decay,
                 gamma,
                 render=False):
    reset_state_values()

    env = TicTacToeEnv()
    if opponent == 'random':
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            RandomAgent('X')
        ]
    else:  # Two Q agents
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            QAgent(env.observation_space.n, env.action_space.n, 'X', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma)
        ]

    start_mark = 'O'
    agent_rewards = {'O': [], 'X': []}
    episode = 0
    for i in tqdm(range(max_episode)):
        episode += 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        s, mark = state
        done = False
        while not done:
            if render:
                env.render()
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            agent.update(s, nstate[0], action, reward, done)

            if done:
                if render:
                    env.render()
                env.show_result(render, mark, reward)
                # set terminal state value
                set_state_value(state, reward)
                agent_rewards['O'].append(reward)
                agent_rewards['X'].append(-reward)

            s, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    return agent_rewards, agent_by_mark(agents, 'O')