def _learn(max_episode, epsilon, alpha, save_file): """Learn by episodes. Make two TD agent, and repeat self play for given episode count. Update state values as reward coming from the environment. Args: max_episode (int): Episode count. epsilon (float): Probability of exploration. alpha (float): Step size. save_file: File name to save result. """ reset_state_values() env = TicTacToeEnv() agents = [ TDAgent('O', epsilon, alpha, env), TDAgent('X', epsilon, alpha, env) ] start_mark = 'O' for i in tqdm(range(max_episode)): episode = i + 1 env.show_episode(False, episode) # reset agent for new episode for agent in agents: agent.episode_rate = episode / float(max_episode) env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False while not done: agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() env.show_turn(False, mark) action = agent.act(state, ava_actions) # update (no rendering) nstate, reward, done, info = env.step(action) agent.backup(state, nstate, reward) if done: env.show_result(False, mark, reward) # set terminal state value set_state_value(state, reward) _, mark = state = nstate # rotate start start_mark = next_mark(start_mark) # save states save_model(save_file, max_episode, epsilon, alpha)
def _play(load_file, vs_agent, show_number): """Play with learned model. Make TD agent and adversarial agnet to play with. Play and switch starting mark when the game finished. TD agent behave no exploring action while in play mode. Args: load_file (str): vs_agent (object): Enemy agent of TD agent. show_number (bool): Whether show grid number for visual hint. """ load_model(load_file) env = TicTacToeEnv(show_number=show_number) td_agent = TDAgent('X', 0, 0, env) # prevent exploring start_mark = 'O' agents = [vs_agent, td_agent] while True: # start agent rotation env.set_start_mark(start_mark) state = env.reset() _, mark = state done = False # show start board for human agent if mark == 'O': env.render(mode='human') while not done: agent = agent_by_mark(agents, mark) human = isinstance(agent, HumanAgent) env.show_turn(True, mark) ava_actions = env.available_actions() if human: action = agent.act(ava_actions) if action is None: sys.exit() else: action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render(mode='human') if done: env.show_result(True, mark, reward) break else: _, mark = state # rotation start start_mark = next_mark(start_mark)
def play(max_episode=10): start_mark = 'O' env = TicTacToeEnv() agents = [Agent('O'), Agent('X')] for _ in range(max_episode): env.set_start_mark(start_mark) state = env.reset() while not env.done: _, mark = state env.show_turn(True, mark) agent = agent_by_mark(agents, mark) ava_actions = env.available_actions() action = agent.act(state, ava_actions) state, reward, done, info = env.step(action) env.render() env.show_result(True, mark, reward) # rotate start start_mark = next_mark(start_mark)