Example #1
0
def play(max_episode=10):
    episode = 0
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    while episode < max_episode:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            env.show_turn(True, mark)

            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotate start
        start_mark = next_mark(start_mark)
        episode += 1
def play_base(env):
    load_model(MC_MODEL_FILE)
    agents = [BaseAgent('O'), OnPolicyMCAgent('X', 0, 1)]

    start_mark = 'X'
    test_cases = 10
    while test_cases:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        test_cases -= 1
def play(show_number):
    env = TicTacToeEnv(show_number=show_number)
    agents = [MinimaxAgent('O'),
              HumanAgent('X')]
    episode = 0
    while True:
        state = env.reset()
        _, mark = state
        done = False
        env.render()
        while not done:
            agent = agent_by_mark(agents, mark)
            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            if action is None:
                sys.exit()

            state, reward, done, info = env.step(action)
        
            print('')
            env.render()
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, _ = state
            mark = next_mark(mark)

        episode += 1
def _bench(max_episode, model_file, show_result=True):
    """Benchmark given model.

    Args:
        max_episode (int): Episode count to benchmark.
        model_file (str): Learned model file name to benchmark.
        show_result (bool): Output result to stdout.

    Returns:
        (dict): Benchmark result.
    """
    minfo = load_model(model_file)
    agents = [BaseAgent('O'), TDAgent('X', 0, 0)]
    show = False

    start_mark = 'O'
    env = TicTacToeEnv()
    env.set_start_mark(start_mark)

    episode = 0
    results = []
    for i in tqdm(range(max_episode)):
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            print((state,reward,action))
            if show:
                env.show_turn(True, mark)
                env.render(mode='human')

            if done:
                if show:
                    env.show_result(True, mark, reward)
                results.append(reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        episode += 1

    o_win = results.count(1)
    x_win = results.count(-1)
    draw = len(results) - o_win - x_win
    mfile = model_file.replace(CWD + os.sep, '')
    minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw,
                      model_file=mfile))
    result = json.dumps(minfo)

    if show_result:
        print(result)
    return result
Example #5
0
	def learn(self,env,num_episodes,agent_2,rndm):
		returns_sum = defaultdict(float)
		returns_count = defaultdict(float)
		unique_states = set()
		# number_unique = []
		mean_returns = []
		# low_returns = []
		# high_returns = []


		start_mark = 'X'



		for episode in range(1,num_episodes+1):
			if episode%1000 == 0:
				print("\rEpisode {}/{}.".format(episode,num_episodes),end="")
				sys.stdout.flush()


			episodes = self.generate_episode(env,self.policy,start_mark)


			sa_in_episode = set([(tuple(x[0]),x[1]) for x in episodes])
		
			# print(Q)

			# for x in episodes:
				# unique_states.add(x[0])


			for state,action in sa_in_episode:
				sa_pair = (state,action)
				first_occurence_idx = next(i for i,x in enumerate(episodes) if x[0] == state and x[1] == action)
				# print(sa_pair)
				# print(first_occurence_idx)
				G = sum([x[2]*(self.discount_factor**i) for i,x in enumerate(episodes[first_occurence_idx:])])
				# print(G)
				returns_sum[sa_pair] += G
				returns_count[sa_pair] += 1.0
				self.Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]

			# print(self.Q)

			for s,a in rndm:
				self.backup[(s,a)].append(deepcopy(self.Q[s][a]))

			start_mark = next_mark(start_mark)

			mu = play_against(self,agent_2,10)
			self.unique_states.append(len(self.Q.keys()))
			mean_returns.append(mu)
			# low_returns.append(low)
			# high_returns.append(high)
		
		save_model('Mc_OnPolicy_agent',num_episodes,self.epsilon,self.discount_factor,'Mc_OnPolicy',self.Q)
		return mean_returns
def _play(load_file, vs_agent, show_number):
    """Play with learned model.

    Make TD agent and adversarial agnet to play with.
    Play and switch starting mark when the game finished.
    TD agent behave no exploring action while in play mode.

    Args:
        load_file (str):
        vs_agent (object): Enemy agent of TD agent.
        show_number (bool): Whether show grid number for visual hint.
    """
    load_model(load_file)
    env = TicTacToeEnv(show_number=show_number)
    td_agent = TDAgent('X', 0, 0)  # prevent exploring
    start_mark = 'O'
    agents = [vs_agent, td_agent]

    while True:
        # start agent rotation
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False

        # show start board for human agent
        if mark == 'O':
            env.render(mode='human')

        while not done:
            agent = agent_by_mark(agents, mark)
            human = isinstance(agent, HumanAgent)

            env.show_turn(True, mark)
            ava_actions = env.available_actions()
            if human:
                action = agent.act(ava_actions)
                if action is None:
                    sys.exit()
            else:
                action = agent.act(state, ava_actions)

            state, reward, done, info = env.step(action)
            print((state,reward,action))

            env.render(mode='human')
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
def _learn(max_episode, epsilon, alpha, save_file):
    """Learn by episodes.

    Make two TD agent, and repeat self play for given episode count.
    Update state values as reward coming from the environment.

    Args:
        max_episode (int): Episode count.
        epsilon (float): Probability of exploration.
        alpha (float): Step size.
        save_file: File name to save result.
    """
    reset_state_values()

    env = TicTacToeEnv()
    agents = [TDAgent('O', epsilon, alpha),
              TDAgent('X', epsilon, alpha)]

    start_mark = 'O'
    for i in tqdm(range(max_episode)):
        episode = i + 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            print((state,reward,action))
            agent.backup(state, nstate, reward)

            if done:
                env.show_result(False, mark, reward)
                # set terminal state value
                set_state_value(state, reward)

            _, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    # save states
    save_model(save_file, max_episode, epsilon, alpha)
Example #8
0
def play_against(agent_mc,agent_2,max_episode = 10,bench = True):
	start_mark = 'O'

	env = TicTacToeEnv()
	env.set_start_mark(start_mark)
	agents = [agent_mc,agent_2]

	episode = 0
	results = []

	for i in range(max_episode):
		env.set_start_mark(start_mark)
		state = env.reset()
		_,mark = state

		done = False

		while not done:
			agent = agent_by_mark(agents,mark)

			ava_actions = env.available_actions()

			# print(agent.mark)

			# if agent.mark == 'O':
				# print(agent.Q[state])

			action = agent.act(state,ava_actions)

			

			state,reward,done,_ = env.step(action)

			# env.render()

			if done:
				results.append(reward)
				break
			else:
				_,mark = state
		start_mark = next_mark(start_mark)
		episode += 1

	o_win = results.count(1)
	x_win = results.count(-1)
	draw = len(results) - o_win - x_win

	if bench == False:
		print("O_WINS = {},X_WINS = {},DRAW = {}".format(o_win,x_win,draw))


	return float(o_win-x_win)/(max_episode)
 def act(self, state, ava_actions):
     board, mark = state
     nboard = list(board[:])
     if check_game_status(nboard) < 0:
         min = 100
         max = -100
         action_min = ava_actions[0]
         action_max = ava_actions[0]
         if mark == 'O':
             for action in ava_actions:
                 nboard[action] = 1
                 mark = next_mark(mark)
                 value, q = self.act(
                     (tuple(nboard), mark),
                     [p for p in ava_actions if p != action])
                 if (value < min):
                     min = value
                     action_min = action
                 nboard[action] = 0  #backtrack
                 mark = next_mark(mark)
             return min, action_min
         else:
             for action in ava_actions:
                 nboard[action] = 2
                 mark = next_mark(mark)
                 value, m = self.act(
                     (tuple(nboard), mark),
                     [p for p in ava_actions if p != action])
                 if (value > max):
                     max = value
                     action_max = action
                 nboard[action] = 0  #backtrack
                 mark = next_mark(mark)
             return max, action_max
     else:
         return check_game_status(nboard), 12
Example #10
0
	def generate_episode(self,env,policy,start_mark):
		episodes = []
		
		env.set_start_mark(start_mark)
		state = env.reset()

		done = False
		iteration = 0

		while not done:
			available_actions = env.available_actions()
			action = policy(state,available_actions,False)
			nstate,reward,done,_ = env.step(action)

			episodes.append((state,action,reward))
			state = nstate
			iteration += 1

			start_mark = next_mark(start_mark)

		return episodes
def find_loc_prob(state, aval_actions, action, win_count, loss_count, step):
    aval_actions.remove(action)
    state = after_action_state(state, action)
    game_status = check_game_status(state[0])

    if (game_status == 0 or game_status == tocode(next_mark(state[1]))):
        win_count = win_count + step

        if (game_status == 0):	#If there is draw then it will be counted as victory for both the players
            loss_count = loss_count + step

        return win_count, loss_count
    elif (game_status == tocode(state[1])):
        loss_count = loss_count + step
        return win_count, loss_count
    else:
        for action in aval_actions:
            temp = aval_actions.copy()
            loss_count, win_count = find_loc_prob(state, temp, action, loss_count, win_count, step/5)

    return win_count, loss_count
def learn(env):
    max_episode = MAX_EPISODE
    epsilon = EPSILON
    agents = [OnPolicyMCAgent('X', epsilon), OnPolicyMCAgent('O', epsilon)]
    agents[0].orig_actions = env.available_actions()
    agents[1].orig_actions = env.available_actions()

    start_mark = 'O'

    #iterating through episodes
    for episode in tqdm(range(max_episode)):

        #env.show_episode(False,episode+1)
        agents[0].trans_list = []
        agents[1].trans_list = []
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)
            next_state, reward, done, _ = env.step(action)
            #agent.update(state, new_state)
            agent.trans_list.append((state, action, reward))

            state = next_state
            _, mark = state

        agents[0].update()
        agents[1].update()

        if done:
            env.show_result(False, mark, reward)

        start_mark = next_mark(start_mark)

    save_model(MC_MODEL_FILE)
    def act(self, state, my_env: TicTacToeEnv):
        available_actions = my_env.available_actions()
        # --- Step 1: play winning move, if possible ---
        for action in available_actions:
            nstate = after_action_state(state, action)
            gstatus = check_game_status(nstate[0])
            if gstatus > 0:
                if tomark(gstatus) == self.mark:
                    return action

        # --- Step 2: block opponent from winning ---
        # imagine the opponent was playing
        rev_state = (state[0], next_mark(state[1]))
        for action in available_actions:
            nstate = after_action_state(rev_state, action)
            gstatus = check_game_status(nstate[0])
            if gstatus > 0:
                # if they can make a winning move, play that
                if tomark(gstatus) == self.opponent_mark:
                    return action

        return random.choice(available_actions)
Example #14
0
def find_loc_prob(state, aval_actions, action, win_count, loss_count, step):
    aval_actions.remove(action)
    state = after_action_state(state, action)
    game_status = check_game_status(state[0])
    print("Action = {}".format(action))

    if (game_status == 0 or game_status == tocode(next_mark(state[1]))):
        win_count = win_count + step
        return win_count, loss_count
    elif (game_status == tocode(state[1])):
        loss_count = loss_count + step
        return win_count, loss_count
    else:
        for action in aval_actions:
            print("Calling recurssively for step {}".format(step))
            print(
                "Win count and Loss count till this step = {} and {} for mark {}"
                .format(win_count, loss_count, state[1]))
            loss_count, win_count = find_loc_prob(state, aval_actions, action,
                                                  loss_count, win_count,
                                                  step - 1)

    return win_count, loss_count
Example #15
0
def play(max_episode=10):
    start_mark = 'O'
    env = TicTacToeEnv()
    agents = [BaseAgent('O'),
              BaseAgent('X')]

    for _ in range(max_episode):
        env.set_start_mark(start_mark)
        state = env.reset()
        while not env.done:
            _, mark = state
            env.show_turn(True, mark)
            
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            env.render()

        env.show_result(True, mark, reward)

        # rotate start
        start_mark = next_mark(start_mark)
Example #16
0
def train_agents(opponent,
                 max_episode,
                 epsilon,
                 epsilon_decay,
                 alpha,
                 alpha_decay,
                 gamma,
                 render=False):
    reset_state_values()

    env = TicTacToeEnv()
    if opponent == 'random':
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            RandomAgent('X')
        ]
    else:  # Two Q agents
        agents = [
            QAgent(env.observation_space.n, env.action_space.n, 'O', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma),
            QAgent(env.observation_space.n, env.action_space.n, 'X', epsilon,
                   epsilon_decay, alpha, alpha_decay, gamma)
        ]

    start_mark = 'O'
    agent_rewards = {'O': [], 'X': []}
    episode = 0
    for i in tqdm(range(max_episode)):
        episode += 1
        env.show_episode(False, episode)

        # reset agent for new episode
        for agent in agents:
            agent.episode_rate = episode / float(max_episode)

        env.set_start_mark(start_mark)
        state = env.reset()
        s, mark = state
        done = False
        while not done:
            if render:
                env.render()
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)

            # update (no rendering)
            nstate, reward, done, info = env.step(action)
            agent.update(s, nstate[0], action, reward, done)

            if done:
                if render:
                    env.render()
                env.show_result(render, mark, reward)
                # set terminal state value
                set_state_value(state, reward)
                agent_rewards['O'].append(reward)
                agent_rewards['X'].append(-reward)

            s, mark = state = nstate

        # rotate start
        start_mark = next_mark(start_mark)

    return agent_rewards, agent_by_mark(agents, 'O')
Example #17
0
	def learn(self,env,num_episodes,agent_2,rndm):

		mean_returns = []
		C = defaultdict(lambda: np.zeros(env.action_space.n))

		start_mark = 'X'


		for episode in range(1,num_episodes+1):
			if episode%1000 == 0:
				print("\rEpisode {}/{}".format(episode,num_episodes),end="")
				sys.stdout.flush()

			episodes = self.generate_episode(env,self.behaviour_policy,start_mark)

			G = 0.0
			W = 1.0

			t_initial = len(episodes)
			for t in range(len(episodes))[::-1]:
				state,action,reward = episodes[t]
				# print("State = {}, Action = {}, reward = {},W = {}".format(state,action,reward,W))
				G = self.discount_factor*G + reward
				C[state][action] += W

				# if (W/C[state][action]) != 1.0:
					# print(W/C[state][action])

				# print(self.Q[state])

				# if t_initial-t > 4:
					# print(t_initial-t)

				self.Q[state][action] += (W/C[state][action]) * (G - self.Q[state][action])



				# print(self.Q[state])


				x = np.nonzero(state[0])
				y = []



				for i in range(9):
					if i in x[0]:
						continue
					else:
						y.append(i)

				# print(state)
				# print(y)
				y = np.array(y) 

				# print(self.target_policy(state,y))

				

				if action != self.target_policy(state,y):
					break


				W = W*(len(y))

			start_mark = next_mark(start_mark)

			mu = play_against(self,agent_2,10)
			self.unique_states.append(len(self.Q.keys()))
			mean_returns.append(mu)

			for s,a in rndm:
				self.backup[(s,a)].append(deepcopy(self.Q[s][a]))
			# low_returns.append(low)
			# high_returns.append(high)


		save_model('Mc_OffPolicy_agent.dat',num_episodes,None,self.discount_factor,'Mc_OffPolicy',self.Q)

		return mean_returns
 def opponent_mark(self):
     return next_mark(self.mark)
def play(num_games, verbose=True):
    """
    Test out two agents playing against each other.
    Displays progress and result.

    Parameters:
    -----------
    num_games: int
        How many games to simulate
    verbose: bool
        If true, display play information during each game
        If false, just display progress bar as simulations progress.
    """
    # Print header
    print("-" * 30)
    print(f"Playing {num_games} games")
    print("  * Player X: {}".format(players["X"].name))
    print("  * Player O: {}".format(players["O"].name))
    print("-" * 30)

    # select random starting player
    start_mark = random.choice(["X", "O"])

    # keep track of who won
    winners = []

    # if verbose is false, display progress bar
    if not verbose:
        myrange = trange
    else:
        myrange = range

    for _ in myrange(num_games):

        # set up board
        env = TicTacToeEnv()
        env.set_start_mark(start_mark)
        state = env.reset()

        # init the agents
        agents = [players["X"]("X"), players["O"]("O")]

        # play until game is done
        while not env.done:
            _, mark = state
            if verbose:
                env.show_turn(True, mark)
            agent = agent_by_mark(agents, mark)
            action = agent.act(state, copy(env))
            state, reward, _, _ = env.step(action)
            if verbose:
                env.render()

        # append winner to list (-1=X, 1=0, 0=tie)
        winners.append(reward)

        # print out result
        if verbose:
            env.show_result(True, mark, reward)

        # rotate start
        start_mark = next_mark(start_mark)

    # tally and display final stats
    c = Counter(winners)
    total = c[-1] + c[1] + c[0]
    print("\nX won {} ({:.2%})".format(c[-1], c[-1] / total))
    print("O won {} ({:.2%})".format(c[1], c[1] / total))
    print("Tied  {} ({:.2%})".format(c[0], c[0] / total))