Beispiel #1
0
    def setUp(self):
        self.env = DQN.env
        (self.player_states, (self.community_infos,
                              self.community_cards)) = self.env.reset()
        (self.player_infos, self.player_hands) = zip(*self.player_states)
        self.current_state = ((self.player_infos, self.player_hands),
                              (self.community_infos, self.community_cards))
        self.state = DQN.create_np_array(self.player_infos, self.player_hands,
                                         self.community_cards,
                                         self.community_infos)
        self.state_set = utilities.convert_list_to_tupleA(
            self.player_states[self.env.learner_bot.get_seat()],
            self.current_state[1])
        self._round = utilities.which_round(self.community_cards)
        self.current_player = self.community_infos[-3]
        self.learner_bot, self.villain = self.env.learner_bot, self.env.villain
        Q = defaultdict(lambda: np.zeros(self.env.action_space.n))
        self.agent = DQN.DQNAgent(DQN.state_size,
                                  DQN.action_size)  # initialise agent

        self.policy = DQN.make_epsilon_greedy_policy(Q, self.agent.epsilon,
                                                     self.env.action_space.n)
        self.villain_action = DQN.get_action_policy(
            self.player_infos, self.community_infos, self.community_cards,
            self.env, self._round, self.env.n_seats, self.state_set,
            self.policy, self.villain)
        self.learner_action = self.agent.act(self.state, self.player_infos,
                                             self.community_infos,
                                             self.community_cards, self.env,
                                             self._round, self.env.n_seats,
                                             self.state_set, self.policy)
Beispiel #2
0
def generate_episode(env, n_seats):
    # state observation
    episode = []
    (player_states, (community_infos, community_cards)) = env.reset()
    (player_infos, player_hands) = zip(*player_states)
    current_state = ((player_infos, player_hands), (community_infos,
                                                    community_cards))

    env.render(mode='human', initial=True, delay=delay)
    terminal = False
    while not terminal:

        _round = utilities.which_round(community_cards)
        current_player = community_infos[-3]
        a = (env._current_player.currentbet)
        actions = get_action_policy(player_infos, community_infos,
                                    community_cards, env, _round, n_seats)
        (player_states, (community_infos, community_cards)
         ), action, rewards, terminal, info = env.step(actions)
        current_state = (player_states, (community_infos, community_cards))
        episode.append((current_state, action, env.learner_bot.reward))
        env.render(mode='human', delay=delay)

    return episode
Beispiel #3
0
def mc_control_epsilon_greedy(num_episodes,
                              discount_factor=1.0,
                              epsilon=0.1,
                              is_with_rendering=with_render):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, env.action_space.n, epsilon)

    episode_list = []
    stacks_over_time = {}
    for index, player in env._player_dict.items():
        stacks_over_time.update({player.get_seat(): [player.stack]})
    for i_episode in range(1, num_episodes + 1):
        if with_render:
            print("\n\n********{}*********".format(i_episode))

        # Print out which episode we're on, useful for debugging.
        if i_episode % 10 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        # episode = generate_episode_control(env, env.n_seats, policy)

        episode = []
        (player_states, (community_infos, community_cards)) = env.reset()
        (player_infos, player_hands) = zip(*player_states)
        current_state = ((player_infos, player_hands), (community_infos,
                                                        community_cards))
        utilities.compress_bucket(current_state, env, pre=True)
        # print(env.level_raises)
        # Only want the state set that is relevant to learner bot every step.
        state_set = utilities.convert_list_to_tupleA(
            player_states[env.learner_bot.get_seat()], current_state[1])

        if is_with_rendering:
            env.render(mode='human', initial=True, delay=delay)
        terminal = False
        while not terminal:

            _round = utilities.which_round(community_cards)
            current_player = community_infos[-3]
            a = (env._current_player.currentbet)
            action = get_action_policy(player_infos, community_infos,
                                       community_cards, env, _round,
                                       env.n_seats, state_set, policy)
            # print(env.level_raises)
            (player_states, (community_infos, community_cards)
             ), action, rewards, terminal, info = env.step(action)

            utilities.compress_bucket(player_states, env)
            parsed_return_state = utilities.convert_step_return_to_set(
                (current_state, action, env.learner_bot.reward))
            action = utilities.convert_step_return_to_action(action)
            episode.append(
                (parsed_return_state, action, env.learner_bot.reward))
            current_state = (player_states, (community_infos, community_cards)
                             )  # state = next_state
            if is_with_rendering:
                env.render(mode='human', delay=delay)

        is_end_game = utilities.do_necessary_env_cleanup(
            env)  # assign new positions, remove players if stack < 0 etc ..
        stack_list = env.report_game(requested_attributes=["stack"])
        count_existing_players = 0
        for stack_record_index, stack_record in env._player_dict.items():
            arr = stacks_over_time[stack_record_index] + [
                stack_list[stack_record_index]
            ]
            stacks_over_time.update({stack_record_index: arr})
            if (stack_list[stack_record_index] != 0):
                count_existing_players += 1
        episode_list.append(episode)

        # Find all (state, action) pairs we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(tuple(sar[0]), sar[1]) for sar in episode])
        for state, action in sa_in_episode:
            state = state[0]
            sa_pair = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i, x in enumerate(episode)
                                       if x[0][0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([
                x[2] * (discount_factor**i)
                for i, x in enumerate(episode[first_occurence_idx:])
            ])
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]

        # The policy is improved implicitly by changing the Q dictionary
        if is_end_game:
            break

    # Episode end
    for player_idx, stack in stacks_over_time.items():
        if player_idx == 0:
            plt.plot(stack, label="Player {} - Learner".format(player_idx))
        else:
            plt.plot(stack, label="Player {}".format(player_idx))

    p1_stack_t = list(stacks_over_time.values())[0]
    p2_stack_t = list(stacks_over_time.values())[1]
    # diffs = [j-i for i, j in zip(p1_stack_t[:-1], p1_stack_t[1:])]
    # import statistics
    # lost_avg = statistics.mean(diffs)
    won_avg = p1_stack_t[len(p1_stack_t) - 1] - p1_stack_t[0]
    # print(p1_stack_t)
    print('mbb/g:{}'.format(won_avg / n_episodes))
    plt.ylabel('Stack Size')
    plt.xlabel('Episode')
    plt.legend()
    if with_graph:
        plt.show()

    return Q, policy
Beispiel #4
0
        episode = []
        (player_states, (community_infos, community_cards)) = env.reset()
        (player_infos, player_hands) = zip(*player_states)
        current_state = ((player_infos, player_hands), (community_infos, community_cards))
        utilities.compress_bucket(current_state, env, pre=True)
        state = create_np_array(player_infos, player_hands, community_cards, community_infos)

        # Only want the state set that is relevant to learner bot every step. 
        state_set = utilities.convert_list_to_tupleA(player_states[env.learner_bot.get_seat()], current_state[1])

        if with_render:
            env.render(mode='human', initial=True, delay=delay)
        terminal = False
        while not terminal:

            _round = utilities.which_round(community_cards)
            current_player = community_infos[-3]
            if current_player is not 0:
                action = get_action_policy(player_infos, community_infos, community_cards, env, _round, env.n_seats, state_set, policy, villain)
            else:
                action = agent.act(state, player_infos, community_infos, community_cards, env, _round, env.n_seats, state_set, policy)
            
            #STEP - SET BREAKPOINT ON THE FOLLOWING LINE TO OBSERVE ACTIONS TAKEN ONE BY ONE
            (player_states, (community_infos, community_cards)), action, rewards, terminal, info = env.step(action)

            utilities.compress_bucket(player_states, env)
            action = utilities.convert_step_return_to_action(action)
            ps = list(zip(*player_states))
            next_state = create_np_array(ps[0], ps[1], community_cards, community_infos) # Numpy array
            agent.remember(state, action, env.learner_bot.reward, next_state, terminal)
            state = next_state
Beispiel #5
0
 def populate_info_pre_action(self):
     self._round = utilities.which_round(self.community_cards)
     self.current_player = self.community_infos[-3]