def setUp(self): self.env = DQN.env (self.player_states, (self.community_infos, self.community_cards)) = self.env.reset() (self.player_infos, self.player_hands) = zip(*self.player_states) self.current_state = ((self.player_infos, self.player_hands), (self.community_infos, self.community_cards)) self.state = DQN.create_np_array(self.player_infos, self.player_hands, self.community_cards, self.community_infos) self.state_set = utilities.convert_list_to_tupleA( self.player_states[self.env.learner_bot.get_seat()], self.current_state[1]) self._round = utilities.which_round(self.community_cards) self.current_player = self.community_infos[-3] self.learner_bot, self.villain = self.env.learner_bot, self.env.villain Q = defaultdict(lambda: np.zeros(self.env.action_space.n)) self.agent = DQN.DQNAgent(DQN.state_size, DQN.action_size) # initialise agent self.policy = DQN.make_epsilon_greedy_policy(Q, self.agent.epsilon, self.env.action_space.n) self.villain_action = DQN.get_action_policy( self.player_infos, self.community_infos, self.community_cards, self.env, self._round, self.env.n_seats, self.state_set, self.policy, self.villain) self.learner_action = self.agent.act(self.state, self.player_infos, self.community_infos, self.community_cards, self.env, self._round, self.env.n_seats, self.state_set, self.policy)
def generate_episode(env, n_seats): # state observation episode = [] (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) current_state = ((player_infos, player_hands), (community_infos, community_cards)) env.render(mode='human', initial=True, delay=delay) terminal = False while not terminal: _round = utilities.which_round(community_cards) current_player = community_infos[-3] a = (env._current_player.currentbet) actions = get_action_policy(player_infos, community_infos, community_cards, env, _round, n_seats) (player_states, (community_infos, community_cards) ), action, rewards, terminal, info = env.step(actions) current_state = (player_states, (community_infos, community_cards)) episode.append((current_state, action, env.learner_bot.reward)) env.render(mode='human', delay=delay) return episode
def mc_control_epsilon_greedy(num_episodes, discount_factor=1.0, epsilon=0.1, is_with_rendering=with_render): """ Monte Carlo Control using Epsilon-Greedy policies. Finds an optimal epsilon-greedy policy. Args: env: OpenAI gym environment. num_episodes: Number of episodes to sample. discount_factor: Gamma discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, policy). Q is a dictionary mapping state -> action values. policy is a function that takes an observation as an argument and returns action probabilities """ # Keeps track of sum and count of returns for each state # to calculate an average. We could use an array to save all # returns (like in the book) but that's memory inefficient. returns_sum = defaultdict(float) returns_count = defaultdict(float) # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # The policy we're following policy = make_epsilon_greedy_policy(Q, env.action_space.n, epsilon) episode_list = [] stacks_over_time = {} for index, player in env._player_dict.items(): stacks_over_time.update({player.get_seat(): [player.stack]}) for i_episode in range(1, num_episodes + 1): if with_render: print("\n\n********{}*********".format(i_episode)) # Print out which episode we're on, useful for debugging. if i_episode % 10 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() # Generate an episode. # An episode is an array of (state, action, reward) tuples # episode = generate_episode_control(env, env.n_seats, policy) episode = [] (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) current_state = ((player_infos, player_hands), (community_infos, community_cards)) utilities.compress_bucket(current_state, env, pre=True) # print(env.level_raises) # Only want the state set that is relevant to learner bot every step. state_set = utilities.convert_list_to_tupleA( player_states[env.learner_bot.get_seat()], current_state[1]) if is_with_rendering: env.render(mode='human', initial=True, delay=delay) terminal = False while not terminal: _round = utilities.which_round(community_cards) current_player = community_infos[-3] a = (env._current_player.currentbet) action = get_action_policy(player_infos, community_infos, community_cards, env, _round, env.n_seats, state_set, policy) # print(env.level_raises) (player_states, (community_infos, community_cards) ), action, rewards, terminal, info = env.step(action) utilities.compress_bucket(player_states, env) parsed_return_state = utilities.convert_step_return_to_set( (current_state, action, env.learner_bot.reward)) action = utilities.convert_step_return_to_action(action) episode.append( (parsed_return_state, action, env.learner_bot.reward)) current_state = (player_states, (community_infos, community_cards) ) # state = next_state if is_with_rendering: env.render(mode='human', delay=delay) is_end_game = utilities.do_necessary_env_cleanup( env) # assign new positions, remove players if stack < 0 etc .. stack_list = env.report_game(requested_attributes=["stack"]) count_existing_players = 0 for stack_record_index, stack_record in env._player_dict.items(): arr = stacks_over_time[stack_record_index] + [ stack_list[stack_record_index] ] stacks_over_time.update({stack_record_index: arr}) if (stack_list[stack_record_index] != 0): count_existing_players += 1 episode_list.append(episode) # Find all (state, action) pairs we've visited in this episode # We convert each state to a tuple so that we can use it as a dict key sa_in_episode = set([(tuple(sar[0]), sar[1]) for sar in episode]) for state, action in sa_in_episode: state = state[0] sa_pair = (state, action) # Find the first occurance of the (state, action) pair in the episode first_occurence_idx = next(i for i, x in enumerate(episode) if x[0][0] == state and x[1] == action) # Sum up all rewards since the first occurance G = sum([ x[2] * (discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:]) ]) # Calculate average return for this state over all sampled episodes returns_sum[sa_pair] += G returns_count[sa_pair] += 1.0 Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair] # The policy is improved implicitly by changing the Q dictionary if is_end_game: break # Episode end for player_idx, stack in stacks_over_time.items(): if player_idx == 0: plt.plot(stack, label="Player {} - Learner".format(player_idx)) else: plt.plot(stack, label="Player {}".format(player_idx)) p1_stack_t = list(stacks_over_time.values())[0] p2_stack_t = list(stacks_over_time.values())[1] # diffs = [j-i for i, j in zip(p1_stack_t[:-1], p1_stack_t[1:])] # import statistics # lost_avg = statistics.mean(diffs) won_avg = p1_stack_t[len(p1_stack_t) - 1] - p1_stack_t[0] # print(p1_stack_t) print('mbb/g:{}'.format(won_avg / n_episodes)) plt.ylabel('Stack Size') plt.xlabel('Episode') plt.legend() if with_graph: plt.show() return Q, policy
episode = [] (player_states, (community_infos, community_cards)) = env.reset() (player_infos, player_hands) = zip(*player_states) current_state = ((player_infos, player_hands), (community_infos, community_cards)) utilities.compress_bucket(current_state, env, pre=True) state = create_np_array(player_infos, player_hands, community_cards, community_infos) # Only want the state set that is relevant to learner bot every step. state_set = utilities.convert_list_to_tupleA(player_states[env.learner_bot.get_seat()], current_state[1]) if with_render: env.render(mode='human', initial=True, delay=delay) terminal = False while not terminal: _round = utilities.which_round(community_cards) current_player = community_infos[-3] if current_player is not 0: action = get_action_policy(player_infos, community_infos, community_cards, env, _round, env.n_seats, state_set, policy, villain) else: action = agent.act(state, player_infos, community_infos, community_cards, env, _round, env.n_seats, state_set, policy) #STEP - SET BREAKPOINT ON THE FOLLOWING LINE TO OBSERVE ACTIONS TAKEN ONE BY ONE (player_states, (community_infos, community_cards)), action, rewards, terminal, info = env.step(action) utilities.compress_bucket(player_states, env) action = utilities.convert_step_return_to_action(action) ps = list(zip(*player_states)) next_state = create_np_array(ps[0], ps[1], community_cards, community_infos) # Numpy array agent.remember(state, action, env.learner_bot.reward, next_state, terminal) state = next_state
def populate_info_pre_action(self): self._round = utilities.which_round(self.community_cards) self.current_player = self.community_infos[-3]