Exemple #1
0
def show_move_values(player: Player, game: Game):
    """Show learned values for game state.

    Args:
        player: instance of Player class
        game: instance of Game class
    """

    match_state, transform = state_lookup(game.state, player.value_map)
    if 'k' in transform['args']:
        transform['args']['k'] = -transform['args']['k']
    action_values = agent.value_map.get(match_state, None)[game.mark]
    adj_values = reverse_transforms(action_values, transform, game.ind_to_loc)

    _, ax = plt.subplots(figsize=(4.5, 4.5))
    _ = plt.plot([1, 1], [0, -3], 'k-', linewidth=4)
    _ = plt.plot([2, 2], [0, -3], 'k-', linewidth=4)
    _ = plt.plot([0, 3], [-1, -1], 'k-', linewidth=4)
    _ = plt.plot([0, 3], [-2, -2], 'k-', linewidth=4)
    for x, y in game.ind_to_loc:
        if game.state[x, y] != 0:
            mark = 'x' if game.state[x, y] == 1 else 'o'
            plt.text(y + 0.275, -x - 0.725, mark, size=60)
        else:
            # TODO: add round(x, 2)
            plt.text(y + 0.35, -x - 0.575, adj_values[x, y], size=15)
            square = patches.Rectangle((y, -x - 1),
                                       1,
                                       1,
                                       linewidth=0,
                                       edgecolor='none',
                                       facecolor='r',
                                       alpha=adj_values[x, y] * 0.75)
            ax.add_patch(square)
    _ = ax.axis('off')
Exemple #2
0
 def _policy(self, marker: int, game: Game) -> Tuple[int]:
     match_state, transform = state_lookup(game.state, self.value_map)
     if 'k' in transform['args']:
         transform['args']['k'] = -transform['args']['k']
     action_values = self.value_map.get(match_state, None)[marker]
     adj_values = reverse_transforms(action_values, transform, game.ind_to_loc)
     actions = [a for a in adj_values]
     raw_values = [adj_values[a] for a in actions]
     if sum(raw_values) <= 0:
         values = [1/len(raw_values) for v in raw_values]
     else:
         values = [v/sum(raw_values) for v in raw_values]
     loc_inds = [i for i in range(len(values))]
     if self.explore:
         # take action with probability proportional to value
         loc_ind = np.random.choice(loc_inds, p=values)
     else:
         # exploit - take action with highest value
         loc_ind = loc_inds[np.argmax(values)]
     loc = actions[loc_ind]
     return loc
Exemple #3
0
def test_reverse_transform():
    # arrange
    action_values = {
        (0, 0): 0,
        (0, 1): 0.1,
        (0, 2): 0.2,
        (1, 0): 0.3,
        (1, 1): 0.4,
        (2, 2): 0.5,
        (2, 0): 0.6,
        (2, 1): 0.7,
        (2, 2): 0.8
    }
    transform = {'func': np.fliplr, 'args': {}}
    expected_values = [action_values[act] for act in action_values]

    # act
    adj_values = reverse_transforms(action_values, transform, Game.ind_to_loc)
    values = [adj_values[act] for act in adj_values]
    print(adj_values)

    # assert
    assert len(adj_values) == len(action_values)
    assert set(values) == set(expected_values)
Exemple #4
0
    def process_reward(self, reward: Union[int, float],
                       ind_to_loc: List[Tuple]) -> List[ValueMod]:
        """Update value map given reward.

        Args:
            reward: int or float, reward value
            ind_to_loc: list of tuple, game state index to board location map

        Returns:
            reward_mods: list of ValueMod, modifications to value for each move
        """

        temporal_discount = 1
        reward_mods = []
        # if the reward is 0, no update
        #     UNLESS the temporal_discount_rate is 0
        #     in this case, we need to process 0 rewards to transfer learning to earlier states
        #     we could use eligibility traces to update each state on every move..
        #     ..but that is less intuitive and won't be implemented here
        # if there's a non-zero reward (win/loss), assign credit to all moves (temporal difference)
        #     less credit is given to earlier moves, according to the temporal_discount_rate
        if (reward == 0) and (self.temporal_discount_rate > 0):
            return []
        elif reward == 0:
            entries = [self.buffer[-1]]
        else:
            entries = self.buffer[::-1]

        for entry in entries:
            # find the current value of (state, marker, move) combo
            match_state, transform = state_lookup(entry.state, self.value_map)
            if 'k' in transform['args']:
                transform['args']['k'] = -transform['args']['k']
            action_values = self.value_map[match_state][entry.marker]
            adj_values = reverse_transforms(action_values, transform,
                                            ind_to_loc)
            print(adj_values)
            current = adj_values[entry.move]

            # TODO: after a player's move, there is no valid move for that marker
            # so the maximum future value is likely not being found correctly
            # option: use the next state in the buffer (after competitor's turn), if there is one
            #  - dependent on quality of competitor's move
            #  - if no more states in buffer, have to use last state
            # find the maximum value in the state resulting from the current move
            new_state = np.copy(entry.state)
            new_state[entry.move[0], entry.move[1]] = entry.marker
            new_match_state, _ = state_lookup(new_state, self.value_map)
            new_action_values = self.value_map[new_match_state][entry.marker]
            if isinstance(new_action_values, dict):
                max_future = max(
                    [new_action_values[a] for a in new_action_values])
            else:
                max_future = new_action_values

            # use the Bellman equation to update the current value
            updated = np.clip(
                current + temporal_discount * self.learning_rate *
                (reward + (self.discount_rate * max_future - current)),
                a_min=0,
                a_max=1)

            # reverse the transform to find the proper move to update.. and apply it
            undo = transform
            undo['args'] = {k: -undo['args'][k] for k in undo['args']}
            adj_move = [
                k
                for k in reverse_transforms({entry.move: 0}, undo, ind_to_loc)
            ][0]
            self.value_map[match_state][entry.marker][adj_move] = updated

            # update temporal discount and record modification to value map
            temporal_discount *= self.temporal_discount_rate
            mod = ValueMod(state=match_state,
                           move=adj_move,
                           previous=current,
                           new=updated)
            reward_mods.append(mod)

        return reward_mods