def show_move_values(player: Player, game: Game): """Show learned values for game state. Args: player: instance of Player class game: instance of Game class """ match_state, transform = state_lookup(game.state, player.value_map) if 'k' in transform['args']: transform['args']['k'] = -transform['args']['k'] action_values = agent.value_map.get(match_state, None)[game.mark] adj_values = reverse_transforms(action_values, transform, game.ind_to_loc) _, ax = plt.subplots(figsize=(4.5, 4.5)) _ = plt.plot([1, 1], [0, -3], 'k-', linewidth=4) _ = plt.plot([2, 2], [0, -3], 'k-', linewidth=4) _ = plt.plot([0, 3], [-1, -1], 'k-', linewidth=4) _ = plt.plot([0, 3], [-2, -2], 'k-', linewidth=4) for x, y in game.ind_to_loc: if game.state[x, y] != 0: mark = 'x' if game.state[x, y] == 1 else 'o' plt.text(y + 0.275, -x - 0.725, mark, size=60) else: # TODO: add round(x, 2) plt.text(y + 0.35, -x - 0.575, adj_values[x, y], size=15) square = patches.Rectangle((y, -x - 1), 1, 1, linewidth=0, edgecolor='none', facecolor='r', alpha=adj_values[x, y] * 0.75) ax.add_patch(square) _ = ax.axis('off')
def _policy(self, marker: int, game: Game) -> Tuple[int]: match_state, transform = state_lookup(game.state, self.value_map) if 'k' in transform['args']: transform['args']['k'] = -transform['args']['k'] action_values = self.value_map.get(match_state, None)[marker] adj_values = reverse_transforms(action_values, transform, game.ind_to_loc) actions = [a for a in adj_values] raw_values = [adj_values[a] for a in actions] if sum(raw_values) <= 0: values = [1/len(raw_values) for v in raw_values] else: values = [v/sum(raw_values) for v in raw_values] loc_inds = [i for i in range(len(values))] if self.explore: # take action with probability proportional to value loc_ind = np.random.choice(loc_inds, p=values) else: # exploit - take action with highest value loc_ind = loc_inds[np.argmax(values)] loc = actions[loc_ind] return loc
def test_reverse_transform(): # arrange action_values = { (0, 0): 0, (0, 1): 0.1, (0, 2): 0.2, (1, 0): 0.3, (1, 1): 0.4, (2, 2): 0.5, (2, 0): 0.6, (2, 1): 0.7, (2, 2): 0.8 } transform = {'func': np.fliplr, 'args': {}} expected_values = [action_values[act] for act in action_values] # act adj_values = reverse_transforms(action_values, transform, Game.ind_to_loc) values = [adj_values[act] for act in adj_values] print(adj_values) # assert assert len(adj_values) == len(action_values) assert set(values) == set(expected_values)
def process_reward(self, reward: Union[int, float], ind_to_loc: List[Tuple]) -> List[ValueMod]: """Update value map given reward. Args: reward: int or float, reward value ind_to_loc: list of tuple, game state index to board location map Returns: reward_mods: list of ValueMod, modifications to value for each move """ temporal_discount = 1 reward_mods = [] # if the reward is 0, no update # UNLESS the temporal_discount_rate is 0 # in this case, we need to process 0 rewards to transfer learning to earlier states # we could use eligibility traces to update each state on every move.. # ..but that is less intuitive and won't be implemented here # if there's a non-zero reward (win/loss), assign credit to all moves (temporal difference) # less credit is given to earlier moves, according to the temporal_discount_rate if (reward == 0) and (self.temporal_discount_rate > 0): return [] elif reward == 0: entries = [self.buffer[-1]] else: entries = self.buffer[::-1] for entry in entries: # find the current value of (state, marker, move) combo match_state, transform = state_lookup(entry.state, self.value_map) if 'k' in transform['args']: transform['args']['k'] = -transform['args']['k'] action_values = self.value_map[match_state][entry.marker] adj_values = reverse_transforms(action_values, transform, ind_to_loc) print(adj_values) current = adj_values[entry.move] # TODO: after a player's move, there is no valid move for that marker # so the maximum future value is likely not being found correctly # option: use the next state in the buffer (after competitor's turn), if there is one # - dependent on quality of competitor's move # - if no more states in buffer, have to use last state # find the maximum value in the state resulting from the current move new_state = np.copy(entry.state) new_state[entry.move[0], entry.move[1]] = entry.marker new_match_state, _ = state_lookup(new_state, self.value_map) new_action_values = self.value_map[new_match_state][entry.marker] if isinstance(new_action_values, dict): max_future = max( [new_action_values[a] for a in new_action_values]) else: max_future = new_action_values # use the Bellman equation to update the current value updated = np.clip( current + temporal_discount * self.learning_rate * (reward + (self.discount_rate * max_future - current)), a_min=0, a_max=1) # reverse the transform to find the proper move to update.. and apply it undo = transform undo['args'] = {k: -undo['args'][k] for k in undo['args']} adj_move = [ k for k in reverse_transforms({entry.move: 0}, undo, ind_to_loc) ][0] self.value_map[match_state][entry.marker][adj_move] = updated # update temporal discount and record modification to value map temporal_discount *= self.temporal_discount_rate mod = ValueMod(state=match_state, move=adj_move, previous=current, new=updated) reward_mods.append(mod) return reward_mods