def _build_heuristic_distance(self):
     """
     Build the heuristic map used for searching by Dijkstra's Algorithm
     """
     goal = self.goal
     frontier = PriorityQueueImproved('min',
                                      f=self.heuristic_distance.__getitem__)
     # For all exit positions (We don't care about other players)
     for pos in self.exit_positions[self.colour]:
         # Set initial heuristic to 1, and add to start
         self.heuristic_distance[pos] = 1
         frontier.append(pos)
     # While search is not ended
     while frontier:
         pos = frontier.pop()
         q, r = pos
         # Explore all space near current place
         cost = self.heuristic_distance[pos]
         for dq, dr in self.moves:
             for move in range(1, 3):
                 next_pos = (q + dq * move, r + dr * move)
                 # If the moved position is valid, update it with cost + 1,
                 # Else simply continue next loop
                 if (not State.inboard(next_pos)
                         or next_pos in goal.pos_to_piece):
                     continue
                 # Get value in dictionary
                 h_val = self.heuristic_distance.get(next_pos, None)
                 # Not yet navigated to or can be updated
                 if h_val is None or h_val > cost + 1:
                     # Update dictionary entry
                     self.heuristic_distance[next_pos] = cost + 1
                     # Update the value in queue
                     frontier.append(next_pos)
 def convert_action_perspective(self, action, convert_to):
     """
     This further converts the converted format from convert_action_to
     to the corresponding player perspective
     :param action: the action to translate. Format (fr, to, type)
     :param convert_to: To whose perspective should the convert to
     :return: The converted perspective
     """
     fr, to, move = action
     # No need for change if pass
     if action == "PASS":
         return action
     if convert_to == "player":
         new_fr = State.rotate_pos(self.colour, "red", fr)
         new_to = State.rotate_pos(self.colour, "red", to)
     elif convert_to == "referee":
         new_fr = State.rotate_pos("red", self.colour, fr)
         new_to = State.rotate_pos("red", self.colour, to)
     else:
         raise ValueError(convert_to + "mode is not valid")
     return new_fr, new_to, move
    def __init__(self,
                 colour,
                 search_algorithm=None,
                 game_type=Game,
                 evaluator=player_evaluator,
                 initial_state=None):
        """
        This method is called once at the beginning of the game to initialise
        your player. You should use this opportunity to set up your own internal
        representation of the game state, and any other information about the
        game state you would like to maintain for the duration of the game.

        The parameter colour will be a string representing the player your
        program will play as (Red, Green or Blue). The value will be one of the
        strings "red", "green", or "blue" correspondingly.

        You can parse any valid board state to UCT the Agent
        However the Agent assumes the states are valid
        """
        self.colour = colour
        self.search_algorithm = search_algorithm

        self.code_map = State.code_map
        self.rev_code_map = State.rev_code_map

        # cycle the players:
        #    player:: red:   red -> red,   green -> green, blue -> blue
        #    player:: green: red -> blue,  green -> red,   blue -> green
        #    player:: blue:  red -> green, green -> blue,  blue -> red
        self.referee_to_player_mapping = State.perspective_mapping[colour]
        self.player_to_referee_mapping = {
            value: key
            for key, value in self.referee_to_player_mapping.items()
        }

        # The initial player is red, convert it to the rotate perspective
        state = State(self.start_config,
                      colour=self.referee_to_player_mapping["red"])

        # Colour of the game is different from the color of the state
        self.game = game_type("red", state)
node_type = SimpleRLNode2

policy = "greedy"
# policy = "choice"

# debug = 0.001
# debug = 0.1
debug = 0

# explore = 0
# explore = 0.1
# explore = 0.2
# explore = 0.5
explore = 1

# theta = 0.05
# theta = 0.01
theta = 0.005
# theta = 0.001
# theta = 0.0005

gamma = 1
# gamma = 0.99

initial_state = State(Player.start_config, "red")
game = Game("red", initial_state)
agent.td_train(game, initial_state, debug=debug,
               node_type=node_type, policy=policy,
               explore=explore, theta=theta, gamma=gamma)

def parse_state(file_name):
    f = open(file_name)
    pos_dict, colour, completed = JsonParser(json.load(f)).parse()
    return State(pos_dict, colour, completed)
def change_state_color(state, color):
    return State(state.pos_to_piece, color, state.completed)
    def td_train(self, game, initial_state=None, explore=0.1, n=1000,
                 theta=0.05, checkpoint_interval=20, gamma=0.9,
                 node_type=InitialRLNode, policy="choice", debug=0):
        # TODO make it possible to plug in other agents
        self.network.learning_rate = theta
        initial_node = node_type(initial_state, rewards=(0, 0, 0))

        if policy == "greedy":
            policy = self.greedy_policy
        elif policy == "choice":
            policy = self.choice_policy
        else:
            raise ValueError("Invalid policy")

        # Generate episodes of game based on current policy
        # -> Update the value for each player
        losses = []

        episodes = []
        count = 0
        length = 0
        for i in range(n):
            node = initial_node
            # We record three player simultaneously
            loss = 0

            episode_actions = []
            episode_states = []
            episode_rewards = []

            # while not game.terminal_state(node.state):
            while True:

                # TODO replace this by any policy for bootstrapping
                # TODO use the current value to compute!

                current_colour = node.state.colour
                current_code = node.state.code_map[current_colour]

                # Rotate the state to make current colour be red
                current_state = node.state.red_perspective(current_colour)
                # Get the results
                action, next_node = policy(game, current_state,
                                           explore=explore,
                                           node_type=node_type, train=True)

                # Update
                # Here is the model assumption
                # The player's turn (who's time to choose action)
                # --------------------
                # g   b   r   g   b   r   g ...
                # 1   2   3   4   5   6   7
                # In reality, we should be computing three values for each
                # node. But we cheat here. We only compute the value wrt
                # current actor: The values are like this. (for r only)
                #                     * here now
                #         v   v'      v''
                #             o
                #           /
                # o - o - o - o           o
                #   ^       \           /
                # Other       o - o - o - o
                # branch                \
                # unknown                 o
                # p_s[r]:[0   1   2   *]
                # We say that vt'' -> vt', and vt' +

                # # (Experimental, try to solve the after state problem)
                # # Update estimation of v' based on v''
                # # Then update the estimation v based on v'
                # # Get y
                # # 1. Get current state (already have)
                # # 2. Get feature vector
                # current_state_vector = self.feature_extractor(current_state)
                # # 3. Compute v'
                # v_prime = \
                #     self.network.forward(np.array([current_state_vector]))
                # ### THERE is no reward here!
                # # 4. Get y from v'
                # y = v_prime
                #
                # # Get X
                # # 1. Get prev state
                # prev_state = player_states[current_code][1]
                # # 2. Get feature vector as X
                # prev_state_vector = \
                #     self.feature_extractor(prev_state)
                # X = np.array([prev_state_vector])
                # # Backward propagation
                # self.network.backward(X, y)

                # Update the estimation of previous state v and v'
                # Get y
                # 1. Get next state
                next_state = next_node.state
                # 2. Get feature vector
                next_state_vector = self.feature_extractor(next_state)
                # 3. Compute v'
                v_prime = \
                    self.network.forward(np.array([next_state_vector]))
                # 4. Get reward
                reward = next_node.rewards[0]
                # 5. Get v' + reward as y
                y = gamma * v_prime + reward
                # Get X
                # 1. Get current state
                # 2. Get feature vector as X
                current_state_vector = self.feature_extractor(current_state)
                X = np.array([current_state_vector])
                # Backward propagation
                y_hat_old = self.network.forward(X)
                # if y[0][0] != y_hat_old[0][0]:
                #     print("====================")
                #     print(y_hat_old, y)
                self.network.backward(X, y)
                y_hat = self.network.forward(X)
                # if y[0][0] != y_hat[0][0]:
                    # print(y_hat, y)
                    # assert abs(y[0][0] - y_hat[0][0]) < abs(y[0][0] - y_hat_old[0][0])
                    # print("====================")
                loss += self.network.loss.compute(y_hat, y)
                count += 1

                if game.terminal_state(next_node.state):
                    break

                fr, to, move = action
                fr = State.rotate_pos("red", current_colour, fr)
                to = State.rotate_pos("red", current_colour, to)
                action = (fr, to, move)

                node = next_node
                # Back to original perspective
                node.original_perspective(current_colour)

                episode_actions.append(action)
                episode_states.append(node.state)
                episode_rewards.append(node.rewards)

                if debug:
                    print(node)
                    sleep(debug)

            print(len(episode_states))
            print(f"Episode: {i}")

            # Store them for now
            episodes.append((episode_states, episode_actions, episode_rewards))
            length += len(episode_states)

            if i % checkpoint_interval == checkpoint_interval - 1:
                losses.append((i, loss))
                print(f"Episode: {i}\n"
                      f"        loss={loss/count}\n"
                      f"        average episode={length/checkpoint_interval}")
                count = 0
                length = 0
                self.network.save_checkpoint()
        self.network.save_final()
 def __init__(self, colour, state):
     super().__init__(state, State({}, "red"))
     # self.evaluator = evaluator
     self.colour = colour
     self._build_heuristic_distance()