def select(self, c=1.5, csi=1., b=1., variance=False):
        """
         Select one of the child actions based on UCT rule
         :param c: UCB exploration constant
         :param csi: exploration constant
         :param b: parameter such that the rewards belong to [0, b]
         """
        if not variance:
            uct_upper_bound = np.array([
                child_action.Q +
                c * np.sqrt(np.log(self.n) /
                            (child_action.n)) if child_action.n > 0 else np.inf
                for child_action in self.child_actions
            ])
            winner = argmax(uct_upper_bound)
            return self.child_actions[winner]

        if self.n > 0:
            logp = np.log(self.n)
        else:
            logp = -np.inf

        bound = np.array([
            child_action.Q +
            np.sqrt(csi * child_action.sigma * logp / child_action.n) +
            3 * c * b * csi * logp / child_action.n if child_action.n > 0
            and not np.isinf(child_action.sigma).any() else np.inf
            for child_action in self.child_actions
        ])

        winner = argmax(bound)
        return self.child_actions[winner]
Ejemplo n.º 2
0
 def select(self):
     UCT = np.array([
         child.Q + self.c * (np.sqrt(self.n + 1) / (child.n + 1))
         for child in self.children
     ])
     winner = argmax(UCT)
     return self.children[winner]
    def pi_wrapper(self, s, current_depth, max_depth):
        # Compute the reduced budget as function of the search root depth
        if self.scheduler_params:
            l = self.schedule(current_depth,
                              k=self.scheduler_params["slope"],
                              mid=self.scheduler_params["mid"],
                              width=current_depth + max_depth)
            # self.scheduler_budget = max(int(self.budget * (1 - l)), self.scheduler_params["min_budget"])
            self.scheduler_budget = max(int(self.budget * l),
                                        self.scheduler_params["min_budget"])
            # print("\nDepth: {}\nBudget: {}".format(current_depth, self.scheduler_budget))

        if self.mcts_only:
            self.search(self.n_mcts, self.c_dpw, self.mcts_env, max_depth)
            state, pi, V = self.return_results(
                self.temp)  # TODO put 0 if the network is enabled
            self.curr_probs.append(pi)
            a_w = argmax(pi)
            # max_p = np.max(pi)
            # a_w = np.random.choice(np.argwhere(pi == max_p)[0])
        else:
            pi_w = self.get_model().predict_pi(s).flatten()
            self.curr_probs.append(pi_w)
            max_p = np.max(pi_w)
            a_w = np.random.choice(np.argwhere(pi_w == max_p)[0])
        return a_w
    def pi_wrapper(self, s, current_depth, max_depth):
        # Compute the reduced budget as function of the search root depth
        if self.scheduler_params:
            l = self.schedule(current_depth,
                              k=self.scheduler_params["slope"],
                              mid=self.scheduler_params["mid"],
                              width=current_depth + max_depth)
            # self.scheduler_budget = max(int(self.budget * (1 - l)), self.scheduler_params["min_budget"])
            self.scheduler_budget = max(int(self.budget * l),
                                        self.scheduler_params["min_budget"])
            # print("\nDepth: {}\nBudget: {}".format(current_depth, self.scheduler_budget))

        if self.mcts_only:
            if len(self.get_env().get_available_actions(
                    self.get_mcts().owner)) > 1:
                self.search(self.n_mcts, self.c_dpw[self._current_agent],
                            self.mcts_env, max_depth)
                state, pi, V = self.return_results(
                    self.temp)  # TODO put 0 if the network is enabled
            else:
                pi = np.zeros(self.get_mcts().na)
                pi[0] = 1.
            self.curr_probs.append(pi)
            a_w = argmax(pi)
            # max_p = np.max(pi)
            # a_w = np.random.choice(np.argwhere(pi == max_p)[0])
        else:
            raise NotImplementedError(
                "No policy network has been implemented for RaceStrategy")
        return a_w
Ejemplo n.º 5
0
    def select(self, c=1.5):
        ''' Select one of the child actions based on UCT rule '''

        UCT = np.array(
            [child_action.Q + prior * c * (np.sqrt(self.n + 1) / (child_action.n + 1)) for child_action, prior in
             zip(self.child_actions, self.priors)])
        winner = argmax(UCT)
        return self.child_actions[winner]
Ejemplo n.º 6
0
 def select(self, c=1.5):
     """ Select one of the child actions based on UCT rule """
     # TODO check here
     UCT = np.array(
         [child_action.Q + c * np.sqrt(np.log(self.n) / (child_action.n)) if child_action.n > 0 else np.inf
          for child_action in self.child_actions])
     winner = argmax(UCT)
     return self.child_actions[winner]
Ejemplo n.º 7
0
 def select(self, c=1.5):
     """ Select one of the child actions based on UCT rule """
     uct = np.array([
         child_action.q + prior * c * (np.sqrt(self.n + 1) /
                                       (child_action.n + 1))
         for child_action, prior in zip(self.child_actions, self.priors)
     ])
     winner = argmax(uct)
     return self.child_actions[winner]
 def select(self, c=1.5):
     """ Select one of the child actions based on UCT rule """
     # TODO check here
     uct_upper_bound = np.array([
         child_action.Q + c * (np.sqrt(self.n + 1) / (child_action.n + 1))
         for child_action in self.child_actions
     ])
     winner = argmax(uct_upper_bound)
     return self.child_actions[winner]
Ejemplo n.º 9
0
    def _viterbi_decode(self, feats):

        # initialize list to keep track of backpointers
        backpointers = []

        # initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars.unsqueeze(0)

        bsz, time, dim = feats.unsqueeze(0).size()

        for feat in feats:

            # calculate scores of next tag per tag
            forward_var = forward_var.view(bsz, 1, self.tagset_size)
            trans_scores = self.log_transitions.unsqueeze(0)
            next_tag_vars = forward_var + trans_scores

            # get best next tags and viterbi vars
            _, idx = torch.max(next_tag_vars, 2)
            best_tag_ids = idx.view(bsz, -1)
            indices = torch.transpose(best_tag_ids.unsqueeze(0), 1, 2)
            viterbivars_t = torch.gather(next_tag_vars, 2, indices).squeeze(2)

            # add emission scores and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (viterbivars_t + feat).view(1, -1)
            backpointers.append(best_tag_ids[0].tolist())

        # transition to STOP_TAG
        terminal_var = forward_var + self.log_transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # follow the back pointers to decode the best path
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)

        # pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path
Ejemplo n.º 10
0
def maximally_correlating_ordering(correlations):
    """Given the correlation matrix of columns of some matrices a and b, return
       ordering indices such that b[:, ordering] correlates maximally with a.
       Maximally correlating pairs of columns are chosen greedily.
    """
    c = np.array(correlations)
    n = len(c)
    permutation = np.zeros(n, dtype=int)
    correlation_signs = np.zeros(n, dtype=int)
    # In a greedy fashion, match best correlating columns with each other
    for _ in range(n):
        # Find the best correlation of columns that are both still available
        a_col, b_col = argmax(abs(c))
        permutation[a_col] = b_col
        correlation_signs[a_col] = np.sign(c[a_col, b_col])
        # Mark found columns as unavailable
        c[a_col, :] = 0
        c[:, b_col] = 0
    return permutation, correlation_signs
Ejemplo n.º 11
0
 def select(self, stochastic=True):
     qs = np.zeros(self.na)
     for a in self.child_actions:
         qs[a] = a.q(stochastic)
     return self.child_actions[argmax(qs)]