def select(self, c=1.5, csi=1., b=1., variance=False): """ Select one of the child actions based on UCT rule :param c: UCB exploration constant :param csi: exploration constant :param b: parameter such that the rewards belong to [0, b] """ if not variance: uct_upper_bound = np.array([ child_action.Q + c * np.sqrt(np.log(self.n) / (child_action.n)) if child_action.n > 0 else np.inf for child_action in self.child_actions ]) winner = argmax(uct_upper_bound) return self.child_actions[winner] if self.n > 0: logp = np.log(self.n) else: logp = -np.inf bound = np.array([ child_action.Q + np.sqrt(csi * child_action.sigma * logp / child_action.n) + 3 * c * b * csi * logp / child_action.n if child_action.n > 0 and not np.isinf(child_action.sigma).any() else np.inf for child_action in self.child_actions ]) winner = argmax(bound) return self.child_actions[winner]
def select(self): UCT = np.array([ child.Q + self.c * (np.sqrt(self.n + 1) / (child.n + 1)) for child in self.children ]) winner = argmax(UCT) return self.children[winner]
def pi_wrapper(self, s, current_depth, max_depth): # Compute the reduced budget as function of the search root depth if self.scheduler_params: l = self.schedule(current_depth, k=self.scheduler_params["slope"], mid=self.scheduler_params["mid"], width=current_depth + max_depth) # self.scheduler_budget = max(int(self.budget * (1 - l)), self.scheduler_params["min_budget"]) self.scheduler_budget = max(int(self.budget * l), self.scheduler_params["min_budget"]) # print("\nDepth: {}\nBudget: {}".format(current_depth, self.scheduler_budget)) if self.mcts_only: self.search(self.n_mcts, self.c_dpw, self.mcts_env, max_depth) state, pi, V = self.return_results( self.temp) # TODO put 0 if the network is enabled self.curr_probs.append(pi) a_w = argmax(pi) # max_p = np.max(pi) # a_w = np.random.choice(np.argwhere(pi == max_p)[0]) else: pi_w = self.get_model().predict_pi(s).flatten() self.curr_probs.append(pi_w) max_p = np.max(pi_w) a_w = np.random.choice(np.argwhere(pi_w == max_p)[0]) return a_w
def pi_wrapper(self, s, current_depth, max_depth): # Compute the reduced budget as function of the search root depth if self.scheduler_params: l = self.schedule(current_depth, k=self.scheduler_params["slope"], mid=self.scheduler_params["mid"], width=current_depth + max_depth) # self.scheduler_budget = max(int(self.budget * (1 - l)), self.scheduler_params["min_budget"]) self.scheduler_budget = max(int(self.budget * l), self.scheduler_params["min_budget"]) # print("\nDepth: {}\nBudget: {}".format(current_depth, self.scheduler_budget)) if self.mcts_only: if len(self.get_env().get_available_actions( self.get_mcts().owner)) > 1: self.search(self.n_mcts, self.c_dpw[self._current_agent], self.mcts_env, max_depth) state, pi, V = self.return_results( self.temp) # TODO put 0 if the network is enabled else: pi = np.zeros(self.get_mcts().na) pi[0] = 1. self.curr_probs.append(pi) a_w = argmax(pi) # max_p = np.max(pi) # a_w = np.random.choice(np.argwhere(pi == max_p)[0]) else: raise NotImplementedError( "No policy network has been implemented for RaceStrategy") return a_w
def select(self, c=1.5): ''' Select one of the child actions based on UCT rule ''' UCT = np.array( [child_action.Q + prior * c * (np.sqrt(self.n + 1) / (child_action.n + 1)) for child_action, prior in zip(self.child_actions, self.priors)]) winner = argmax(UCT) return self.child_actions[winner]
def select(self, c=1.5): """ Select one of the child actions based on UCT rule """ # TODO check here UCT = np.array( [child_action.Q + c * np.sqrt(np.log(self.n) / (child_action.n)) if child_action.n > 0 else np.inf for child_action in self.child_actions]) winner = argmax(UCT) return self.child_actions[winner]
def select(self, c=1.5): """ Select one of the child actions based on UCT rule """ uct = np.array([ child_action.q + prior * c * (np.sqrt(self.n + 1) / (child_action.n + 1)) for child_action, prior in zip(self.child_actions, self.priors) ]) winner = argmax(uct) return self.child_actions[winner]
def select(self, c=1.5): """ Select one of the child actions based on UCT rule """ # TODO check here uct_upper_bound = np.array([ child_action.Q + c * (np.sqrt(self.n + 1) / (child_action.n + 1)) for child_action in self.child_actions ]) winner = argmax(uct_upper_bound) return self.child_actions[winner]
def _viterbi_decode(self, feats): # initialize list to keep track of backpointers backpointers = [] # initialize the viterbi variables in log space init_vvars = torch.full((1, self.tagset_size), -10000.) init_vvars[0][self.tag_to_ix[START_TAG]] = 0 # forward_var at step i holds the viterbi variables for step i-1 forward_var = init_vvars.unsqueeze(0) bsz, time, dim = feats.unsqueeze(0).size() for feat in feats: # calculate scores of next tag per tag forward_var = forward_var.view(bsz, 1, self.tagset_size) trans_scores = self.log_transitions.unsqueeze(0) next_tag_vars = forward_var + trans_scores # get best next tags and viterbi vars _, idx = torch.max(next_tag_vars, 2) best_tag_ids = idx.view(bsz, -1) indices = torch.transpose(best_tag_ids.unsqueeze(0), 1, 2) viterbivars_t = torch.gather(next_tag_vars, 2, indices).squeeze(2) # add emission scores and assign forward_var to the set # of viterbi variables we just computed forward_var = (viterbivars_t + feat).view(1, -1) backpointers.append(best_tag_ids[0].tolist()) # transition to STOP_TAG terminal_var = forward_var + self.log_transitions[self.tag_to_ix[STOP_TAG]] best_tag_id = argmax(terminal_var) path_score = terminal_var[0][best_tag_id] # follow the back pointers to decode the best path best_path = [best_tag_id] for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) # pop off the start tag (we dont want to return that to the caller) start = best_path.pop() assert start == self.tag_to_ix[START_TAG] # Sanity check best_path.reverse() return path_score, best_path
def maximally_correlating_ordering(correlations): """Given the correlation matrix of columns of some matrices a and b, return ordering indices such that b[:, ordering] correlates maximally with a. Maximally correlating pairs of columns are chosen greedily. """ c = np.array(correlations) n = len(c) permutation = np.zeros(n, dtype=int) correlation_signs = np.zeros(n, dtype=int) # In a greedy fashion, match best correlating columns with each other for _ in range(n): # Find the best correlation of columns that are both still available a_col, b_col = argmax(abs(c)) permutation[a_col] = b_col correlation_signs[a_col] = np.sign(c[a_col, b_col]) # Mark found columns as unavailable c[a_col, :] = 0 c[:, b_col] = 0 return permutation, correlation_signs
def select(self, stochastic=True): qs = np.zeros(self.na) for a in self.child_actions: qs[a] = a.q(stochastic) return self.child_actions[argmax(qs)]