def return_results(self,decision_type,backup_policy,temperature,c): counts = self.get_backward_counts(backward_policy=backup_policy,c=c) probs = stable_normalizer(counts,temperature) Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32') V = np.sum(counts*Q)/np.sum(counts)[None] if decision_type == 'count': a = my_argmax(counts) elif decision_type == 'mean': Q_ = np.array([child_action.Q if child_action.n > 0 else -np.Inf for child_action in self.child_actions]) a = my_argmax(Q_) return probs,V,a
def ucb_backward_sample(self,c): ''' UCB sample for backward pass. Does not use sigma_tree. Note the -np.Inf in the U, which prevent selecting an untried action ''' Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32') U = np.array([c * (np.sqrt(self.n)/(child_action.n)) if child_action.n > 0 else -np.Inf for child_action in self.child_actions],dtype='float32') scores = np.squeeze(Q + U) winner = my_argmax(scores) return winner
def thompson_policy_sample(self): ''' Thompson sample for backward pass ''' # not used right now samples = [] for child_action in self.child_actions: if child_action.n > 0: samples.append(child_action.Q + np.random.normal(0,1)/np.sqrt(child_action.n)) else: samples.append(-np.Inf) # cant select an untried action return my_argmax(np.array(samples))
def select(self,c): ''' Select one of the child actions based on UCT rule ''' Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32') U = np.array([c * (np.sqrt(self.n)/(child_action.n)) if child_action.n > 0 else np.Inf for child_action in self.child_actions],dtype='float32') if self.sigma_tree: sigma_actions_t = np.array([child_action.sigma_t for child_action in self.child_actions]) U *= sigma_actions_t scores = Q + U winner = my_argmax(scores) return self.child_actions[winner]
def return_results(self, decision_type='count', loss_type='count', V_decision='on-policy', temperature=1): # aggregate some results counts = np.array( [child_action.n for child_action in self.child_actions], dtype='float32') Q = np.array([child_action.Q for child_action in self.child_actions], dtype='float32') a_list = [child_action.index for child_action in self.child_actions] # decision if decision_type == 'count': a_argmax = my_argmax(counts) elif decision_type == 'mean': Q2 = np.array([ child_action.Q if child_action.n > 0 else -np.Inf for child_action in self.child_actions ]) a_argmax = my_argmax(Q2) a_chosen = self.child_actions[a_argmax].index # loss if loss_type == 'count': probs = stable_normalizer(counts, temperature) elif loss_type == 'Q': probs = Q # needs logsumexp # estimate V if V_decision == 'on_policy': V = np.sum((counts / np.sum(counts)) * Q)[None] elif V_decision == 'max': V = np.max(Q)[None] return probs, a_list, V, a_chosen, a_argmax
def get_backward_counts(self,backward_policy,c): ''' returns a vector of counts to be used as policy in the backward pass ''' if backward_policy == 'on-policy': counts = [child_action.n for child_action in self.child_actions] elif backward_policy == 'off-policy': Q = np.array([child_action.Q if child_action.n > 0 else -np.Inf for child_action in self.child_actions]) counts = [0 for i in range(len(self.child_actions))] index = my_argmax(Q) counts[index] += 1 elif 'ucb' in backward_policy: try: _,c = backward_policy.split('-') except: c = c backward_a = self.ucb_backward_sample(float(c)) self.child_actions[backward_a].backward_n += 1 counts = [child_action.backward_n for child_action in self.child_actions] elif backward_policy == 'thompson': backward_a = self.thompson_policy_sample() self.child_actions[backward_a].backward_n += 1 counts = [child_action.backward_n for child_action in self.child_actions] return np.array(counts,dtype='float32')
def select(self, c): ''' Select one of the child actions based on UCT rule ''' # first check whether we need to add a child self.add_child_actions() Q = np.array([child_action.Q for child_action in self.child_actions], dtype='float32') U = np.array([ c * (np.sqrt(self.n) / child_action.n) if child_action.n >= 1 else np.Inf for child_action in self.child_actions ], dtype='float32') if self.use_prior: U *= np.array(self.priors, dtype='float32') if self.sigma_tree: U *= np.array(self.sigma_actions_t, dtype='float32') scores = np.squeeze(Q + U) winner = my_argmax(scores) if np.any(np.isnan(scores)): print('Q (means): {}, U (UCB): {}'.format(Q, U)) raise ValueError('Nans produced in select step') #set_trace() return self.child_actions[winner]