Esempio n. 1
0
 def return_results(self,decision_type,backup_policy,temperature,c):
     counts = self.get_backward_counts(backward_policy=backup_policy,c=c)
     probs = stable_normalizer(counts,temperature)
     Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32')
     V = np.sum(counts*Q)/np.sum(counts)[None]
     
     if decision_type == 'count':
         a = my_argmax(counts)
     elif decision_type == 'mean':
         Q_ = np.array([child_action.Q if child_action.n > 0 else -np.Inf for child_action in self.child_actions])
         a = my_argmax(Q_) 
     return probs,V,a
Esempio n. 2
0
 def ucb_backward_sample(self,c):
     ''' UCB sample for backward pass. Does not use sigma_tree. Note the -np.Inf in the U, which prevent selecting an untried action '''
     Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32')
     U = np.array([c * (np.sqrt(self.n)/(child_action.n)) if child_action.n > 0 else -np.Inf for child_action in self.child_actions],dtype='float32')
     scores = np.squeeze(Q + U)
     winner = my_argmax(scores)
     return winner
Esempio n. 3
0
 def thompson_policy_sample(self):
     ''' Thompson sample for backward pass '''
     # not used right now
     samples = []
     for child_action in self.child_actions:
         if child_action.n > 0:
             samples.append(child_action.Q + np.random.normal(0,1)/np.sqrt(child_action.n))
         else:
             samples.append(-np.Inf) # cant select an untried action
     return my_argmax(np.array(samples))
Esempio n. 4
0
 def select(self,c):
     ''' Select one of the child actions based on UCT rule '''
     Q = np.array([child_action.Q for child_action in self.child_actions],dtype='float32')
     U = np.array([c * (np.sqrt(self.n)/(child_action.n)) if child_action.n > 0 else np.Inf for child_action in self.child_actions],dtype='float32')
     if self.sigma_tree:
         sigma_actions_t = np.array([child_action.sigma_t for child_action in self.child_actions])
         U *= sigma_actions_t
     scores = Q + U
     winner = my_argmax(scores)
     return self.child_actions[winner]
Esempio n. 5
0
    def return_results(self,
                       decision_type='count',
                       loss_type='count',
                       V_decision='on-policy',
                       temperature=1):
        # aggregate some results
        counts = np.array(
            [child_action.n for child_action in self.child_actions],
            dtype='float32')
        Q = np.array([child_action.Q for child_action in self.child_actions],
                     dtype='float32')
        a_list = [child_action.index for child_action in self.child_actions]

        # decision
        if decision_type == 'count':
            a_argmax = my_argmax(counts)
        elif decision_type == 'mean':
            Q2 = np.array([
                child_action.Q if child_action.n > 0 else -np.Inf
                for child_action in self.child_actions
            ])
            a_argmax = my_argmax(Q2)
        a_chosen = self.child_actions[a_argmax].index

        # loss
        if loss_type == 'count':
            probs = stable_normalizer(counts, temperature)
        elif loss_type == 'Q':
            probs = Q  # needs logsumexp

        # estimate V
        if V_decision == 'on_policy':
            V = np.sum((counts / np.sum(counts)) * Q)[None]
        elif V_decision == 'max':
            V = np.max(Q)[None]

        return probs, a_list, V, a_chosen, a_argmax
Esempio n. 6
0
 def get_backward_counts(self,backward_policy,c):
     ''' returns a vector of counts to be used as policy in the backward pass '''
     if backward_policy == 'on-policy':
         counts = [child_action.n for child_action in self.child_actions]
     elif backward_policy == 'off-policy':
         Q = np.array([child_action.Q if child_action.n > 0 else -np.Inf for child_action in self.child_actions])
         counts = [0 for i in range(len(self.child_actions))]
         index = my_argmax(Q)
         counts[index] += 1
     elif 'ucb' in backward_policy:
         try:
             _,c = backward_policy.split('-')
         except:
             c = c
         backward_a = self.ucb_backward_sample(float(c))
         self.child_actions[backward_a].backward_n += 1
         counts = [child_action.backward_n for child_action in self.child_actions]
     elif backward_policy == 'thompson':
         backward_a = self.thompson_policy_sample()
         self.child_actions[backward_a].backward_n += 1
         counts = [child_action.backward_n for child_action in self.child_actions]
     return np.array(counts,dtype='float32')
Esempio n. 7
0
    def select(self, c):
        ''' Select one of the child actions based on UCT rule '''
        # first check whether we need to add a child
        self.add_child_actions()

        Q = np.array([child_action.Q for child_action in self.child_actions],
                     dtype='float32')
        U = np.array([
            c * (np.sqrt(self.n) / child_action.n)
            if child_action.n >= 1 else np.Inf
            for child_action in self.child_actions
        ],
                     dtype='float32')
        if self.use_prior:
            U *= np.array(self.priors, dtype='float32')
        if self.sigma_tree:
            U *= np.array(self.sigma_actions_t, dtype='float32')
        scores = np.squeeze(Q + U)
        winner = my_argmax(scores)
        if np.any(np.isnan(scores)):
            print('Q (means): {}, U (UCB): {}'.format(Q, U))
            raise ValueError('Nans produced in select step')
            #set_trace()
        return self.child_actions[winner]