def softmax_selection(self, state_index, Q): possible_actions = self.env.get_possible_actions(state_index) if len(possible_actions) == 1: return 0 probabilities = utils.softmax(Q, self.inv_temp) action_idx = np.random.choice(list(range(len(possible_actions))), p=probabilities) return action_idx
def select_action(self, state_idx, softmax=True): # TODO: get categorical dist over next state # okay because it's local # gradient-based (hill-climbing) gradient ascent # graph hill climbing # Maybe change for M(sa,sa). potentially over state action only in two step V = self.M_hat @ self.R_hat next_state = [self.env.get_next_state(state_idx, a) for a in range(self.env.nr_actions)] Q = [V[s] for s in next_state] probabilities = utils.softmax(Q, self.beta) return np.random.choice(list(range(self.env.nr_actions)), p=probabilities)
def select_action(self, state_idx, softmax=True): # TODO: get categorical dist over next state # okay because it's local # gradient-based planning (hill-climbing) gradient ascent # graph hill climbing # Maybe change for M(sa,sa). potentially over state action only in two step next_state = [ self.env.get_next_state(state_idx, a) for a in range(self.env.nr_actions) ] Q = [self.compute_V(s) for s in next_state] probabilities = utils.softmax(Q, self.beta) try: a = np.random.choice(list(range(self.env.nr_actions)), p=probabilities) except ValueError: print('whats wrong') return a
def softmax_selection(self, state_index, Q): probabilities = utils.softmax(Q, self.inv_temp) action_idx = np.random.choice(list(range(self.env.nr_actions)), p=probabilities) return action_idx
def softmax_selection(self, state_idx): probabilities = utils.softmax(self.Q[state_idx], self.beta) action_idx = np.random.choice(list(range(self.env.nr_actions)), p=probabilities) return action_idx