def eps_greedy_choice_linear(self, state, epsilon): Qa = np.zeros(2) # epsilon greedy policy if random.random() > epsilon: for action in Actions.get_values(): phi = self.feature_computation(state, action) Qa[action] = sum(phi*self.theta) a_next = Actions.get_action(np.argmax(Qa)) else: a_next = Actions.hit if random.random()<0.5 else Actions.stick phi = self.feature_computation(state, a_next) my_Qa = sum(phi*self.theta) return [a_next, my_Qa]
def eps_greedy_choice_linear(self, state, epsilon): Qa = np.zeros(2) # epsilon greedy policy if random.random() > epsilon: for action in Actions.get_values(): phi = self.feature_computation(state, action) Qa[action] = sum(phi * self.theta) a_next = Actions.get_action(np.argmax(Qa)) else: a_next = Actions.hit if random.random() < 0.5 else Actions.stick phi = self.feature_computation(state, a_next) my_Qa = sum(phi * self.theta) return [a_next, my_Qa]
def eps_greedy_choice(self, state): # collect visits try: visits_to_state = sum(self.N[state.dl_sum-1, state.pl_sum-1, :]) except: visits_to_state = 0 # compute epsilon curr_epsilon = self.n0 / (self.n0 + visits_to_state) # epsilon greedy policy if random.random() < curr_epsilon: return Actions.hit if random.random()<0.5 else Actions.stick else: return Actions.get_action(np.argmax(self.Q[state.dl_sum-1, state.pl_sum-1, :]))
def eps_greedy_choice(self, state): # collect visits try: visits_to_state = sum(self.N[state.dl_sum - 1, state.pl_sum - 1, :]) except: visits_to_state = 0 # compute epsilon curr_epsilon = self.n0 / (self.n0 + visits_to_state) # epsilon greedy policy if random.random() < curr_epsilon: return Actions.hit if random.random() < 0.5 else Actions.stick else: return Actions.get_action( np.argmax(self.Q[state.dl_sum - 1, state.pl_sum - 1, :]))