Esempio n. 1
0
    def eps_greedy_choice_linear(self, state, epsilon):

        Qa = np.zeros(2)

        # epsilon greedy policy
        if random.random() > epsilon:
            for action in Actions.get_values():
                phi = self.feature_computation(state, action)
                Qa[action] = sum(phi*self.theta)
            a_next = Actions.get_action(np.argmax(Qa))
        else:
            a_next = Actions.hit if random.random()<0.5 else Actions.stick
        phi = self.feature_computation(state, a_next)
        my_Qa = sum(phi*self.theta)
        return [a_next, my_Qa]
Esempio n. 2
0
    def eps_greedy_choice_linear(self, state, epsilon):

        Qa = np.zeros(2)

        # epsilon greedy policy
        if random.random() > epsilon:
            for action in Actions.get_values():
                phi = self.feature_computation(state, action)
                Qa[action] = sum(phi * self.theta)
            a_next = Actions.get_action(np.argmax(Qa))
        else:
            a_next = Actions.hit if random.random() < 0.5 else Actions.stick
        phi = self.feature_computation(state, a_next)
        my_Qa = sum(phi * self.theta)
        return [a_next, my_Qa]
Esempio n. 3
0
    def eps_greedy_choice(self, state):

        # collect visits
        try:
            visits_to_state = sum(self.N[state.dl_sum-1, state.pl_sum-1, :])
        except:
            visits_to_state = 0

        # compute epsilon
        curr_epsilon = self.n0 / (self.n0 + visits_to_state)

        # epsilon greedy policy
        if random.random() < curr_epsilon:
            return Actions.hit if random.random()<0.5 else Actions.stick
        else:
            return Actions.get_action(np.argmax(self.Q[state.dl_sum-1, state.pl_sum-1, :]))
Esempio n. 4
0
    def eps_greedy_choice(self, state):

        # collect visits
        try:
            visits_to_state = sum(self.N[state.dl_sum - 1,
                                         state.pl_sum - 1, :])
        except:
            visits_to_state = 0

        # compute epsilon
        curr_epsilon = self.n0 / (self.n0 + visits_to_state)

        # epsilon greedy policy
        if random.random() < curr_epsilon:
            return Actions.hit if random.random() < 0.5 else Actions.stick
        else:
            return Actions.get_action(
                np.argmax(self.Q[state.dl_sum - 1, state.pl_sum - 1, :]))