Example #1
0
    def MC_control(self, iterations):

        # Initialise
        self.iter = iterations
        self.method = "MC_control"
        count_wins = 0
        episode_pairs = []

        # Loop over episodes (complete game runs)
        for episode in xrange(self.iter):

            # reset state action pair list
            episode_pairs = []

            # get initial state for current episode
            my_state = self.env.get_initial_state()

            # Execute until game ends
            while not my_state.term:

                # choose action with epsilon greedy policy
                my_action = self.eps_greedy_choice(my_state)

                # store action state pairs
                episode_pairs.append((my_state, my_action))

                # update visits
                self.N[my_state.dl_sum - 1, my_state.pl_sum - 1,
                       Actions.get_value(my_action)] += 1

                # execute action
                my_state = self.env.step(my_state, my_action)

            # Update Action value function accordingly
            for curr_s, curr_a in episode_pairs:
                step = 1.0 / (self.N[curr_s.dl_sum - 1, curr_s.pl_sum - 1,
                                     Actions.get_value(curr_a)])
                error = my_state.rew - self.Q[curr_s.dl_sum - 1,
                                              curr_s.pl_sum - 1,
                                              Actions.get_value(curr_a)]
                self.Q[curr_s.dl_sum - 1, curr_s.pl_sum - 1,
                       Actions.get_value(curr_a)] += step * error

            #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, my_state.rew)
            count_wins = count_wins + 1 if my_state.rew == 1 else count_wins

        print float(count_wins) / self.iter * 100

        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d, p] = max(self.Q[d, p, :])
Example #2
0
    def eps_greedy_choice_linear(self, state, epsilon):

        Qa = np.zeros(2)

        # epsilon greedy policy
        if random.random() > epsilon:
            for action in Actions.get_values():
                phi = self.feature_computation(state, action)
                Qa[action] = sum(phi*self.theta)
            a_next = Actions.get_action(np.argmax(Qa))
        else:
            a_next = Actions.hit if random.random()<0.5 else Actions.stick
        phi = self.feature_computation(state, a_next)
        my_Qa = sum(phi*self.theta)
        return [a_next, my_Qa]
Example #3
0
    def eps_greedy_choice_linear(self, state, epsilon):

        Qa = np.zeros(2)

        # epsilon greedy policy
        if random.random() > epsilon:
            for action in Actions.get_values():
                phi = self.feature_computation(state, action)
                Qa[action] = sum(phi * self.theta)
            a_next = Actions.get_action(np.argmax(Qa))
        else:
            a_next = Actions.hit if random.random() < 0.5 else Actions.stick
        phi = self.feature_computation(state, a_next)
        my_Qa = sum(phi * self.theta)
        return [a_next, my_Qa]
Example #4
0
 def __init__(self):
     self.card_min = 1  # min absolute val of card
     self.card_max = 10  # max absolute val of card
     self.dl_values = 10  # possible values for dl in state
     self.pl_values = 21  # possible values for pl in state
     self.act_values = len(
         Actions.get_values())  # number of possible actions
Example #5
0
    def MC_control(self, iterations):

        # Initialise
        self.iter = iterations
        self.method = "MC_control"
        count_wins = 0
        episode_pairs = []

        # Loop over episodes (complete game runs)
        for episode in xrange(self.iter):

            # reset state action pair list
            episode_pairs = []

            # get initial state for current episode
            my_state = self.env.get_initial_state()

            # Execute until game ends
            while not my_state.term:

                # choose action with epsilon greedy policy
                my_action = self.eps_greedy_choice(my_state)

                # store action state pairs
                episode_pairs.append((my_state, my_action))

                # update visits
                self.N[my_state.dl_sum-1, my_state.pl_sum-1, Actions.get_value(my_action)] += 1

                # execute action
                my_state = self.env.step(my_state, my_action)

            # Update Action value function accordingly
            for curr_s, curr_a in episode_pairs:
                step = 1.0  / (self.N[curr_s.dl_sum-1, curr_s.pl_sum-1, Actions.get_value(curr_a)])
                error = my_state.rew - self.Q[curr_s.dl_sum-1, curr_s.pl_sum-1, Actions.get_value(curr_a)]
                self.Q[curr_s.dl_sum-1, curr_s.pl_sum-1, Actions.get_value(curr_a)] += step * error

            #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, my_state.rew)
            count_wins = count_wins+1 if my_state.rew==1 else count_wins

        print float(count_wins)/self.iter*100

        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d,p] = max(self.Q[d, p, :])
Example #6
0
    def eps_greedy_choice(self, state):

        # collect visits
        try:
            visits_to_state = sum(self.N[state.dl_sum-1, state.pl_sum-1, :])
        except:
            visits_to_state = 0

        # compute epsilon
        curr_epsilon = self.n0 / (self.n0 + visits_to_state)

        # epsilon greedy policy
        if random.random() < curr_epsilon:
            return Actions.hit if random.random()<0.5 else Actions.stick
        else:
            return Actions.get_action(np.argmax(self.Q[state.dl_sum-1, state.pl_sum-1, :]))
Example #7
0
    def eps_greedy_choice(self, state):

        # collect visits
        try:
            visits_to_state = sum(self.N[state.dl_sum - 1,
                                         state.pl_sum - 1, :])
        except:
            visits_to_state = 0

        # compute epsilon
        curr_epsilon = self.n0 / (self.n0 + visits_to_state)

        # epsilon greedy policy
        if random.random() < curr_epsilon:
            return Actions.hit if random.random() < 0.5 else Actions.stick
        else:
            return Actions.get_action(
                np.argmax(self.Q[state.dl_sum - 1, state.pl_sum - 1, :]))
Example #8
0
 def __init__(self):
     self.card_min = 1    # min absolute val of card
     self.card_max = 10   # max absolute val of card
     self.dl_values = 10  # possible values for dl in state
     self.pl_values = 21  # possible values for pl in state
     self.act_values = len(Actions.get_values())  # number of possible actions
Example #9
0
    def TD_control_linear(self, iterations, mlambda, avg_it):

        self.mlambda = float(mlambda)
        self.iter = iterations
        self.method = "Sarsa_control_linear_approx"

        epsilon = 0.05
        alpha = 0.01

        l_mse = 0
        e_mse = np.zeros((avg_it,self.iter))
        monte_carlo_Q = pickle.load(open("Data/Qval_func_1000000_MC_control.pkl", "rb"))
        n_elements = monte_carlo_Q.shape[0]*monte_carlo_Q.shape[1]*2

        for my_it in xrange(avg_it):

            self.Q = np.zeros((self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.LinE = np.zeros(len(self.d_edges)*len(self.p_edges)*2)
            self.theta = np.random.random(36)*0.2
            #self.theta = np.zeros(len(self.d_edges)*len(self.p_edges)*2)
            count_wins = 0

            # Loop over episodes (complete game runs)
            for episode in xrange(self.iter):

                self.LinE = np.zeros(36)
                s = self.env.get_initial_state()

                if np.random.random() < 1-epsilon:
                    Qa = -100000
                    a = None
                    for act in Actions.get_values():
                        phi_curr = self.feature_computation(s,act)
                        Q =  sum(self.theta*phi_curr)
                        if Q > Qa:
                            Qa = Q
                            a = act
                            phi = phi_curr
                else:
                    a = Actions.stick if np.random.random()<0.5 else Actions.hit
                    phi = self.feature_computation(s,a)
                    Qa = sum(self.theta*phi)

                # Execute until game ends
                while not s.term:

                    # Accumulating traces
                    self.LinE[phi==1] += 1

                    # execute action
                    s_next = self.env.step(s, a)

                    # compute delta
                    delta = s_next.rew - sum(self.theta*phi)

                    # choose next action with epsilon greedy policy
                    if np.random.random() < 1-epsilon:
                        Qa = float(-100000)
                        a = None
                        for act in Actions.get_values():
                            phi_curr = self.feature_computation(s_next,act)
                            Q =  sum(self.theta*phi_curr)
                            if Q > Qa:
                                Qa = Q
                                a = act
                                phi = phi_curr
                    else:
                        a = Actions.stick if np.random.random()<0.5 else Actions.hit
                        phi = self.feature_computation(s_next,a)
                        Qa = sum(self.theta*phi)

                    # delta
                    delta += Qa
                    self.theta += alpha*delta*self.LinE
                    self.LinE = self.mlambda*self.LinE

                    # reassign s and a
                    s = s_next

                #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, s_next.rew)
                count_wins = count_wins+1 if s_next.rew==1 else count_wins

                self.Q = self.deriveQ()
                e_mse[my_it, episode] = np.sum(np.square(self.Q-monte_carlo_Q))/float(n_elements)

            print float(count_wins)/self.iter*100

            self.Q = self.deriveQ()
            l_mse += np.sum(np.square(self.Q-monte_carlo_Q))

        if mlambda==0 or mlambda==1:
            plt.plot(e_mse.mean(axis=0))
            plt.ylabel('mse vs episodes')
            plt.show()
        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d,p] = max(self.Q[d, p, :])

        #print self.theta

        return l_mse/float(n_elements)
Example #10
0
    def TD_control(self, iterations, mlambda, avg_it):

        self.mlambda = float(mlambda)
        self.iter = iterations
        self.method = "Sarsa_control"

        l_mse = 0
        e_mse = np.zeros((avg_it,self.iter))

        monte_carlo_Q = pickle.load(open("Data/Qval_func_1000000_MC_control.pkl", "rb"))
        n_elements = monte_carlo_Q.shape[0]*monte_carlo_Q.shape[1]*2

        for my_it in xrange(avg_it):

            self.N = np.zeros((self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.Q = np.zeros((self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.E = np.zeros((self.env.dl_values, self.env.pl_values, self.env.act_values))
            count_wins = 0

            # Loop over episodes (complete game runs)
            for episode in xrange(self.iter):

                self.E = np.zeros((self.env.dl_values, self.env.pl_values, self.env.act_values))
                s = self.env.get_initial_state()
                a = self.eps_greedy_choice(s)

                # Execute until game ends
                while not s.term:

                    # update visit count
                    self.N[s.dl_sum-1, s.pl_sum-1, Actions.get_value(a)] += 1

                    # execute action
                    s_next = self.env.step(s, a)

                    # choose next action with epsilon greedy policy
                    a_next = self.eps_greedy_choice(s_next)

                    # update action value function
                    alpha = 1.0  / (self.N[s.dl_sum-1, s.pl_sum-1,  Actions.get_value(a)])
                    try:
                        delta = s_next.rew + self.Q[s_next.dl_sum-1, s_next.pl_sum-1, Actions.get_value(a_next)] -\
                            self.Q[s.dl_sum-1, s.pl_sum-1, Actions.get_value(a)]
                    except:
                        delta = s_next.rew - self.Q[s.dl_sum-1, s.pl_sum-1, Actions.get_value(a)]
                    self.E[s.dl_sum-1, s.pl_sum-1, Actions.get_value(a)] += 1

                    update = alpha*delta*self.E
                    self.Q = self.Q+update
                    self.E = self.mlambda*self.E

                    # reassign s and a
                    s = s_next
                    a = a_next

                #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, s_next.rew)
                count_wins = count_wins+1 if s_next.rew==1 else count_wins

                e_mse[my_it, episode] = np.sum(np.square(self.Q-monte_carlo_Q))/float(n_elements)

            print float(count_wins)/self.iter*100
            l_mse += np.sum(np.square(self.Q-monte_carlo_Q))/float(n_elements)
            #print n_elements

        if mlambda==0 or mlambda==1:
            plt.plot(e_mse.mean(axis=0))
            plt.ylabel('mse vs episodes')
            plt.show()

        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d,p] = max(self.Q[d, p, :])

        return float(l_mse)/avg_it
Example #11
0
    def TD_control_linear(self, iterations, mlambda, avg_it):

        self.mlambda = float(mlambda)
        self.iter = iterations
        self.method = "Sarsa_control_linear_approx"

        epsilon = 0.05
        alpha = 0.01

        l_mse = 0
        e_mse = np.zeros((avg_it, self.iter))
        monte_carlo_Q = pickle.load(
            open("Data/Qval_func_1000000_MC_control.pkl", "rb"))
        n_elements = monte_carlo_Q.shape[0] * monte_carlo_Q.shape[1] * 2

        for my_it in xrange(avg_it):

            self.Q = np.zeros(
                (self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.LinE = np.zeros(len(self.d_edges) * len(self.p_edges) * 2)
            self.theta = np.random.random(36) * 0.2
            #self.theta = np.zeros(len(self.d_edges)*len(self.p_edges)*2)
            count_wins = 0

            # Loop over episodes (complete game runs)
            for episode in xrange(self.iter):

                self.LinE = np.zeros(36)
                s = self.env.get_initial_state()

                if np.random.random() < 1 - epsilon:
                    Qa = -100000
                    a = None
                    for act in Actions.get_values():
                        phi_curr = self.feature_computation(s, act)
                        Q = sum(self.theta * phi_curr)
                        if Q > Qa:
                            Qa = Q
                            a = act
                            phi = phi_curr
                else:
                    a = Actions.stick if np.random.random(
                    ) < 0.5 else Actions.hit
                    phi = self.feature_computation(s, a)
                    Qa = sum(self.theta * phi)

                # Execute until game ends
                while not s.term:

                    # Accumulating traces
                    self.LinE[phi == 1] += 1

                    # execute action
                    s_next = self.env.step(s, a)

                    # compute delta
                    delta = s_next.rew - sum(self.theta * phi)

                    # choose next action with epsilon greedy policy
                    if np.random.random() < 1 - epsilon:
                        Qa = float(-100000)
                        a = None
                        for act in Actions.get_values():
                            phi_curr = self.feature_computation(s_next, act)
                            Q = sum(self.theta * phi_curr)
                            if Q > Qa:
                                Qa = Q
                                a = act
                                phi = phi_curr
                    else:
                        a = Actions.stick if np.random.random(
                        ) < 0.5 else Actions.hit
                        phi = self.feature_computation(s_next, a)
                        Qa = sum(self.theta * phi)

                    # delta
                    delta += Qa
                    self.theta += alpha * delta * self.LinE
                    self.LinE = self.mlambda * self.LinE

                    # reassign s and a
                    s = s_next

                #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, s_next.rew)
                count_wins = count_wins + 1 if s_next.rew == 1 else count_wins

                self.Q = self.deriveQ()
                e_mse[my_it, episode] = np.sum(
                    np.square(self.Q - monte_carlo_Q)) / float(n_elements)

            print float(count_wins) / self.iter * 100

            self.Q = self.deriveQ()
            l_mse += np.sum(np.square(self.Q - monte_carlo_Q))

        if mlambda == 0 or mlambda == 1:
            plt.plot(e_mse.mean(axis=0))
            plt.ylabel('mse vs episodes')
            plt.show()
        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d, p] = max(self.Q[d, p, :])

        #print self.theta

        return l_mse / float(n_elements)
Example #12
0
    def TD_control(self, iterations, mlambda, avg_it):

        self.mlambda = float(mlambda)
        self.iter = iterations
        self.method = "Sarsa_control"

        l_mse = 0
        e_mse = np.zeros((avg_it, self.iter))

        monte_carlo_Q = pickle.load(
            open("Data/Qval_func_1000000_MC_control.pkl", "rb"))
        n_elements = monte_carlo_Q.shape[0] * monte_carlo_Q.shape[1] * 2

        for my_it in xrange(avg_it):

            self.N = np.zeros(
                (self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.Q = np.zeros(
                (self.env.dl_values, self.env.pl_values, self.env.act_values))
            self.E = np.zeros(
                (self.env.dl_values, self.env.pl_values, self.env.act_values))
            count_wins = 0

            # Loop over episodes (complete game runs)
            for episode in xrange(self.iter):

                self.E = np.zeros((self.env.dl_values, self.env.pl_values,
                                   self.env.act_values))
                s = self.env.get_initial_state()
                a = self.eps_greedy_choice(s)

                # Execute until game ends
                while not s.term:

                    # update visit count
                    self.N[s.dl_sum - 1, s.pl_sum - 1,
                           Actions.get_value(a)] += 1

                    # execute action
                    s_next = self.env.step(s, a)

                    # choose next action with epsilon greedy policy
                    a_next = self.eps_greedy_choice(s_next)

                    # update action value function
                    alpha = 1.0 / (self.N[s.dl_sum - 1, s.pl_sum - 1,
                                          Actions.get_value(a)])
                    try:
                        delta = s_next.rew + self.Q[s_next.dl_sum-1, s_next.pl_sum-1, Actions.get_value(a_next)] -\
                            self.Q[s.dl_sum-1, s.pl_sum-1, Actions.get_value(a)]
                    except:
                        delta = s_next.rew - self.Q[s.dl_sum - 1, s.pl_sum - 1,
                                                    Actions.get_value(a)]
                    self.E[s.dl_sum - 1, s.pl_sum - 1,
                           Actions.get_value(a)] += 1

                    update = alpha * delta * self.E
                    self.Q = self.Q + update
                    self.E = self.mlambda * self.E

                    # reassign s and a
                    s = s_next
                    a = a_next

                #if episode%10000==0: print "Episode: %d, Reward: %d" %(episode, s_next.rew)
                count_wins = count_wins + 1 if s_next.rew == 1 else count_wins

                e_mse[my_it, episode] = np.sum(
                    np.square(self.Q - monte_carlo_Q)) / float(n_elements)

            print float(count_wins) / self.iter * 100
            l_mse += np.sum(
                np.square(self.Q - monte_carlo_Q)) / float(n_elements)
            #print n_elements

        if mlambda == 0 or mlambda == 1:
            plt.plot(e_mse.mean(axis=0))
            plt.ylabel('mse vs episodes')
            plt.show()

        # Derive value function
        for d in xrange(self.env.dl_values):
            for p in xrange(self.env.pl_values):
                self.V[d, p] = max(self.Q[d, p, :])

        return float(l_mse) / avg_it