Esempio n. 1
0
 def monte_carlo_control(self, iters):
     """ 
     Monte-Carlo control algorithm
     """
     num_wins = 0
     optimal_policy = np.zeros((self.env.dealer_values, self.env.player_values))
     for episode in range(0, iters):
         state_episode = self.env.get_initial_state()
         reward_episode = 0
         history = []
         #sample episode
         while not state_episode.terminal:
             action = self.epsilon_greedy(state_episode)
             
             history.append([state_episode, action, reward_episode])
             #update number of visits
             self.N[state_episode.dealer_card - 1, state_episode.player_sum - 1, Action.get_value(action)] += 1
             
             [reward_episode, state_episode] = self.env.step(state_episode, action)
         
         #update Q
         for state, action, reward in history:
             step_size = 1.0 / self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)]
             Gt = reward_episode
             error = Gt - self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)]
             self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += step_size * error
         if (Gt == 1):
             num_wins = num_wins + 1
         
     print ("Percentage of win %.3f" % (num_wins / iters * 100.0))
     #update policy based on action-value function
     for (dealer_sum, player_sum), value in np.ndenumerate(self.V):
         if self.Q[dealer_sum, player_sum, 1] > self.Q[dealer_sum, player_sum, 0]:
             optimal_policy[dealer_sum, player_sum] = 1
         self.V[dealer_sum, player_sum] = max(self.Q[dealer_sum, player_sum, :])
Esempio n. 2
0
    def linear_sarsa(self, iters, lambda_, compare_to_monctecarlo = False):     
        """ 
        Linear Function Approximation of sarsa lambda algorithm
        """
        if compare_to_monctecarlo:
            monte_carlo_iterations = 1000000
            env = Environment()
            agent = Agent(env)
            agent.monte_carlo_control(monte_carlo_iterations)
            Q_monte_carlo = agent.Q
            mse_all = []
            
        for episode in range(0, iters):
            E = np.zeros(self.number_of_features) 
            #initialize state and action          
            state = self.env.get_initial_state()
            reward = 0
            action = self.epsilon_greedy_linear_constant(state)
#            self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1 
            while not state.terminal:                   
#                update number of visits
                self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1              
                [reward, state_forward] = self.env.step(state, action)                 
                action_forward = self.epsilon_greedy_linear_constant(state_forward)  
                
                if not state_forward.terminal:
                    current_estimate = reward + self.estimate_Q(state_forward, action_forward)
                else:
                    current_estimate = reward
                    
                previous_estimate = self.estimate_Q(state, action)
                delta = current_estimate - previous_estimate

                E = np.add(E, self.get_feature_vector(state, action))
                step_size = 0.01                
                self.weights += step_size * delta * E
                E = lambda_ * E

                action = action_forward
                state = state_forward
            if compare_to_monctecarlo:
                mse_all.append(compute_mse(self.approximation_to_Q(), Q_monte_carlo))
  
        if compare_to_monctecarlo:
#            print (mse_all[-1])
            plt.plot(range(0, iters), mse_all, 'r-')
            plt.xlabel("episodes")
            plt.ylabel("MSE")
#            plt.title("lambda = 0")
            plt.show()
            
        for (dealer_sum, player_sum), value in np.ndenumerate(self.V):
            s = State(dealer_sum+1, player_sum+1)
            self.Q[dealer_sum, player_sum ,0] = np.dot(self.get_feature_vector(s, Action.hit), self.weights)
            self.Q[dealer_sum, player_sum ,1] = np.dot(self.get_feature_vector(s, Action.stick), self.weights)
            self.V[dealer_sum, player_sum] = max(self.estimate_Q(s,Action.hit), self.estimate_Q(s,Action.stick))
Esempio n. 3
0
    def td_learning(self, iters, lambda_, compare_to_monctecarlo = False, trace = Trace.accumulating):
        """ 
        sarsa lambda algorithm
        """
        if compare_to_monctecarlo:
            monte_carlo_iterations = 1000000
            env = Environment()
            agent = Agent(env)
            agent.monte_carlo_control(monte_carlo_iterations)
            Q_monte_carlo = agent.Q
            mse_all = []
            
        for episode in range(0, iters):
            E = np.zeros(((self.env.dealer_values, self.env.player_values, self.env.action_values)))  
            
            #initialize state and action          
            state = self.env.get_initial_state()
            reward = 0
            action = self.epsilon_greedy(state)
            while not state.terminal:                   
#                update number of visits
                self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1              
                [reward, state_forward] = self.env.step(state, action)                 
                action_forward = self.epsilon_greedy(state_forward)  
                
                if not state_forward.terminal:
                    current_estimate = reward + self.Q[state_forward.dealer_card - 1, state_forward.player_sum - 1, Action.get_value(action_forward)]
                else:
                    current_estimate = reward
                    
                previous_estimate = self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)]
                delta = current_estimate - previous_estimate
                
                step_size = 1.0 / self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)]
                if trace == Trace.accumulating:
                    E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1
                elif trace == Trace.replacing:
                    E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] = 1
                elif trace == Trace.dutch:
                    E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] = E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] + step_size*(1 - E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)])

                if trace == Trace.dutch:
                    self.Q += delta * E
                else:
                    self.Q += step_size * delta * E
                E = lambda_ * E
              
                action = action_forward
                state = state_forward
            
            if compare_to_monctecarlo:
                mse_all.append(compute_mse(self.Q, Q_monte_carlo))
  
        if compare_to_monctecarlo:
#            print (mse_all[-1])
            plt.plot(range(0, iters), mse_all, 'r-')
            plt.xlabel("episodes")
            plt.ylabel("MSE")
#            plt.title("lambda = 1")
            plt.show()
                                 
        #update policy based on action-value function
        for (dealer_sum, player_sum), value in np.ndenumerate(self.V):
            self.V[dealer_sum, player_sum] = max(self.Q[dealer_sum, player_sum, :])