def monte_carlo_control(self, iters): """ Monte-Carlo control algorithm """ num_wins = 0 optimal_policy = np.zeros((self.env.dealer_values, self.env.player_values)) for episode in range(0, iters): state_episode = self.env.get_initial_state() reward_episode = 0 history = [] #sample episode while not state_episode.terminal: action = self.epsilon_greedy(state_episode) history.append([state_episode, action, reward_episode]) #update number of visits self.N[state_episode.dealer_card - 1, state_episode.player_sum - 1, Action.get_value(action)] += 1 [reward_episode, state_episode] = self.env.step(state_episode, action) #update Q for state, action, reward in history: step_size = 1.0 / self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] Gt = reward_episode error = Gt - self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += step_size * error if (Gt == 1): num_wins = num_wins + 1 print ("Percentage of win %.3f" % (num_wins / iters * 100.0)) #update policy based on action-value function for (dealer_sum, player_sum), value in np.ndenumerate(self.V): if self.Q[dealer_sum, player_sum, 1] > self.Q[dealer_sum, player_sum, 0]: optimal_policy[dealer_sum, player_sum] = 1 self.V[dealer_sum, player_sum] = max(self.Q[dealer_sum, player_sum, :])
def linear_sarsa(self, iters, lambda_, compare_to_monctecarlo = False): """ Linear Function Approximation of sarsa lambda algorithm """ if compare_to_monctecarlo: monte_carlo_iterations = 1000000 env = Environment() agent = Agent(env) agent.monte_carlo_control(monte_carlo_iterations) Q_monte_carlo = agent.Q mse_all = [] for episode in range(0, iters): E = np.zeros(self.number_of_features) #initialize state and action state = self.env.get_initial_state() reward = 0 action = self.epsilon_greedy_linear_constant(state) # self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1 while not state.terminal: # update number of visits self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1 [reward, state_forward] = self.env.step(state, action) action_forward = self.epsilon_greedy_linear_constant(state_forward) if not state_forward.terminal: current_estimate = reward + self.estimate_Q(state_forward, action_forward) else: current_estimate = reward previous_estimate = self.estimate_Q(state, action) delta = current_estimate - previous_estimate E = np.add(E, self.get_feature_vector(state, action)) step_size = 0.01 self.weights += step_size * delta * E E = lambda_ * E action = action_forward state = state_forward if compare_to_monctecarlo: mse_all.append(compute_mse(self.approximation_to_Q(), Q_monte_carlo)) if compare_to_monctecarlo: # print (mse_all[-1]) plt.plot(range(0, iters), mse_all, 'r-') plt.xlabel("episodes") plt.ylabel("MSE") # plt.title("lambda = 0") plt.show() for (dealer_sum, player_sum), value in np.ndenumerate(self.V): s = State(dealer_sum+1, player_sum+1) self.Q[dealer_sum, player_sum ,0] = np.dot(self.get_feature_vector(s, Action.hit), self.weights) self.Q[dealer_sum, player_sum ,1] = np.dot(self.get_feature_vector(s, Action.stick), self.weights) self.V[dealer_sum, player_sum] = max(self.estimate_Q(s,Action.hit), self.estimate_Q(s,Action.stick))
def td_learning(self, iters, lambda_, compare_to_monctecarlo = False, trace = Trace.accumulating): """ sarsa lambda algorithm """ if compare_to_monctecarlo: monte_carlo_iterations = 1000000 env = Environment() agent = Agent(env) agent.monte_carlo_control(monte_carlo_iterations) Q_monte_carlo = agent.Q mse_all = [] for episode in range(0, iters): E = np.zeros(((self.env.dealer_values, self.env.player_values, self.env.action_values))) #initialize state and action state = self.env.get_initial_state() reward = 0 action = self.epsilon_greedy(state) while not state.terminal: # update number of visits self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1 [reward, state_forward] = self.env.step(state, action) action_forward = self.epsilon_greedy(state_forward) if not state_forward.terminal: current_estimate = reward + self.Q[state_forward.dealer_card - 1, state_forward.player_sum - 1, Action.get_value(action_forward)] else: current_estimate = reward previous_estimate = self.Q[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] delta = current_estimate - previous_estimate step_size = 1.0 / self.N[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] if trace == Trace.accumulating: E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] += 1 elif trace == Trace.replacing: E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] = 1 elif trace == Trace.dutch: E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] = E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)] + step_size*(1 - E[state.dealer_card - 1, state.player_sum - 1, Action.get_value(action)]) if trace == Trace.dutch: self.Q += delta * E else: self.Q += step_size * delta * E E = lambda_ * E action = action_forward state = state_forward if compare_to_monctecarlo: mse_all.append(compute_mse(self.Q, Q_monte_carlo)) if compare_to_monctecarlo: # print (mse_all[-1]) plt.plot(range(0, iters), mse_all, 'r-') plt.xlabel("episodes") plt.ylabel("MSE") # plt.title("lambda = 1") plt.show() #update policy based on action-value function for (dealer_sum, player_sum), value in np.ndenumerate(self.V): self.V[dealer_sum, player_sum] = max(self.Q[dealer_sum, player_sum, :])