def epsilon_greedy_lfa(epsilon, theta, state_features): """ Epsilon greedy policy, for linear function approximation, which returns random action with probability epsilon, highest value action otherwise. :param epsilon: random action probability :param value_function: [state_action_feature_vector] value function :param state: current state feature vector :return: action to take """ # exploration if random.random() < epsilon: act = _random_action() val = utilities.get_state_action_features(state_features, act).dot(theta) return val, act # exploitation else: value_HIT = utilities.get_state_action_features(state_features, 0).dot(theta) value_STICK = utilities.get_state_action_features(state_features, 1).dot(theta) if value_HIT > value_STICK: return value_HIT, HIT elif value_STICK > value_HIT: return value_STICK, STICK else: act = _random_action() val = utilities.get_state_action_features(state_features, act).dot(theta) return val, act
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Value function approximation using coarse coding :param l: lambda parameter :param gamma: discounting rate :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # weights vector for the state_action feature vector theta = np.random.random(36) * 0.2 # random move probability epsilon = 0.05 # step-size parameter alpha = 0.01 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # key is state_action feature vector eligibility_trace = np.zeros(36) # initial state, action [SA..], and set of features state = environment.State() # calculate features for the given state state_features_current = utilities.get_state_features(state) # get action from this state q_a_current, action_current = policy(epsilon, theta, state_features_current) # calculate final state, action feature vector features_current = utilities.get_state_action_features( state_features_current, action_current) while not state.terminal: # update eligibility trace (accumulating) eligibility_trace = np.add(eligibility_trace, features_current) # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] state_features_next = utilities.get_state_features(state) q_a_next, action_next = policy(epsilon, theta, state_features_next) features_next = utilities.get_state_action_features( state_features_next, action_next) # calculate state value difference delta = reward + gamma * q_a_next - q_a_current # update weights theta = np.add(theta, alpha * delta * eligibility_trace) # update trace eligibility_trace *= gamma * l features_current = features_next action_current = action_next # calculate value function value_function = defaultdict(float) for player in xrange(1, 22): for dealer in xrange(1, 11): for action in [0, 1]: s = environment.State(dealer, player) phi = utilities.get_state_action_features( utilities.get_state_features(s), action) value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta) # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append( (episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curves if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=( learning_curve, l, )) p.start() else: plotting.plot_learning_curve(learning_curve, l) return value_function
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Value function approximation using coarse coding :param l: lambda parameter :param gamma: discounting rate :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # weights vector for the state_action feature vector theta = np.random.random(36)*0.2 # random move probability epsilon = 0.05 # step-size parameter alpha = 0.01 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # key is state_action feature vector eligibility_trace = np.zeros(36) # initial state, action [SA..], and set of features state = environment.State() # calculate features for the given state state_features_current = utilities.get_state_features(state) # get action from this state q_a_current, action_current = policy(epsilon, theta, state_features_current) # calculate final state, action feature vector features_current = utilities.get_state_action_features(state_features_current, action_current) while not state.terminal: # update eligibility trace (accumulating) eligibility_trace = np.add(eligibility_trace, features_current) # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] state_features_next = utilities.get_state_features(state) q_a_next, action_next = policy(epsilon, theta, state_features_next) features_next = utilities.get_state_action_features(state_features_next, action_next) # calculate state value difference delta = reward + gamma * q_a_next - q_a_current # update weights theta = np.add(theta, alpha * delta * eligibility_trace) # update trace eligibility_trace *= gamma * l features_current = features_next action_current = action_next # calculate value function value_function = defaultdict(float) for player in xrange(1, 22): for dealer in xrange(1, 11): for action in [0, 1]: s = environment.State(dealer, player) phi = utilities.get_state_action_features(utilities.get_state_features(s), action) value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta) # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curves if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,)) p.start() else: plotting.plot_learning_curve(learning_curve, l) return value_function