def epsilon_greedy_lfa(epsilon, theta, state_features):
    """ Epsilon greedy policy, for linear function approximation,
    which returns random action with probability epsilon, highest value action otherwise.

    :param epsilon: random action probability
    :param value_function: [state_action_feature_vector] value function
    :param state: current state feature vector
    :return: action to take
    """

    # exploration
    if random.random() < epsilon:
        act = _random_action()
        val = utilities.get_state_action_features(state_features, act).dot(theta)
        return val, act
    # exploitation
    else:
        value_HIT = utilities.get_state_action_features(state_features, 0).dot(theta)
        value_STICK = utilities.get_state_action_features(state_features, 1).dot(theta)

        if value_HIT > value_STICK:
            return value_HIT, HIT
        elif value_STICK > value_HIT:
            return value_STICK, STICK
        else:
            act = _random_action()
            val = utilities.get_state_action_features(state_features, act).dot(theta)
            return val, act
Ejemplo n.º 2
0
def linear_function_approximation(l=0.9,
                                  max_episodes=1000,
                                  policy=policies.epsilon_greedy_lfa,
                                  n_zero=100,
                                  gamma=1,
                                  plot_learning_curve=True,
                                  multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36) * 0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta,
                                             state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(
            state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(
                state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(
                        utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card,
                                    action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100,
                                  gamma=1, plot_learning_curve=True, multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36)*0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta, state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function