def _test_linear_function_approximation(averaging_runs=1, plot_learning_curve=False, multiproc=True):
    """ Test the SARSA LFA algorithm

    :param averaging_runs: Number of runs to use in averaging SARSA LFA results
    :param plot_learning_curve: Whether to plot the learning curve for lambda being 0 or 1
    """
    print "\nTesting linear function approximation..."

    mc_value_function = pickle.load(open("Data/MC_value_function.pickle", "rb"))
    average_mse = []
    lambda_space = np.linspace(0, 1, 11)

    # get MSE errors over all averaging runs
    for i in xrange(averaging_runs):
        mse = []
        for _lambda in lambda_space:
            sarsa_value_function = rl_algorithms.linear_function_approximation(
                l=_lambda, plot_learning_curve=plot_learning_curve, multiproc=multiproc
            )
            mse.append(utilities.calculate_mse(mc_value_function, sarsa_value_function))
        average_mse.append(mse)
        if (i % 5) == 0:
            print i

    # average scores from n runs of SARSA and plot
    average_mse = [float(sum(col)) / len(col) for col in zip(*average_mse)]
    average_lambda_mse = zip(lambda_space, average_mse)

    for entry in average_lambda_mse:
        print ("--------------------")
        print ("Lambda: %.1f" % entry[0])
        print ("Mean Squared Error: %.4f" % entry[1])
        print ("\n")

    plotting.plot_sarsa_mse(average_lambda_mse)
Ejemplo n.º 2
0
def _test_sarsa(averaging_runs=1, plot_learning_curve=False, multiproc=True):
    """ Test the SARSA algorithm

    :param averaging_runs: Number of runs to use in averaging SARSA results
    :param plot_learning_curve: Whether to plot the learning curve for lambda being 0 or 1
    """
    print '\nTesting SARSA...'

    try:
        mc_value_function = pickle.load(
            open("Data/MC_value_function.pickle", "rb"))
    except:
        mc_value_function = rl_algorithms.monte_carlo(iterations=1000000)
    average_mse = []
    lambda_space = np.linspace(0, 1, 11)

    # get MSE errors over all averaging runs
    for i in xrange(averaging_runs):
        mse = []
        for _lambda in lambda_space:
            sarsa_value_function = rl_algorithms.sarsa_lambda(l=_lambda, \
                                        plot_learning_curve=plot_learning_curve, multiproc=multiproc)
            mse.append(
                utilities.calculate_mse(mc_value_function,
                                        sarsa_value_function))
        average_mse.append(mse)
        if (i % 5) == 0:
            print i

    # average scores from n runs of SARSA and plot
    average_mse = [float(sum(col)) / len(col) for col in zip(*average_mse)]
    average_lambda_mse = zip(lambda_space, average_mse)

    for entry in average_lambda_mse:
        print('--------------------')
        print('Lambda: %.1f' % entry[0])
        print('Mean Squared Error: %.4f' % entry[1])
        print('\n')

    plotting.plot_sarsa_mse(average_lambda_mse)
Ejemplo n.º 3
0
def sarsa_lambda(l=0.9,
                 max_episodes=1000,
                 policy=policies.epsilon_greedy,
                 n_zero=100,
                 gamma=1,
                 plot_learning_curve=True,
                 multiproc=True):
    """ Applies eligibility trace version of Sarsa to the game Easy21

    :param l: lambda parameter
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param gamma: discounting rate
    :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # no. of wins to calculate the percentage of wins at the end
    wins = 0

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # current (player, dealer, action)
        eligibility_trace = defaultdict(float)

        # initial state, action [SA..]
        state = environment.State()
        player_current = state.player_sum
        dealer_current = state.dealer_first_card
        epsilon = n_zero / float(n_zero + counter_state[
            (player_current, dealer_current)])
        action_current = policy(epsilon, value_function, state)

        while not state.terminal:

            # update counts
            counter_state[(player_current, dealer_current)] += 1
            counter_state_action[(player_current, dealer_current,
                                  action_current)] += 1

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            player_next = state.player_sum
            dealer_next = state.dealer_first_card
            epsilon = n_zero / float(n_zero +
                                     counter_state[(player_next, dealer_next)])
            action_next = policy(epsilon, value_function, state)

            delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \
                value_function[(player_current, dealer_current, action_current)]

            alpha = 1.0 / counter_state_action[(player_current, dealer_current,
                                                action_current)]

            eligibility_trace[(player_current, dealer_current,
                               action_current)] += 1

            # update the values
            for key in value_function:
                value_function[key] += alpha * delta * eligibility_trace[key]
                eligibility_trace[key] *= gamma * l

            player_current = player_next
            dealer_current = dealer_next
            action_current = action_next

        # use it later to calculate the percentage of wins
        if reward == 1:
            wins += 1

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curve
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    # get the percentage of wins
    print float(wins) / max_episodes
    return value_function
Ejemplo n.º 4
0
def linear_function_approximation(l=0.9,
                                  max_episodes=1000,
                                  policy=policies.epsilon_greedy_lfa,
                                  n_zero=100,
                                  gamma=1,
                                  plot_learning_curve=True,
                                  multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36) * 0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle",
                                         "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta,
                                             state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(
            state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(
                state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(
                        utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card,
                                    action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append(
                (episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve,
                        args=(
                            learning_curve,
                            l,
                        ))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function
def sarsa_lambda(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy,
                 n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True):
    """ Applies eligibility trace version of Sarsa to the game Easy21

    :param l: lambda parameter
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param gamma: discounting rate
    :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # no. of wins to calculate the percentage of wins at the end
    wins = 0

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # current (player, dealer, action)
        eligibility_trace = defaultdict(float)

        # initial state, action [SA..]
        state = environment.State()
        player_current = state.player_sum
        dealer_current = state.dealer_first_card
        epsilon = n_zero / float(n_zero + counter_state[(player_current, dealer_current)])
        action_current = policy(epsilon, value_function, state)

        while not state.terminal:

            # update counts
            counter_state[(player_current, dealer_current)] += 1
            counter_state_action[(player_current, dealer_current, action_current)] += 1

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            player_next = state.player_sum
            dealer_next = state.dealer_first_card
            epsilon = n_zero / float(n_zero + counter_state[(player_next, dealer_next)])
            action_next = policy(epsilon, value_function, state)

            delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \
                value_function[(player_current, dealer_current, action_current)]

            alpha = 1.0 / counter_state_action[(player_current, dealer_current, action_current)]

            eligibility_trace[(player_current, dealer_current, action_current)] += 1

            # update the values
            for key in value_function:
                value_function[key] += alpha * delta * eligibility_trace[key]
                eligibility_trace[key] *= gamma * l

            player_current = player_next
            dealer_current = dealer_next
            action_current = action_next

        # use it later to calculate the percentage of wins
        if reward == 1:
            wins += 1

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curve
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    # get the percentage of wins
    print float(wins) / max_episodes
    return value_function
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100,
                                  gamma=1, plot_learning_curve=True, multiproc=True):
    """ Value function approximation using coarse coding

    :param l: lambda parameter
    :param gamma: discounting rate
    :param max_episodes: stop learning after this many episodes
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple
        algorithms on mac or windows simultaneously)
    :return: value function after max_episodes
    """
    # weights vector for the state_action feature vector
    theta = np.random.random(36)*0.2
    # random move probability
    epsilon = 0.05
    # step-size parameter
    alpha = 0.01

    # learning curve plotting
    if l in {0, 1} and plot_learning_curve:
        learning_curve = []
        try:
            mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb"))
        except:
            mc_values = monte_carlo(iterations=1000000)

    for episode in range(max_episodes):

        # key is state_action feature vector
        eligibility_trace = np.zeros(36)

        # initial state, action [SA..], and set of features
        state = environment.State()
        # calculate features for the given state
        state_features_current = utilities.get_state_features(state)
        # get action from this state
        q_a_current, action_current = policy(epsilon, theta, state_features_current)
        # calculate final state, action feature vector
        features_current = utilities.get_state_action_features(state_features_current, action_current)

        while not state.terminal:

            # update eligibility trace (accumulating)
            eligibility_trace = np.add(eligibility_trace, features_current)

            # take a step, get reward [..R..]
            [state, reward] = environment.step(state, action_current)
            if reward is None:
                reward = 0

            # follow up state, action [..SA]
            state_features_next = utilities.get_state_features(state)
            q_a_next, action_next = policy(epsilon, theta, state_features_next)
            features_next = utilities.get_state_action_features(state_features_next, action_next)

            # calculate state value difference
            delta = reward + gamma * q_a_next - q_a_current
            # update weights
            theta = np.add(theta, alpha * delta * eligibility_trace)
            # update trace
            eligibility_trace *= gamma * l

            features_current = features_next
            action_current = action_next

        # calculate value function
        value_function = defaultdict(float)
        for player in xrange(1, 22):
            for dealer in xrange(1, 11):
                for action in [0, 1]:
                    s = environment.State(dealer, player)
                    phi = utilities.get_state_action_features(utilities.get_state_features(s), action)
                    value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta)

        # get the episode MSE for plotting learning curve
        if l in {0, 1} and plot_learning_curve:
            learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function)))

    # plot learning curves
    if l in {0, 1} and plot_learning_curve:
        if multiproc:
            # create a new process so computation can continue after plotting
            p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,))
            p.start()
        else:
            plotting.plot_learning_curve(learning_curve, l)

    return value_function