コード例 #1
0
ファイル: lf_approx.py プロジェクト: izmendi/rl-coursework
def calculate_mse(action_value_function):
    mc_action_value_function = load('mc_result.dat')
    linear_function = LinearFunction()
    mse, count = 0, 0

    for dealer in range(1, 11):
        for player in range(1, 22):
            for action in range(0, 2):
                state = State(dealer=dealer, player=player)
                linear_function.update(state)
                features = linear_function.get_features()

                mc_reward = mc_action_value_function[(dealer, player, action)]
                reward = action_value_function[(tuple(features), action)]
                mse += (reward - mc_reward) ** 2
                count += 1

    mse /= count
    return mse
コード例 #2
0
ファイル: lf_approx.py プロジェクト: izmendi/rl-coursework
def sarsa(lambd):
    n_episodes = 1000
    epi_batch = 100
    episodes = xrange(n_episodes)
    action_value_function = defaultdict(float)
    linear_function = LinearFunction()
    params_hit = np.array([0 for i in range(18)])
    params_stick = np.array([0 for i in range(18)])
    n_zero = 10
    epsilon = 0.05
    alpha = 0.01

    if lambd == 0.0 or lambd == 1.0:
        mses = []

    for episode in episodes:
        if episode%epi_batch == 0:
            if lambd == 0.0 or lambd == 1.0:
                mses.append(calculate_mse(action_value_function))

        # initialize state, action, epsilon, and eligibility-trace
        state = State()
        linear_function.update(state)
        current_feats = linear_function.get_features()
        action = epsilon_greedy_policy(action_value_function, state, epsilon, current_feats)
        eligibility_hit = np.array([0 for i in range(18)])
        eligibility_stick = np.array([0 for i in range(18)])

        while not state.terminal:
            np_feats = np.array(current_feats)
            if action is HIT:
                eligibility_hit = np.add(eligibility_hit, np_feats)
            else:
                eligibility_stick = np.add(eligibility_stick, np_feats)

            reward = step(state, action)
            linear_function.update(state)
            new_features = linear_function.get_features()

            # update delta
            delta_hit = reward - np.array(tuple(new_features)).dot(params_hit)
            delta_stick = reward - np.array(tuple(new_features)).dot(params_stick)

            # update Action Value Function
            if action == HIT:
                update_action_value_function(action_value_function, (new_features, action), params_hit)
            else:
                update_action_value_function(action_value_function, (new_features, action), params_stick)

            # update delta, parameters, and eligibility-trace
            if action == HIT:
                delta_hit += action_value_function[(tuple(new_features), HIT)]
            else:
                delta_stick += action_value_function[(tuple(new_features), STICK)]

            params_hit = np.add(params_hit, alpha * delta_hit * eligibility_hit)
            params_stick = np.add(params_stick, alpha * delta_stick * eligibility_stick)
            eligibility_hit = eligibility_hit * lambd
            eligibility_stick = eligibility_stick * lambd

            # decide an action
            action = epsilon_greedy_policy(action_value_function, state, epsilon, new_features)

            # update state and action
            current_features = new_features


    if lambd == 0.0 or lambd == 1.0:
        mses.append(calculate_mse(action_value_function))

    # plot mses curve
    if lambd == 0.0 or lambd == 1.0:
        print "Plotting learning curve for $\lambda$=",lambd
        x = range(0, n_episodes + 1, epi_batch)
        fig = plt.figure()
        plt.title('Learning curve of MSE against Episodes @ $\lambda$ = ' + str(lambd))
        plt.xlabel("episode number")
        plt.xlim([0, n_episodes])
        plt.xticks(range(0, n_episodes + 1, epi_batch))
        plt.ylabel("Mean-Squared Error (MSE)")
        plt.plot(x, mses)
        fname = "lapprox_mse_lambda%f_%s.png" % (lambd, str(datetime.now()))
        plt.savefig(fname)
        # plt.show()

    mse = calculate_mse(action_value_function)

    return mse