コード例 #1
0
def test_sarsa_gives_expected_value_when_parameters_are_valid():
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [4.]
    q = {
        1: {
            3: 10.
        },
        2: {
            0: 20.
        },
    }
    alpha = 0.3
    gamma = 0.2
    s, a = trajectory[-2]
    s_p, a_p = trajectory[-1]
    expected = (alpha * rewards[-1] + alpha * gamma * q[s_p][a_p] +
                (1 - alpha) * q[s][a])
    control = SarsaControl(alpha, gamma, q=q)
    control.update(trajectory, rewards)
    q = control.get_q()
    actual = q[s][a]
    assert expected == actual
コード例 #2
0
def test_sarsa_replaces_previous_value_with_reward_plus_next_value_when_alpha_is_one_and_gamma_is_one(
):
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [4.]
    q = {
        1: {
            3: 10.
        },
        2: {
            0: 20.
        },
    }
    alpha = 1.
    gamma = 1.
    s_p, a_p = trajectory[-1]
    expected = rewards[-1] + q[s_p][a_p]
    control = SarsaControl(alpha, gamma, q=q)
    control.update(trajectory, rewards)
    q = control.get_q()
    s, a = trajectory[-2]
    actual = q[s][a]
    assert expected == actual
コード例 #3
0
def test_sarsa_throws_exception_when_current_state_is_not_in_q():
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [4.]
    q = {
        1: {
            3: 10.
        },
    }
    alpha = 0.5
    gamma = 0.5
    control = SarsaControl(alpha, gamma, q=q)
    with pytest.raises(Exception) as excinfo:
        control.update(trajectory, rewards)
    assert "current state 2 not found in q function" in str(excinfo.value)
コード例 #4
0
def test_sarsa_throws_exception_when_previous_state_is_not_in_q():
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [0]
    q = {
        2: {
            0: 20.
        },
    }
    alpha = 0.5
    gamma = 0.5
    control = SarsaControl(alpha, gamma, q=q)
    with pytest.raises(Exception) as excinfo:
        control.update(trajectory, rewards)
    assert "previous state 1 not found in q function" in str(excinfo.value)
コード例 #5
0
def test_sarsa_has_no_effect_on_q_when_alpha_is_zero():
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [4.]
    q = {
        1: {
            3: 10.
        },
        2: {
            0: 20.
        },
    }
    alpha = 0.
    gamma = 0.5
    expected = copy.deepcopy(q)
    control = SarsaControl(alpha, gamma, q=q)
    control.update(trajectory, rewards)
    actual = control.get_q()
    assert expected == actual
コード例 #6
0
def test_sarsa_throws_exception_when_reward_for_current_step_has_already_been_decided(
):
    trajectory = [
        (1, 3),
        (2, 0),
    ]
    rewards = [0., 1.]
    q = {
        1: {
            3: 10.
        },
        2: {
            0: 20.
        },
    }
    alpha = 0.
    gamma = 0.5
    control = SarsaControl(alpha, gamma, q=q)
    with pytest.raises(Exception) as excinfo:
        control.update(trajectory, rewards)
    assert "Length of trajectory and rewards lists are the same; current state-action pair shouldn't yet have a reward when doing Sarsa" in str(
        excinfo.value)
コード例 #7
0
def main():
    start_time = time()

    # Extract the features of the environment
    # TODO: This has to be automatable.
    n_states = GRID[0] * GRID[1] * GRID[2] * GRID[3]
    n_actions = gym.make(ENV_NAME).action_space.n
    min_values = gym.make(ENV_NAME).observation_space.low
    max_values = gym.make(ENV_NAME).observation_space.high
    # Modify min/max values to something more appropriate for the environment
    min_values[1] = -1.
    min_values[2] = -0.25
    min_values[3] = -2.0
    max_values[1] = 1.
    max_values[2] = 0.25
    max_values[3] = 2.0
    state_encoder = StateEncoder().fit(GRID, min_values, max_values)

    q = {}
    if START_FROM_MODEL:
        # Load an existing q-function
        with open("best_q.pkl", "rb") as q_file:
            q = pickle.load(q_file)
    else:
        # TODO: I don't think this is a good idea.  Defeats the point of using a
        #       hash tables, and many of the states won't be visitable.  In
        #       addition, I have a feeling that the initial value shouldn't be a
        #       fixed number but rather evolve over time as more information is
        #       known.
        for state in range(n_states):
            q[state] = {key: INITIAL_VALUE for key in range(n_actions)}
    control = SarsaControl(
        ALPHA,
        GAMMA,
        epsilon=EPSILON,
        random_policy=RandomPolicy(n_actions),
        q=q,
    )

    # Outer loop for episode
    for episode in range(NUM_EPSIODES):
        # Create the initial environment and register the initial state
        env, env_raw, observation = make_envs(
            ENV_NAME,
            output_movie=OUTPUT_MOVIE,
            max_steps_per_episode=MAX_STEPS_PER_EPISODE,
        )
        state = state_encoder.transform(observation)

        # Generate the trajectory for this episode
        trajectory = []
        rewards = []
        cumul_reward = 0.
        for timestep in range(MAX_STEPS_PER_EPISODE):
            # Render the current frame
            if OUTPUT_MOVIE:
                env.render()
                sleep(FRAME_RATE)

            # Predict the current action and generate the reward
            action = control.next_action(state)
            trajectory.append((state, action))
            # Update the q function based on the trajector for this episode
            if timestep > 0:
                control.update(trajectory, rewards)
                q = control.get_q()
            observation, reward, done, _ = env.step(action)
            rewards.append(reward)
            state = state_encoder.transform(observation)
            cumul_reward += reward

            # Output time step results to screen.  Only do this when outputting
            # movies to avoid slowing down the training.
            if OUTPUT_MOVIE:
                print(
                    step_statistics(timestep + 1, reward, cumul_reward, state))
            # If we're finished with this episode, output episode statistics
            # and break
            if done:
                elapsed_time = time() - start_time
                print(
                    run_summary(elapsed_time, episode + 1, timestep + 1,
                                cumul_reward))
                break

        close_envs(env, env_raw)
        if episode % MODEL_SAVE_FREQUENCY == 0:
            print("Outputting model...")
            with open("best_q.pkl", "wb") as q_file:
                pickle.dump(q, q_file)