def test_sarsa_gives_expected_value_when_parameters_are_valid(): trajectory = [ (1, 3), (2, 0), ] rewards = [4.] q = { 1: { 3: 10. }, 2: { 0: 20. }, } alpha = 0.3 gamma = 0.2 s, a = trajectory[-2] s_p, a_p = trajectory[-1] expected = (alpha * rewards[-1] + alpha * gamma * q[s_p][a_p] + (1 - alpha) * q[s][a]) control = SarsaControl(alpha, gamma, q=q) control.update(trajectory, rewards) q = control.get_q() actual = q[s][a] assert expected == actual
def test_sarsa_replaces_previous_value_with_reward_plus_next_value_when_alpha_is_one_and_gamma_is_one( ): trajectory = [ (1, 3), (2, 0), ] rewards = [4.] q = { 1: { 3: 10. }, 2: { 0: 20. }, } alpha = 1. gamma = 1. s_p, a_p = trajectory[-1] expected = rewards[-1] + q[s_p][a_p] control = SarsaControl(alpha, gamma, q=q) control.update(trajectory, rewards) q = control.get_q() s, a = trajectory[-2] actual = q[s][a] assert expected == actual
def test_sarsa_throws_exception_when_current_state_is_not_in_q(): trajectory = [ (1, 3), (2, 0), ] rewards = [4.] q = { 1: { 3: 10. }, } alpha = 0.5 gamma = 0.5 control = SarsaControl(alpha, gamma, q=q) with pytest.raises(Exception) as excinfo: control.update(trajectory, rewards) assert "current state 2 not found in q function" in str(excinfo.value)
def test_sarsa_throws_exception_when_previous_state_is_not_in_q(): trajectory = [ (1, 3), (2, 0), ] rewards = [0] q = { 2: { 0: 20. }, } alpha = 0.5 gamma = 0.5 control = SarsaControl(alpha, gamma, q=q) with pytest.raises(Exception) as excinfo: control.update(trajectory, rewards) assert "previous state 1 not found in q function" in str(excinfo.value)
def test_sarsa_has_no_effect_on_q_when_alpha_is_zero(): trajectory = [ (1, 3), (2, 0), ] rewards = [4.] q = { 1: { 3: 10. }, 2: { 0: 20. }, } alpha = 0. gamma = 0.5 expected = copy.deepcopy(q) control = SarsaControl(alpha, gamma, q=q) control.update(trajectory, rewards) actual = control.get_q() assert expected == actual
def test_sarsa_throws_exception_when_reward_for_current_step_has_already_been_decided( ): trajectory = [ (1, 3), (2, 0), ] rewards = [0., 1.] q = { 1: { 3: 10. }, 2: { 0: 20. }, } alpha = 0. gamma = 0.5 control = SarsaControl(alpha, gamma, q=q) with pytest.raises(Exception) as excinfo: control.update(trajectory, rewards) assert "Length of trajectory and rewards lists are the same; current state-action pair shouldn't yet have a reward when doing Sarsa" in str( excinfo.value)
def main(): start_time = time() # Extract the features of the environment # TODO: This has to be automatable. n_states = GRID[0] * GRID[1] * GRID[2] * GRID[3] n_actions = gym.make(ENV_NAME).action_space.n min_values = gym.make(ENV_NAME).observation_space.low max_values = gym.make(ENV_NAME).observation_space.high # Modify min/max values to something more appropriate for the environment min_values[1] = -1. min_values[2] = -0.25 min_values[3] = -2.0 max_values[1] = 1. max_values[2] = 0.25 max_values[3] = 2.0 state_encoder = StateEncoder().fit(GRID, min_values, max_values) q = {} if START_FROM_MODEL: # Load an existing q-function with open("best_q.pkl", "rb") as q_file: q = pickle.load(q_file) else: # TODO: I don't think this is a good idea. Defeats the point of using a # hash tables, and many of the states won't be visitable. In # addition, I have a feeling that the initial value shouldn't be a # fixed number but rather evolve over time as more information is # known. for state in range(n_states): q[state] = {key: INITIAL_VALUE for key in range(n_actions)} control = SarsaControl( ALPHA, GAMMA, epsilon=EPSILON, random_policy=RandomPolicy(n_actions), q=q, ) # Outer loop for episode for episode in range(NUM_EPSIODES): # Create the initial environment and register the initial state env, env_raw, observation = make_envs( ENV_NAME, output_movie=OUTPUT_MOVIE, max_steps_per_episode=MAX_STEPS_PER_EPISODE, ) state = state_encoder.transform(observation) # Generate the trajectory for this episode trajectory = [] rewards = [] cumul_reward = 0. for timestep in range(MAX_STEPS_PER_EPISODE): # Render the current frame if OUTPUT_MOVIE: env.render() sleep(FRAME_RATE) # Predict the current action and generate the reward action = control.next_action(state) trajectory.append((state, action)) # Update the q function based on the trajector for this episode if timestep > 0: control.update(trajectory, rewards) q = control.get_q() observation, reward, done, _ = env.step(action) rewards.append(reward) state = state_encoder.transform(observation) cumul_reward += reward # Output time step results to screen. Only do this when outputting # movies to avoid slowing down the training. if OUTPUT_MOVIE: print( step_statistics(timestep + 1, reward, cumul_reward, state)) # If we're finished with this episode, output episode statistics # and break if done: elapsed_time = time() - start_time print( run_summary(elapsed_time, episode + 1, timestep + 1, cumul_reward)) break close_envs(env, env_raw) if episode % MODEL_SAVE_FREQUENCY == 0: print("Outputting model...") with open("best_q.pkl", "wb") as q_file: pickle.dump(q, q_file)