Example #1
0
def test_value_iteration():

    # run policy iteration on v_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi_policy_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_policy_v_pi(mdp_agent_v_pi_policy_iteration, mdp_environment,
                        0.001, True)

    # run value iteration on v_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi_value_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment, 0.001,
                       1, True)

    assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi

    # run value iteration on q_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_q_pi_value_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_value_q_pi(mdp_agent_q_pi_value_iteration, mdp_environment, 0.001,
                       1, True)

    assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
Example #2
0
def test_run():

    random_state = RandomState(12345)

    mdp_environment: GamblersProblem = GamblersProblem(
        'gamblers problem', random_state=random_state, T=None, p_h=0.4)

    agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    monitor = Monitor()
    state = mdp_environment.reset_for_new_run(agent)
    agent.reset_for_new_run(state)
    mdp_environment.run(agent, monitor)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle', 'wb') as file:
    #     pickle.dump(monitor, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_run.pickle',
              'rb') as file:
        fixture = pickle.load(file)

    assert monitor.t_average_reward == fixture.t_average_reward
Example #3
0
def test_human_player():

    random_state = RandomState(12345)

    human = Human()

    def mock_input(*_) -> str:
        s = human.most_recent_state
        selected_a = sample_list_item(s.AA,
                                      probs=None,
                                      random_state=random_state)
        return selected_a.name

    human.get_input = mock_input

    mancala: Mancala = Mancala(random_state=random_state,
                               T=None,
                               initial_count=4,
                               player_2=human)

    epsilon = 0.05

    p1 = ActionValueMdpAgent(
        'player 1', random_state, 1,
        TabularStateActionValueEstimator(mancala, epsilon, None))

    state = mancala.reset_for_new_run(p1)
    p1.reset_for_new_run(state)
    a = p1.act(0)
    state, reward = mancala.advance(state, 0, a, p1)

    assert mancala.board[7].count == 0 and state.i == 1 and reward.i == 2
Example #4
0
def test_off_policy_monte_carlo_with_function_approximation():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment,
        0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        None,
        False,
        None,
        None
    )

    # target agent
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        q_S_A
    )

    # episode generation (behavior) policy
    off_policy_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, None, None)
    )

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=100,
        num_episodes_per_improvement=1,
        update_upon_every_visit=True,
        planning_environment=None,
        make_final_policy_greedy=False,
        off_policy_agent=off_policy_agent
    )

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture
    assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][mdp_environment.SS[5].AA[1]]).startswith('-2.4305')

    # make greedy
    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1
    assert mdp_agent.pi.estimator.epsilon == 0.0
Example #5
0
def test_learn():

    random_state = RandomState(12345)

    mancala: Mancala = Mancala(random_state=random_state,
                               T=None,
                               initial_count=4,
                               player_2=None)

    p1 = ActionValueMdpAgent(
        'player 1', random_state, 1,
        TabularStateActionValueEstimator(mancala, 0.05, None))

    checkpoint_path = iterate_value_q_pi(
        agent=p1,
        environment=mancala,
        num_improvements=3,
        num_episodes_per_improvement=100,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        num_improvements_per_checkpoint=3,
        checkpoint_path=tempfile.NamedTemporaryFile(delete=False).name)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle', 'wb') as file:
    #     pickle.dump(p1.pi, file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_mancala.pickle',
              'rb') as file:
        fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(p1.pi, fixture)

    resumed_p1 = resume_from_checkpoint(checkpoint_path=checkpoint_path,
                                        resume_function=iterate_value_q_pi,
                                        num_improvements=2)

    # run same number of improvements without checkpoint...result should be the same.
    random_state = RandomState(12345)
    mancala: Mancala = Mancala(random_state=random_state,
                               T=None,
                               initial_count=4,
                               player_2=None)
    no_checkpoint_p1 = ActionValueMdpAgent(
        'player 1', random_state, 1,
        TabularStateActionValueEstimator(mancala, 0.05, None))

    iterate_value_q_pi(agent=no_checkpoint_p1,
                       environment=mancala,
                       num_improvements=5,
                       num_episodes_per_improvement=100,
                       update_upon_every_visit=False,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    assert no_checkpoint_p1.pi == resumed_p1.pi
Example #6
0
def test_learn():

    random_state = RandomState(12345)
    gym = Gym(random_state=random_state, T=None, gym_id='CartPole-v1')
    q_S_A = TabularStateActionValueEstimator(gym, 0.05, 0.001)
    mdp_agent = ActionValueMdpAgent('agent', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=gym,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle',
              'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Example #7
0
def test_sarsa_iterate_value_q_pi_with_trajectory_planning():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    planning_environment = TrajectorySamplingMdpPlanningEnvironment(
        'test planning', random_state, StochasticEnvironmentModel(), 10, None)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=planning_environment,
                       make_final_policy_greedy=True)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Example #8
0
def test_evaluate_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    evaluated_states, _ = evaluate_q_pi(agent=mdp_agent,
                                        environment=mdp_environment,
                                        num_episodes=1000,
                                        exploring_starts=True,
                                        update_upon_every_visit=False)

    assert len(
        q_S_A) == len(evaluated_states) + 2  # terminal states aren't evaluated
    assert all(s in q_S_A for s in evaluated_states)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'wb') as file:
    #     pickle.dump(q_S_A, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert tabular_estimator_legacy_eq(q_S_A, fixture)
Example #9
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        q_S_A
    )
    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=3000,
        num_episodes_per_improvement=1,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        num_improvements_per_plot=1500,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name
    )

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        q_S_A.epsilon = -1.0
        q_S_A.improve_policy(mdp_agent, states=None, event=PolicyImprovementEvent.MAKING_POLICY_GREEDY)

    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
Example #10
0
def test_n_step_q_learning_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.Q_LEARNING,
                       n_steps=3,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle',
            'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Example #11
0
def test_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        q_S_A
    )
    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=3000,
        num_episodes_per_improvement=1,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False
    )

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq(q_S_A, q_S_A_fixture)
Example #12
0
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        StateActionIdentityFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
Example #13
0
def test_invalid_iterate_value_q_pi():

    # target agent
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, 0.0, None)
    )

    # episode generation (behavior) policy
    off_policy_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, 0.0, None)
    )

    with pytest.raises(ValueError, match='Planning environments are not currently supported for Monte Carlo iteration.'):
        iterate_value_q_pi(
            agent=mdp_agent,
            environment=mdp_environment,
            num_improvements=100,
            num_episodes_per_improvement=1,
            update_upon_every_visit=True,
            planning_environment=TrajectorySamplingMdpPlanningEnvironment('foo', random_state, StochasticEnvironmentModel(), 100, None),
            make_final_policy_greedy=False,
            off_policy_agent=off_policy_agent
        )

    # test warning...no off-policy agent with epsilon=0.0
    mdp_agent.q_S_A.epsilon = 0.0
    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=100,
        num_episodes_per_improvement=1,
        update_upon_every_visit=True,
        planning_environment=None,
        make_final_policy_greedy=False,
        off_policy_agent=None
    )
Example #14
0
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    epsilon = 0.05
    q_S_A_1 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)
    mdp_agent_1 = ActionValueMdpAgent('test', random_state, 1, q_S_A_1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator
    assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
Example #15
0
def test_invalid_improve_policy_with_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    epsilon = 0.0
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, epsilon, None))

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        improve_policy_with_q_pi(mdp_agent, {}, -1)
Example #16
0
def dump_agent() -> str:

    # create dummy mdp agent for runner
    # noinspection PyTypeChecker
    stochastic_mdp_agent = ActionValueMdpAgent('foo', RandomState(12345), 1.0,
                                               DummyQSA())
    agent_path = tempfile.NamedTemporaryFile(delete=False).name
    with open(agent_path, 'wb') as f:
        pickle.dump(stochastic_mdp_agent, f)

    return agent_path
Example #17
0
def test_q_learning_iterate_value_q_pi_tabular_policy_ne():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    epsilon = 0.05
    q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_1 = ActionValueMdpAgent('test', random_state, 1, q_S_A_1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    test_state = mdp_environment.SS[5]
    test_action = test_state.AA[0]

    assert q_S_A_1 != q_S_A_2
    assert q_S_A_1[test_state] != q_S_A_2[test_state]
    assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
Example #18
0
def test_policy_iteration():

    # state-value policy iteration
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_policy_v_pi(mdp_agent_v_pi, mdp_environment, 0.001, True)

    # action-value policy iteration
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_q_pi = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    iterate_policy_q_pi(mdp_agent_q_pi, mdp_environment, 0.001, True)

    # should get the same policy
    assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
Example #19
0
def test_evaluate_v_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    v_pi, _ = evaluate_v_pi(agent=mdp_agent,
                            environment=mdp_environment,
                            theta=0.001,
                            num_iterations=None,
                            update_in_place=True)

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    v_pi_not_in_place, _ = evaluate_v_pi(agent=mdp_agent,
                                         environment=mdp_environment,
                                         theta=0.001,
                                         num_iterations=None,
                                         update_in_place=False)

    assert list(v_pi.keys()) == list(v_pi_not_in_place.keys())

    np.testing.assert_allclose(list(v_pi.values()),
                               list(v_pi_not_in_place.values()),
                               atol=0.01)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Example #20
0
def test_agent_invalid_action():

    random = RandomState()
    agent = ActionValueMdpAgent(
        'foo', random, 1.0,
        TabularStateActionValueEstimator(Gridworld.example_4_1(random, None),
                                         None, None))

    # test None action
    agent.__act__ = lambda t: None

    with pytest.raises(ValueError, match='Agent returned action of None'):
        agent.act(0)

    # test infeasible action
    action = Action(1, 'foo')
    agent.__act__ = lambda t: action
    state = MdpState(1, [], False)
    agent.sense(state, 0)
    with pytest.raises(
            ValueError,
            match=f'Action {action} is not feasible in state {state}'):
        agent.act(0)
Example #21
0
def test_invalid_epsilon_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, 0.0, None))

    with pytest.raises(ValueError,
                       match='epsilon must be strictly > 0 for TD-learning'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=10,
                           num_episodes_per_improvement=100,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.Q_LEARNING,
                           n_steps=3,
                           planning_environment=None,
                           make_final_policy_greedy=False)
Example #22
0
    def train_thread_target():

        random_state = RandomState(12345)
        mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
        mdp_agent = ActionValueMdpAgent(
            'test', random_state, 1,
            TabularStateActionValueEstimator(mdp_environment, 0.1, None))

        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=1000000,
                           num_episodes_per_improvement=10,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.SARSA,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           thread_manager=thread_manager,
                           num_improvements_per_plot=10)
Example #23
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, 0.05, None))

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=100,
        num_updates_per_improvement=None,
        alpha=0.1,
        mode=Mode.Q_LEARNING,
        n_steps=1,
        planning_environment=None,
        make_final_policy_greedy=False,
        num_improvements_per_plot=5,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
Example #24
0
def test_gamblers_problem():

    random_state = RandomState(12345)
    mdp_environment: GamblersProblem = GamblersProblem(
        'gamblers problem', random_state=random_state, T=None, p_h=0.4)
    mdp_agent_v_pi_value_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    v_pi = iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment,
                              0.001, 1, True)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_gamblers_problem.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Example #25
0
def test_evaluate_q_pi_invalid_n_steps():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    epsilon = 0.05
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, epsilon, None)
    )

    with pytest.raises(ValueError):
        evaluate_q_pi(
            agent=mdp_agent,
            environment=mdp_environment,
            num_episodes=5,
            num_updates_per_improvement=None,
            alpha=0.1,
            mode=Mode.Q_LEARNING,
            n_steps=-1,
            planning_environment=None
        )
Example #26
0
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula(
):

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    with pytest.raises(ValueError, match='Invalid combination of formula'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=5,
                           num_episodes_per_improvement=5,
                           num_updates_per_improvement=None,
                           alpha=None,
                           mode=Mode.Q_LEARNING,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False)
Example #27
0
def test_evaluate_v_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    v_pi = evaluate_v_pi(agent=mdp_agent,
                         environment=mdp_environment,
                         num_episodes=1000)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Example #28
0
def evaluate_v_pi(
        agent: ActionValueMdpAgent,
        environment: MdpEnvironment,
        num_episodes: int
) -> Dict[MdpState, float]:
    """
    Perform Monte Carlo evaluation of an agent's policy within an environment, returning state values. Uses a random
    action on the first time step to maintain exploration (exploring starts). This evaluation approach is only
    marginally useful in practice, as the state-value estimates require a model of the environmental dynamics (i.e.,
    the transition-reward probability distribution) in order to be applied. See `evaluate_q_pi` in this module for a
    more feature-rich and useful evaluation approach (i.e., state-action value estimation). This evaluation function
    operates over rewards obtained at the end of episodes, so it is only appropriate for episodic tasks.

    :param agent: Agent.
    :param environment: Environment.
    :param num_episodes: Number of episodes to execute.
    :return: Dictionary of MDP states and their estimated values under the agent's policy.
    """

    logging.info(f'Running Monte Carlo evaluation of v_pi for {num_episodes} episode(s).')

    v_pi: Dict[MdpState, IncrementalSampleAverager] = {
        terminal_state: IncrementalSampleAverager()
        for terminal_state in environment.terminal_states
    }

    episodes_per_print = int(num_episodes * 0.05)
    for episode_i in range(num_episodes):

        # start the environment in a random state
        state = environment.reset_for_new_run(agent)
        agent.reset_for_new_run(state)

        # simulate until episode termination, keeping a trace of states and their immediate rewards, as well as the
        # times of their first visits.
        t = 0
        state_first_t = {}
        t_state_reward = []
        while not state.terminal and (environment.T is None or t < environment.T):

            if state not in state_first_t:
                state_first_t[state] = t

            if t == 0:
                a = sample_list_item(state.AA, None, environment.random_state)
            else:
                a = agent.act(t)

            next_state, reward = environment.advance(state, t, a, agent)
            t_state_reward.append((t, state, reward))
            state = next_state
            t += 1

            agent.sense(state, t)

        # work backwards through the trace to calculate discounted returns. need to work backward in order for the value
        # of g at each time step t to be properly discounted.
        g = 0
        for t, state, reward in reversed(t_state_reward):

            g = agent.gamma * g + reward.r

            # if the current time step was the first visit to the state, then g is the discounted sample value. add it
            # to our average.
            if state_first_t[state] == t:

                if state not in v_pi:
                    v_pi[state] = IncrementalSampleAverager()

                v_pi[state].update(g)

        episodes_finished = episode_i + 1
        if episodes_finished % episodes_per_print == 0:
            logging.info(f'Finished {episodes_finished} of {num_episodes} episode(s).')

    return {
        s: v_pi[s].get_value()
        for s in v_pi
    }
Example #29
0
def evaluate_q_pi(
    agent: ActionValueMdpAgent, environment: MdpEnvironment, num_episodes: int,
    num_updates_per_improvement: Optional[int], alpha: Optional[float],
    mode: Mode, n_steps: Optional[int],
    planning_environment: Optional[MdpPlanningEnvironment]
) -> Tuple[Set[MdpState], float]:
    """
    Perform temporal-difference (TD) evaluation of an agent's policy within an environment, returning state-action
    values. This evaluation function implements both on-policy TD learning (SARSA) as well as off-policy TD learning
    (Q-learning and expected SARSA), and n-step updates are implemented for all learning modes.

    :param agent: Agent containing target policy to be optimized.
    :param environment: Environment.
    :param num_episodes: Number of episodes to execute.
    :param num_updates_per_improvement: Number of state-action value updates to execute for each iteration of policy
    improvement, or None for policy improvement per specified number of episodes.
    :param alpha: Constant step size to use when updating Q-values, or None for 1/n step size.
    :param mode: Evaluation mode (see `rlai.gpi.temporal_difference.evaluation.Mode`).
    :param n_steps: Number of steps to accumulate rewards before updating estimated state-action values. Must be in the
    range [1, inf], or None for infinite step size (Monte Carlo evaluation).
    :param planning_environment: Planning environment to learn through experience gained during evaluation, or None to
    not learn an environment model.
    :return: 2-tuple of (1) set of only those states that were evaluated, and (2) the average reward obtained per
    episode.
    """

    if n_steps is not None and n_steps < 1:
        raise ValueError(
            'The value of n_steps must be in range [1, inf], or None.')

    logging.info(
        f'Running temporal-difference evaluation of q_pi for {num_episodes} episode(s).'
    )

    evaluated_states = set()

    planning = isinstance(environment, MdpPlanningEnvironment)

    # prioritized sampling requires access to the bootstrapped state-action value function, and it also requires
    # access to the state-action value estimators.
    if isinstance(environment, PrioritizedSweepingMdpPlanningEnvironment):
        environment.bootstrap_function = partial(
            get_bootstrapped_state_action_value,
            mode=mode,
            agent=agent,
            q_S_A=agent.q_S_A,
            environment=environment)
        environment.q_S_A = agent.q_S_A

    # run episodes
    episode_reward_averager = IncrementalSampleAverager()
    episodes_per_print = max(1, int(num_episodes * 0.05))
    for episode_i in range(num_episodes):

        # reset the environment for the new run, and reset the agent accordingly.
        curr_state = environment.reset_for_new_run(agent)
        agent.reset_for_new_run(curr_state)

        # simulate until episode termination. begin by taking an action in the first state.
        curr_t = 0
        curr_a = agent.act(curr_t)
        total_reward = 0.0
        t_state_a_g: Dict[int, Tuple[MdpState, Action, float]] = {
        }  # dictionary from time steps to tuples of state, action, and truncated return.
        next_state_q_s_a = 0.0
        while not curr_state.terminal and (environment.T is None
                                           or curr_t < environment.T):

            advance_result, next_reward = environment.advance(state=curr_state,
                                                              t=curr_t,
                                                              a=curr_a,
                                                              agent=agent)

            logging.debug(f'Obtained reward:  {next_reward}\n')

            # in the case of a planning-based advancement, the planning environment returns a 3-tuple of the current
            # state, current action, and next state. this is because the planning environment may revise any one of
            # these variables to conduct the planning process (e.g., by prioritized sweeping).
            if planning:
                curr_state, curr_a, next_state = advance_result
            else:
                next_state = advance_result

            next_t = curr_t + 1
            agent.sense(next_state, next_t)

            # if we're building an environment model, then update it with the transition we just observed.
            if planning_environment is not None:
                planning_environment.model.update(curr_state, curr_a,
                                                  next_state, next_reward)

            # initialize the n-step, truncated return accumulator at the current time for the current state and action.
            t_state_a_g[curr_t] = (curr_state, curr_a, 0.0)

            # ask the agent to shape the reward, returning the time steps whose returns should be updated and the shaped
            # reward associated with each. if n_steps is None, then shape the reward all the way back to the start
            # (equivalent to infinite n_steps, or monte carlo returns). if n_steps is not None, then shape the reward
            # for n-step updates.
            if n_steps is None:
                first_t = 0
            else:
                # in 1-step td, the earliest time step is the final time step; in 2-step, the earliest time step is the
                # prior time step, etc.
                first_t = max(0, curr_t - n_steps + 1)

            t_shaped_reward = agent.shape_reward(reward=next_reward,
                                                 first_t=first_t,
                                                 final_t=curr_t)

            # update return accumulators with shaped rewards
            t_state_a_g.update({
                return_t: (t_state_a_g[return_t][0], t_state_a_g[return_t][1],
                           t_state_a_g[return_t][2] + shaped_reward)
                for return_t, shaped_reward in t_shaped_reward.items()

                # reward shapers might return invalid time steps. ignore these.
                if return_t in t_state_a_g
            })

            # get the next state's bootstrapped value and next action, based on the bootstrapping mode. note that the
            # bootstrapped next state-action value is only used if we're performing n-step updates below.
            next_state_q_s_a, next_a = get_bootstrapped_state_action_value(
                state=next_state,
                t=next_t,
                mode=mode,
                agent=agent,
                q_S_A=agent.q_S_A,
                environment=environment)

            # only update if n_steps is finite (not monte carlo)
            if n_steps is not None:
                update_q_S_A(
                    q_S_A=agent.q_S_A,
                    n_steps=n_steps,
                    curr_t=curr_t,
                    t_state_a_g=t_state_a_g,
                    agent=agent,
                    next_state_q_s_a=next_state_q_s_a,
                    alpha=alpha,
                    evaluated_states=evaluated_states,
                    planning_environment=planning_environment,
                    num_updates_per_improvement=num_updates_per_improvement)

            # advance the episode
            curr_t = next_t
            curr_state = next_state
            curr_a = next_a

            total_reward += next_reward.r

        # flush out the remaining n-step updates. if we terminated because we reached a terminal state, then all next-
        # state values for the updates are zero. if instead we terminated because we reached the maximum number of time
        # steps, then use the bootstrapped next-state value instead. this is an important distinction because these
        # value can be dramatically different, and when discounting is not used (or is very small), the resulting value
        # estimates can be correspondingly different.
        if curr_state.terminal:
            next_state_q_s_a = 0.0

        flush_n_steps = len(t_state_a_g) + 1
        while len(t_state_a_g) > 0:
            update_q_S_A(
                q_S_A=agent.q_S_A,
                n_steps=flush_n_steps,
                curr_t=curr_t,
                t_state_a_g=t_state_a_g,
                agent=agent,
                next_state_q_s_a=next_state_q_s_a,
                alpha=alpha,
                evaluated_states=evaluated_states,
                planning_environment=planning_environment,
                num_updates_per_improvement=num_updates_per_improvement)
            curr_t += 1

        episode_reward_averager.update(total_reward)

        episodes_finished = episode_i + 1
        if episodes_finished % episodes_per_print == 0:
            logging.info(
                f'Finished {episodes_finished} of {num_episodes} episode(s).')

    return evaluated_states, episode_reward_averager.get_value()
def test_policy_overrides():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=20,
        num_updates_per_improvement=None,
        alpha=None,
        mode=Mode.Q_LEARNING,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
    )

    random_state = RandomState(12345)

    mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment_2, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment_2), None, False, None, None)

    mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment_2,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    assert isinstance(
        mdp_agent_2.most_recent_state,
        MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi

    with pytest.raises(ValueError,
                       match='Attempted to check for None in policy.'):
        # noinspection PyTypeChecker
        if None in mdp_agent_2.pi:  # pragma no cover
            pass

    assert mdp_agent.pi == mdp_agent_2.pi
    assert not (mdp_agent.pi != mdp_agent_2.pi)