Example #1
0
def test_value_iteration():

    # run policy iteration on v_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi_policy_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_policy_v_pi(mdp_agent_v_pi_policy_iteration, mdp_environment,
                        0.001, True)

    # run value iteration on v_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi_value_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment, 0.001,
                       1, True)

    assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi

    # run value iteration on q_pi
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_q_pi_value_iteration = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_value_q_pi(mdp_agent_q_pi_value_iteration, mdp_environment, 0.001,
                       1, True)

    assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
Example #2
0
def test_sarsa_iterate_value_q_pi_with_trajectory_planning():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    planning_environment = TrajectorySamplingMdpPlanningEnvironment(
        'test planning', random_state, StochasticEnvironmentModel(), 10, None)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=planning_environment,
                       make_final_policy_greedy=True)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Example #3
0
def test_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=3000,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=False,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Example #4
0
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        StateActionIdentityFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
Example #5
0
def test_evaluate_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    evaluated_states, _ = evaluate_q_pi(agent=mdp_agent,
                                        environment=mdp_environment,
                                        num_episodes=1000,
                                        exploring_starts=True,
                                        update_upon_every_visit=False)

    assert len(
        q_S_A) == len(evaluated_states) + 2  # terminal states aren't evaluated
    assert all(s in q_S_A for s in evaluated_states)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'wb') as file:
    #     pickle.dump(q_S_A, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert tabular_estimator_legacy_eq(q_S_A, fixture)
Example #6
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=3000,
        num_episodes_per_improvement=1,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=1500,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        q_S_A.epsilon = -1.0
        q_S_A.improve_policy(mdp_agent,
                             states=None,
                             event=PolicyImprovementEvent.MAKING_POLICY_GREEDY)

    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(
        mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
Example #7
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=100,
        num_updates_per_improvement=None,
        alpha=0.1,
        mode=Mode.Q_LEARNING,
        n_steps=1,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=5,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
Example #8
0
def test_n_step_q_learning_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.Q_LEARNING,
                       n_steps=3,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle',
            'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Example #9
0
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula(
):

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Invalid combination of formula'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=5,
                           num_episodes_per_improvement=5,
                           num_updates_per_improvement=None,
                           alpha=None,
                           mode=Mode.Q_LEARNING,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A)
Example #10
0
def test_human_agent():

    agent = Human()

    a1 = Action(0, 'Foo')
    a2 = Action(1, 'Bar')

    state = MdpState(1, [a1, a2], False)
    agent.sense(state, 0)

    call_num = 0

    def mock_input(*_) -> str:

        nonlocal call_num
        if call_num == 0:
            call_num += 1
            return 'asdf'
        else:
            return 'Bar'

    agent.get_input = mock_input  # MagicMock(return_value='Bar')

    assert agent.act(0) == a2

    with pytest.raises(NotImplementedError):
        rng = RandomState(12345)
        Human.init_from_arguments([], rng, Gridworld.example_4_1(rng, None))
Example #11
0
def test_check_marginal_probabilities():

    random = RandomState()
    gridworld = Gridworld.example_4_1(random, None)
    gridworld.p_S_prime_R_given_S_A[gridworld.SS[0]][gridworld.a_left][gridworld.SS[0]][Reward(1, -1)] = 1.0

    with pytest.raises(ValueError, match='Expected next-state/next-reward marginal probability of 1.0, but got 2.0'):
        gridworld.check_marginal_probabilities()
Example #12
0
def test_off_policy_monte_carlo_with_function_approximation():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment,
        0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        None,
        False,
        None,
        None
    )

    # target agent
    mdp_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        q_S_A
    )

    # episode generation (behavior) policy
    off_policy_agent = ActionValueMdpAgent(
        'test',
        random_state,
        1,
        TabularStateActionValueEstimator(mdp_environment, None, None)
    )

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=100,
        num_episodes_per_improvement=1,
        update_upon_every_visit=True,
        planning_environment=None,
        make_final_policy_greedy=False,
        off_policy_agent=off_policy_agent
    )

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture
    assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][mdp_environment.SS[5].AA[1]]).startswith('-2.4305')

    # make greedy
    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1
    assert mdp_agent.pi.estimator.epsilon == 0.0
Example #13
0
def test_value_iteration():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    # run policy iteration on v_pi
    mdp_agent_v_pi_policy_iteration = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_policy_v_pi(
        mdp_agent_v_pi_policy_iteration,
        mdp_environment,
        0.001,
        True
    )

    # run value iteration on v_pi
    mdp_agent_v_pi_value_iteration = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_value_v_pi(
        mdp_agent_v_pi_value_iteration,
        mdp_environment,
        0.001,
        1,
        True
    )

    assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi

    # run value iteration on q_pi
    mdp_agent_q_pi_value_iteration = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_value_q_pi(
        mdp_agent_q_pi_value_iteration,
        mdp_environment,
        0.001,
        1,
        True
    )

    assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
def main():

    random = RandomState(12345)
    gridworld = Gridworld.example_4_1(random, None)

    # the bottom-right corner (3,3) is a goal state. get the states surrounding this goal. these will become the sticky
    # states.
    sticky_states = [
        gridworld.grid[2, 2], gridworld.grid[2, 3], gridworld.grid[3, 2]
    ]

    # amplify all negative rewards in the sticky states by a factor of 10, keeping the probabilities the same.
    for sticky_state in sticky_states:
        for a in gridworld.p_S_prime_R_given_S_A[sticky_state]:
            for s_prime in gridworld.p_S_prime_R_given_S_A[sticky_state][a]:
                gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] = {
                    Reward(r.i, (r.r * 10.0 if r.r < 0.0 else r.r)):
                    gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime]
                    [r]
                    for r in gridworld.p_S_prime_R_given_S_A[sticky_state][a]
                    [s_prime]
                }

    epsilon = 0.1

    q_S_A = TabularStateActionValueEstimator(
        environment=gridworld,
        epsilon=epsilon,
        continuous_state_discretization_resolution=None)

    pi = q_S_A.get_initial_policy()

    mdp_agent = StochasticMdpAgent(name='agent',
                                   random_state=random,
                                   pi=pi,
                                   gamma=1.0)

    # iterate the agents policy using q-learning temporal differencing
    iterate_value_q_pi(agent=mdp_agent,
                       environment=gridworld,
                       num_improvements=20,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A,
                       num_improvements_per_plot=20)

    for s in pi:
        print(f'State {s.i}:')
        for a in pi[s]:
            if pi[s][a] > 0.0:
                print(f'\tPr({a.name}):  {pi[s][a]}')
Example #15
0
def test_invalid_improve_policy_with_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    epsilon = 0.0
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, epsilon, None))

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        improve_policy_with_q_pi(mdp_agent, {}, -1)
Example #16
0
def test_policy_iteration():

    # state-value policy iteration
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_v_pi = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    iterate_policy_v_pi(mdp_agent_v_pi, mdp_environment, 0.001, True)

    # action-value policy iteration
    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent_q_pi = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))

    iterate_policy_q_pi(mdp_agent_q_pi, mdp_environment, 0.001, True)

    # should get the same policy
    assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
Example #17
0
def test_evaluate_v_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    v_pi, _ = evaluate_v_pi(agent=mdp_agent,
                            environment=mdp_environment,
                            theta=0.001,
                            num_iterations=None,
                            update_in_place=True)

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, None, None))
    v_pi_not_in_place, _ = evaluate_v_pi(agent=mdp_agent,
                                         environment=mdp_environment,
                                         theta=0.001,
                                         num_iterations=None,
                                         update_in_place=False)

    assert list(v_pi.keys()) == list(v_pi_not_in_place.keys())

    np.testing.assert_allclose(list(v_pi.values()),
                               list(v_pi_not_in_place.values()),
                               atol=0.01)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
def test_check_state_and_action_lists():

    random = RandomState(12345)
    gw = Gridworld.example_4_1(random, T=None)
    fex = GridworldFeatureExtractor(gw)

    states = [MdpState(i=None, AA=[], terminal=False)]
    actions = [Action(0)]
    fex.check_state_and_action_lists(states, actions)

    with pytest.raises(ValueError, match='Expected '):
        actions.clear()
        fex.check_state_and_action_lists(states, actions)
Example #19
0
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A_1 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_1 = StochasticMdpAgent('test', random_state,
                                     q_S_A_1.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_1)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator
    assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
Example #20
0
def test_q_learning_iterate_value_q_pi_tabular_policy_ne():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_1 = StochasticMdpAgent('test', random_state,
                                     q_S_A_1.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_1)

    q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    test_state = mdp_environment.SS[5]
    test_action = test_state.AA[0]

    assert q_S_A_1 != q_S_A_2
    assert q_S_A_1[test_state] != q_S_A_2[test_state]
    assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
Example #21
0
def test_invalid_improve_policy_with_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    epsilon = 0.0

    q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        improve_policy_with_q_pi(mdp_agent, {}, -1)
Example #22
0
def test_invalid_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    # episode generation (behavior) policy
    off_policy_agent = StochasticMdpAgent('test', random_state,
                                          q_S_A.get_initial_policy(), 1)

    with pytest.raises(
            ValueError,
            match=
            'Planning environments are not currently supported for Monte Carlo iteration.'
    ):
        iterate_value_q_pi(
            agent=mdp_agent,
            environment=mdp_environment,
            num_improvements=100,
            num_episodes_per_improvement=1,
            update_upon_every_visit=True,
            planning_environment=TrajectorySamplingMdpPlanningEnvironment(
                'foo', random_state, StochasticEnvironmentModel(), 100, None),
            make_final_policy_greedy=False,
            q_S_A=q_S_A,
            off_policy_agent=off_policy_agent)

    # test warning...no off-policy agent with epsilon=0.0
    q_S_A.epsilon = 0.0
    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=True,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A,
                       off_policy_agent=None)
Example #23
0
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
    assert mdp_agent.pi.format_state_action_probs(
        mdp_environment.SS) == pi_fixture.format_state_action_probs(
            mdp_environment.SS)
    assert mdp_agent.pi.format_state_action_values(
        mdp_environment.SS) == pi_fixture.format_state_action_values(
            mdp_environment.SS)
Example #24
0
    def train_thread_target():
        random_state = RandomState(12345)

        mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

        q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

        mdp_agent = StochasticMdpAgent('test', random_state,
                                       q_S_A.get_initial_policy(), 1)

        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=1000000,
                           num_episodes_per_improvement=10,
                           update_upon_every_visit=False,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A,
                           thread_manager=thread_manager,
                           num_improvements_per_plot=10)
Example #25
0
    def train_thread_target():

        random_state = RandomState(12345)
        mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
        mdp_agent = ActionValueMdpAgent(
            'test', random_state, 1,
            TabularStateActionValueEstimator(mdp_environment, 0.1, None))

        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=1000000,
                           num_episodes_per_improvement=10,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.SARSA,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           thread_manager=thread_manager,
                           num_improvements_per_plot=10)
Example #26
0
def test_invalid_epsilon_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, 0.0, None))

    with pytest.raises(ValueError,
                       match='epsilon must be strictly > 0 for TD-learning'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=10,
                           num_episodes_per_improvement=100,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.Q_LEARNING,
                           n_steps=3,
                           planning_environment=None,
                           make_final_policy_greedy=False)
Example #27
0
def test_policy_iteration():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    # state-value policy iteration
    mdp_agent_v_pi = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_policy_v_pi(
        mdp_agent_v_pi,
        mdp_environment,
        0.001,
        True
    )

    # action-value policy iteration
    mdp_agent_q_pi = StochasticMdpAgent(
        'test',
        random_state,
        TabularPolicy(None, mdp_environment.SS),
        1
    )

    iterate_policy_q_pi(
        mdp_agent_q_pi,
        mdp_environment,
        0.001,
        True
    )

    # should get the same policy
    assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
Example #28
0
def test_agent_invalid_action():

    random = RandomState()
    agent = ActionValueMdpAgent(
        'foo', random, 1.0,
        TabularStateActionValueEstimator(Gridworld.example_4_1(random, None),
                                         None, None))

    # test None action
    agent.__act__ = lambda t: None

    with pytest.raises(ValueError, match='Agent returned action of None'):
        agent.act(0)

    # test infeasible action
    action = Action(1, 'foo')
    agent.__act__ = lambda t: action
    state = MdpState(1, [], False)
    agent.sense(state, 0)
    with pytest.raises(
            ValueError,
            match=f'Action {action} is not feasible in state {state}'):
        agent.act(0)
Example #29
0
def test_evaluate_v_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   TabularPolicy(None, mdp_environment.SS), 1)

    v_pi = evaluate_v_pi(agent=mdp_agent,
                         environment=mdp_environment,
                         num_episodes=1000)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'wb') as file:
    #     pickle.dump(v_pi, file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle',
            'rb') as file:
        fixture = pickle.load(file)

    assert v_pi == fixture
Example #30
0
def test_evaluate_q_pi_invalid_n_steps():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    epsilon = 0.05

    q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError):
        evaluate_q_pi(agent=mdp_agent,
                      environment=mdp_environment,
                      num_episodes=5,
                      num_updates_per_improvement=None,
                      alpha=0.1,
                      mode=Mode.Q_LEARNING,
                      n_steps=-1,
                      planning_environment=None,
                      q_S_A=q_S_A)