Exemple #1
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=3000,
        num_episodes_per_improvement=1,
        update_upon_every_visit=False,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=1500,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)

    with pytest.raises(ValueError, match='Epsilon must be >= 0'):
        q_S_A.epsilon = -1.0
        q_S_A.improve_policy(mdp_agent,
                             states=None,
                             event=PolicyImprovementEvent.MAKING_POLICY_GREEDY)

    q_S_A.epsilon = 0.0
    assert q_S_A.improve_policy(
        mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
Exemple #2
0
def test_invalid_iterate_value_q_pi():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None)

    # target agent
    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    # episode generation (behavior) policy
    off_policy_agent = StochasticMdpAgent('test', random_state,
                                          q_S_A.get_initial_policy(), 1)

    with pytest.raises(
            ValueError,
            match=
            'Planning environments are not currently supported for Monte Carlo iteration.'
    ):
        iterate_value_q_pi(
            agent=mdp_agent,
            environment=mdp_environment,
            num_improvements=100,
            num_episodes_per_improvement=1,
            update_upon_every_visit=True,
            planning_environment=TrajectorySamplingMdpPlanningEnvironment(
                'foo', random_state, StochasticEnvironmentModel(), 100, None),
            make_final_policy_greedy=False,
            q_S_A=q_S_A,
            off_policy_agent=off_policy_agent)

    # test warning...no off-policy agent with epsilon=0.0
    q_S_A.epsilon = 0.0
    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=100,
                       num_episodes_per_improvement=1,
                       update_upon_every_visit=True,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A,
                       off_policy_agent=None)