def test_value_iteration(): # run policy iteration on v_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi_policy_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_v_pi(mdp_agent_v_pi_policy_iteration, mdp_environment, 0.001, True) # run value iteration on v_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi_value_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_value_v_pi(mdp_agent_v_pi_value_iteration, mdp_environment, 0.001, 1, True) assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi # run value iteration on q_pi random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_q_pi_value_iteration = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_value_q_pi(mdp_agent_q_pi_value_iteration, mdp_environment, 0.001, 1, True) assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
def test_sarsa_iterate_value_q_pi_with_trajectory_planning(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) planning_environment = TrajectorySamplingMdpPlanningEnvironment( 'test planning', random_state, StochasticEnvironmentModel(), 10, None) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=1, planning_environment=planning_environment, make_final_policy_greedy=True) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_planning.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq( q_S_A, q_S_A_fixture)
def test_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=3000, num_episodes_per_improvement=1, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_iteration_of_value_q_pi.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq( q_S_A, q_S_A_fixture)
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), StateActionIdentityFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert np.allclose(mdp_agent.pi.estimator.model.model.coef_, pi_fixture.estimator.model.model.coef_)
def test_evaluate_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) evaluated_states, _ = evaluate_q_pi(agent=mdp_agent, environment=mdp_environment, num_episodes=1000, exploring_starts=True, update_upon_every_visit=False) assert len( q_S_A) == len(evaluated_states) + 2 # terminal states aren't evaluated assert all(s in q_S_A for s in evaluated_states) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'wb') as file: # pickle.dump(q_S_A, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_action_value.pickle', 'rb') as file: fixture = pickle.load(file) assert tabular_estimator_legacy_eq(q_S_A, fixture)
def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=3000, num_episodes_per_improvement=1, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, num_improvements_per_plot=1500, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name) with pytest.raises(ValueError, match='Epsilon must be >= 0'): q_S_A.epsilon = -1.0 q_S_A.improve_policy(mdp_agent, states=None, event=PolicyImprovementEvent.MAKING_POLICY_GREEDY) q_S_A.epsilon = 0.0 assert q_S_A.improve_policy( mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == 14
def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=1, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, num_improvements_per_plot=5, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
def test_n_step_q_learning_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'rb') as file: fixture_pi, fixture_q_S_A = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, fixture_pi) and tabular_estimator_legacy_eq( q_S_A, fixture_q_S_A)
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula( ): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises(ValueError, match='Invalid combination of formula'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A)
def test_human_agent(): agent = Human() a1 = Action(0, 'Foo') a2 = Action(1, 'Bar') state = MdpState(1, [a1, a2], False) agent.sense(state, 0) call_num = 0 def mock_input(*_) -> str: nonlocal call_num if call_num == 0: call_num += 1 return 'asdf' else: return 'Bar' agent.get_input = mock_input # MagicMock(return_value='Bar') assert agent.act(0) == a2 with pytest.raises(NotImplementedError): rng = RandomState(12345) Human.init_from_arguments([], rng, Gridworld.example_4_1(rng, None))
def test_check_marginal_probabilities(): random = RandomState() gridworld = Gridworld.example_4_1(random, None) gridworld.p_S_prime_R_given_S_A[gridworld.SS[0]][gridworld.a_left][gridworld.SS[0]][Reward(1, -1)] = 1.0 with pytest.raises(ValueError, match='Expected next-state/next-reward marginal probability of 1.0, but got 2.0'): gridworld.check_marginal_probabilities()
def test_off_policy_monte_carlo_with_function_approximation(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None ) # target agent mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, q_S_A ) # episode generation (behavior) policy off_policy_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None) ) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=None, make_final_policy_greedy=False, off_policy_agent=off_policy_agent ) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open(f'{os.path.dirname(__file__)}/fixtures/test_off_policy_monte_carlo_with_function_approximationo.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert mdp_agent.pi == pi_fixture and q_S_A == q_S_A_fixture assert str(mdp_agent.pi.estimator[mdp_environment.SS[5]][mdp_environment.SS[5].AA[1]]).startswith('-2.4305') # make greedy q_S_A.epsilon = 0.0 assert q_S_A.improve_policy(mdp_agent, None, PolicyImprovementEvent.MAKING_POLICY_GREEDY) == -1 assert mdp_agent.pi.estimator.epsilon == 0.0
def test_value_iteration(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) # run policy iteration on v_pi mdp_agent_v_pi_policy_iteration = StochasticMdpAgent( 'test', random_state, TabularPolicy(None, mdp_environment.SS), 1 ) iterate_policy_v_pi( mdp_agent_v_pi_policy_iteration, mdp_environment, 0.001, True ) # run value iteration on v_pi mdp_agent_v_pi_value_iteration = StochasticMdpAgent( 'test', random_state, TabularPolicy(None, mdp_environment.SS), 1 ) iterate_value_v_pi( mdp_agent_v_pi_value_iteration, mdp_environment, 0.001, 1, True ) assert mdp_agent_v_pi_policy_iteration.pi == mdp_agent_v_pi_value_iteration.pi # run value iteration on q_pi mdp_agent_q_pi_value_iteration = StochasticMdpAgent( 'test', random_state, TabularPolicy(None, mdp_environment.SS), 1 ) iterate_value_q_pi( mdp_agent_q_pi_value_iteration, mdp_environment, 0.001, 1, True ) assert mdp_agent_q_pi_value_iteration.pi == mdp_agent_v_pi_policy_iteration.pi
def main(): random = RandomState(12345) gridworld = Gridworld.example_4_1(random, None) # the bottom-right corner (3,3) is a goal state. get the states surrounding this goal. these will become the sticky # states. sticky_states = [ gridworld.grid[2, 2], gridworld.grid[2, 3], gridworld.grid[3, 2] ] # amplify all negative rewards in the sticky states by a factor of 10, keeping the probabilities the same. for sticky_state in sticky_states: for a in gridworld.p_S_prime_R_given_S_A[sticky_state]: for s_prime in gridworld.p_S_prime_R_given_S_A[sticky_state][a]: gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] = { Reward(r.i, (r.r * 10.0 if r.r < 0.0 else r.r)): gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] [r] for r in gridworld.p_S_prime_R_given_S_A[sticky_state][a] [s_prime] } epsilon = 0.1 q_S_A = TabularStateActionValueEstimator( environment=gridworld, epsilon=epsilon, continuous_state_discretization_resolution=None) pi = q_S_A.get_initial_policy() mdp_agent = StochasticMdpAgent(name='agent', random_state=random, pi=pi, gamma=1.0) # iterate the agents policy using q-learning temporal differencing iterate_value_q_pi(agent=mdp_agent, environment=gridworld, num_improvements=20, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A, num_improvements_per_plot=20) for s in pi: print(f'State {s.i}:') for a in pi[s]: if pi[s][a] > 0.0: print(f'\tPr({a.name}): {pi[s][a]}')
def test_invalid_improve_policy_with_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) epsilon = 0.0 mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, epsilon, None)) with pytest.raises(ValueError, match='Epsilon must be >= 0'): improve_policy_with_q_pi(mdp_agent, {}, -1)
def test_policy_iteration(): # state-value policy iteration random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_v_pi = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_v_pi(mdp_agent_v_pi, mdp_environment, 0.001, True) # action-value policy iteration random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent_q_pi = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) iterate_policy_q_pi(mdp_agent_q_pi, mdp_environment, 0.001, True) # should get the same policy assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
def test_evaluate_v_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi, _ = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, theta=0.001, num_iterations=None, update_in_place=True) random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, None, None)) v_pi_not_in_place, _ = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, theta=0.001, num_iterations=None, update_in_place=False) assert list(v_pi.keys()) == list(v_pi_not_in_place.keys()) np.testing.assert_allclose(list(v_pi.values()), list(v_pi_not_in_place.values()), atol=0.01) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'wb') as file: # pickle.dump(v_pi, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_iterative_policy_evaluation_of_state_value.pickle', 'rb') as file: fixture = pickle.load(file) assert v_pi == fixture
def test_check_state_and_action_lists(): random = RandomState(12345) gw = Gridworld.example_4_1(random, T=None) fex = GridworldFeatureExtractor(gw) states = [MdpState(i=None, AA=[], terminal=False)] actions = [Action(0)] fex.check_state_and_action_lists(states, actions) with pytest.raises(ValueError, match='Expected '): actions.clear() fex.check_state_and_action_lists(states, actions)
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_1 = StochasticMdpAgent('test', random_state, q_S_A_1.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_1) q_S_A_2 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_2 = StochasticMdpAgent('test', random_state, q_S_A_2.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_2) assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
def test_q_learning_iterate_value_q_pi_tabular_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_1 = StochasticMdpAgent('test', random_state, q_S_A_1.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_1) q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_2 = StochasticMdpAgent('test', random_state, q_S_A_2.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_2) test_state = mdp_environment.SS[5] test_action = test_state.AA[0] assert q_S_A_1 != q_S_A_2 assert q_S_A_1[test_state] != q_S_A_2[test_state] assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
def test_invalid_improve_policy_with_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) epsilon = 0.0 q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None) # target agent mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises(ValueError, match='Epsilon must be >= 0'): improve_policy_with_q_pi(mdp_agent, {}, -1)
def test_invalid_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.0, None) # target agent mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) # episode generation (behavior) policy off_policy_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises( ValueError, match= 'Planning environments are not currently supported for Monte Carlo iteration.' ): iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=TrajectorySamplingMdpPlanningEnvironment( 'foo', random_state, StochasticEnvironmentModel(), 100, None), make_final_policy_greedy=False, q_S_A=q_S_A, off_policy_agent=off_policy_agent) # test warning...no off-policy agent with epsilon=0.0 q_S_A.epsilon = 0.0 iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=100, num_episodes_per_improvement=1, update_upon_every_visit=True, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, off_policy_agent=None)
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert np.allclose(mdp_agent.pi.estimator.model.model.coef_, pi_fixture.estimator.model.model.coef_) assert mdp_agent.pi.format_state_action_probs( mdp_environment.SS) == pi_fixture.format_state_action_probs( mdp_environment.SS) assert mdp_agent.pi.format_state_action_values( mdp_environment.SS) == pi_fixture.format_state_action_values( mdp_environment.SS)
def train_thread_target(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.1, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=1000000, num_episodes_per_improvement=10, update_upon_every_visit=False, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, thread_manager=thread_manager, num_improvements_per_plot=10)
def train_thread_target(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.1, None)) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=1000000, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=None, planning_environment=None, make_final_policy_greedy=False, thread_manager=thread_manager, num_improvements_per_plot=10)
def test_invalid_epsilon_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.0, None)) with pytest.raises(ValueError, match='epsilon must be strictly > 0 for TD-learning'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False)
def test_policy_iteration(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) # state-value policy iteration mdp_agent_v_pi = StochasticMdpAgent( 'test', random_state, TabularPolicy(None, mdp_environment.SS), 1 ) iterate_policy_v_pi( mdp_agent_v_pi, mdp_environment, 0.001, True ) # action-value policy iteration mdp_agent_q_pi = StochasticMdpAgent( 'test', random_state, TabularPolicy(None, mdp_environment.SS), 1 ) iterate_policy_q_pi( mdp_agent_q_pi, mdp_environment, 0.001, True ) # should get the same policy assert mdp_agent_v_pi.pi == mdp_agent_q_pi.pi
def test_agent_invalid_action(): random = RandomState() agent = ActionValueMdpAgent( 'foo', random, 1.0, TabularStateActionValueEstimator(Gridworld.example_4_1(random, None), None, None)) # test None action agent.__act__ = lambda t: None with pytest.raises(ValueError, match='Agent returned action of None'): agent.act(0) # test infeasible action action = Action(1, 'foo') agent.__act__ = lambda t: action state = MdpState(1, [], False) agent.sense(state, 0) with pytest.raises( ValueError, match=f'Action {action} is not feasible in state {state}'): agent.act(0)
def test_evaluate_v_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = StochasticMdpAgent('test', random_state, TabularPolicy(None, mdp_environment.SS), 1) v_pi = evaluate_v_pi(agent=mdp_agent, environment=mdp_environment, num_episodes=1000) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'wb') as file: # pickle.dump(v_pi, file) with open( f'{os.path.dirname(__file__)}/fixtures/test_monte_carlo_evaluation_of_state_value.pickle', 'rb') as file: fixture = pickle.load(file) assert v_pi == fixture
def test_evaluate_q_pi_invalid_n_steps(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) epsilon = 0.05 q_S_A = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises(ValueError): evaluate_q_pi(agent=mdp_agent, environment=mdp_environment, num_episodes=5, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=-1, planning_environment=None, q_S_A=q_S_A)