def test_learn(): random_state = RandomState(12345) gym = Gym(random_state=random_state, T=None, gym_id='CartPole-v1') q_S_A = TabularStateActionValueEstimator(gym, 0.05, 0.001) mdp_agent = StochasticMdpAgent('agent', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=gym, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=1, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'rb') as file: fixture_pi, fixture_q_S_A = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, fixture_pi) and tabular_estimator_legacy_eq( q_S_A, fixture_q_S_A)
def test_sarsa_iterate_value_q_pi_make_greedy(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=1, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_make_greedy.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_make_greedy.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, pi_fixture) and tabular_estimator_legacy_eq( q_S_A, q_S_A_fixture)
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula( ): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) with pytest.raises(ValueError, match='Invalid combination of formula'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A)
def test_iterate_value_q_pi_with_pdf(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=1, planning_environment=None, make_final_policy_greedy=False, q_S_A=q_S_A, num_improvements_per_plot=5, pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
def test_n_step_q_learning_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'rb') as file: fixture_pi, fixture_q_S_A = pickle.load(file) assert tabular_pi_legacy_eq(mdp_agent.pi, fixture_pi) and tabular_estimator_legacy_eq( q_S_A, fixture_q_S_A)
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), StateActionIdentityFeatureExtractor(mdp_environment), f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})', False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=False) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert np.allclose(mdp_agent.pi.estimator.model.model.coef_, pi_fixture.estimator.model.model.coef_)
def main(): random = RandomState(12345) gridworld = Gridworld.example_4_1(random, None) # the bottom-right corner (3,3) is a goal state. get the states surrounding this goal. these will become the sticky # states. sticky_states = [ gridworld.grid[2, 2], gridworld.grid[2, 3], gridworld.grid[3, 2] ] # amplify all negative rewards in the sticky states by a factor of 10, keeping the probabilities the same. for sticky_state in sticky_states: for a in gridworld.p_S_prime_R_given_S_A[sticky_state]: for s_prime in gridworld.p_S_prime_R_given_S_A[sticky_state][a]: gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] = { Reward(r.i, (r.r * 10.0 if r.r < 0.0 else r.r)): gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] [r] for r in gridworld.p_S_prime_R_given_S_A[sticky_state][a] [s_prime] } epsilon = 0.1 q_S_A = TabularStateActionValueEstimator( environment=gridworld, epsilon=epsilon, continuous_state_discretization_resolution=None) pi = q_S_A.get_initial_policy() mdp_agent = StochasticMdpAgent(name='agent', random_state=random, pi=pi, gamma=1.0) # iterate the agents policy using q-learning temporal differencing iterate_value_q_pi(agent=mdp_agent, environment=gridworld, num_improvements=20, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A, num_improvements_per_plot=20) for s in pi: print(f'State {s.i}:') for a in pi[s]: if pi[s][a] > 0.0: print(f'\tPr({a.name}): {pi[s][a]}')
def main(): random_state = RandomState(12345) environment = Gym( random_state=random_state, T=None, gym_id='CartPole-v1', continuous_action_discretization_resolution=None, render_every_nth_episode=100 ) model = SKLearnSGD( loss='squared_loss', alpha=0.0, learning_rate='constant', eta0=0.0001, scale_eta0_for_y=False ) feature_extractor = CartpoleFeatureExtractor( environment=environment ) q_S_A = ApproximateStateActionValueEstimator( environment=environment, epsilon=0.02, model=model, feature_extractor=feature_extractor, formula=None, plot_model=False, plot_model_per_improvements=None, plot_model_bins=None ) agent = StochasticMdpAgent( name='Cartpole Agent', random_state=random_state, pi=q_S_A.get_initial_policy(), gamma=0.95 ) iterate_value_q_pi( agent=agent, environment=environment, num_improvements=15000, num_episodes_per_improvement=1, num_updates_per_improvement=1, alpha=None, mode=Mode.SARSA, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A, num_improvements_per_plot=100 )
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_1 = StochasticMdpAgent('test', random_state, q_S_A_1.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_1) q_S_A_2 = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent_2 = StochasticMdpAgent('test', random_state, q_S_A_2.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_2) assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
def test_q_learning_iterate_value_q_pi_tabular_policy_ne(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_1 = StochasticMdpAgent('test', random_state, q_S_A_1.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_1, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_1) q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None) mdp_agent_2 = StochasticMdpAgent('test', random_state, q_S_A_2.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment, num_improvements=5, num_episodes_per_improvement=5, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A_2) test_state = mdp_environment.SS[5] test_action = test_state.AA[0] assert q_S_A_1 != q_S_A_2 assert q_S_A_1[test_state] != q_S_A_2[test_state] assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A = ApproximateStateActionValueEstimator( mdp_environment, 0.05, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent = StochasticMdpAgent('test', random_state, q_S_A.get_initial_policy(), 1) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, q_S_A=q_S_A) # uncomment the following line and run test to update fixture # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file: # pickle.dump((mdp_agent.pi, q_S_A), file) with open( f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'rb') as file: pi_fixture, q_S_A_fixture = pickle.load(file) assert np.allclose(mdp_agent.pi.estimator.model.model.coef_, pi_fixture.estimator.model.model.coef_) assert mdp_agent.pi.format_state_action_probs( mdp_environment.SS) == pi_fixture.format_state_action_probs( mdp_environment.SS) assert mdp_agent.pi.format_state_action_values( mdp_environment.SS) == pi_fixture.format_state_action_values( mdp_environment.SS)
def train_thread_target(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.1, None)) iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=1000000, num_episodes_per_improvement=10, num_updates_per_improvement=None, alpha=0.1, mode=Mode.SARSA, n_steps=None, planning_environment=None, make_final_policy_greedy=False, thread_manager=thread_manager, num_improvements_per_plot=10)
def test_invalid_epsilon_iterate_value_q_pi(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None) mdp_agent = ActionValueMdpAgent( 'test', random_state, 1, TabularStateActionValueEstimator(mdp_environment, 0.0, None)) with pytest.raises(ValueError, match='epsilon must be strictly > 0 for TD-learning'): iterate_value_q_pi(agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=100, num_updates_per_improvement=None, alpha=0.1, mode=Mode.Q_LEARNING, n_steps=3, planning_environment=None, make_final_policy_greedy=False)
def test_policy_overrides(): random_state = RandomState(12345) mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20) epsilon = 0.05 q_S_A = ApproximateStateActionValueEstimator( mdp_environment, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment), None, False, None, None) mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A) iterate_value_q_pi( agent=mdp_agent, environment=mdp_environment, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True, ) random_state = RandomState(12345) mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20) q_S_A_2 = ApproximateStateActionValueEstimator( mdp_environment_2, epsilon, SKLearnSGD(random_state=random_state, scale_eta0_for_y=False), GridworldFeatureExtractor(mdp_environment_2), None, False, None, None) mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2) iterate_value_q_pi(agent=mdp_agent_2, environment=mdp_environment_2, num_improvements=10, num_episodes_per_improvement=20, num_updates_per_improvement=None, alpha=None, mode=Mode.Q_LEARNING, n_steps=None, planning_environment=None, make_final_policy_greedy=True) assert isinstance( mdp_agent_2.most_recent_state, MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi with pytest.raises(ValueError, match='Attempted to check for None in policy.'): # noinspection PyTypeChecker if None in mdp_agent_2.pi: # pragma no cover pass assert mdp_agent.pi == mdp_agent_2.pi assert not (mdp_agent.pi != mdp_agent_2.pi)