Beispiel #1
0
def test_learn():

    random_state = RandomState(12345)

    gym = Gym(random_state=random_state, T=None, gym_id='CartPole-v1')

    q_S_A = TabularStateActionValueEstimator(gym, 0.05, 0.001)

    mdp_agent = StochasticMdpAgent('agent', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=gym,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=None,
                       make_final_policy_greedy=False,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(f'{os.path.dirname(__file__)}/fixtures/test_gym.pickle',
              'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Beispiel #2
0
def test_sarsa_iterate_value_q_pi_make_greedy():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.SARSA,
                       n_steps=1,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_make_greedy.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_iteration_of_value_q_pi_make_greedy.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                pi_fixture) and tabular_estimator_legacy_eq(
                                    q_S_A, q_S_A_fixture)
Beispiel #3
0
def test_q_learning_iterate_value_q_pi_function_approximation_invalid_formula(
):

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    with pytest.raises(ValueError, match='Invalid combination of formula'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=5,
                           num_episodes_per_improvement=5,
                           num_updates_per_improvement=None,
                           alpha=None,
                           mode=Mode.Q_LEARNING,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           q_S_A=q_S_A)
Beispiel #4
0
def test_iterate_value_q_pi_with_pdf():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)

    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=100,
        num_updates_per_improvement=None,
        alpha=0.1,
        mode=Mode.Q_LEARNING,
        n_steps=1,
        planning_environment=None,
        make_final_policy_greedy=False,
        q_S_A=q_S_A,
        num_improvements_per_plot=5,
        pdf_save_path=tempfile.NamedTemporaryFile(delete=False).name)
Beispiel #5
0
def test_n_step_q_learning_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    q_S_A = TabularStateActionValueEstimator(mdp_environment, 0.05, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=0.1,
                       mode=Mode.Q_LEARNING,
                       n_steps=3,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_td_n_step_q_learning_iteration_of_value_q_pi.pickle',
            'rb') as file:
        fixture_pi, fixture_q_S_A = pickle.load(file)

    assert tabular_pi_legacy_eq(mdp_agent.pi,
                                fixture_pi) and tabular_estimator_legacy_eq(
                                    q_S_A, fixture_q_S_A)
Beispiel #6
0
def test_q_learning_iterate_value_q_pi_function_approximation_with_formula():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)
    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        StateActionIdentityFeatureExtractor(mdp_environment),
        f'C(s, levels={[s.i for s in mdp_environment.SS]}):C(a, levels={[a.i for a in mdp_environment.SS[0].AA]})',
        False, None, None)
    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=False)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
def main():

    random = RandomState(12345)
    gridworld = Gridworld.example_4_1(random, None)

    # the bottom-right corner (3,3) is a goal state. get the states surrounding this goal. these will become the sticky
    # states.
    sticky_states = [
        gridworld.grid[2, 2], gridworld.grid[2, 3], gridworld.grid[3, 2]
    ]

    # amplify all negative rewards in the sticky states by a factor of 10, keeping the probabilities the same.
    for sticky_state in sticky_states:
        for a in gridworld.p_S_prime_R_given_S_A[sticky_state]:
            for s_prime in gridworld.p_S_prime_R_given_S_A[sticky_state][a]:
                gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime] = {
                    Reward(r.i, (r.r * 10.0 if r.r < 0.0 else r.r)):
                    gridworld.p_S_prime_R_given_S_A[sticky_state][a][s_prime]
                    [r]
                    for r in gridworld.p_S_prime_R_given_S_A[sticky_state][a]
                    [s_prime]
                }

    epsilon = 0.1

    q_S_A = TabularStateActionValueEstimator(
        environment=gridworld,
        epsilon=epsilon,
        continuous_state_discretization_resolution=None)

    pi = q_S_A.get_initial_policy()

    mdp_agent = StochasticMdpAgent(name='agent',
                                   random_state=random,
                                   pi=pi,
                                   gamma=1.0)

    # iterate the agents policy using q-learning temporal differencing
    iterate_value_q_pi(agent=mdp_agent,
                       environment=gridworld,
                       num_improvements=20,
                       num_episodes_per_improvement=100,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A,
                       num_improvements_per_plot=20)

    for s in pi:
        print(f'State {s.i}:')
        for a in pi[s]:
            if pi[s][a] > 0.0:
                print(f'\tPr({a.name}):  {pi[s][a]}')
def main():

    random_state = RandomState(12345)

    environment = Gym(
        random_state=random_state,
        T=None,
        gym_id='CartPole-v1',
        continuous_action_discretization_resolution=None,
        render_every_nth_episode=100
    )

    model = SKLearnSGD(
        loss='squared_loss',
        alpha=0.0,
        learning_rate='constant',
        eta0=0.0001,
        scale_eta0_for_y=False
    )

    feature_extractor = CartpoleFeatureExtractor(
        environment=environment
    )

    q_S_A = ApproximateStateActionValueEstimator(
        environment=environment,
        epsilon=0.02,
        model=model,
        feature_extractor=feature_extractor,
        formula=None,
        plot_model=False,
        plot_model_per_improvements=None,
        plot_model_bins=None
    )

    agent = StochasticMdpAgent(
        name='Cartpole Agent',
        random_state=random_state,
        pi=q_S_A.get_initial_policy(),
        gamma=0.95
    )

    iterate_value_q_pi(
        agent=agent,
        environment=environment,
        num_improvements=15000,
        num_episodes_per_improvement=1,
        num_updates_per_improvement=1,
        alpha=None,
        mode=Mode.SARSA,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
        q_S_A=q_S_A,
        num_improvements_per_plot=100
    )
Beispiel #9
0
def test_q_learning_iterate_value_q_pi_function_approximation_policy_ne():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A_1 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_1 = StochasticMdpAgent('test', random_state,
                                     q_S_A_1.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_1)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    assert mdp_agent_1.pi.estimator != mdp_agent_2.pi.estimator
    assert mdp_agent_1.pi.estimator.model != mdp_agent_2.pi.estimator.model
Beispiel #10
0
def test_q_learning_iterate_value_q_pi_tabular_policy_ne():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A_1 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_1 = StochasticMdpAgent('test', random_state,
                                     q_S_A_1.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_1,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=10,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_1)

    q_S_A_2 = TabularStateActionValueEstimator(mdp_environment, epsilon, None)

    mdp_agent_2 = StochasticMdpAgent('test', random_state,
                                     q_S_A_2.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment,
                       num_improvements=5,
                       num_episodes_per_improvement=5,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A_2)

    test_state = mdp_environment.SS[5]
    test_action = test_state.AA[0]

    assert q_S_A_1 != q_S_A_2
    assert q_S_A_1[test_state] != q_S_A_2[test_state]
    assert q_S_A_1[test_state][test_action] != q_S_A_2[test_state][test_action]
Beispiel #11
0
def test_q_learning_iterate_value_q_pi_function_approximation_no_formula():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, 0.05,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = StochasticMdpAgent('test', random_state,
                                   q_S_A.get_initial_policy(), 1)

    iterate_value_q_pi(agent=mdp_agent,
                       environment=mdp_environment,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True,
                       q_S_A=q_S_A)

    # uncomment the following line and run test to update fixture
    # with open(f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle', 'wb') as file:
    #     pickle.dump((mdp_agent.pi, q_S_A), file)

    with open(
            f'{os.path.dirname(__file__)}/fixtures/test_q_learning_iterate_value_q_pi_function_approximation_no_formula.pickle',
            'rb') as file:
        pi_fixture, q_S_A_fixture = pickle.load(file)

    assert np.allclose(mdp_agent.pi.estimator.model.model.coef_,
                       pi_fixture.estimator.model.model.coef_)
    assert mdp_agent.pi.format_state_action_probs(
        mdp_environment.SS) == pi_fixture.format_state_action_probs(
            mdp_environment.SS)
    assert mdp_agent.pi.format_state_action_values(
        mdp_environment.SS) == pi_fixture.format_state_action_values(
            mdp_environment.SS)
Beispiel #12
0
    def train_thread_target():

        random_state = RandomState(12345)
        mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
        mdp_agent = ActionValueMdpAgent(
            'test', random_state, 1,
            TabularStateActionValueEstimator(mdp_environment, 0.1, None))

        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=1000000,
                           num_episodes_per_improvement=10,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.SARSA,
                           n_steps=None,
                           planning_environment=None,
                           make_final_policy_greedy=False,
                           thread_manager=thread_manager,
                           num_improvements_per_plot=10)
Beispiel #13
0
def test_invalid_epsilon_iterate_value_q_pi():

    random_state = RandomState(12345)
    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, None)
    mdp_agent = ActionValueMdpAgent(
        'test', random_state, 1,
        TabularStateActionValueEstimator(mdp_environment, 0.0, None))

    with pytest.raises(ValueError,
                       match='epsilon must be strictly > 0 for TD-learning'):
        iterate_value_q_pi(agent=mdp_agent,
                           environment=mdp_environment,
                           num_improvements=10,
                           num_episodes_per_improvement=100,
                           num_updates_per_improvement=None,
                           alpha=0.1,
                           mode=Mode.Q_LEARNING,
                           n_steps=3,
                           planning_environment=None,
                           make_final_policy_greedy=False)
def test_policy_overrides():

    random_state = RandomState(12345)

    mdp_environment: Gridworld = Gridworld.example_4_1(random_state, 20)

    epsilon = 0.05

    q_S_A = ApproximateStateActionValueEstimator(
        mdp_environment, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment), None, False, None, None)

    mdp_agent = ActionValueMdpAgent('test', random_state, 1, q_S_A)

    iterate_value_q_pi(
        agent=mdp_agent,
        environment=mdp_environment,
        num_improvements=10,
        num_episodes_per_improvement=20,
        num_updates_per_improvement=None,
        alpha=None,
        mode=Mode.Q_LEARNING,
        n_steps=None,
        planning_environment=None,
        make_final_policy_greedy=True,
    )

    random_state = RandomState(12345)

    mdp_environment_2: Gridworld = Gridworld.example_4_1(random_state, 20)

    q_S_A_2 = ApproximateStateActionValueEstimator(
        mdp_environment_2, epsilon,
        SKLearnSGD(random_state=random_state, scale_eta0_for_y=False),
        GridworldFeatureExtractor(mdp_environment_2), None, False, None, None)

    mdp_agent_2 = ActionValueMdpAgent('test', random_state, 1, q_S_A_2)

    iterate_value_q_pi(agent=mdp_agent_2,
                       environment=mdp_environment_2,
                       num_improvements=10,
                       num_episodes_per_improvement=20,
                       num_updates_per_improvement=None,
                       alpha=None,
                       mode=Mode.Q_LEARNING,
                       n_steps=None,
                       planning_environment=None,
                       make_final_policy_greedy=True)

    assert isinstance(
        mdp_agent_2.most_recent_state,
        MdpState) and mdp_agent_2.most_recent_state in mdp_agent_2.pi

    with pytest.raises(ValueError,
                       match='Attempted to check for None in policy.'):
        # noinspection PyTypeChecker
        if None in mdp_agent_2.pi:  # pragma no cover
            pass

    assert mdp_agent.pi == mdp_agent_2.pi
    assert not (mdp_agent.pi != mdp_agent_2.pi)