def test_is_unwrappable_to():
    assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit)
    assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             feature_wrapper.FeatureWrapper)
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper)
    assert is_unwrappable_to(env, DiscreteEnv)
    assert is_unwrappable_to(env, gym.Env)
def test_feature_count():
    env = feature_wrapper.make('FrozenLake-v0')
    # create dummy data:
    path = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
    features = []
    for i in path:
        features.append(to_one_hot(i, 16))

    trajs = [{'features': features}]
    result = feature_count(env, trajs, gamma=1.0)
    desired = np.array(
        [0., 1., 2., 3., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    assert isinstance(result, np.ndarray)
    assert np.allclose(result, desired)

    # two times the same traj should get the same feature count:
    trajs = [{'features': features}, {'features': features}]
    result = feature_count(env, trajs, gamma=1.0)
    desired = np.array(
        [0., 1., 2., 3., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    assert isinstance(result, np.ndarray)
    assert np.allclose(result, desired)

    # repeating a traj twice should double feature count (with gamma 1)
    trajs = [{'features': features + features}]
    result = feature_count(env, trajs, gamma=1.0)
    desired = np.array(
        [0., 2., 4., 6., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    assert isinstance(result, np.ndarray)
    assert np.allclose(result, desired)

    # test gamma 0.9:
    trajs = [{'features': features}]
    result = feature_count(env, trajs, gamma=.9)
    x = .9**6 + .9**7 + .9**8 + .9**9
    desired = np.array([
        0., 1., .9 + .81, .729 + .6561 + .59049, x, 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.
    ])
    assert isinstance(result, np.ndarray)
    assert np.allclose(result, desired)

    # test gamma 0:
    result = feature_count(env, trajs, gamma=0)
    desired = np.array(
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    assert isinstance(result, np.ndarray)
    assert np.allclose(result, desired)
Exemple #3
0
def make_wrapped_env(env_id: str,
                     with_feature_wrapper: bool = False,
                     reward_function_factory: Callable = None,
                     with_model_wrapper: bool = False):
    """Make an environment, potentially wrapped in FeatureWrapper, RewardWrapper,
    and BaseWorldModelWrapper.

    Parameters
    ----------
    env_id: str
        The environment's id, e.g. 'FrozenLake-v0'.
    with_feature_wrapper: bool
        Whether to use a feature wrapper.
    reward_function_factory: Callable
        A function which returns a new reward function when called. If this is
        provided, the environment will be wrapped in a RewardWrapper using
        the returned reward function.
    with_model_wrapper: bool
        Whether to use a BaseWorldModelWrapper.

    Returns
    -------
    gym.Env
        A gym environment, potentially wrapped.
    """
    assert env_id in ENV_IDS
    if with_feature_wrapper:
        assert env_id in feature_wrapper.feature_wrappable_envs()
        env = feature_wrapper.make(env_id)
    else:
        env = make_env(env_id)

    if reward_function_factory is not None:
        reward_function = reward_function_factory(env)
        assert isinstance(reward_function, BaseRewardFunction)
        env = RewardWrapper(env, reward_function)

    if with_model_wrapper:
        if utils.wrapper.is_unwrappable_to(env, DiscreteEnv):
            env = DiscreteEnvModelWrapper(env)
        elif utils.wrapper.is_unwrappable_to(env, MazeWorld):
            env = MazeModelWrapper(env)
        else:
            raise NotImplementedError()
    return env
Exemple #4
0
store_to = 'data/frozen/expert/'
no_episodes = 1000
max_steps_per_episode = 100


def rl_alg_factory(env):
    '''Return an RL algorithm which is used both for the expert and
    in the IRL loop.'''
    return TabularQ(env)


# Apprenticeship IRL assumes that rewards are linear in features.
# However, FrozenLake doesn't provide features. It is sufficiently small
# to work with tabular methods. Therefore, we just use a wrapper that uses
# a one-hot encoding of the state space as features.
env = feature_wrapper.make('FrozenLake-v0')

# Generate expert trajectories.
expert_agent = rl_alg_factory(env)
print('Training expert agent...')
expert_agent.train(15)
print('Done training expert')
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
# with open(store_to + 'trajs.pkl', 'rb') as f:
#     expert_trajs = pickle.load(f)
Exemple #5
0
def quick_run_alg(alg_class, config={}):
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)

    def rl_alg_factory(env):
        return ValueIteration(env, {})

    expert_trajs = [{
        'states': [
            0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9,
            8, 8, 9, 10, 14, 15
        ],
        'actions': [
            0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,
            3, 3, 1, 0, 1
        ],
        'rewards': [
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            1.0
        ],
        'true_rewards': [],
        'features': [
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }, {
        'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15],
        'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1],
        'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
        'true_rewards': [],
        'features': [
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }]
    metrics = []
    alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config)
    alg.train(2, 2, 2)
def test_value_iteration():
    # gamma = 1.0
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 1.0})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.93 and state_values[14] < 0.95
    assert state_values[15] == 0

    # gamma = 0.9
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 0.9})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.63 and state_values[14] < 0.65
    # holes and frisbee should have zero value:
    for i in [5, 7, 11, 12, 15]:
        assert state_values[i] == 0

    # check some q values:
    # go right in second to last state
    assert np.argmax(agent.q_values[14, :]) == 1
    assert np.min(agent.q_values) == 0
    assert np.max(agent.q_values) <= 1

    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))

    # check softmax policy
    old_state_values = agent.state_values
    old_q_values = agent.q_values
    agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1})
    agent.train(10)
    assert np.all(agent.state_values <= old_state_values)
    # at least initial state should now have lower value:
    assert agent.state_values[0] < old_state_values[0]
    assert np.all(agent.q_values <= old_q_values)
    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))
        # ordering of probabilities should stay the same with softmax
        assert np.all(
            np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i)))

    # test policy array:
    policy_array = agent.policy_array()
    assert policy_array.shape == (17, 4)
    for i in range(16):
        assert np.all(agent.policy(i) == policy_array[i, :])

    # check if true reward isn't leaked:
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, np.zeros(16))
    env = RewardWrapper(env, reward_function)
    agent = ValueIteration(env, {})
    agent.train(10)
    assert np.sum(agent.state_values == 0)
Exemple #7
0
# Define important script constants here:
store_to = 'data/pendulum/expert/'
no_episodes = 1000
max_steps_per_episode = 200


def rl_alg_factory(env):
    '''Return an RL algorithm which is used both for the expert and
    in the IRL loop.'''
    return PPO(env)


# Pendulum has features that can easily be extracted from the previous state,
# see PendulumFeatureWrapper.
env = feature_wrapper.make('Pendulum-v0')

# Generate expert trajectories.
# expert_agent = rl_alg_factory(env)
# print('Training expert agent...')
# expert_agent.train(15)
# print('Done training expert')
# expert_trajs = collect_trajs(env, expert_agent, no_episodes,
#                              max_steps_per_episode, store_to)

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
with open(store_to + 'trajs.pkl', 'rb') as f:
    expert_trajs = pickle.load(f)