Exemple #1
0
def test_frozen_features():
    env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True)
    feature_wrapper = unwrap_env(env, FeatureWrapper)
    d = feature_wrapper.feature_dimensionality()
    ranges = feature_wrapper.feature_range()
    print(ranges)
    for i in range(16):
        feature = feature_wrapper.features(None, None, i)
        assert feature.shape == d
        assert np.all(feature >= ranges[0])
        assert np.all(feature <= ranges[1])
def test_frozenlake_fully_wrapped_transitions():
    env = make_wrapped_env(
        'FrozenLake-v0', with_feature_wrapper=True, with_model_wrapper=True)
    transitions = env.get_transition_array()

    # assert probability sums to 1.0
    for s in range(transitions.shape[0]):
        for a in range(transitions.shape[1]):
            assert transitions[s, a].sum() == 1.0

    assert isinstance(transitions, np.ndarray)
    assert transitions.shape == (17, 4, 17)
Exemple #3
0
def test_maze1_features():
    env = make_wrapped_env('MazeWorld1-v0', with_feature_wrapper=True)
    maze_env = unwrap_env(env, MazeWorld)
    feature_wrapper = unwrap_env(env, FeatureWrapper)
    d = feature_wrapper.feature_dimensionality()
    ranges = feature_wrapper.feature_range()
    print(ranges)
    for i in range(10240, 13):
        for a in range(10):
            feature = feature_wrapper.features(maze_env.index_to_state(i), a,
                                               None)
            assert feature.shape == d
            assert np.all(feature >= ranges[0])
            assert np.all(feature <= ranges[1])
Exemple #4
0
def test_tabular_function():
    def reward_function_factory(env):
        params = np.zeros(64)
        params[-1] = 1.
        return TabularRewardFunction(env, params)

    env = make_wrapped_env('FrozenLake8x8-v0',
                           reward_function_factory=reward_function_factory,
                           with_model_wrapper=True)
    agent = ValueIteration(env)
    agent.train(1)
    trajs = collect_trajs(env, agent, 10)
    for traj in trajs:
        for i in range(len(traj['rewards'])):
            assert np.isclose(traj['rewards'][i], traj['true_rewards'][i])
def case_make_wrapped(env_id):
    env = make_wrapped_env(env_id)
    assert not is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id, with_feature_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           with_model_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert not is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, BaseWorldModelWrapper)

    def rew_fun_fact(env):
        return FeatureBasedRewardFunction(env, 'random')

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           reward_function_factory=rew_fun_fact,
                           with_model_wrapper=False)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert is_unwrappable_to(env, RewardWrapper)
    assert not is_unwrappable_to(env, BaseWorldModelWrapper)

    env = make_wrapped_env(env_id,
                           with_feature_wrapper=True,
                           reward_function_factory=rew_fun_fact,
                           with_model_wrapper=True)
    assert is_unwrappable_to(env, FeatureWrapper)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, BaseWorldModelWrapper)
def test_update_parameters_frozen_feature():
    def rew_fun_factory(env):
        return FeatureBasedRewardFunction(env, 'random')

    env = make_wrapped_env(
        'FrozenLake-v0',
        with_feature_wrapper=True,
        reward_function_factory=rew_fun_factory)

    reward_wrapper = unwrap_env(env, RewardWrapper)
    params = np.copy(reward_wrapper.reward_function.parameters)
    domain = reward_wrapper.reward_function.domain()
    rews = reward_wrapper.reward_function.reward(domain)
    reward_wrapper.update_reward_parameters(2 * params)
    rews2 = reward_wrapper.reward_function.reward(domain)
    assert np.all(np.isclose(2 * rews, rews2))
    reward_wrapper.update_reward_parameters(np.zeros_like(params))
    rews3 = reward_wrapper.reward_function.reward(domain)
    assert np.all(np.isclose(rews3, np.zeros_like(rews3)))
def test_get_reward_matrix_wrapped_feature():

    true_rews = np.random.randn(65)
    true_rews[-1] = 0

    def reward_function_factory(env):
        return FeatureBasedRewardFunction(env, true_rews[:-1])

    env = make_wrapped_env(
        'FrozenLake8x8-v0',
        with_feature_wrapper=True,
        reward_function_factory=reward_function_factory,
        with_model_wrapper=True)

    transitions = env.get_transition_array()
    rewards = env.get_reward_array()

    assert rewards.shape == (65, 4)
    assert transitions.shape == (65, 4, 65)

    for s in range(64 + 1):
        for a in range(4):
            assert np.isclose(rewards[s, a],
                              transitions[s, a, :].dot(true_rews))
def quick_run_alg(alg_class, config={}):
    def reward_function_factory(env):
        return FeatureBasedRewardFunction(env, 'random')

    env = make_wrapped_env('FrozenLake-v0',
                           with_feature_wrapper=True,
                           reward_function_factory=reward_function_factory,
                           with_model_wrapper=True)

    def rl_alg_factory(env):
        return ValueIteration(env, {})

    expert_trajs = [{
        'states': [
            0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9,
            8, 8, 9, 10, 14, 15
        ],
        'actions': [
            0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,
            3, 3, 1, 0, 1
        ],
        'rewards': [
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            1.0
        ],
        'true_rewards': [],
        'features': [
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }, {
        'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15],
        'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1],
        'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
        'true_rewards': [],
        'features': [
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }]
    metrics = []
    alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config)
    alg.train(2, 2, 2)
def test_maze_indices():
    env = make_wrapped_env('MazeWorld0-v0', with_model_wrapper=True)
    for i in range(10240):
        assert i == env.state_to_index(env.index_to_state(i))
Exemple #10
0
 def new_f():
     return f(make_wrapped_env(key, with_feature_wrapper=True))
Exemple #11
0
    def __init__(self, env_id: str, expert_trajs_path: str,
                 irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]],
                                           BaseIRLAlgorithm],
                 metrics: List[BaseMetric], rl_config: dict, irl_config: dict,
                 run_config: dict):
        """

        Parameters
        ----------
        env_id: str
            The environment id of a gym environment.
        expert_trajs_path: str
            A path to the folder where expert trajectories are stored. The file with
            expert trajectories must be expert_trajs_path/trajs.data.
        irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm]
            A factory function which takes a gym environment and expert trajetories and
            returns a subclass of BaseIRLAlgorithm.
        metrics: List[BaseMetric]
            The metrics to be evaluated after running the IRL algorithm.
        run_config: dict
            A dictionary containing the configuration of the run. Required fields are:
            'reward_function': subclass of BaseRewardFunction, e.g. FeatureBasedRewardFunction.
            'no_expert_trajs': int, number of expert trajectories to be used.
            'no_irl_iterations': int, number of iterations the IRL algorithm is run for.
            'no_rl_episodes_per_irl_iteration': int, how many episodes the RL agent
            is allowed to run each iteration.
            'no_irl_episodes_per_irl_iteration': int, how many episodes can be sampled
            for the IRL algorithm each iteration.
        """

        action_in_domain = env_id in ENV_IDS_ACTION_IN_DOMAIN
        next_state_in_domain = env_id in ENV_IDS_NEXT_STATE_IN_DOMAIN

        def reward_function_factory(env):
            return run_config['reward_function'](
                env,
                parameters='random',
                action_in_domain=action_in_domain,
                next_state_in_domain=next_state_in_domain)

        print('Making run environment.')
        self.env = make_wrapped_env(
            env_id,
            with_feature_wrapper=run_config['requires_features'],
            reward_function_factory=reward_function_factory,
            with_model_wrapper=run_config['requires_transitions'])

        # load expert trajs:
        print('Load expert demonstrations from ' + str(expert_trajs_path))
        self.expert_trajs = load_stored_trajs(expert_trajs_path)
        print('Loaded expert demonstrations.')
        # use only specified number of expert trajs
        assert len(self.expert_trajs) >= run_config['no_expert_trajs']
        self.expert_trajs = self.expert_trajs[:run_config['no_expert_trajs']]
        self.irl_alg_factory = irl_alg_factory
        # Metrics are only passed as classes and need to be instantiated
        instantiated_metrics = []
        # collect all information relevant for certain metric __init__s:
        metric_input = {
            'env':
            self.env,
            'expert_trajs':
            self.expert_trajs,
            'true_reward':
            truth.make_true_reward(env_id),
            'no_trajs_for_metrics':
            run_config['no_metric_episodes_per_irl_iteration']
        }
        # instantiate metrics:
        print('Instantiate metrics.')
        for metric in metrics:
            instantiated_metrics.append(metric(metric_input))
        self.metrics = instantiated_metrics

        self.rl_config = rl_config
        self.irl_config = irl_config

        self.run_config = run_config
Exemple #12
0
def test_value_iteration():
    # gamma = 1.0
    env = make_wrapped_env('FrozenLake-v0', with_model_wrapper=True)
    agent = ValueIteration(env, {'gamma': 1.0})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.93 and state_values[14] < 0.95
    assert state_values[15] == 0

    # gamma = 0.9
    env = make_wrapped_env('FrozenLake-v0', with_model_wrapper=True)
    agent = ValueIteration(env, {'gamma': 0.9})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.63 and state_values[14] < 0.65
    # holes and frisbee should have zero value:
    for i in [5, 7, 11, 12, 15]:
        assert state_values[i] == 0

    # check some q values:
    # go right in second to last state
    assert np.argmax(agent.q_values[14, :]) == 1
    assert np.min(agent.q_values) == 0
    assert np.max(agent.q_values) <= 1

    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))

    # check softmax policy
    old_state_values = agent.state_values
    old_q_values = agent.q_values
    agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1})
    agent.train(10)
    assert np.all(agent.state_values <= old_state_values)
    # at least initial state should now have lower value:
    assert agent.state_values[0] < old_state_values[0]
    assert np.all(agent.q_values <= old_q_values)
    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))
        # ordering of probabilities should stay the same with softmax
        assert np.all(
            np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i)))

    # test policy array:
    policy_array = agent.policy_array()
    assert policy_array.shape == (17, 4)
    for i in range(16):
        assert np.all(agent.policy(i) == policy_array[i, :])

    # check if true reward isn't leaked:
    def reward_function_factory(env):
        return FeatureBasedRewardFunction(env, np.zeros(16))

    env = make_wrapped_env('FrozenLake-v0',
                           with_feature_wrapper=True,
                           reward_function_factory=reward_function_factory,
                           with_model_wrapper=True)
    agent = ValueIteration(env, {})
    agent.train(10)
    assert np.sum(agent.state_values == 0)
Exemple #13
0
def test_random_featb_function():
    env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True)
    rf = FeatureBasedRewardFunction(env, 'random')