def test_frozen_features(): env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True) feature_wrapper = unwrap_env(env, FeatureWrapper) d = feature_wrapper.feature_dimensionality() ranges = feature_wrapper.feature_range() print(ranges) for i in range(16): feature = feature_wrapper.features(None, None, i) assert feature.shape == d assert np.all(feature >= ranges[0]) assert np.all(feature <= ranges[1])
def test_frozenlake_fully_wrapped_transitions(): env = make_wrapped_env( 'FrozenLake-v0', with_feature_wrapper=True, with_model_wrapper=True) transitions = env.get_transition_array() # assert probability sums to 1.0 for s in range(transitions.shape[0]): for a in range(transitions.shape[1]): assert transitions[s, a].sum() == 1.0 assert isinstance(transitions, np.ndarray) assert transitions.shape == (17, 4, 17)
def test_maze1_features(): env = make_wrapped_env('MazeWorld1-v0', with_feature_wrapper=True) maze_env = unwrap_env(env, MazeWorld) feature_wrapper = unwrap_env(env, FeatureWrapper) d = feature_wrapper.feature_dimensionality() ranges = feature_wrapper.feature_range() print(ranges) for i in range(10240, 13): for a in range(10): feature = feature_wrapper.features(maze_env.index_to_state(i), a, None) assert feature.shape == d assert np.all(feature >= ranges[0]) assert np.all(feature <= ranges[1])
def test_tabular_function(): def reward_function_factory(env): params = np.zeros(64) params[-1] = 1. return TabularRewardFunction(env, params) env = make_wrapped_env('FrozenLake8x8-v0', reward_function_factory=reward_function_factory, with_model_wrapper=True) agent = ValueIteration(env) agent.train(1) trajs = collect_trajs(env, agent, 10) for traj in trajs: for i in range(len(traj['rewards'])): assert np.isclose(traj['rewards'][i], traj['true_rewards'][i])
def case_make_wrapped(env_id): env = make_wrapped_env(env_id) assert not is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True, with_model_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, BaseWorldModelWrapper) def rew_fun_fact(env): return FeatureBasedRewardFunction(env, 'random') env = make_wrapped_env(env_id, with_feature_wrapper=True, reward_function_factory=rew_fun_fact, with_model_wrapper=False) assert is_unwrappable_to(env, FeatureWrapper) assert is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True, reward_function_factory=rew_fun_fact, with_model_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, BaseWorldModelWrapper)
def test_update_parameters_frozen_feature(): def rew_fun_factory(env): return FeatureBasedRewardFunction(env, 'random') env = make_wrapped_env( 'FrozenLake-v0', with_feature_wrapper=True, reward_function_factory=rew_fun_factory) reward_wrapper = unwrap_env(env, RewardWrapper) params = np.copy(reward_wrapper.reward_function.parameters) domain = reward_wrapper.reward_function.domain() rews = reward_wrapper.reward_function.reward(domain) reward_wrapper.update_reward_parameters(2 * params) rews2 = reward_wrapper.reward_function.reward(domain) assert np.all(np.isclose(2 * rews, rews2)) reward_wrapper.update_reward_parameters(np.zeros_like(params)) rews3 = reward_wrapper.reward_function.reward(domain) assert np.all(np.isclose(rews3, np.zeros_like(rews3)))
def test_get_reward_matrix_wrapped_feature(): true_rews = np.random.randn(65) true_rews[-1] = 0 def reward_function_factory(env): return FeatureBasedRewardFunction(env, true_rews[:-1]) env = make_wrapped_env( 'FrozenLake8x8-v0', with_feature_wrapper=True, reward_function_factory=reward_function_factory, with_model_wrapper=True) transitions = env.get_transition_array() rewards = env.get_reward_array() assert rewards.shape == (65, 4) assert transitions.shape == (65, 4, 65) for s in range(64 + 1): for a in range(4): assert np.isclose(rewards[s, a], transitions[s, a, :].dot(true_rews))
def quick_run_alg(alg_class, config={}): def reward_function_factory(env): return FeatureBasedRewardFunction(env, 'random') env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True, reward_function_factory=reward_function_factory, with_model_wrapper=True) def rl_alg_factory(env): return ValueIteration(env, {}) expert_trajs = [{ 'states': [ 0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9, 8, 8, 9, 10, 14, 15 ], 'actions': [ 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 1, 0, 1 ], 'rewards': [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ], 'true_rewards': [], 'features': [ np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }, { 'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15], 'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1], 'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 'true_rewards': [], 'features': [ np.array([ 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0. ]), np.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1. ]) ] }] metrics = [] alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config) alg.train(2, 2, 2)
def test_maze_indices(): env = make_wrapped_env('MazeWorld0-v0', with_model_wrapper=True) for i in range(10240): assert i == env.state_to_index(env.index_to_state(i))
def new_f(): return f(make_wrapped_env(key, with_feature_wrapper=True))
def __init__(self, env_id: str, expert_trajs_path: str, irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm], metrics: List[BaseMetric], rl_config: dict, irl_config: dict, run_config: dict): """ Parameters ---------- env_id: str The environment id of a gym environment. expert_trajs_path: str A path to the folder where expert trajectories are stored. The file with expert trajectories must be expert_trajs_path/trajs.data. irl_alg_factory: Callable[[gym.Env, List[Dict[str, list]]], BaseIRLAlgorithm] A factory function which takes a gym environment and expert trajetories and returns a subclass of BaseIRLAlgorithm. metrics: List[BaseMetric] The metrics to be evaluated after running the IRL algorithm. run_config: dict A dictionary containing the configuration of the run. Required fields are: 'reward_function': subclass of BaseRewardFunction, e.g. FeatureBasedRewardFunction. 'no_expert_trajs': int, number of expert trajectories to be used. 'no_irl_iterations': int, number of iterations the IRL algorithm is run for. 'no_rl_episodes_per_irl_iteration': int, how many episodes the RL agent is allowed to run each iteration. 'no_irl_episodes_per_irl_iteration': int, how many episodes can be sampled for the IRL algorithm each iteration. """ action_in_domain = env_id in ENV_IDS_ACTION_IN_DOMAIN next_state_in_domain = env_id in ENV_IDS_NEXT_STATE_IN_DOMAIN def reward_function_factory(env): return run_config['reward_function']( env, parameters='random', action_in_domain=action_in_domain, next_state_in_domain=next_state_in_domain) print('Making run environment.') self.env = make_wrapped_env( env_id, with_feature_wrapper=run_config['requires_features'], reward_function_factory=reward_function_factory, with_model_wrapper=run_config['requires_transitions']) # load expert trajs: print('Load expert demonstrations from ' + str(expert_trajs_path)) self.expert_trajs = load_stored_trajs(expert_trajs_path) print('Loaded expert demonstrations.') # use only specified number of expert trajs assert len(self.expert_trajs) >= run_config['no_expert_trajs'] self.expert_trajs = self.expert_trajs[:run_config['no_expert_trajs']] self.irl_alg_factory = irl_alg_factory # Metrics are only passed as classes and need to be instantiated instantiated_metrics = [] # collect all information relevant for certain metric __init__s: metric_input = { 'env': self.env, 'expert_trajs': self.expert_trajs, 'true_reward': truth.make_true_reward(env_id), 'no_trajs_for_metrics': run_config['no_metric_episodes_per_irl_iteration'] } # instantiate metrics: print('Instantiate metrics.') for metric in metrics: instantiated_metrics.append(metric(metric_input)) self.metrics = instantiated_metrics self.rl_config = rl_config self.irl_config = irl_config self.run_config = run_config
def test_value_iteration(): # gamma = 1.0 env = make_wrapped_env('FrozenLake-v0', with_model_wrapper=True) agent = ValueIteration(env, {'gamma': 1.0}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.93 and state_values[14] < 0.95 assert state_values[15] == 0 # gamma = 0.9 env = make_wrapped_env('FrozenLake-v0', with_model_wrapper=True) agent = ValueIteration(env, {'gamma': 0.9}) agent.train(10) state_values = agent.state_values assert isinstance(state_values, np.ndarray) assert state_values.shape == (17, ) # argmax should be state just before frisbee # (15 is final state, 16 is absorbing state) assert np.argmax(state_values) == 14 assert state_values[14] > 0.63 and state_values[14] < 0.65 # holes and frisbee should have zero value: for i in [5, 7, 11, 12, 15]: assert state_values[i] == 0 # check some q values: # go right in second to last state assert np.argmax(agent.q_values[14, :]) == 1 assert np.min(agent.q_values) == 0 assert np.max(agent.q_values) <= 1 # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # check softmax policy old_state_values = agent.state_values old_q_values = agent.q_values agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1}) agent.train(10) assert np.all(agent.state_values <= old_state_values) # at least initial state should now have lower value: assert agent.state_values[0] < old_state_values[0] assert np.all(agent.q_values <= old_q_values) # check policy: for i in range(16): assert np.isclose(np.sum(agent.policy(i)), 1.) assert np.min(agent.policy(i)) >= 0. assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i)) # ordering of probabilities should stay the same with softmax assert np.all( np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i))) # test policy array: policy_array = agent.policy_array() assert policy_array.shape == (17, 4) for i in range(16): assert np.all(agent.policy(i) == policy_array[i, :]) # check if true reward isn't leaked: def reward_function_factory(env): return FeatureBasedRewardFunction(env, np.zeros(16)) env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True, reward_function_factory=reward_function_factory, with_model_wrapper=True) agent = ValueIteration(env, {}) agent.train(10) assert np.sum(agent.state_values == 0)
def test_random_featb_function(): env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True) rf = FeatureBasedRewardFunction(env, 'random')