def __init__(self, env: gym.Env, config: Union[None, dict] = None): """ Parameters ---------- env: gym.Env A DiscreteEnv environment config: dict Configuration of hyperparameters. """ assert is_unwrappable_to(env, BaseWorldModelWrapper) super(ValueIteration, self).__init__(env, config) self.model_wrapper = unwrap_env(env, BaseWorldModelWrapper) # +1 for absorbing state self.no_states = self.model_wrapper.n_states() + 1 self.no_actions = env.action_space.n self.transitions = self.model_wrapper.get_transition_array() # will be filled in beginning of training: self.rewards = None # will be filled during training: self.state_values = None self.q_values = None # whenever self._policy is None, it will be re-calculated # based on current self.q_values when calling policy(). self._policy = None
def test_make_maze1(): env = make_env('MazeWorld1-v0') assert is_unwrappable_to(env, MazeWorld) walls, rews = get_maps(MAP1) maze_env = unwrap_env(env, MazeWorld) assert np.all(maze_env.map_walls == walls) assert np.all(maze_env.map_rewards == rews)
def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]], rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm], metrics: List[BaseMetric] = [], config: Union[dict, None] = None): """ Parameters ---------- env: gym.Env The gym environment to be trained on. Needs to be wrapped in a RewardWrapper to not leak the true reward function. expert_trajs: List[dict] A list of trajectories. Each trajectory is a dictionary with keys ['states', 'actions', 'rewards', 'true_rewards', 'features']. The values of each dictionary are lists. See :func:`irl_benchmark.irl.collect.collect_trajs`. rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm] A function which returns a new RL algorithm when called. config: dict A dictionary containing algorithm-specific parameters. """ assert is_unwrappable_to(env, RewardWrapper) self.env = env self.expert_trajs = expert_trajs self.rl_alg_factory = rl_alg_factory self.metrics = metrics self.metric_results = [[]] * len(metrics) self.config = preprocess_config(self, IRL_CONFIG_DOMAINS, config)
def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]], rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm], metrics: List[BaseMetric], config: dict): """See :class:`irl_benchmark.irl.algorithms.base_algorithm.BaseIRLAlgorithm`.""" assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, FeatureWrapper) super(MaxEntIRL, self).__init__(env, expert_trajs, rl_alg_factory, metrics, config) # get transition matrix (with absorbing state) self.transition_matrix = get_transition_matrix(self.env) self.n_states, self.n_actions, _ = self.transition_matrix.shape # get map of features for all states: feature_wrapper = unwrap_env(env, FeatureWrapper) self.feat_map = feature_wrapper.feature_array()
def train(self, no_episodes: int): """ Train the agent Parameters ---------- no_episodes: int Not used in this algorithm (since it assumes known transition dynamics) """ assert is_unwrappable_to( self.env, gym.envs.toy_text.discrete.DiscreteEnv) or is_unwrappable_to( self.env, MazeWorld) # extract reward function from env (using wrapped reward function if available): self.rewards = self.model_wrapper.get_reward_array() # initialize state values: state_values = np.zeros([self.no_states]) while True: # stops when state values converge # remember old values for error computation old_state_values = state_values.copy() # calculate Q-values: q_values = self.rewards + \ self.config['gamma'] * self.transitions.dot(state_values) # calculate state values either with maximum or mellow maximum: if self.config['temperature'] is None: # using default maximum operator: state_values = self._argmax_state_values(q_values) else: # using softmax: state_values = self._softmax_state_values(q_values) # stopping condition: # check if state values converged (almost no change since last iteration: if np.allclose(state_values, old_state_values, atol=self.config['epsilon']): break # persist learned state values and Q-values: self.state_values = state_values self.q_values = q_values # flag to tell other methods that policy needs to be updated based on new values: self._policy = None
def test_is_unwrappable_to(): assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit) assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'), FrozenLakeFeatureWrapper) assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'), feature_wrapper.FeatureWrapper) env = feature_wrapper.make('FrozenLake-v0') reward_function = FeatureBasedRewardFunction(env, 'random') env = RewardWrapper(env, reward_function) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper) assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, gym.Env)
def feature_count(env, trajs: List[Dict[str, list]], gamma: float) -> np.ndarray: """Return empirical discounted feature counts of input trajectories. Parameters ---------- env: gym.Env A gym environment, wrapped in a feature wrapper trajs: List[Dict[str, list]] A list of trajectories. Each trajectory is a dictionary with keys ['states', 'actions', 'rewards', 'true_rewards', 'features']. The values of each dictionary are lists. See :func:`irl_benchmark.irl.collect.collect_trajs`. gamma: float The discount factor. Must be in range [0., 1.]. Returns ------- np.ndarray A numpy array containing discounted feature counts. The shape is the same as the trajectories' feature shapes. One scalar feature count per feature. """ assert is_unwrappable_to(env, FeatureWrapper) # Initialize feature count sum to zeros of correct shape: feature_dim = unwrap_env(env, FeatureWrapper).feature_dimensionality() # feature_dim is a 1-tuple, # extract the feature dimensionality as integer: assert len(feature_dim) == 1 feature_dim = feature_dim[0] feature_count_sum = np.zeros(feature_dim) for traj in trajs: assert traj['features'] # empty lists are False in python # gammas is a vector containing [gamma^0, gamma^1, gamma^2, ... gamma^l] # where l is length of the trajectory: gammas = gamma**np.arange(len(traj['features'])) traj_feature_count = np.sum(gammas.reshape(-1, 1) * np.array(traj['features']).reshape( (-1, feature_dim)), axis=0) # add trajectory's feature count: feature_count_sum += traj_feature_count # divide feature_count_sum by number of trajectories to normalize: result = feature_count_sum / len(trajs) return result
def get_reward_array(self): env = unwrap_env(self.env, DiscreteEnv) # adding +1 to account for absorbing state # (reached whenever game ended) n_states = env.observation_space.n + 1 n_actions = env.action_space.n if is_unwrappable_to(self.env, RewardWrapper): # get the reward function: reward_wrapper = unwrap_env(self.env, RewardWrapper) reward_function = reward_wrapper.reward_function else: reward_function = None rewards = np.zeros([n_states, n_actions]) # iterate over all "from" states: for state, transitions_given_state in env.P.items(): # iterate over all actions: for action, outcomes in transitions_given_state.items(): # iterate over all possible outcomes: for probability, next_state, reward, done in outcomes: if reward_function is not None: if done and state == next_state: # don't output reward for reaching state if game is over # and already in that state. reward = 0 else: rew_input = reward_wrapper.get_reward_input_for( state, action, next_state) reward = reward_function.reward(rew_input) rewards[state, action] += reward * probability # reward of absorbing state is zero: rewards[-1, :] = 0.0 return rewards
def __init__(self, env: gym.Env, config: dict): """ Parameters ---------- env: gym.Env A DiscreteEnv environment config: dict Configuration of hyperparameters. """ assert is_unwrappable_to(env, gym.envs.toy_text.discrete.DiscreteEnv) super(ValueIteration, self).__init__(env, config) self.no_states = env.observation_space.n + 1 # + 1 for absorbing state self.no_actions = env.action_space.n self.transitions = get_transition_matrix(env) # will be filled in beginning of training: self.rewards = None # will be filled during training: self.state_values = None self.q_values = None # whenever self._policy is None, it will be re-calculated # based on current self.q_values when calling policy(). self._policy = None
def case_make_wrapped(env_id): env = make_wrapped_env(env_id) assert not is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True, with_model_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert not is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, BaseWorldModelWrapper) def rew_fun_fact(env): return FeatureBasedRewardFunction(env, 'random') env = make_wrapped_env(env_id, with_feature_wrapper=True, reward_function_factory=rew_fun_fact, with_model_wrapper=False) assert is_unwrappable_to(env, FeatureWrapper) assert is_unwrappable_to(env, RewardWrapper) assert not is_unwrappable_to(env, BaseWorldModelWrapper) env = make_wrapped_env(env_id, with_feature_wrapper=True, reward_function_factory=rew_fun_fact, with_model_wrapper=True) assert is_unwrappable_to(env, FeatureWrapper) assert is_unwrappable_to(env, RewardWrapper) assert is_unwrappable_to(env, BaseWorldModelWrapper)
def test_make_frozen8(): env = make_env('FrozenLake8x8-v0') assert is_unwrappable_to(env, FrozenLakeEnv)
def __init__(self, env): assert is_unwrappable_to(env, DiscreteEnv) super(DiscreteEnvModelWrapper, self).__init__(env)
def _get_model_arrays(self, return_transitions=True, return_rewards=True): if return_rewards: if is_unwrappable_to(self.env, RewardWrapper): reward_wrapper = unwrap_env(self.env, RewardWrapper) else: reward_wrapper = None assert return_transitions or return_rewards # +1 for absorbing state: n_states = self.n_states() + 1 absorbing_s = n_states - 1 num_rewards = n_actions = self.maze_env.action_space.n paths = self.maze_env.paths if return_transitions: coords_trans_state = [] coords_trans_action = [] coords_trans_next_state = [] trans_data = [] def add_transition(s, a, sn, p): coords_trans_state.append(s) coords_trans_action.append(a) coords_trans_next_state.append(sn) trans_data.append(p) if return_rewards: rewards = np.zeros((n_states, n_actions)) for s in tqdm(range(n_states - 1)): for a in range(n_actions): state = self.index_to_state(s) if return_rewards and reward_wrapper is not None: rew_input = reward_wrapper.get_reward_input_for( state, a, None) wrapped_reward = reward_wrapper.reward_function.reward( rew_input).item() if np.sum(state[num_rewards:]) == 0: if return_transitions: add_transition(s, a, absorbing_s, 1.) if return_rewards: if reward_wrapper is None: rewards[s, a] = 0 else: rewards[s, a] = wrapped_reward continue pos_index = int(np.where(state[:num_rewards] > 0)[0][0]) path = paths[pos_index][a] if len(path) == 1 or pos_index == a: assert pos_index == a if return_transitions: add_transition(s, a, s, 1. - RANDOM_QUIT_CHANCE) add_transition(s, a, absorbing_s, RANDOM_QUIT_CHANCE) if return_rewards: if reward_wrapper is None: rewards[s, a] = REWARD_MOVE if state[num_rewards + a] != 0: rews_where = self.maze_env.rews_where rewards[s, a] += float( self.maze_env.map_rewards[rews_where[0][a], \ rews_where[1][a]]) * (1 - RANDOM_QUIT_CHANCE) else: rewards[s, a] = wrapped_reward continue success_prob = (1 - RANDOM_QUIT_CHANCE)**(len(path) - 1) if return_transitions: new_state = get_next_state(state, a, num_rewards) new_s = self.state_to_index(new_state) add_transition(s, a, new_s, success_prob) add_transition(s, a, absorbing_s, 1. - success_prob) if return_rewards: if reward_wrapper is None: if state[num_rewards + a] == 0: # if reward is already collected at this field: rew_value = 0 else: rews_where = self.maze_env.rews_where rew_value = float( self.maze_env.map_rewards[rews_where[0][a], rews_where[1][a]]) possible_distances = np.arange(1, len(path)) prob_getting_to_distance = ( 1 - RANDOM_QUIT_CHANCE)**possible_distances prob_stopping_at_distance = np.ones_like( possible_distances, dtype=np.float32) prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE expected_walking_distance = np.sum( possible_distances * prob_getting_to_distance * prob_stopping_at_distance) weighted_reward = expected_walking_distance * REWARD_MOVE + success_prob * rew_value rewards[s, a] = weighted_reward else: rewards[s, a] = wrapped_reward for a in range(n_actions): if return_transitions: add_transition(absorbing_s, a, absorbing_s, 1.) if return_rewards: rewards[absorbing_s, a] = 0 if return_transitions: coords = np.array([ coords_trans_state, coords_trans_action, coords_trans_next_state ]) transitions = sparse.COO(coords, trans_data) if return_transitions: if return_rewards: return transitions, rewards return transitions return rewards
def __init__(self, env): assert is_unwrappable_to(env, MazeWorld) super(MazeModelWrapper, self).__init__(env) self.maze_env = unwrap_env(self.env, MazeWorld)