def test_maze1_features(): env = make_wrapped_env('MazeWorld1-v0', with_feature_wrapper=True) maze_env = unwrap_env(env, MazeWorld) feature_wrapper = unwrap_env(env, FeatureWrapper) d = feature_wrapper.feature_dimensionality() ranges = feature_wrapper.feature_range() print(ranges) for i in range(10240, 13): for a in range(10): feature = feature_wrapper.features(maze_env.index_to_state(i), a, None) assert feature.shape == d assert np.all(feature >= ranges[0]) assert np.all(feature <= ranges[1])
def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]], rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm], metrics: List[BaseMetric], config: dict): """See :class:`irl_benchmark.irl.algorithms.base_algorithm.BaseIRLAlgorithm`.""" super(MaxEntIRL, self).__init__(env, expert_trajs, rl_alg_factory, metrics, config) # get transition matrix (with absorbing state) self.transition_matrix = unwrap_env( env, BaseWorldModelWrapper).get_transition_array() self.n_states, self.n_actions, _ = self.transition_matrix.shape # get map of features for all states: feature_wrapper = unwrap_env(env, FeatureWrapper) self.feat_map = feature_wrapper.feature_array()
def get_transition_array(self): env = unwrap_env(self.env, DiscreteEnv) # adding +1 to account for absorbing state # (reached whenever game ended) n_states = env.observation_space.n + 1 n_actions = env.action_space.n transitions = np.zeros([n_states, n_actions, n_states]) # iterate over all "from" states: for state, transitions_given_state in env.P.items(): # iterate over all actions: for action, outcomes in transitions_given_state.items(): # iterate over all possible outcomes: for probability, next_state, _, done in outcomes: # add transition probability T(s, a, s') transitions[state, action, next_state] += probability if done: # outcome was marked as ending the game. # if game is done and state == next_state, map to absorbing state instead if state == next_state: transitions[state, action, next_state] = 0 # map next state to absorbing state # make sure that next state wasn't mapped to any other state yet assert np.sum(transitions[next_state, :, :-1]) == 0 transitions[next_state, :, -1] = 1.0 # specify transition probabilities for absorbing state: # returning to itself for all actions. transitions[-1, :, -1] = 1.0 return transitions
def test_make_maze1(): env = make_env('MazeWorld1-v0') assert is_unwrappable_to(env, MazeWorld) walls, rews = get_maps(MAP1) maze_env = unwrap_env(env, MazeWorld) assert np.all(maze_env.map_walls == walls) assert np.all(maze_env.map_rewards == rews)
def __init__(self, env: gym.Env, config: Union[None, dict] = None): """ Parameters ---------- env: gym.Env A DiscreteEnv environment config: dict Configuration of hyperparameters. """ assert is_unwrappable_to(env, BaseWorldModelWrapper) super(ValueIteration, self).__init__(env, config) self.model_wrapper = unwrap_env(env, BaseWorldModelWrapper) # +1 for absorbing state self.no_states = self.model_wrapper.n_states() + 1 self.no_actions = env.action_space.n self.transitions = self.model_wrapper.get_transition_array() # will be filled in beginning of training: self.rewards = None # will be filled during training: self.state_values = None self.q_values = None # whenever self._policy is None, it will be re-calculated # based on current self.q_values when calling policy(). self._policy = None
def features(self, current_state: np.ndarray, action: int, next_state: None) -> np.ndarray: """Return features to be saved in step method's info dictionary. There are four feature variables: expected walking distance, probability of reaching a small reward field, probability of reaching a medium reward field, probability of reaching a large reward field. Only one of the last three values will be non-zero.""" maze_env = unwrap_env(self.env, MazeWorld) # can only calculate features for a single state-action pair. assert len(current_state.shape) == 1 # special case: not at any position: if np.sum(current_state[:maze_env.num_rewards]) == 0: return np.array([1, 0, 0, 0]) path_len = maze_env.get_path_len(current_state, action) # special case: all rewards collected: if np.sum(current_state[maze_env.num_rewards:]) == 0: return np.zeros(4) assert path_len > 0 # special case: walking to current position if path_len == 1: # assert that agent is walking to its current position: assert current_state[action] == 1.0 expected_walking_distance = 1.0 else: # calculate expected walking distance feature: possible_distances = np.arange(1, path_len) prob_getting_to_distance = (1 - RANDOM_QUIT_CHANCE)**possible_distances prob_stopping_at_distance = np.ones_like(possible_distances, dtype=np.float32) prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE expected_walking_distance = np.sum(possible_distances * prob_getting_to_distance * prob_stopping_at_distance) # coin collection probabilities: ccps = np.zeros(3) rew_value = maze_env.get_rew_value(current_state, action) if rew_value != 0.: assert rew_value in [REWARD_SMALL, REWARD_MEDIUM, REWARD_LARGE] rew_value_index = [REWARD_SMALL, REWARD_MEDIUM, REWARD_LARGE].index(rew_value) if path_len == 1: ccps[rew_value_index] = (1 - RANDOM_QUIT_CHANCE) else: ccps[rew_value_index] = (1 - RANDOM_QUIT_CHANCE)**(path_len - 1) return np.concatenate((np.array([expected_walking_distance]), ccps))
def test_frozen_features(): env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True) feature_wrapper = unwrap_env(env, FeatureWrapper) d = feature_wrapper.feature_dimensionality() ranges = feature_wrapper.feature_range() print(ranges) for i in range(16): feature = feature_wrapper.features(None, None, i) assert feature.shape == d assert np.all(feature >= ranges[0]) assert np.all(feature <= ranges[1])
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int): """ """ sa_visit_count, P0 = self.sa_visitations() # calculate feature expectations expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0) # initialize the parameters reward_function = FeatureBasedRewardFunction(self.env, 'random') theta = reward_function.parameters agent = self.rl_alg_factory(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) reward_wrapper = unwrap_env(self.env, RewardWrapper) reward_wrapper.update_reward_parameters(theta) # compute policy agent.train(no_rl_episodes_per_irl_iteration) policy = agent.policy_array() state_values = agent.state_values q_values = agent.q_values # occupancy measure d = self.occupancy_measure(policy=policy, initial_state_dist=P0)[:-1] # log-likeilihood gradient grad = -(expert_feature_count - np.dot(self.feat_map.T, d)) # graduate descent theta -= self.config['lr'] * grad evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_function } self.evaluate_metrics(evaluation_input) return theta
def get_reward_array(self): env = unwrap_env(self.env, DiscreteEnv) # adding +1 to account for absorbing state # (reached whenever game ended) n_states = env.observation_space.n + 1 n_actions = env.action_space.n if is_unwrappable_to(self.env, RewardWrapper): # get the reward function: reward_wrapper = unwrap_env(self.env, RewardWrapper) reward_function = reward_wrapper.reward_function else: reward_function = None rewards = np.zeros([n_states, n_actions]) # iterate over all "from" states: for state, transitions_given_state in env.P.items(): # iterate over all actions: for action, outcomes in transitions_given_state.items(): # iterate over all possible outcomes: for probability, next_state, reward, done in outcomes: if reward_function is not None: if done and state == next_state: # don't output reward for reaching state if game is over # and already in that state. reward = 0 else: rew_input = reward_wrapper.get_reward_input_for( state, action, next_state) reward = reward_function.reward(rew_input) rewards[state, action] += reward * probability # reward of absorbing state is zero: rewards[-1, :] = 0.0 return rewards
def feature_count(env, trajs: List[Dict[str, list]], gamma: float) -> np.ndarray: """Return empirical discounted feature counts of input trajectories. Parameters ---------- env: gym.Env A gym environment, wrapped in a feature wrapper trajs: List[Dict[str, list]] A list of trajectories. Each trajectory is a dictionary with keys ['states', 'actions', 'rewards', 'true_rewards', 'features']. The values of each dictionary are lists. See :func:`irl_benchmark.irl.collect.collect_trajs`. gamma: float The discount factor. Must be in range [0., 1.]. Returns ------- np.ndarray A numpy array containing discounted feature counts. The shape is the same as the trajectories' feature shapes. One scalar feature count per feature. """ assert is_unwrappable_to(env, FeatureWrapper) # Initialize feature count sum to zeros of correct shape: feature_dim = unwrap_env(env, FeatureWrapper).feature_dimensionality() # feature_dim is a 1-tuple, # extract the feature dimensionality as integer: assert len(feature_dim) == 1 feature_dim = feature_dim[0] feature_count_sum = np.zeros(feature_dim) for traj in trajs: assert traj['features'] # empty lists are False in python # gammas is a vector containing [gamma^0, gamma^1, gamma^2, ... gamma^l] # where l is length of the trajectory: gammas = gamma**np.arange(len(traj['features'])) traj_feature_count = np.sum(gammas.reshape(-1, 1) * np.array(traj['features']).reshape( (-1, feature_dim)), axis=0) # add trajectory's feature count: feature_count_sum += traj_feature_count # divide feature_count_sum by number of trajectories to normalize: result = feature_count_sum / len(trajs) return result
def __init__(self, env: gym.Env, expert_trajs: List[Dict[str, list]], rl_alg_factory: Callable[[gym.Env], BaseRLAlgorithm], metrics: List[BaseMetric], config: dict): super(MaxCausalEntIRL, self).__init__(env, expert_trajs, rl_alg_factory, metrics, config) assert is_unwrappable_to(env, DiscreteEnv) assert is_unwrappable_to(env, FeatureWrapper) # get transition matrix (with absorbing state) self.transition_matrix = get_transition_matrix(self.env) self.n_states, self.n_actions, _ = self.transition_matrix.shape # get map of features for all states: feature_wrapper = unwrap_env(env, FeatureWrapper) self.feat_map = feature_wrapper.feature_array()
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int): """Train algorithm. See abstract base class for parameter types.""" # calculate feature expectations expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0) # start with an agent agent = self.rl_alg_factory(self.env) reward_wrapper = unwrap_env(self.env, RewardWrapper) theta = reward_wrapper.reward_function.parameters irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) # compute policy agent.train(no_rl_episodes_per_irl_iteration) policy = agent.policy_array() # compute state visitation frequencies, discard absorbing state svf = self.expected_svf(policy)[:-1] # compute gradients grad = (expert_feature_count - self.feat_map.T.dot(svf)) # update params theta += self.config['lr'] * grad reward_wrapper.update_reward_parameters(theta) evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_wrapper.reward_function } self.evaluate_metrics(evaluation_input) return theta
def test_update_parameters_frozen_feature(): def rew_fun_factory(env): return FeatureBasedRewardFunction(env, 'random') env = make_wrapped_env( 'FrozenLake-v0', with_feature_wrapper=True, reward_function_factory=rew_fun_factory) reward_wrapper = unwrap_env(env, RewardWrapper) params = np.copy(reward_wrapper.reward_function.parameters) domain = reward_wrapper.reward_function.domain() rews = reward_wrapper.reward_function.reward(domain) reward_wrapper.update_reward_parameters(2 * params) rews2 = reward_wrapper.reward_function.reward(domain) assert np.all(np.isclose(2 * rews, rews2)) reward_wrapper.update_reward_parameters(np.zeros_like(params)) rews3 = reward_wrapper.reward_function.reward(domain) assert np.all(np.isclose(rews3, np.zeros_like(rews3)))
def feature_array(self) -> np.ndarray: """ Get features for the entire domain as an array. Has to be overwritten in each feature wrapper. Wrappers for large environments will not implement this method. Returns ------- np.ndarray The features for the entire domain as an array. Shape: (domain_size, d). """ maze_world = unwrap_env(self.env, MazeWorld) num_rewards = maze_world.num_rewards n_states = num_rewards * 2**num_rewards feature_array = np.zeros((n_states, num_rewards, 4)) for s in range(n_states): for a in range(num_rewards): state = maze_world.index_to_state(s) feature = self.features(state, a, None) feature_array[s, a, :] = feature return feature_array
def train(self, no_irl_iterations: int, no_rl_episodes_per_irl_iteration: int, no_irl_episodes_per_irl_iteration: int ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]: """Train the apprenticeship learning IRL algorithm. Parameters ---------- no_irl_iterations: int The number of iteration the algorithm should be run. no_rl_episodes_per_irl_iteration: int The number of episodes the RL algorithm is allowed to run in each iteration of the IRL algorithm. no_irl_episodes_per_irl_iteration: int The number of episodes permitted to be run in each iteration to update the current reward estimate (e.g. to estimate state frequencies of the currently optimal policy). Returns ------- Tuple[BaseRewardFunction, BaseRLAlgorithm] The estimated reward function and a RL agent trained for this estimate. """ # Initialize training with a random agent. agent = RandomAgent(self.env) irl_iteration_counter = 0 while irl_iteration_counter < no_irl_iterations: irl_iteration_counter += 1 if self.config['verbose']: print('IRL ITERATION ' + str(irl_iteration_counter)) # Estimate feature count of current agent. trajs = collect_trajs( self.env, agent, no_trajectories=no_irl_episodes_per_irl_iteration) current_feature_count = self.feature_count( trajs, gamma=self.config['gamma']) print('CURRENT FEATURE COUNT:') print(current_feature_count) # add new feature count to list of feature counts self.feature_counts.append(current_feature_count) # for SVM mode: self.labels.append(-1.) # convert to numpy array: feature_counts = np.array(self.feature_counts) labels = np.array(self.labels) # update reward coefficients based on mode specified in config: if self.config['mode'] == 'projection': # projection mode: if irl_iteration_counter == 1: # initialize feature_count_bar in first iteration # set to first non-expert feature count: feature_count_bar = feature_counts[1] else: # not first iteration. # calculate line through last feature_count_bar and # last non-expert feature count: line = feature_counts[-1] - feature_count_bar # new feature_count_bar is orthogonal projection of # expert's feature count onto the line: feature_count_bar += np.dot( line, feature_counts[0] - feature_count_bar) / np.dot( line, line) * line reward_coefficients = feature_counts[0] - feature_count_bar # compute distance as L2 norm of reward coefficients (t^(i) in paper): distance = np.linalg.norm(reward_coefficients, ord=2) elif self.config['mode'] == 'svm': # svm mode: # create quadratic programming problem definition: weights = cvx.Variable(feature_counts.shape[1]) bias = cvx.Variable() objective = cvx.Minimize(cvx.norm(weights, 2)) constraints = [ cvx.multiply(labels, (feature_counts * weights + bias)) >= 1 ] problem = cvx.Problem(objective, constraints) # solve quadratic program: problem.solve() if weights.value is None: # TODO: we need to handle empty solution better. raise RuntimeError( 'Empty solution set for linearly separable SVM.') if self.config['verbose']: # print support vectors # (which last iterations where relevant for current result?) svm_classifications = feature_counts.dot( weights.value) + bias.value support_vectors = np.where( np.isclose(np.abs(svm_classifications), 1))[0] print('The support vectors are from iterations number ' + str(support_vectors)) reward_coefficients = weights.value distance = 2 / problem.value else: raise NotImplementedError() if self.config['verbose']: print('Distance: ' + str(distance)) self.distances.append(distance) print(reward_coefficients) # update reward function reward_wrapper = unwrap_env(self.env, RewardWrapper) reward_wrapper.update_reward_parameters(reward_coefficients) # check stopping criterion: if distance <= self.config['epsilon']: if self.config['verbose']: print("Feature counts matched within " + str(self.config['epsilon']) + ".") break # create new RL-agent agent = self.rl_alg_factory(self.env) # train agent (with new reward function) agent.train(no_rl_episodes_per_irl_iteration) evaluation_input = { 'irl_agent': agent, 'irl_reward': reward_wrapper.reward_function } self.evaluate_metrics(evaluation_input) return reward_wrapper.reward_function, agent
def test_unwrap(): env = make_env('FrozenLake-v0') assert env.env is unwrap_env(env, DiscreteEnv) # No unwrapping needed: assert env is unwrap_env(env, gym.Env) # Unwrap all the way: assert env.env is unwrap_env(env) env = FrozenLakeFeatureWrapper(env) assert env.env.env is unwrap_env(env, DiscreteEnv) # No unwrapping needed: assert env is unwrap_env(env, FrozenLakeFeatureWrapper) # Unwrap all the way: assert env.env.env is unwrap_env(env) # check types: assert isinstance(unwrap_env(env, DiscreteEnv), DiscreteEnv) assert isinstance(unwrap_env(env, feature_wrapper.FeatureWrapper), feature_wrapper.FeatureWrapper) assert isinstance(unwrap_env(env, FrozenLakeFeatureWrapper), FrozenLakeFeatureWrapper) assert isinstance(unwrap_env(env, FrozenLakeFeatureWrapper), feature_wrapper.FeatureWrapper) assert isinstance(unwrap_env(env), gym.Env)
def __init__(self, env): assert is_unwrappable_to(env, MazeWorld) super(MazeModelWrapper, self).__init__(env) self.maze_env = unwrap_env(self.env, MazeWorld)
def _get_model_arrays(self, return_transitions=True, return_rewards=True): if return_rewards: if is_unwrappable_to(self.env, RewardWrapper): reward_wrapper = unwrap_env(self.env, RewardWrapper) else: reward_wrapper = None assert return_transitions or return_rewards # +1 for absorbing state: n_states = self.n_states() + 1 absorbing_s = n_states - 1 num_rewards = n_actions = self.maze_env.action_space.n paths = self.maze_env.paths if return_transitions: coords_trans_state = [] coords_trans_action = [] coords_trans_next_state = [] trans_data = [] def add_transition(s, a, sn, p): coords_trans_state.append(s) coords_trans_action.append(a) coords_trans_next_state.append(sn) trans_data.append(p) if return_rewards: rewards = np.zeros((n_states, n_actions)) for s in tqdm(range(n_states - 1)): for a in range(n_actions): state = self.index_to_state(s) if return_rewards and reward_wrapper is not None: rew_input = reward_wrapper.get_reward_input_for( state, a, None) wrapped_reward = reward_wrapper.reward_function.reward( rew_input).item() if np.sum(state[num_rewards:]) == 0: if return_transitions: add_transition(s, a, absorbing_s, 1.) if return_rewards: if reward_wrapper is None: rewards[s, a] = 0 else: rewards[s, a] = wrapped_reward continue pos_index = int(np.where(state[:num_rewards] > 0)[0][0]) path = paths[pos_index][a] if len(path) == 1 or pos_index == a: assert pos_index == a if return_transitions: add_transition(s, a, s, 1. - RANDOM_QUIT_CHANCE) add_transition(s, a, absorbing_s, RANDOM_QUIT_CHANCE) if return_rewards: if reward_wrapper is None: rewards[s, a] = REWARD_MOVE if state[num_rewards + a] != 0: rews_where = self.maze_env.rews_where rewards[s, a] += float( self.maze_env.map_rewards[rews_where[0][a], \ rews_where[1][a]]) * (1 - RANDOM_QUIT_CHANCE) else: rewards[s, a] = wrapped_reward continue success_prob = (1 - RANDOM_QUIT_CHANCE)**(len(path) - 1) if return_transitions: new_state = get_next_state(state, a, num_rewards) new_s = self.state_to_index(new_state) add_transition(s, a, new_s, success_prob) add_transition(s, a, absorbing_s, 1. - success_prob) if return_rewards: if reward_wrapper is None: if state[num_rewards + a] == 0: # if reward is already collected at this field: rew_value = 0 else: rews_where = self.maze_env.rews_where rew_value = float( self.maze_env.map_rewards[rews_where[0][a], rews_where[1][a]]) possible_distances = np.arange(1, len(path)) prob_getting_to_distance = ( 1 - RANDOM_QUIT_CHANCE)**possible_distances prob_stopping_at_distance = np.ones_like( possible_distances, dtype=np.float32) prob_stopping_at_distance[:-1] = RANDOM_QUIT_CHANCE expected_walking_distance = np.sum( possible_distances * prob_getting_to_distance * prob_stopping_at_distance) weighted_reward = expected_walking_distance * REWARD_MOVE + success_prob * rew_value rewards[s, a] = weighted_reward else: rewards[s, a] = wrapped_reward for a in range(n_actions): if return_transitions: add_transition(absorbing_s, a, absorbing_s, 1.) if return_rewards: rewards[absorbing_s, a] = 0 if return_transitions: coords = np.array([ coords_trans_state, coords_trans_action, coords_trans_next_state ]) transitions = sparse.COO(coords, trans_data) if return_transitions: if return_rewards: return transitions, rewards return transitions return rewards