Esempio n. 1
0
def test_tab_featb_functions():
    env = feature_make('FrozenLake8x8-v0')
    params = np.zeros(64)
    params[-1] = 1.
    rf = FeatureBasedRewardFunction(env, params)
    domain = rf.domain()
    rf2 = TabularRewardFunction(env, params)
    rf_true = make_true_reward('FrozenLake8x8-v0')
    rew1 = rf.reward(domain)
    rew2 = rf2.reward(domain)
    rew_true = rf_true.reward(domain)
    assert np.all(rew_true == rew1)
    assert np.all(rew1 == rew2)
    assert rew_true.shape == rew1.shape
    assert rew1.shape == rew2.shape
Esempio n. 2
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int):
        """Train algorithm. See abstract base class for parameter types."""

        # calculate feature expectations
        expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0)

        # start with an agent
        agent = self.rl_alg_factory(self.env)

        reward_function = FeatureBasedRewardFunction(self.env, 'random')
        self.env.update_reward_function(reward_function)
        theta = reward_function.parameters

        irl_iteration_counter = 0
        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))
            # compute policy
            agent.train(no_rl_episodes_per_irl_iteration)

            policy = agent.policy_array()

            # compute state visitation frequencies, discard absorbing state
            svf = self.expected_svf(policy)[:-1]

            # compute gradients
            grad = (expert_feature_count - self.feat_map.T.dot(svf))

            # update params
            theta += self.config['lr'] * grad

            reward_function.update_parameters(theta)

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return theta
Esempio n. 3
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int):
        """

        """

        sa_visit_count, P0 = self.sa_visitations()

        # calculate feature expectations
        expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0)

        # initialize the parameters
        reward_function = FeatureBasedRewardFunction(self.env, 'random')
        theta = reward_function.parameters

        agent = self.rl_alg_factory(self.env)

        irl_iteration_counter = 0

        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            reward_wrapper = unwrap_env(self.env, RewardWrapper)
            reward_wrapper.update_reward_parameters(theta)

            # compute policy
            agent.train(no_rl_episodes_per_irl_iteration)

            policy = agent.policy_array()
            state_values = agent.state_values
            q_values = agent.q_values

            # occupancy measure
            d = self.occupancy_measure(policy=policy,
                                       initial_state_dist=P0)[:-1]

            # log-likeilihood gradient
            grad = -(expert_feature_count - np.dot(self.feat_map.T, d))

            # graduate descent
            theta -= self.config['lr'] * grad

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return theta
def test_is_unwrappable_to():
    assert is_unwrappable_to(make_env('FrozenLake-v0'), TimeLimit)
    assert is_unwrappable_to(make_env('FrozenLake-v0'), DiscreteEnv)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake8x8-v0'),
                             FrozenLakeFeatureWrapper)
    assert is_unwrappable_to(feature_wrapper.make('FrozenLake-v0'),
                             feature_wrapper.FeatureWrapper)
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)
    assert is_unwrappable_to(env, RewardWrapper)
    assert is_unwrappable_to(env, feature_wrapper.FeatureWrapper)
    assert is_unwrappable_to(env, DiscreteEnv)
    assert is_unwrappable_to(env, gym.Env)
Esempio n. 5
0
    def train(self,
              step_size=1e-2,
              time_limit=60,
              n_trajs=10000,
              verbose=False):
        '''Train for at most time_limit seconds w/ n_trajs non-expert trajs.
        Args:
        step_size -- `float`, size of each gradient ascent step
        time_limit -- `int`, number of seconds to train
        n_trajs -- `int`, number of non-expert trajs to be collected
        verbose -- `bool`, if true print gradient norms and reward weights
        Returns nothing.
        '''
        t0 = time.time()
        reward_coefficients = self.reward_function.parameters
        trajs = collect_trajs(self.env, self.baseline_agent, n_trajs,
                              self.horizon)

        # Estimate subgradient based on collected trajectories, then
        # update reward coefficients.
        if verbose:
            print('Starting subgradient ascent...')
        iteration_counter = 0
        while time.time() < t0 + time_limit:
            # replace the previous with the following line when using pdb
            #  for _ in range(50):
            subgrads = self.subgradients(trajs, reward_coefficients)
            reward_coefficients += step_size * subgrads
            reward_coefficients /= np.linalg.norm(reward_coefficients)
            iteration_counter += 1
            if verbose and iteration_counter < 10:
                print('ITERATION ' + str(iteration_counter) + ' grad norm: ' +
                      str(np.linalg.norm(subgrads)))
                print('ITERATION ' + str(iteration_counter) +
                      ' reward coefficients: ' + str(reward_coefficients))
        if verbose:
            print('Final reward coefficients: ' + str(reward_coefficients))

        self.reward_function = FeatureBasedRewardFunction(
            self.env_rew, reward_coefficients)
        self.env_rew.update_reward_function(self.reward_function)
 def reward_function_factory(env):
     return FeatureBasedRewardFunction(env, 'random')
Esempio n. 7
0
# a one-hot encoding of the state space as features.
env = feature_wrapper.make('FrozenLake-v0')

# Generate expert trajectories.
expert_agent = rl_alg_factory(env)
print('Training expert agent...')
expert_agent.train(15)
print('Done training expert')
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
# with open(store_to + 'trajs.pkl', 'rb') as f:
#     expert_trajs = pickle.load(f)

# Provide random reward function as initial reward estimate.
# This probably isn't really required.
reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16))
env = RewardWrapper(env, reward_function)

# Run projection algorithm for up to 5 minutes.
appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True)
appr_irl.train(time_limit=600,
               rl_time_per_iteration=45,
               eps=0,
               no_trajs=100,
               max_steps_per_episode=100,
               verbose=True)
Esempio n. 8
0
def quick_run_alg(alg_class, config={}):
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, 'random')
    env = RewardWrapper(env, reward_function)

    def rl_alg_factory(env):
        return ValueIteration(env, {})

    expert_trajs = [{
        'states': [
            0, 0, 4, 0, 4, 8, 4, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 4, 8, 9,
            8, 8, 9, 10, 14, 15
        ],
        'actions': [
            0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,
            3, 3, 1, 0, 1
        ],
        'rewards': [
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            1.0
        ],
        'true_rewards': [],
        'features': [
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }, {
        'states': [0, 4, 8, 8, 9, 10, 6, 2, 6, 10, 14, 15],
        'actions': [0, 0, 3, 3, 1, 0, 2, 0, 2, 0, 1],
        'rewards': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
        'true_rewards': [],
        'features': [
            np.array([
                0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.
            ]),
            np.array([
                0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.
            ])
        ]
    }]
    metrics = []
    alg = alg_class(env, expert_trajs, rl_alg_factory, metrics, config)
    alg.train(2, 2, 2)
Esempio n. 9
0
    def train(self,
              time_limit=300,
              rl_time_per_iteration=30,
              eps=0,
              no_trajs=1000,
              max_steps_per_episode=1000,
              verbose=False):
        '''Accumulate feature counts and estimate reward function.

        Args:
          time_limit: total training time in seconds
          rl_time_per_iteration: RL training time per step in seconds.
          eps: terminate if distance to expert feature counts is below eps.
          verbose: more verbose prints at runtime if true

        Returns nothing.
        '''
        t0 = time.time()

        if verbose:
            alg_mode = 'projection' if self.proj else 'SVM'
            print('Running Apprenticeship IRL in mode: ' + alg_mode)

        # start with random agent:
        agent = RandomAgent(self.env)

        iteration_counter = 0
        while time.time() < t0 + time_limit:
            iteration_counter += 1
            if verbose:
                print('ITERATION ' + str(iteration_counter))
            trajs = collect_trajs(self.env, agent,
                                  no_episodes=no_trajs,
                                  max_steps_per_episode=max_steps_per_episode)
            if verbose:
                print('Average true reward per episode: '
                      + str(true_reward_per_traj(trajs)))
            current_feature_count = self.feature_count(trajs)
            self.feature_counts.append(current_feature_count)
            self.labels.append(-1.0)

            feature_counts = np.array(self.feature_counts)
            labels = np.array(self.labels)

            if self.proj:
                # using projection version of the algorithm
                if iteration_counter == 1:
                    feature_count_bar = feature_counts[1]
                else:
                    line = feature_counts[-1] - feature_count_bar
                    feature_count_bar += np.dot(
                        line, feature_counts[0] - feature_count_bar) / np.dot(
                            line, line) * line
                reward_coefficients = feature_counts[0] - feature_count_bar
                distance = np.linalg.norm(reward_coefficients)

            else:
                # using SVM version of the algorithm ("max-margin" in
                # the paper, not to be confused with max-margin planning)
                w = cvx.Variable(feature_counts.shape[1])
                b = cvx.Variable()

                objective = cvx.Minimize(cvx.norm(w, 2))
                constraints = [
                    cvx.multiply(labels, (feature_counts * w + b)) >= 1
                ]

                problem = cvx.Problem(objective, constraints)
                problem.solve()
                if w.value is None:
                    print('NO MORE SVM SOLUTION!!')
                    return


                yResult = feature_counts.dot(w.value) + b.value
                supportVectorRows = np.where(np.isclose(np.abs(yResult), 1))[0]

                reward_coefficients = w.value
                distance = 2 / problem.value

                if verbose:
                    print('The support vectors are from iterations number ' +
                          str(supportVectorRows))
            if verbose:
                print('Reward coefficients: ' + str(reward_coefficients))
                print('Distance: ' + str(distance))

            self.distances.append(distance)

            self.reward_function = FeatureBasedRewardFunction(
                self.env, reward_coefficients)
            self.env.update_reward_function(self.reward_function)

            if distance <= eps:
                if verbose:
                    print("Feature counts matched within " + str(eps) + ".")
                break

            if time.time() + rl_time_per_iteration >= t0 + time_limit:
                break

            agent = self.rl_alg_factory(self.env)
            agent.train(rl_time_per_iteration)
Esempio n. 10
0
    def train(self,
              feat_map,
              time_limit=300,
              rl_time_per_iteration=15,
              verbose=False):
        """
        Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)
        inputs:
          feat_map    NxD matrix - the features for each state
          P_a         NxN_ACTIONSxN matrix - P_a[s0, a, s1] is the transition prob of
                                             landing at state s1 when taking action
                                             a at state s0
          gamma       float - RL discount factor
          trajs       a list of demonstrations
          lr          float - learning rate
          n_iters     int - number of optimization steps
        returns
          rewards     Nx1 vector - recovered state rewards
        """
        t0 = time.time()

        # init parameters
        theta = np.random.uniform(size=(feat_map.shape[1], ))

        # calc feature expectations
        feat_exp = np.zeros([feat_map.shape[1]])
        for episode in self.expert_trajs:
            for state in episode['states']:
                feat_exp += feat_map[state]
        feat_exp = feat_exp / len(self.expert_trajs)
        #print(feat_exp)

        agent = self.rl_alg_factory(self.env)

        # training
        iteration_counter = 0
        while time.time() < t0 + time_limit:
            iteration_counter += 1
            if verbose:
                print('iteration: {}'.format(iteration_counter))

            reward_function_estimate = FeatureBasedRewardFunction(
                self.env, theta)
            self.env.update_reward_function(reward_function_estimate)

            # compute policy
            agent.train(time_limit=rl_time_per_iteration)

            policy = agent.pi

            # compute state visitation frequencies
            svf = self.expected_svf(policy)

            # compute gradients
            grad = -(feat_exp - feat_map.T.dot(svf))

            # update params
            theta += self.lr * grad

        # return sigmoid(normalize(rewards))
        self.reward_function = reward_function_estimate
        return theta
 def reward_function_factory(env):
     return FeatureBasedRewardFunction(env, true_rews[:-1])
Esempio n. 12
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int):
        """

        """

        sa_visit_count, P0 = self.sa_visitations()

        # mean_s_visit_count = np.sum(sa_visit_count, 1) / len(self.expert_trajs)

        # calculate feature expectations
        expert_feature_count = self.feature_count(self.expert_trajs, gamma=1.0)

        # mean_feature_count = np.dot(self.feat_map.T, expert_feature_count )

        # initialize the parameters
        # theta = np.random.rand(self.feat_map.shape[1])

        reward_function = FeatureBasedRewardFunction(self.env, 'random')
        theta = reward_function.parameters

        agent = self.rl_alg_factory(self.env)

        irl_iteration_counter = 0

        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            reward_function_estimate = FeatureBasedRewardFunction(
                self.env, theta)
            self.env.update_reward_function(reward_function_estimate)

            # compute policy
            agent.train(no_rl_episodes_per_irl_iteration)

            policy = agent.policy_array()
            state_values = agent.state_values
            q_values = agent.q_values

            # Log-Likelihood
            # l = np.sum(sa_visit_count * (q_values - state_values.T))  # check: broadcasting works as intended or not

            # occupancy measure
            d = self.occupancy_measure(policy=policy, P0=P0)[:-1]

            # log-likeilihood gradient
            grad = -(expert_feature_count - np.dot(self.feat_map.T, d))

            # graduate descent
            theta -= self.config['lr'] * grad

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return theta
Esempio n. 13
0
from irl_benchmark.irl.algorithms.maxent.me_irl import MaxEnt
from irl_benchmark.irl.collect import collect_trajs
from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper
from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction
from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper
from irl_benchmark.rl.algorithms.value_iteration import ValueIteration
from irl_benchmark.utils.utils import get_transition_matrix

store_to = 'data/frozen/expert/'
no_episodes = 1000
max_steps_per_episode = 1000

env = gym.make('FrozenLake8x8-v0')
env = FrozenLakeFeatureWrapper(env)
initial_reward_function_estimate = FeatureBasedRewardFunction(
    env=env, parameters=np.zeros(64))
env = RewardWrapper(env=env, reward_function=initial_reward_function_estimate)

# Generate expert trajectories.
expert_agent = ValueIteration(env)
print('Training expert agent...')
expert_agent.train(30)
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

feat_map = np.eye(64)

transition_dynamics = get_transition_matrix(env)


def rl_alg_factory(env):
Esempio n. 14
0
 def reward_function_factory(env):
     return FeatureBasedRewardFunction(env, np.zeros(16))
Esempio n. 15
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int
              ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]:
        """Train the apprenticeship learning IRL algorithm.

        Parameters
        ----------
        no_irl_iterations: int
            The number of iteration the algorithm should be run.
        no_rl_episodes_per_irl_iteration: int
            The number of episodes the RL algorithm is allowed to run in
            each iteration of the IRL algorithm.
        no_irl_episodes_per_irl_iteration: int
            The number of episodes permitted to be run in each iteration
            to update the current reward estimate (e.g. to estimate state frequencies
            of the currently optimal policy).

        Returns
        -------
        Tuple[BaseRewardFunction, BaseRLAlgorithm]
            The estimated reward function and a RL agent trained for this estimate.
        """

        # Initialize training with a random agent.
        agent = RandomAgent(self.env)

        irl_iteration_counter = 0
        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            # Estimate feature count of current agent.
            trajs = collect_trajs(
                self.env,
                agent,
                no_trajectories=no_irl_episodes_per_irl_iteration)
            current_feature_count = self.feature_count(
                trajs, gamma=self.config['gamma'])

            # add new feature count to list of feature counts
            self.feature_counts.append(current_feature_count)
            # for SVM mode:
            self.labels.append(-1.)

            # convert to numpy array:
            feature_counts = np.array(self.feature_counts)
            labels = np.array(self.labels)

            # update reward coefficients based on mode specified in config:
            if self.config['mode'] == 'projection':
                # projection mode:
                if irl_iteration_counter == 1:
                    # initialize feature_count_bar in first iteration
                    # set to first non-expert feature count:
                    feature_count_bar = feature_counts[1]
                else:
                    # not first iteration.
                    # calculate line through last feature_count_bar and
                    # last non-expert feature count:
                    line = feature_counts[-1] - feature_count_bar
                    # new feature_count_bar is orthogonal projection of
                    # expert's feature count onto the line:
                    feature_count_bar += np.dot(
                        line, feature_counts[0] - feature_count_bar) / np.dot(
                            line, line) * line
                reward_coefficients = feature_counts[0] - feature_count_bar
                # compute distance as L2 norm of reward coefficients (t^(i) in paper):
                distance = np.linalg.norm(reward_coefficients, ord=2)

            elif self.config['mode'] == 'svm':
                # svm mode:
                # create quadratic programming problem definition:
                weights = cvx.Variable(feature_counts.shape[1])
                bias = cvx.Variable()
                objective = cvx.Minimize(cvx.norm(weights, 2))
                constraints = [
                    cvx.multiply(labels,
                                 (feature_counts * weights + bias)) >= 1
                ]
                problem = cvx.Problem(objective, constraints)
                # solve quadratic program:
                problem.solve()

                if weights.value is None:
                    # TODO: we need to handle empty solution better.
                    raise RuntimeError(
                        'Empty solution set for linearly separable SVM.')

                if self.config['verbose']:
                    # print support vectors
                    # (which last iterations where relevant for current result?)
                    svm_classifications = feature_counts.dot(
                        weights.value) + bias.value
                    support_vectors = np.where(
                        np.isclose(np.abs(svm_classifications), 1))[0]
                    print('The support vectors are from iterations number ' +
                          str(support_vectors))

                reward_coefficients = weights.value
                distance = 2 / problem.value

            else:
                raise NotImplementedError()

            if self.config['verbose']:
                print('Distance: ' + str(distance))

            self.distances.append(distance)

            # create new reward function with current coefficient estimate
            reward_function = FeatureBasedRewardFunction(
                self.env, reward_coefficients)
            # update reward function
            assert isinstance(self.env, RewardWrapper)
            self.env.update_reward_function(reward_function)

            # TODO: see messages with max about order of training & deducing
            # check stopping criterion:
            if distance <= self.config['epsilon']:
                if self.config['verbose']:
                    print("Feature counts matched within " +
                          str(self.config['epsilon']) + ".")
                break

            # create new RL-agent
            agent = self.rl_alg_factory(self.env)
            # train agent (with new reward function)
            agent.train(no_rl_episodes_per_irl_iteration)

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return reward_function, agent
Esempio n. 16
0
    def __init__(self,
                 env,
                 expert_trajs,
                 rl_alg_factory,
                 baseline_agent=None,
                 gamma=.8,
                 horizon=20,
                 delta=.05,
                 eps=None):
        '''Set environment, RL agent factory, expert trajectories, and parameters.
        Args:
        env -- wrapped environment; unwrap_env must get envs of the following
               types from it: gym.Env, FeatureWrapper, RewardWrapper
        expert_trajs -- `list` of expert trajectories
        rl_alg_factory -- function that takes an environment
                          and returns an RL agent
        baseline_agent -- `RLAlgorithm`, used to get non-optimal trajectories.
                          If None, a RandomAgent will be used.
        gamma -- `float`, discount factor; note that large values won't work
                 well for environments like FrozenLake where discounting is the
                 only incentive to quickly reach the goal state
        horizon -- `int`, fixed length of trajectories to be considered
        delta -- confidence that feature count difference between output policy
                 and expert policy is less than 2 * epsilon
        calc_eps -- `float` or None; if None, then epsilons will be calculated
                    to guarantee matching expert feature counts within epsilon
                    with confidence delta (via Hoeffding's inequality). But
                    this requires the range of feature values.
        NOTE: Performance of the algorithm might depend on epsilons in a way
        I don't currently understand, as the epsilons occur in the expression
        used to approximate the relevant subgradient (ibid., p. 187, eq. 7).
        '''
        # Initialize base class and put remaining args into attributes.
        super(RelEnt, self).__init__(env, expert_trajs, rl_alg_factory)
        self.gamma = gamma
        self.horizon = horizon
        self.delta = delta

        # Compute remaining attributes.
        # Set gym.Env and FeatureWrapper envs as attributes.
        self.env_gym = unwrap_env(self.env, gym.Env)
        self.env_feat = unwrap_env(self.env, FeatureWrapper)
        self.env_rew = unwrap_env(self.env, RewardWrapper)
        if baseline_agent is not None:
            self.baseline_agent = baseline_agent
        else:
            self.baseline_agent = RandomAgent(self.env_gym)
        # Set expert trajs, and features.
        self.n_trajs = len(self.expert_trajs)
        self.n_features = self.env_feat.feature_shape()[0]
        assert isinstance(self.n_features, int)  # Should be dim of vector.
        # Initialize random reward function.
        self.reward_function = FeatureBasedRewardFunction(
            self.env_rew, np.random.randn(self.n_features))
        # Calculate expert feature counts.
        self.expert_feature_count = self.feature_count(self.expert_trajs)
        # Set tolerance epsilon (one per feature) for not matching
        # expert feature counts.
        self.epsilons = np.zeros(self.n_features)
        if eps is not None:
            self.epsilons = eps
        else:
            # Calculate epsilons via Hoeffding (ibid., p. 184).
            max_features = self.env_feat.feature_range()[1]
            min_features = self.env_feat.feature_range()[0]
            self.epsilons = max_features - min_features
            scale = np.sqrt(-np.log(1 - self.delta) / (2 * self.n_trajs))
            scale *= (self.gamma**(self.horizon + 1) - 1) / (self.gamma - 1)
            self.epsilons *= scale
Esempio n. 17
0
 def reward_function_factory(env):
     params = np.zeros(64)
     params[-1] = 1.
     return FeatureBasedRewardFunction(env, params)
def test_value_iteration():
    # gamma = 1.0
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 1.0})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.93 and state_values[14] < 0.95
    assert state_values[15] == 0

    # gamma = 0.9
    env = gym.make('FrozenLake-v0')
    agent = ValueIteration(env, {'gamma': 0.9})
    agent.train(10)
    state_values = agent.state_values
    assert isinstance(state_values, np.ndarray)
    assert state_values.shape == (17, )
    # argmax should be state just before frisbee
    # (15 is final state, 16 is absorbing state)
    assert np.argmax(state_values) == 14
    assert state_values[14] > 0.63 and state_values[14] < 0.65
    # holes and frisbee should have zero value:
    for i in [5, 7, 11, 12, 15]:
        assert state_values[i] == 0

    # check some q values:
    # go right in second to last state
    assert np.argmax(agent.q_values[14, :]) == 1
    assert np.min(agent.q_values) == 0
    assert np.max(agent.q_values) <= 1

    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))

    # check softmax policy
    old_state_values = agent.state_values
    old_q_values = agent.q_values
    agent = ValueIteration(env, {'gamma': 0.9, 'temperature': 0.1})
    agent.train(10)
    assert np.all(agent.state_values <= old_state_values)
    # at least initial state should now have lower value:
    assert agent.state_values[0] < old_state_values[0]
    assert np.all(agent.q_values <= old_q_values)
    # check policy:
    for i in range(16):
        assert np.isclose(np.sum(agent.policy(i)), 1.)
        assert np.min(agent.policy(i)) >= 0.
        assert np.argmax(agent.q_values[i, :]) == np.argmax(agent.policy(i))
        # ordering of probabilities should stay the same with softmax
        assert np.all(
            np.argsort(old_q_values[i, :]) == np.argsort(agent.policy(i)))

    # test policy array:
    policy_array = agent.policy_array()
    assert policy_array.shape == (17, 4)
    for i in range(16):
        assert np.all(agent.policy(i) == policy_array[i, :])

    # check if true reward isn't leaked:
    env = feature_wrapper.make('FrozenLake-v0')
    reward_function = FeatureBasedRewardFunction(env, np.zeros(16))
    env = RewardWrapper(env, reward_function)
    agent = ValueIteration(env, {})
    agent.train(10)
    assert np.sum(agent.state_values == 0)
Esempio n. 19
0
from irl_benchmark.irl.algorithms.maxent.me_irl import MaxEnt
from irl_benchmark.irl.collect import collect_trajs
from irl_benchmark.irl.feature.feature_wrapper import FrozenLakeFeatureWrapper
from irl_benchmark.irl.reward.reward_function import FeatureBasedRewardFunction
from irl_benchmark.irl.reward.reward_wrapper import RewardWrapper
from irl_benchmark.metrics.inverse_learning_error import ILE
from irl_benchmark.rl.algorithms.value_iteration import ValueIteration
from irl_benchmark.utils.utils import get_transition_matrix

store_to = 'data/frozen/expert/'
no_episodes = 1000
max_steps_per_episode = 1000

env = gym.make('FrozenLake8x8-v0')
env = FrozenLakeFeatureWrapper(env)
initial_reward_function_estimate = FeatureBasedRewardFunction(
    env, np.zeros(64))
env = RewardWrapper(env, initial_reward_function_estimate)

# Generate expert trajectories.
expert_agent = ValueIteration(env)
print('Training expert agent...')
expert_agent.train(10)
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

feat_map = np.eye(64)

transition_dynamics = get_transition_matrix(env)


def rl_alg_factory(env):
Esempio n. 20
0
def maze_world_0(env):
    parameters = np.array(
        [REWARD_MOVE, REWARD_SMALL, REWARD_MEDIUM, REWARD_LARGE])
    print('Create env for true reward function')
    return FeatureBasedRewardFunction(env, parameters, action_in_domain=True)
Esempio n. 21
0
def test_random_featb_function():
    env = make_wrapped_env('FrozenLake-v0', with_feature_wrapper=True)
    rf = FeatureBasedRewardFunction(env, 'random')