Ejemplo n.º 1
0
 def generate_traj_if_not_exists(self, evaluation_input: dict):
     assert 'irl_agent' in evaluation_input.keys()
     if not 'irl_trajs' in evaluation_input:
         print('generating new trajs for metrics')
         evaluation_input['irl_trajs'] = collect_trajs(
             self.env, evaluation_input['irl_agent'], 100)
     else:
         print('reuse generated trajs for metric')
     return evaluation_input['irl_trajs']
Ejemplo n.º 2
0
def test_tabular_function():
    def reward_function_factory(env):
        params = np.zeros(64)
        params[-1] = 1.
        return TabularRewardFunction(env, params)

    env = make_wrapped_env('FrozenLake8x8-v0',
                           reward_function_factory=reward_function_factory,
                           with_model_wrapper=True)
    agent = ValueIteration(env)
    agent.train(1)
    trajs = collect_trajs(env, agent, 10)
    for traj in trajs:
        for i in range(len(traj['rewards'])):
            assert np.isclose(traj['rewards'][i], traj['true_rewards'][i])
Ejemplo n.º 3
0
    def train(self,
              step_size=1e-2,
              time_limit=60,
              n_trajs=10000,
              verbose=False):
        '''Train for at most time_limit seconds w/ n_trajs non-expert trajs.
        Args:
        step_size -- `float`, size of each gradient ascent step
        time_limit -- `int`, number of seconds to train
        n_trajs -- `int`, number of non-expert trajs to be collected
        verbose -- `bool`, if true print gradient norms and reward weights
        Returns nothing.
        '''
        t0 = time.time()
        reward_coefficients = self.reward_function.parameters
        trajs = collect_trajs(self.env, self.baseline_agent, n_trajs,
                              self.horizon)

        # Estimate subgradient based on collected trajectories, then
        # update reward coefficients.
        if verbose:
            print('Starting subgradient ascent...')
        iteration_counter = 0
        while time.time() < t0 + time_limit:
            # replace the previous with the following line when using pdb
            #  for _ in range(50):
            subgrads = self.subgradients(trajs, reward_coefficients)
            reward_coefficients += step_size * subgrads
            reward_coefficients /= np.linalg.norm(reward_coefficients)
            iteration_counter += 1
            if verbose and iteration_counter < 10:
                print('ITERATION ' + str(iteration_counter) + ' grad norm: ' +
                      str(np.linalg.norm(subgrads)))
                print('ITERATION ' + str(iteration_counter) +
                      ' reward coefficients: ' + str(reward_coefficients))
        if verbose:
            print('Final reward coefficients: ' + str(reward_coefficients))

        self.reward_function = FeatureBasedRewardFunction(
            self.env_rew, reward_coefficients)
        self.env_rew.update_reward_function(self.reward_function)
Ejemplo n.º 4
0
    def generate_traj_if_not_exists(self, evaluation_input: dict):
        """ Generate trajectories and store them in evaluation input.
        If evaluation input already contains trajectories, do nothing.

        Parameters
        ----------
        evaluation_input

        Returns
        -------
        dict
            evaluation_input
        """
        assert 'irl_agent' in evaluation_input.keys()
        if not 'irl_trajs' in evaluation_input:
            print('generating new trajs for metrics')
            evaluation_input['irl_trajs'] = collect_trajs(
                self.env, evaluation_input['irl_agent'], self.no_trajs)
        else:
            print('reuse generated trajs for metric')
        return evaluation_input['irl_trajs']
Ejemplo n.º 5
0
    in the IRL loop.'''
    return TabularQ(env)


# Apprenticeship IRL assumes that rewards are linear in features.
# However, FrozenLake doesn't provide features. It is sufficiently small
# to work with tabular methods. Therefore, we just use a wrapper that uses
# a one-hot encoding of the state space as features.
env = feature_wrapper.make('FrozenLake-v0')

# Generate expert trajectories.
expert_agent = rl_alg_factory(env)
print('Training expert agent...')
expert_agent.train(15)
print('Done training expert')
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
# with open(store_to + 'trajs.pkl', 'rb') as f:
#     expert_trajs = pickle.load(f)

# Provide random reward function as initial reward estimate.
# This probably isn't really required.
reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16))
env = RewardWrapper(env, reward_function)

# Run projection algorithm for up to 5 minutes.
appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, proj=True)
appr_irl.train(time_limit=600,
Ejemplo n.º 6
0
    def train(self,
              time_limit=300,
              rl_time_per_iteration=30,
              eps=0,
              no_trajs=1000,
              max_steps_per_episode=1000,
              verbose=False):
        '''Accumulate feature counts and estimate reward function.

        Args:
          time_limit: total training time in seconds
          rl_time_per_iteration: RL training time per step in seconds.
          eps: terminate if distance to expert feature counts is below eps.
          verbose: more verbose prints at runtime if true

        Returns nothing.
        '''
        t0 = time.time()

        if verbose:
            alg_mode = 'projection' if self.proj else 'SVM'
            print('Running Apprenticeship IRL in mode: ' + alg_mode)

        # start with random agent:
        agent = RandomAgent(self.env)

        iteration_counter = 0
        while time.time() < t0 + time_limit:
            iteration_counter += 1
            if verbose:
                print('ITERATION ' + str(iteration_counter))
            trajs = collect_trajs(self.env, agent,
                                  no_episodes=no_trajs,
                                  max_steps_per_episode=max_steps_per_episode)
            if verbose:
                print('Average true reward per episode: '
                      + str(true_reward_per_traj(trajs)))
            current_feature_count = self.feature_count(trajs)
            self.feature_counts.append(current_feature_count)
            self.labels.append(-1.0)

            feature_counts = np.array(self.feature_counts)
            labels = np.array(self.labels)

            if self.proj:
                # using projection version of the algorithm
                if iteration_counter == 1:
                    feature_count_bar = feature_counts[1]
                else:
                    line = feature_counts[-1] - feature_count_bar
                    feature_count_bar += np.dot(
                        line, feature_counts[0] - feature_count_bar) / np.dot(
                            line, line) * line
                reward_coefficients = feature_counts[0] - feature_count_bar
                distance = np.linalg.norm(reward_coefficients)

            else:
                # using SVM version of the algorithm ("max-margin" in
                # the paper, not to be confused with max-margin planning)
                w = cvx.Variable(feature_counts.shape[1])
                b = cvx.Variable()

                objective = cvx.Minimize(cvx.norm(w, 2))
                constraints = [
                    cvx.multiply(labels, (feature_counts * w + b)) >= 1
                ]

                problem = cvx.Problem(objective, constraints)
                problem.solve()
                if w.value is None:
                    print('NO MORE SVM SOLUTION!!')
                    return


                yResult = feature_counts.dot(w.value) + b.value
                supportVectorRows = np.where(np.isclose(np.abs(yResult), 1))[0]

                reward_coefficients = w.value
                distance = 2 / problem.value

                if verbose:
                    print('The support vectors are from iterations number ' +
                          str(supportVectorRows))
            if verbose:
                print('Reward coefficients: ' + str(reward_coefficients))
                print('Distance: ' + str(distance))

            self.distances.append(distance)

            self.reward_function = FeatureBasedRewardFunction(
                self.env, reward_coefficients)
            self.env.update_reward_function(self.reward_function)

            if distance <= eps:
                if verbose:
                    print("Feature counts matched within " + str(eps) + ".")
                break

            if time.time() + rl_time_per_iteration >= t0 + time_limit:
                break

            agent = self.rl_alg_factory(self.env)
            agent.train(rl_time_per_iteration)
Ejemplo n.º 7
0
# Run this script to generate all expert data.

# FROZEN LAKE:
env = feature_wrapper.make('FrozenLake-v0')


def rl_alg_factory(env):
    return ValueIteration(env, {'gamma': 0.9})


expert_agent = rl_alg_factory(env)
expert_agent.train(None)
expert_trajs = collect_trajs(env,
                             expert_agent,
                             10000,
                             None,
                             'data/frozen/expert/',
                             verbose=True)

# FROZEN LAKE 8x8:
env = feature_wrapper.make('FrozenLake8x8-v0')


def rl_alg_factory(env):
    return ValueIteration(env, {'gamma': 0.9})


expert_agent = rl_alg_factory(env)
expert_agent.train(None)
expert_trajs = collect_trajs(env,
                             expert_agent,
Ejemplo n.º 8
0
    def train(self, no_irl_iterations: int,
              no_rl_episodes_per_irl_iteration: int,
              no_irl_episodes_per_irl_iteration: int
              ) -> Tuple[BaseRewardFunction, BaseRLAlgorithm]:
        """Train the apprenticeship learning IRL algorithm.

        Parameters
        ----------
        no_irl_iterations: int
            The number of iteration the algorithm should be run.
        no_rl_episodes_per_irl_iteration: int
            The number of episodes the RL algorithm is allowed to run in
            each iteration of the IRL algorithm.
        no_irl_episodes_per_irl_iteration: int
            The number of episodes permitted to be run in each iteration
            to update the current reward estimate (e.g. to estimate state frequencies
            of the currently optimal policy).

        Returns
        -------
        Tuple[BaseRewardFunction, BaseRLAlgorithm]
            The estimated reward function and a RL agent trained for this estimate.
        """

        # Initialize training with a random agent.
        agent = RandomAgent(self.env)

        irl_iteration_counter = 0
        while irl_iteration_counter < no_irl_iterations:
            irl_iteration_counter += 1

            if self.config['verbose']:
                print('IRL ITERATION ' + str(irl_iteration_counter))

            # Estimate feature count of current agent.
            trajs = collect_trajs(
                self.env,
                agent,
                no_trajectories=no_irl_episodes_per_irl_iteration)
            current_feature_count = self.feature_count(
                trajs, gamma=self.config['gamma'])

            # add new feature count to list of feature counts
            self.feature_counts.append(current_feature_count)
            # for SVM mode:
            self.labels.append(-1.)

            # convert to numpy array:
            feature_counts = np.array(self.feature_counts)
            labels = np.array(self.labels)

            # update reward coefficients based on mode specified in config:
            if self.config['mode'] == 'projection':
                # projection mode:
                if irl_iteration_counter == 1:
                    # initialize feature_count_bar in first iteration
                    # set to first non-expert feature count:
                    feature_count_bar = feature_counts[1]
                else:
                    # not first iteration.
                    # calculate line through last feature_count_bar and
                    # last non-expert feature count:
                    line = feature_counts[-1] - feature_count_bar
                    # new feature_count_bar is orthogonal projection of
                    # expert's feature count onto the line:
                    feature_count_bar += np.dot(
                        line, feature_counts[0] - feature_count_bar) / np.dot(
                            line, line) * line
                reward_coefficients = feature_counts[0] - feature_count_bar
                # compute distance as L2 norm of reward coefficients (t^(i) in paper):
                distance = np.linalg.norm(reward_coefficients, ord=2)

            elif self.config['mode'] == 'svm':
                # svm mode:
                # create quadratic programming problem definition:
                weights = cvx.Variable(feature_counts.shape[1])
                bias = cvx.Variable()
                objective = cvx.Minimize(cvx.norm(weights, 2))
                constraints = [
                    cvx.multiply(labels,
                                 (feature_counts * weights + bias)) >= 1
                ]
                problem = cvx.Problem(objective, constraints)
                # solve quadratic program:
                problem.solve()

                if weights.value is None:
                    # TODO: we need to handle empty solution better.
                    raise RuntimeError(
                        'Empty solution set for linearly separable SVM.')

                if self.config['verbose']:
                    # print support vectors
                    # (which last iterations where relevant for current result?)
                    svm_classifications = feature_counts.dot(
                        weights.value) + bias.value
                    support_vectors = np.where(
                        np.isclose(np.abs(svm_classifications), 1))[0]
                    print('The support vectors are from iterations number ' +
                          str(support_vectors))

                reward_coefficients = weights.value
                distance = 2 / problem.value

            else:
                raise NotImplementedError()

            if self.config['verbose']:
                print('Distance: ' + str(distance))

            self.distances.append(distance)

            # create new reward function with current coefficient estimate
            reward_function = FeatureBasedRewardFunction(
                self.env, reward_coefficients)
            # update reward function
            assert isinstance(self.env, RewardWrapper)
            self.env.update_reward_function(reward_function)

            # TODO: see messages with max about order of training & deducing
            # check stopping criterion:
            if distance <= self.config['epsilon']:
                if self.config['verbose']:
                    print("Feature counts matched within " +
                          str(self.config['epsilon']) + ".")
                break

            # create new RL-agent
            agent = self.rl_alg_factory(self.env)
            # train agent (with new reward function)
            agent.train(no_rl_episodes_per_irl_iteration)

            evaluation_input = {
                'irl_agent': agent,
                'irl_reward': reward_function
            }
            self.evaluate_metrics(evaluation_input)

        return reward_function, agent
Ejemplo n.º 9
0
        return TabularQ(env)


# RelEnt IRL assumes that rewards are linear in features.
# However, FrozenLake doesn't provide features. It is sufficiently small
# to work with tabular methods. Therefore, we just use a wrapper that uses
# a one-hot encoding of the state space as features.
env = gym.make('FrozenLake-v0')
env = FrozenLakeFeatureWrapper(env)

# Generate expert trajectories.
expert_agent = rl_alg_factory(env, lp=True)
print('Training expert agent...')
expert_agent.train(600)
print('Done training expert')
expert_trajs = collect_trajs(env, expert_agent, no_episodes,
                             max_steps_per_episode, store_to)
expert_performance = avg_undiscounted_return(expert_trajs)
print('The expert ' + 'reached the goal in ' + str(expert_performance) +
      ' of trajs.')

# you can comment out the previous block if expert data has already
# been generated and load the trajectories from file by uncommenting
# next 2 lines:
# with open(store_to + 'trajs.pkl', 'rb') as f:
#     expert_trajs = pickle.load(f)

# Provide random reward function as initial reward estimate.
# This probably isn't really required.
n_features = unwrap_env(env, FeatureWrapper).feature_shape()[0]
reward_function = FeatureBasedRewardFunction(env,
                                             np.random.normal(size=n_features))
Ejemplo n.º 10
0
# TODO: this is an example for Sayan, delete later.

store_to = 'data/frozen/expert/'
no_episodes = 500


def rl_alg_factory(env):
    return ValueIteration(env, {'gamma': 0.9})


env = feature_wrapper.make('FrozenLake-v0')

expert_agent = rl_alg_factory(env)
expert_agent.train(15)
expert_trajs = collect_trajs(env, expert_agent, no_episodes, None, store_to)

# wrap env in random reward function to prevent leaking true reward:
reward_function = FeatureBasedRewardFunction(env, np.random.normal(size=16))
env = RewardWrapper(env, reward_function)

# Run projection algorithm for up to 5 minutes.
irl_config = {'gamma': 0.9, 'verbose': True}
appr_irl = ApprIRL(env, expert_trajs, rl_alg_factory, irl_config)
reward_function, rl_agent = appr_irl.train(
    no_irl_iterations=50,
    no_rl_episodes_per_irl_iteration=no_episodes,
    no_irl_episodes_per_irl_iteration=no_episodes)

print(reward_function.parameters)