Esempio n. 1
0
def main():
    """Main function"""

    # Test BV on PuddleWorld
    from scipy.optimize import minimize

    # from multimodal_irl.envs import CanonicalPuddleWorldEnv, puddle_world_extras
    from multimodal_irl.envs import ElementWorldEnv, element_world_extras

    # env = CanonicalPuddleWorldEnv(wind=0.0)
    # xtr, phi, gt_rewards = puddle_world_extras(env)
    # reward = gt_rewards["dry"]

    env = ElementWorldEnv(wind=0.1, num_elements=3)
    xtr, phi, gt_rewards = element_world_extras(env)
    reward = gt_rewards[0]
    print(reward.theta)

    scale = 2.0
    _, q_star = vi(xtr, phi, reward)
    pi_star = BoltzmannExplorationPolicy(q_star, scale=scale)
    demo_star = pi_star.get_rollouts(env, 10)
    phi_bar_star = phi.demo_average(demo_star, xtr.gamma)
    print(phi_bar_star)

    x0 = np.zeros(len(phi))
    bounds = np.array([(-10.0, 0.0) for _ in range(len(phi))])
    res = minimize(
        bv_maxlikelihood_irl,
        x0,
        args=(xtr, phi, demo_star, None, scale),
        jac=True,
        bounds=bounds,
        options=dict(disp=True),
    )

    print(res)

    pass
Esempio n. 2
0
def bv_maxlikelihood_irl(
    x,
    xtr,
    phi,
    rollouts,
    weights=None,
    boltzmann_scale=0.5,
    qge_tol=1e-3,
    nll_only=False,
):
    """Compute the average rollout Negative Log Likelihood (and gradient) for ML-IRL

    This method is biased to prefer shorter paths through any MDP.

    TODO ajs 29/Oct/2020 Support SoftMax Q function from Babes-Vroman 2011 paper via
        nb_smq_value_iteration()

    Args:
        x (numpy array): Current reward function parameter vector estimate
        xtr (mdp_extras.DiscreteExplicitExtras): Extras object for the MDP being
            optimized
        phi (mdp_extras.FeatureFunction): Feature function to use with linear reward
            parameters. We require len(phi) == len(x).
        rollouts (list): List of (s, a) rollouts.

        weights (numpy array): Optional path weights for weighted IRL problems
        boltzmann_scale (float): Optimality parameter for Boltzmann policy. Babes-Vroman
            use 0.5. Values closer to 1.0 cause slower convergence, but values closer to
            0 model the demonstrations as being non-expert. Empirically I find 0.2 works
            well.
        qge_tol (float): Tolerance for q-gradient estimation.
        nll_only (bool): If true, only return NLL
    """

    if weights is None:
        weights = np.ones(len(rollouts)) / len(rollouts)

    # Compute Q*, pi* for current reward guess
    reward = Linear(x)
    _, q_star = vi(xtr, phi, reward)

    # To use the soft Q function from Babes-Vroman's paper, uncomment below
    # q_star = nb_smq_value_iteration(
    #     xtr.t_mat, xtr.gamma, *reward.structured(xtr, phi), boltzmann_scale
    # )
    pi = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale)

    if not nll_only:
        # Get Q* gradient for current reward parameters
        dq_dtheta = q_grad_fpi(reward.theta, xtr, phi, tol=qge_tol)

    # Sweep demonstrated state-action pairs
    nll = 0
    nll_grad = np.zeros_like(x)
    num_sa_samples = 0
    for path, weight in zip(rollouts, weights):
        for s, a in path[:-1]:
            num_sa_samples += 1
            ell_theta = pi.prob_for_state_action(s, a)

            # Accumulate negative log likelihood of demonstration data
            nll += -1 * weight * np.log(ell_theta)

            if not nll_only:
                expected_action_grad = np.sum(
                    [
                        pi.prob_for_state_action(s, b) * dq_dtheta[s, b, :]
                        for b in xtr.actions
                    ],
                    axis=0,
                )
                dl_dtheta = boltzmann_scale * (expected_action_grad -
                                               dq_dtheta[s, a, :])
                nll_grad += weight * dl_dtheta

    # Convert NLL and gradient to average, not sum
    # This makes for consistent magnitude values regardless of dataset size
    nll /= len(rollouts)
    nll_grad /= len(rollouts)

    if nll_only:
        return nll
    else:
        return nll, nll_grad
Esempio n. 3
0
def maxlikelihood_ml_path(xtr,
                          phi,
                          reward,
                          start,
                          goal,
                          max_path_length,
                          boltzmann_scale=0.5):
    """Find the ML path from s1 to sg under a MaxLikelihood model

    If transitions can inccur +ve rewards the returned paths may contain loops

    NB ajs 14/Jan/2020 The log likelihood of the path that we compute internally
        is fine for doing viterbi ML path inference, but it's not the actual path
        log likelihood - it's not normalized, and the gamma time offset
        is incorrect (depending on what start time the Viterbi alg picks).

    Args:
        xtr (DiscreteExplicitExtras): MDP Extras object
        phi (FeatureFunction): MDP Featrure function
        reward (Linear): Linear reward function
        start (int): Starting state
        goal (int): End state
        max_path_length (int): Maximum allowable path length to search

        boltzmann_scale (float): Boltzmann scale parameter

    Returns:
        (list): Maximum Likelihood path from start to goal under the given MaxEnt reward
            function, or None if no path is possible
    """

    _, q_star = vi(xtr, phi, reward)

    # Initialize an SxA LL Viterbi trellis
    sa_lls = np.zeros(
        (len(xtr.states), len(xtr.actions), max_path_length)) - np.inf
    for a in xtr.actions:
        sa_lls[goal, :, :] = boltzmann_scale * q_star[goal, a]

    # Suppress divide by zero - we take logs of many zeroes here
    with np.errstate(divide="ignore"):

        # Walk backward to propagate the maximum LL
        for t in range(max_path_length - 2, -1, -1):

            # Max-Reduce over actions to compute state LLs
            # (it's a max because we get to choose our actions)
            s_lls = np.max(sa_lls, axis=1)

            # Sweep end states
            for s2 in xtr.states:

                if np.isneginf(s_lls[s2, t + 1]):
                    # Skip this state - it hasn't been reached by probability messages yet
                    continue

                # Sweep actions
                for a in xtr.actions:

                    # Sweep starting states
                    for s1 in xtr.states:

                        if xtr.terminal_state_mask[s1]:
                            # We can't step forward from terminal states - skip this one
                            continue

                        transition_ll = boltzmann_scale * q_star[
                            s1, a] + np.log(xtr.t_mat[s1, a, s2])

                        if np.isneginf(transition_ll):
                            # This transition is impossible - skip
                            continue

                        # Store the max because we're after the maximum likelihood path
                        sa_lls[s1, a,
                               t] = max(sa_lls[s1, a, t],
                                        transition_ll + s_lls[s2, t + 1])

    # Max-reduce to get state/action ML trellises for conveience
    s_lls = np.max(sa_lls, axis=1)

    # Identify our starting time
    if np.isneginf(np.max(s_lls[start])):
        # There is no feasible path from s1 to sg less or equal to than max_path_length
        return None
    start_time = np.argmax(s_lls[start, :])

    # Walk forward from start state, start time to re-construct path
    state = start
    time = start_time
    ml_path = []
    while state != goal:
        action = np.argmax(sa_lls[state, :, time])
        ml_path.append((state, action))
        successor_states = [s for (a, s) in xtr.children[state] if a == action]

        # Choose successor state with highest log likelihood at time + 1
        ml = -np.inf
        next_state = None
        for s2 in successor_states:
            s2_ll = s_lls[s2, time + 1]
            if s2_ll >= ml:
                next_state = s2
                ml = s2_ll

        state = next_state
        time = time + 1

    # Add final (goal) state
    ml_path.append((state, None))

    return ml_path
Esempio n. 4
0
def main():
    """Main function"""

    from mdp_extras import vi, OptimalPolicy
    from multimodal_irl.envs.element_world import ElementWorldEnv, element_world_extras

    demos_per_mode = 10

    num_elements = 4
    env = ElementWorldEnv(num_elements=num_elements, wind=0.1)
    xtr, phi, rewards_gt = element_world_extras(env)
    demos = []
    for reward in rewards_gt:
        _, q_star = vi(xtr, phi, reward)
        pi_star = OptimalPolicy(q_star)
        cluster_demos = pi_star.get_rollouts(env, demos_per_mode)
        demos.extend(cluster_demos)

    print("GT Rewards:")
    print(np.array([r.theta for r in rewards_gt]))

    # Algorithm hyper-params
    concentration = 10.0
    max_initial_clusters = num_elements
    reward_mean = np.mean(rewards_gt[0].theta) * np.ones_like(
        rewards_gt[0].theta)
    reward_covariance = np.var(rewards_gt[0].theta) * np.ones_like(
        rewards_gt[0].theta)
    max_iterations = 10000
    reward_bounds = (-10.0, 0.0)

    # Initialise clusters
    initial_clusters = []
    for cluster_idx, cluster_size in enumerate(
            multinomial(
                len(demos),
                *dirichlet(
                    np.ones(max_initial_clusters) * concentration /
                    max_initial_clusters).rvs(1),
            ).rvs(1)[0]):
        for _ in range(cluster_size):
            initial_clusters.append(cluster_idx)
    initial_clusters = cluster_compaction(np.array(initial_clusters))
    num_initial_clusters = len(set(initial_clusters))

    # Define the reward prior and associated access functions
    reward_prior = multivariate_normal(mean=reward_mean, cov=reward_covariance)
    reward_prior_sample = lambda: reward_prior.rvs(1)
    reward_prior_log_pdf = lambda theta: reward_prior.logpdf(theta)
    reward_prior_log_pdf_grad = lambda theta: np.array(
        [(1.0 / norm(mu_d, sigma_d).pdf(theta_d)) *
         (-1.0 / sigma_d / np.sqrt(2 * np.pi)) * ((theta_d - mu_d) / sigma_d) *
         (np.exp(-0.5 * ((theta_d - mu_d) / sigma_d)**2)) for reward_dim,
         (theta_d, mu_d, sigma_d) in enumerate(
             zip(theta, reward_mean, reward_covariance))]
    )  # Believe it or not, this computes the derivative of the multivariate normal log pdf (with diagonal covariance)

    # Initialise rewards
    initial_rewards = np.clip(reward_prior.rvs(num_initial_clusters),
                              *reward_bounds)
    if len(set(initial_clusters)) == 1:
        initial_rewards = np.array([initial_rewards])

    # Set initial clusters and rewards to ground truth
    # print("Initialising with ground truth rewards and clusters")
    # initial_clusters = np.concatenate(
    #     [[i] * demos_per_mode for i in range(num_elements)]
    # )
    # initial_rewards = np.array([r.theta for r in rewards_gt])

    with np.errstate(divide="raise", over="raise", invalid="raise"):
        rewards_learned = ch_dpm_birl(
            xtr,
            phi,
            demos,
            initial_clusters,
            initial_rewards,
            max_iterations,
            reward_prior_sample,
            reward_prior_log_pdf,
            reward_prior_log_pdf_grad,
            dirichlet_prior_concentration=concentration,
            reward_bounds=reward_bounds,
        )

    print("Learned Rewards:")
    print(rewards_learned)

    print("Done")
def main():

    import gym
    import random
    import itertools as it

    from mdp_extras import (
        vi,
        OptimalPolicy,
        Indicator,
        Linear,
        BoltzmannExplorationPolicy,
    )
    from mdp_extras.envs import nchain_extras

    n = 2
    env = gym.make("NChain-v0", n=n)
    xtr, phi, reward_gt = nchain_extras(env, gamma=0.9)

    rollouts = OptimalPolicy(vi(xtr, phi,
                                reward_gt)[1]).get_rollouts(env,
                                                            10,
                                                            max_path_length=10)

    mean = np.zeros_like(reward_gt.theta)
    r_prior = sp.stats.multivariate_normal(mean, 5.0)

    r = r_prior.rvs(1)
    _, q = vi(xtr, phi, Linear(r))
    pi = OptimalPolicy(q, stochastic=False)
    print(r.reshape(n, -1))

    num_acceptances = 0
    for i in it.count():
        r_tilde = sp.stats.multivariate_normal(r).rvs(1)
        _, q_tilde = vi(xtr, phi, Linear(r_tilde))

        pi_actions = np.argmax([pi.prob_for_state(s) for s in xtr.states],
                               axis=1)
        q_actions = np.argmax(q_tilde, axis=1)

        if not np.all(pi_actions == q_actions):
            # We've reached a new policy equivalence class

            pib_tilde = BoltzmannExplorationPolicy(q_tilde)
            pib = BoltzmannExplorationPolicy(q)
            logprob_tilde = 0.0
            logprob = 0.0
            for p in rollouts:
                logprob_tilde += pib_tilde.path_log_action_probability(p)
                logprob += pib.path_log_action_probability(p)
            logprob_tilde += r_prior.logpdf(r_tilde)
            logprob += r_prior.logpdf(r)

            acceptance_logprob = logprob_tilde - logprob
            acceptance_prob = np.exp(acceptance_logprob)
            acceptance_prob = min(1, acceptance_prob)

            if np.random.rand() <= acceptance_prob:
                # Accept proposal
                r = r_tilde
                q = q_tilde
                pi = OptimalPolicy(q_tilde)
                num_acceptances += 1
                print(f"Accept ({num_acceptances / (i+1) * 100}%)")
                print(r.reshape(n, -1))
            else:
                # Reject
                print(f"Reject ({num_acceptances / (i+1) * 100}%)")
                continue
Esempio n. 6
0
def ch_dpm_birl(
        xtr,
        phi,
        demonstrations,
        initial_clusters,
        initial_rewards,
        max_iterations,
        reward_prior_sample,
        reward_prior_log_pdf,
        reward_prior_log_pdf_grad,
        dirichlet_prior_concentration=1.0,
        reward_prior_confidence=1.0,
        langevin_scale_param=0.001,
        reward_bounds=(None, None),
):
    """A Metropolis Hastings algorithm for DPM-BIRL

    TODO check this implementation against the following;
     * https://github.com/erensezener/IRL_Algorithms/tree/ea19532d61f229f03e254f1ba938deb814e2aca7/irl_multiple_experts/DPM_BIRL
     * https://github.com/s-arora-1987/sawyer_i2rl_project_workspace

    TODO ajs 3/Feb/21 This function leaks memory - after 3000 iterations it is hogging 15Gb.

    Args:
        xtr (mdp_extras.Extras): MDP Extras
        phi (mdp_extras.FeatureFunction): Feature function object
        demonstrations (list): List of (s, a) demonstration trajectories
        initial_clusters (list): Initial cluster allocations (one int for each demonstration)
        initial_rewards (numpy array): Array of initial reward parameters - one row for each initial reward function
        max_iterations (int): Maximum number of MH iterations
        reward_prior_sample (callable): A lambda that returns a single sample from the reward prior
        reward_prior_log_pdf (callable): A lambda that returns the reward prior log pdf for a reward parameter vector
        reward_prior_log_pdf_grad (callable): A lambda that returns the reward prior log pdf gradient wrt. a reward
            vector

        dirichlet_prior_concentration (float): Dirichlet distribution concentration
            parameter - set to 1.0 for uninformed prior
        reward_prior_confidence (float): Boltzmann confidence parameter for MaxLikelihood IRL
            model
        langevin_scale_param (float): Positive scale for the Langevin dynamics step size - should be
            tuned down and/or up until the average acceptance probability for the MH process is ~0.574.
            Acceptance probabilities are inversely proportional to this parameter's size.
        reward_bounds (tuple): Tuple of low, high reward parameter bounds. Set the respective entry to None to
            remove that reward bound.

    Returns:
        (numpy array): Cluster indices for each demonstration
        (numpy array): Array of learned reward functions
    """

    assert (len(initial_rewards.shape) >
            1), "Passed initial rewards must be two-dimensional"

    # THis is the optimal acceptance ratio for Langevin dynamics MH-MCMC (see paper by R & R).
    TARGET_REWARD_ACCEPTANCE_PROB_RATIO = 0.574

    # Sanitize initial clusters and rewards
    clusters = cluster_compaction(initial_clusters)
    print("Initial clusters:")
    print(clusters)

    rewards = np.clip(initial_rewards, *reward_bounds)
    print("Initial Rewards:")
    print(rewards)

    # Solve for Boltzmann Policy for each reward parameter
    boltzmann_policies = [
        BoltzmannExplorationPolicy(
            vi(xtr, phi, Linear(r))[1], reward_prior_confidence)
        for r in rewards
    ]

    # Loop until max number of iterations
    log_acceptance_probs = []
    for t in range(max_iterations):

        print(f"Iteration {t+1}")

        # Loop over each demonstration
        # We assume the list of clusters is always compact
        # print("Updating demonstration memberships...")
        for demo_idx, demo in enumerate(demonstrations):
            demo_cluster = clusters[demo_idx]
            demo_cluster_boltzmann_policy = boltzmann_policies[demo_cluster]

            # TODO ajs sample new cluster assignment from the full conditional posterior

            # Sample a new cluster for this trajectory from Eq 5
            cluster_probs = demo_reallocation_probabilities(
                clusters, demo_cluster, dirichlet_prior_concentration)
            demo_cluster_new = np.random.choice(list(range(
                len(cluster_probs))),
                                                p=cluster_probs)

            if demo_cluster_new == demo_cluster:
                # We didn't move the demonstration - nothing doing
                continue
            elif demo_cluster_new not in clusters:
                # We selected a new/empty cluster, sample a new reward function from the prior and apply bounds
                demo_cluster_new_reward = reward_prior_sample()
                demo_cluster_new_reward = np.clip(demo_cluster_new_reward,
                                                  *reward_bounds)
                demo_cluster_boltzmann_policy_new = BoltzmannExplorationPolicy(
                    vi(xtr, phi, Linear(demo_cluster_new_reward))[1],
                    reward_prior_confidence,
                )
            else:
                # We selected to move the demonstration to an existing cluster
                demo_cluster_new_reward = rewards[demo_cluster_new]
                demo_cluster_boltzmann_policy_new = boltzmann_policies[
                    demo_cluster_new]

            # Compute acceptance probability under Eq. 5 (in log space)
            if cluster_probs[demo_cluster_new] == 0.0:
                loga = -np.inf
            else:
                loga = np.log(
                    cluster_probs[demo_cluster_new]
                ) + demo_cluster_boltzmann_policy_new.path_log_action_probability(
                    demo)

            if cluster_probs[demo_cluster] == 0.0:
                logb = -np.inf
            else:
                logb = np.log(
                    cluster_probs[demo_cluster]
                ) + demo_cluster_boltzmann_policy.path_log_action_probability(
                    demo)

            if np.isneginf(logb):
                acceptance_logprob_ratio = np.inf
            else:
                acceptance_logprob_ratio = loga - logb
            acceptance_logprob = min(np.log(1.0), acceptance_logprob_ratio)
            acceptance_prob = np.exp(acceptance_logprob)

            # Accept/reject the new cluster+reward assignment
            if np.random.rand() <= acceptance_prob:

                if demo_cluster_new not in clusters:
                    # Accept the new cluster and reward
                    clusters[demo_idx] = demo_cluster_new

                    # We spawned a new cluster - add it to the reward list
                    rewards = np.concatenate(
                        (rewards, [demo_cluster_new_reward]), axis=0)
                    boltzmann_policies.append(
                        demo_cluster_boltzmann_policy_new)
                else:
                    # We added this trajectory to an existing cluster
                    # Accept the new cluster and reward
                    clusters[demo_idx] = demo_cluster_new

                # Run a compaction step, removing any empty clusters
                clusters, rewards, boltzmann_policies = cluster_compaction(
                    clusters, rewards, boltzmann_policies)
            else:
                # Reject the new cluster and reward - don't change anything
                continue

        print("Current clusters:", clusters)

        # print("Updating rewards...")
        for cluster_idx in range(len(set(clusters))):
            # Update reward function estimates based on current clusters
            reward = rewards[cluster_idx]
            policy = boltzmann_policies[cluster_idx]
            q_star = policy.q
            demos = [
                demonstrations[idx] for idx, c in enumerate(clusters)
                if c == cluster_idx
            ]

            # Take a single IRL step to update this reward function
            q_star_grad = q_grad_fpi(reward, xtr, phi)
            reward_log_grad = log_urpd_grad(
                xtr,
                reward,
                q_star_grad,
                demos,
                reward_prior_log_pdf_grad,
                reward_prior_confidence,
            )
            new_reward = reward_improvement_step(
                reward,
                reward_log_grad,
                langevin_scale_param,
            )

            # Apply reward constraints here - projection into a box constraint is truncation
            new_reward = np.clip(new_reward, *reward_bounds)

            # Compute acceptance probability of new reward
            _, new_q_star = vi(xtr, phi, Linear(new_reward))
            new_q_grad = q_grad_fpi(new_reward, xtr, phi)
            new_reward_log_urpd = log_urpd(
                xtr,
                new_reward,
                new_q_star,
                demos,
                reward_prior_log_pdf,
                reward_prior_confidence,
            )
            new_reward_log_grad = log_urpd_grad(
                xtr,
                new_reward,
                new_q_grad,
                demos,
                reward_prior_log_pdf_grad,
                reward_prior_confidence,
            )
            new_reward_log_g = log_g(new_reward, new_reward_log_grad, reward,
                                     langevin_scale_param)

            reward_log_urpd = log_urpd(
                xtr,
                reward,
                q_star,
                demos,
                reward_prior_log_pdf,
                reward_prior_confidence,
            )
            reward_log_g = log_g(reward, reward_log_grad, new_reward,
                                 langevin_scale_param)

            log_ratio = (new_reward_log_urpd + new_reward_log_g -
                         (reward_log_urpd + reward_log_g))
            log_acceptance_probs.append(log_ratio)
            accept_logprob = min(np.log(1.0), log_ratio)
            accept_prob = np.exp(accept_logprob)

            # print(f"R_{cluster_idx}: Log ratio is {log_ratio} ({np.exp(log_ratio)})")

            # Accept/reject new reward
            if np.random.rand() <= accept_prob:
                # Accept new reward
                # print(
                #     f"Accepting R_{cluster_idx} change from\n{reward} to\n{new_reward}"
                # )
                rewards[cluster_idx] = new_reward

                # Solve for policy under new cluster reward
                cluster_policy_new = BoltzmannExplorationPolicy(
                    new_q_star, reward_prior_confidence)
                boltzmann_policies[cluster_idx] = cluster_policy_new
            else:
                # Reject new reward
                # print(f"Rejecting change of R_{cluster_idx}")
                continue

        # Run a compaction step, removing any empty clusters
        clusters, rewards, boltzmann_policies = cluster_compaction(
            clusters, rewards, boltzmann_policies)

        print("Current rewards:")
        print(rewards)

        print(
            f"Mean acceptance ratio is {np.exp(np.mean(log_acceptance_probs))} - target is {TARGET_REWARD_ACCEPTANCE_PROB_RATIO}"
        )

    # Return learned reward ensemble
    return clusters, rewards
Esempio n. 7
0
def element_world_v4(
    num_elements,
    num_demos,
    demo_skew,
    num_clusters,
    wind,
    algorithm,
    initialisation,
    width,
    gamma,
    max_demonstration_length,
    reward_range,
    num_init_restarts,
    em_nll_tolerance,
    em_resp_tolerance,
    max_iterations,
    boltzmann_scale,
    skip_ml_paths,
    reward_initialisation,
    _log,
    _seed,
    _run,
):
    """ElementWorld Sacred Experiment"""

    # Construct EW
    _log.info(f"{_seed}: Preparing environment...")
    env = ElementWorldEnv(width=width,
                          num_elements=num_elements,
                          wind=wind,
                          gamma=gamma)
    xtr, phi, gt_rewards = element_world_extras(env)

    mode_proportions = geometric_distribution(demo_skew, num_elements)
    demos_per_mode = np.floor(mode_proportions * num_demos)

    # Ensure every mode has at least 1 demo
    demos_per_mode = np.maximum(demos_per_mode, 1)

    # Ensure correct number of demos are present
    while np.sum(demos_per_mode) > num_demos:
        demos_per_mode[np.argmax(demos_per_mode)] -= 1
    while np.sum(demos_per_mode) < num_demos:
        demos_per_mode[np.argmin(demos_per_mode)] += 1

    # Convert to int
    demos_per_mode = demos_per_mode.astype(int)

    # Solve, get train dataset
    train_demos = []
    train_gt_resp = []
    for ri, (reward,
             num_element_demos) in enumerate(zip(gt_rewards, demos_per_mode)):
        resp_row = np.zeros(num_elements)
        resp_row[ri] = 1.0
        for _ in range(num_element_demos):
            train_gt_resp.append(resp_row)
        _, q_star = vi(xtr, phi, reward)
        # pi_star = OptimalPolicy(q_star)
        pi_star = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale)
        train_demos.extend(
            pi_star.get_rollouts(env,
                                 num_element_demos,
                                 max_path_length=max_demonstration_length))
    train_gt_resp = np.array(train_gt_resp)
    train_gt_mixture_weights = np.sum(train_gt_resp, axis=0) / num_demos

    # Solve, get test dataset
    test_demos = []
    test_gt_resp = []
    for ri, (reward,
             num_element_demos) in enumerate(zip(gt_rewards, demos_per_mode)):
        resp_row = np.zeros(num_elements)
        resp_row[ri] = 1.0
        for _ in range(num_element_demos):
            test_gt_resp.append(resp_row)
        _, q_star = vi(xtr, phi, reward)
        # pi_star = OptimalPolicy(q_star)
        pi_star = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale)
        test_demos.extend(
            pi_star.get_rollouts(env,
                                 num_element_demos,
                                 max_path_length=max_demonstration_length))
    test_gt_resp = np.array(test_gt_resp)
    test_gt_mixture_weights = np.sum(test_gt_resp, axis=0) / num_demos

    if reward_initialisation == "MLE":
        # We use the current IRL model for Maximum Likelihood initialisation of the
        # reward parameters
        if algorithm == "MaxEnt":
            solver = MaxEntEMSolver()
            xtr_p, train_demos_p = padding_trick(xtr, train_demos)
            _, test_demos_p = padding_trick(xtr, test_demos)
        elif algorithm == "MaxLik":
            solver = MaxLikEMSolver()
            xtr_p = xtr
            train_demos_p = train_demos
            test_demos_p = test_demos
        elif algorithm == "SigmaGIRL":
            solver = SigmaGIRLEMSolver()
            xtr_p = xtr
            train_demos_p = train_demos
            test_demos_p = test_demos
        else:
            raise ValueError
    elif reward_initialisation == "MeanOnly":
        # We use a 'mean only' solver to do the reward initialisation
        solver = MeanOnlyEMSolver()
        xtr_p = xtr
        train_demos_p = train_demos
        test_demos_p = test_demos
    else:
        raise ValueError()

    # Initialize Mixture
    t0 = datetime.now()
    if initialisation == "Random":
        # Initialize uniformly at random
        init_mode_weights, init_rewards = solver.init_random(
            phi, num_clusters, reward_range)
    elif initialisation == "KMeans":
        # Initialize with K-Means (hard) clustering
        init_mode_weights, init_rewards = solver.init_kmeans(
            xtr_p, phi, train_demos_p, num_clusters, reward_range,
            num_init_restarts)
    elif initialisation == "GMM":
        # Initialize with GMM (soft) clustering
        init_mode_weights, init_rewards = solver.init_gmm(
            xtr_p, phi, train_demos_p, num_clusters, reward_range,
            num_init_restarts)
    elif initialisation == "Supervised":
        # We always have uniform clusters in supervised experiments
        assert num_clusters == num_elements

        if isinstance(solver, MaxEntEMSolver):
            # Apply padding trick
            xtr_p, train_demos_p = padding_trick(xtr, train_demos)

            # Learn rewards with ground truth responsibility matrix
            learn_rewards = solver.mstep(xtr_p, phi, train_gt_resp,
                                         train_demos_p, reward_range)

            # Compute baseline NLL
            mixture_nll = solver.mixture_nll(xtr_p, phi,
                                             train_gt_mixture_weights,
                                             learn_rewards, train_demos_p)
        elif isinstance(solver, MaxLikEMSolver):

            # Learn rewards with ground truth responsibility matrix
            learn_rewards = solver.mstep(xtr, phi, train_gt_resp, train_demos,
                                         reward_range)

            # Compute baseline NLL
            mixture_nll = solver.mixture_nll(xtr, phi,
                                             train_gt_mixture_weights,
                                             learn_rewards, train_demos)

        else:
            raise ValueError()

        # No initial solution for supervised experiment
        init_resp = None
        init_mode_weights = None
        init_rewards = None
        init_eval = None
        init_eval_train = None

        # Skip BV training
        train_iterations = np.nan
        resp_history = [train_gt_resp]
        mode_weights_history = [train_gt_mixture_weights]
        rewards_history = [learn_rewards]
        nll_history = [mixture_nll]
        train_reason = "Supervised baseline mixture - no training needed"

    else:
        raise ValueError

    def post_em_iteration(solver, iteration, resp, mode_weights, rewards, nll,
                          nll_delta, resp_delta):
        _log.info(f"{_seed}: Iteration {iteration} ended")
        _run.log_scalar("training.nll", nll)
        _run.log_scalar("training.nll_delta", nll_delta)
        _run.log_scalar("training.resp_delta", resp_delta)
        for mw_idx, mw in enumerate(mode_weights):
            _run.log_scalar(f"training.mw{mw_idx+1}", mw)
        for reward_idx, reward in enumerate(rewards):
            for theta_idx, theta_val in enumerate(reward.theta):
                _run.log_scalar(f"training.r{reward_idx+1}.t{theta_idx+1}",
                                theta_val)

    _log.info(
        f"{_seed}: Initialisation done - switching to MLE reward model for EM alg"
    )
    if algorithm == "MaxEnt":
        solver = MaxEntEMSolver(post_it=post_em_iteration)
        xtr_p, train_demos_p = padding_trick(xtr, train_demos)
        _, test_demos_p = padding_trick(xtr, test_demos)
    elif algorithm == "MaxLik":
        solver = MaxLikEMSolver(post_it=post_em_iteration)
        xtr_p = xtr
        train_demos_p = train_demos
        test_demos_p = test_demos
    elif algorithm == "SigmaGIRL":
        solver = SigmaGIRLEMSolver(post_it=post_em_iteration)
        xtr_p = xtr
        train_demos_p = train_demos
        test_demos_p = test_demos
    else:
        raise ValueError

    # Evaluate the initial mixture and run EM loop
    if initialisation != "Supervised":
        # Get initial responsibility matrix
        init_resp = solver.estep(xtr_p, phi, init_mode_weights, init_rewards,
                                 test_demos_p)
        init_resp_train = solver.estep(xtr_p, phi, init_mode_weights,
                                       init_rewards, train_demos_p)

        # Evaluate initial mixture
        _log.info(f"{_seed}: Evaluating initial solution (test set)")
        init_eval = element_world_eval(
            xtr,
            phi,
            test_demos,
            test_gt_resp,
            test_gt_mixture_weights,
            gt_rewards,
            init_resp,
            init_mode_weights,
            init_rewards,
            solver,
            skip_ml_paths,
        )
        _log.info(f"{_seed}: Evaluating initial solution (train set)")
        init_eval_train = element_world_eval(
            xtr,
            phi,
            train_demos,
            train_gt_resp,
            train_gt_mixture_weights,
            gt_rewards,
            init_resp_train,
            init_mode_weights,
            init_rewards,
            solver,
            skip_ml_paths,
            non_data_perf=init_eval,
        )

        # MI-IRL algorithm
        _log.info(f"{_seed}: BV-EM Loop")
        (
            train_iterations,
            resp_history,
            mode_weights_history,
            rewards_history,
            nll_history,
            train_reason,
        ) = bv_em(
            solver,
            xtr_p,
            phi,
            train_demos_p,
            num_clusters,
            reward_range,
            mode_weights=init_mode_weights,
            rewards=init_rewards,
            nll_tolerance=em_nll_tolerance,
            resp_tolerance=em_resp_tolerance,
            max_iterations=max_iterations,
        )
        _log.info(f"{_seed}: BV-EM Loop terminated, reason = {train_reason}")

    t1 = datetime.now()
    learn_resp = resp_history[-1]
    learn_mode_weights = mode_weights_history[-1]
    learn_rewards = rewards_history[-1]
    learn_nll = nll_history[-1]
    train_duration = (t1 - t0).total_seconds()

    # Derive Responsibility matrix for test paths
    learn_resp_test = solver.estep(xtr_p, phi, learn_mode_weights,
                                   learn_rewards, test_demos_p)

    # Evaluate final mixture
    _log.info(f"{_seed}: Evaluating final mixture (test set)")
    learn_eval = element_world_eval(
        xtr,
        phi,
        test_demos,
        test_gt_resp,
        test_gt_mixture_weights,
        gt_rewards,
        learn_resp_test,
        learn_mode_weights,
        learn_rewards,
        solver,
        skip_ml_paths,
    )
    _log.info(f"{_seed}: Evaluating final mixture (train set)")
    learn_eval_train = element_world_eval(
        xtr,
        phi,
        train_demos,
        train_gt_resp,
        train_gt_mixture_weights,
        gt_rewards,
        learn_resp,
        learn_mode_weights,
        learn_rewards,
        solver,
        skip_ml_paths,
        non_data_perf=learn_eval,
    )

    out_str = (
        "{}: Finished after {} iterations ({}) =============================\n"
        "NLL: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n"
        "ANID: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n"
        "EVD: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n"
        "Mode Weights: {} -> {}\n"
        "===================================================\n".format(
            _seed,
            train_iterations,
            train_reason,
            np.nan if init_eval_train is None else init_eval_train["nll"],
            learn_eval_train["nll"],
            np.nan if init_eval is None else init_eval["nll"],
            learn_eval["nll"],
            np.nan if init_eval_train is None else init_eval_train["anid"],
            learn_eval_train["anid"],
            np.nan if init_eval is None else init_eval["anid"],
            learn_eval["anid"],
            np.nan if init_eval_train is None else init_eval_train["mcf_evd"],
            learn_eval_train["mcf_evd"],
            np.nan if init_eval is None else init_eval["mcf_evd"],
            learn_eval["mcf_evd"],
            init_mode_weights,
            learn_mode_weights,
        ))
    print(out_str, flush=True)

    # Dump experimental results to artifact
    _log.info(f"{_seed}: Done...")
    result_fname = f"{_seed}.result"
    with open(result_fname, "wb") as file:
        pickle.dump(
            {
                # Initial soln
                "init_resp": [] if init_resp is None else init_resp.tolist(),
                "init_mode_weights": []
                if init_mode_weights is None else init_mode_weights.tolist(),
                "init_rewards": [] if init_rewards is None else
                [np.array(r.theta).tolist() for r in init_rewards],
                "init_eval": {} if init_eval is None else init_eval,
                "init_eval_train":
                {} if init_eval_train is None else init_eval_train,
                # Final soln
                "learn_resp":
                learn_resp.tolist(),
                "learn_mode_weights":
                learn_mode_weights.tolist(),
                "learn_rewards":
                [np.array(r.theta).tolist() for r in learn_rewards],
                "learn_eval":
                learn_eval,
                "learn_eval_train":
                learn_eval_train,
                # Training details
                "train_iterations":
                train_iterations,
                "train_duration":
                train_duration,
                "resp_history":
                np.array(resp_history).tolist(),
                "mode_weights_history":
                np.array(mode_weights_history).tolist(),
                "rewards_history":
                np.array([[r.theta for r in r1r2r3]
                          for r1r2r3 in rewards_history]).tolist(),
                "nll_history":
                np.array(nll_history).tolist(),
                "train_reason":
                train_reason,
            },
            file,
        )
    _run.add_artifact(result_fname)
    os.remove(result_fname)

    _log.info(f"{_seed}: Done")

    return float(learn_nll)
def canonical_puddle_world(
    transition_type,
    environment,
    gt_num_clusters,
    tr_rollouts_per_mode,
    te_rollouts_per_mode,
    algorithm,
    initialisation,
    num_init_restarts,
    num_clusters,
    reward_range,
    tolerance,
    _log,
    _run,
):

    _log.info("Loading...")

    if transition_type == "Stochastic":
        wind = 0.2
    elif transition_type == "Deterministic":
        wind = 0.0
    else:
        raise ValueError

    if environment == "CanonicalPuddleWorld":
        env = CanonicalPuddleWorldEnv(wind=wind)
    else:
        raise ValueError

    xtr, phi, gt_rewards = puddle_world_extras(env)
    gt_rewards = list(gt_rewards.values())

    if gt_num_clusters == 3:
        pass
    elif gt_num_clusters == 2:
        # Drop 'any' mode
        gt_rewards = gt_rewards[:gt_num_clusters]
    else:
        raise ValueError

    # Get rollouts
    q_stars = []
    pi_stars = []
    tr_rollouts_structured = []
    tr_rollouts = []
    te_rollouts_structured = []
    te_rollouts = []
    for reward in gt_rewards:
        # Get Q* function
        _, q_star = vi(xtr, phi, reward=reward)
        q_stars.append(q_star)

        # Get optimal stochastic policy
        pi_star = OptimalPolicy(q_star)
        pi_stars.append(pi_star)

        # Sample training rollouts from optimal policy
        _tr_rollouts = pi_star.get_rollouts(env, tr_rollouts_per_mode)
        tr_rollouts_structured.append(_tr_rollouts)
        tr_rollouts.extend(_tr_rollouts)

        # Sample distinct testing rollouts from optimal policy
        _te_rollouts = pi_star.get_rollouts(env, te_rollouts_per_mode)
        te_rollouts_structured.append(_te_rollouts)
        te_rollouts.extend(_te_rollouts)

    # Get solver object
    if algorithm == "MaxEnt":
        solver = MaxEntEMSolver()

        # Apply padding trick
        xtr_p, tr_rollouts_p = padding_trick(xtr, tr_rollouts)

    elif algorithm == "MaxLik":
        solver = MaxLikEMSolver()

        # Dummy padded variables
        xtr_p = xtr
        tr_rollouts_p = tr_rollouts

    else:
        raise ValueError

    # Lambda to get ground truth responsibility matrix
    gt_resp = lambda k, rpm: (np.concatenate(
        [np.repeat([np.eye(k)[r, :]], rpm, 0) for r in range(k)],
        0,
    ))

    def eval_clustering(
        gt_resp,
        learned_resp,
    ):
        """Evaluate a mixture model's clustering performance"""

        # Compute cluster metrics
        nid = normalized_information_distance(gt_resp, learned_resp)
        anid = adjusted_normalized_information_distance(gt_resp, learned_resp)

        return nid, anid

    def eval_rewards(
        gt_mode_weights,
        gt_rewards,
        learned_mode_weights,
        learned_rewards,
    ):
        """Evaluate a mixture model's reward performance"""
        gt_num_clusters = len(gt_mode_weights)
        num_clusters = len(learned_mode_weights)

        # Compute reward recovery metrics
        ile_mat = np.zeros((num_clusters, gt_num_clusters))
        evd_mat = np.zeros((num_clusters, gt_num_clusters))
        for learned_mode_idx in range(num_clusters):
            for gt_mode_idx in range(gt_num_clusters):
                ile, evd = ile_evd(
                    xtr,
                    phi,
                    gt_rewards[gt_mode_idx],
                    learned_rewards[learned_mode_idx],
                )
                ile_mat[learned_mode_idx, gt_mode_idx] = ile
                evd_mat[learned_mode_idx, gt_mode_idx] = evd
        mcf_ile, mcf_ile_flowdict = min_cost_flow_error_metric(
            learned_mode_weights, gt_mode_weights, ile_mat)
        mcf_evd, mcf_evd_flowdict = min_cost_flow_error_metric(
            learned_mode_weights, gt_mode_weights, evd_mat)

        return (mcf_ile, mcf_ile_flowdict, mcf_evd, mcf_evd_flowdict)

    _log.info("Initialising...")
    t0 = datetime.now()

    if initialisation == "Random":
        # Initialize uniformly at random
        st_mode_weights, st_rewards = solver.init_random(
            phi, num_clusters, reward_range)
    elif initialisation == "KMeans":
        # Initialize with K-Means (hard) clustering
        st_mode_weights, st_rewards = solver.init_kmeans(
            xtr_p, phi, tr_rollouts_p, num_clusters, reward_range,
            num_init_restarts)
    elif initialisation == "GMM":
        # Initialize with GMM (soft) clustering
        st_mode_weights, st_rewards = solver.init_gmm(xtr_p, phi,
                                                      tr_rollouts_p,
                                                      num_clusters,
                                                      reward_range,
                                                      num_init_restarts)
    elif initialisation == "Baseline":

        # This is a baseline experiment - simply set the clusters to the ground truth
        # model

        # We always have uniform clusters in these experiments
        assert num_clusters == gt_num_clusters

        # Use ground truth responsibility matrix and cluster weights
        _resp = gt_resp(num_clusters, tr_rollouts_per_mode)
        st_mode_weights = np.sum(_resp, axis=0) / len(_resp)

        # Learn rewards with ground truth responsibility matrix
        st_rewards = solver.mstep(xtr_p, phi, _resp, tr_rollouts_p,
                                  reward_range)

        # Compute baseline NLL
        _nll = solver.mixture_nll(xtr_p, phi, st_mode_weights, st_rewards,
                                  tr_rollouts_p)

        iterations = 0
        tr_resp_history = [_resp]
        mode_weights_history = [st_mode_weights]
        rewards_history = [st_rewards]
        tr_nll_history = [_nll]
        reason = "Baseline model - EM loop skipped"

        # No initial solution for baseline models
        init_nid = np.nan
        init_anid = np.nan
        init_nll = np.nan
        init_mcf_ile = np.nan
        init_mcf_ile_flowdict = {}
        init_mcf_evd = np.nan
        init_mcf_evd_flowdict = {}

    else:
        raise ValueError

    if initialisation != "Baseline":
        _log.info("Evaluating initial solution...")

        init_learned_resp = solver.estep(xtr_p, phi, st_mode_weights,
                                         st_rewards, te_rollouts)
        init_nid, init_anid = eval_clustering(
            gt_resp(gt_num_clusters, te_rollouts_per_mode), init_learned_resp)
        init_nll = solver.mixture_nll(xtr_p, phi, st_mode_weights, st_rewards,
                                      te_rollouts)
        (
            init_mcf_ile,
            init_mcf_ile_flowdict,
            init_mcf_evd,
            init_mcf_evd_flowdict,
        ) = eval_rewards(
            np.ones(gt_num_clusters) / gt_num_clusters,
            gt_rewards,
            st_mode_weights,
            st_rewards,
        )

    _log.info("Solving...")
    if initialisation != "Baseline":
        (
            iterations,
            tr_resp_history,
            mode_weights_history,
            rewards_history,
            tr_nll_history,
            reason,
        ) = bv_em(
            solver,
            xtr_p,
            phi,
            tr_rollouts_p,
            num_clusters,
            reward_range,
            mode_weights=st_mode_weights,
            rewards=st_rewards,
            nll_tolerance=tolerance,
        )

    t1 = datetime.now()

    # Log training progress after experiment - timestamps will be wrong
    for it in range(iterations + 1):
        _run.log_scalar("training.mode_weights",
                        mode_weights_history[it].tolist())
        _run.log_scalar("training.rewards",
                        [r.theta.tolist() for r in rewards_history[it]])
        _run.log_scalar("training.nll", float(tr_nll_history[it]))

    tr_learned_resp = tr_resp_history[-1]
    learned_mode_weights = mode_weights_history[-1]
    learned_rewards = rewards_history[-1]
    tr_nll = tr_nll_history[-1]
    duration = (t1 - t0).total_seconds()

    _log.info("Evaluating...")

    # Evaluate training set clustering
    tr_nid, tr_anid = eval_clustering(
        gt_resp(gt_num_clusters, tr_rollouts_per_mode), tr_learned_resp)
    # Evaluate test set clustering
    te_learned_resp = solver.estep(xtr_p, phi, learned_mode_weights,
                                   learned_rewards, te_rollouts)
    te_nid, te_anid = eval_clustering(
        gt_resp(gt_num_clusters, te_rollouts_per_mode), te_learned_resp)
    # Evaluate test set NLL
    te_nll = solver.mixture_nll(xtr_p, phi, learned_mode_weights,
                                learned_rewards, te_rollouts)

    # Evaluate reward performance
    (mcf_ile, mcf_ile_flowdict, mcf_evd, mcf_evd_flowdict) = eval_rewards(
        np.ones(gt_num_clusters) / gt_num_clusters,
        gt_rewards,
        learned_mode_weights,
        learned_rewards,
    )

    _log.info("Done...")

    return {
        # Mixture Initialization
        "st_mode_weights":
        st_mode_weights.tolist(),
        "st_rewards": [st_r.theta.tolist() for st_r in st_rewards],
        "st_nll":
        float(tr_nll_history[0]),
        #
        # Initial solution evaluation
        "init_nid":
        init_nid,
        "init_anid":
        init_anid,
        "init_nll":
        init_nll,
        "init_mcf_ile":
        init_mcf_ile,
        "init_mcf_ile_flowdict":
        init_mcf_ile_flowdict,
        "init_mcf_evd":
        init_mcf_evd,
        "init_mcf_evd_flowdict":
        init_mcf_evd_flowdict,
        #
        # Learned model
        "iterations":
        int(iterations),
        "duration":
        float(duration),
        "learned_mode_weights":
        learned_mode_weights.tolist(),
        "learned_rewards":
        [learned_r.theta.tolist() for learned_r in learned_rewards],
        "reason":
        reason,
        #
        # Training set performance
        "tr_learned_resp":
        tr_learned_resp.tolist(),
        "tr_nll":
        float(tr_nll),
        "tr_normalized_information_distance":
        float(tr_nid),
        "tr_normalized_information_distance_adjusted":
        float(tr_anid),
        #
        # Test set performance
        "te_learned_resp":
        te_learned_resp.tolist(),
        "te_nll":
        float(te_nll),
        "te_normalized_information_distance":
        float(te_nid),
        "te_normalized_information_distance_adjusted":
        float(te_anid),
        #
        # Reward performance
        "min_cost_flow_ile":
        float(mcf_ile),
        "min_cost_flow_ile_flow":
        mcf_ile_flowdict,
        "min_cost_flow_evd":
        float(mcf_evd),
        "min_cost_flow_evd_flow":
        mcf_evd_flowdict,
    }
def main():
    """Main function"""

    import gym
    import warnings

    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    from tqdm import tqdm
    from scipy.optimize import minimize

    from mdp_extras import (
        vi,
        OptimalPolicy,
        padding_trick,
        UniformRandomPolicy,
        PaddedMDPWarning,
        Linear,
    )
    from mdp_extras.envs import nchain_extras, frozen_lake_extras

    from unimodal_irl import sw_maxent_irl, sw_modelfree_maxent_irl, mean_ci, ile_evd

    n = 5
    env = gym.make("NChain-v0", n=n)
    xtr, phi, reward_gt = nchain_extras(env, gamma=0.9)

    _, q_star = vi(xtr, phi, reward_gt)
    pi_star = OptimalPolicy(q_star)
    max_path_length = 10

    num_paths = 40
    demo_star = pi_star.get_rollouts(env,
                                     num_paths,
                                     max_path_length=max_path_length)
    phi_bar = phi.demo_average(demo_star, xtr.gamma)

    with warnings.catch_warnings():
        warnings.filterwarnings(action="ignore", category=PaddedMDPWarning)
        # Compute ground truth values
        gt_nll, gt_grad = sw_maxent_irl(reward_gt.theta, xtr, phi, phi_bar,
                                        max_path_length)

    print(f"GT: {gt_nll:.3f} {gt_grad}")

    pi_ref = UniformRandomPolicy(len(xtr.actions))

    # num_reference_paths = 20
    for num_reference_paths in 2**np.arange(13):

        nll_errs = []
        grad_errs = []
        for rep in range(100):

            pi_ref_demos = []
            for _ in range(num_reference_paths):
                path_len = np.random.randint(1, max_path_length + 1)
                pi_ref_demos.extend(
                    pi_ref.get_rollouts(env, 1, max_path_length=path_len))

            nll, grad = sw_maxent_irl_modelfree(
                reward_gt.theta,
                xtr,
                phi,
                phi_bar,
                max_path_length,
                pi_ref,
                pi_ref_demos,
                nll_only=False,
            )
            # print(f"IS: {nll:.3f} {grad}")
            nll_err = np.sqrt((nll - gt_nll)**2)
            grad_err = np.linalg.norm(gt_grad - grad)
            nll_errs.append(nll_err)
            grad_errs.append(grad_err)

        print(
            f"IS ({num_reference_paths}): {np.mean(nll_err):.3f} {np.mean(grad_err):.3f}"
        )

    print("Howedy")
Esempio n. 10
0
def ile_evd(
    xtr,
    phi,
    reward_gt,
    reward_test,
    *,
    p=1,
    vi_kwargs={},
    policy_kwargs={},
    pe_kwargs={},
    ret_gt_value=False,
    gt_policy_value=None,
):
    """Find Inverse Learning Error and Expected Value Difference metrics

    Inverse Learning Error is defined in "Inverse reinforcement learning in partially
    observable environments." by Choi and Kim, 2011.

    Expected Value Difference is defined in "Nonlinear inverse reinforcement learning
    with gaussian processes." by Levine, et al. 2011. EVD is essentially a weighted
    version of ILE, that only considers states with non-zero starting probability.

    Args:
        xtr (mdp_extras.DiscreteExplicitExtras) MDP extras object
        phi (mdp_extras.FeatureFunction) Feature function for MDP
        reward_gt (mdp_extras.RewardFunction): Ground Truth reward function for MDP
        reward_test (mdp_extras.RewardFunction): Learned reward function for MDP

        p (int): p-Norm to use for ILE, Choi and Kim and other papers recommend p=1
        vi_kwargs (dict): Extra keyword args for mdp_extras.vi Value Iteration method
        policy_kwargs (dict): Extra keyword args for mdp_extras.OptimalPolicy
        pe_kwargs (dict): Extra keyword args for mdp_extras.pi_eval Policy Evaluation method
        ret_gt_value (bool): If true, also return the GT policy state value function,
            can be used for speeding up future calls
        gt_policy_value (numpy array): Optional ground truth policy state value function
            - used for speeding up this function with multiple calls

    Returns:
        (float): Inverse Learning Error metric
        (float): Expected Value Difference metric
    """

    if gt_policy_value is None:
        # Get GT policy state value function
        gt_policy_value, _ = vi(xtr, phi, reward_gt, **vi_kwargs)

    # Get test policy state value function under GT reward
    v_star_test, q_star_test = vi(xtr, phi, reward_test, **vi_kwargs)
    pi_star_test = OptimalPolicy(q_star_test, stochastic=False, **policy_kwargs)
    test_policy_value = pi_eval(xtr, phi, reward_gt, pi_star_test, **pe_kwargs)

    value_delta = gt_policy_value - test_policy_value
    ile = np.linalg.norm(value_delta, ord=p)
    evd = xtr.p0s @ value_delta

    if evd < 0:
        warnings.warn(
            f"EVD is < 0 (by {0 - evd}) - possible loss of accuracy due to numerical rounding"
        )
        evd = 0.0
    if ile < 0:
        warnings.warn(
            f"ILE is < 0 (by {0 - ile}) - possible loss of accuracy due to numerical rounding"
        )
        ile = 0.0

    if not ret_gt_value:
        return ile, evd
    else:
        return ile, evd, gt_policy_value