def main(): """Main function""" # Test BV on PuddleWorld from scipy.optimize import minimize # from multimodal_irl.envs import CanonicalPuddleWorldEnv, puddle_world_extras from multimodal_irl.envs import ElementWorldEnv, element_world_extras # env = CanonicalPuddleWorldEnv(wind=0.0) # xtr, phi, gt_rewards = puddle_world_extras(env) # reward = gt_rewards["dry"] env = ElementWorldEnv(wind=0.1, num_elements=3) xtr, phi, gt_rewards = element_world_extras(env) reward = gt_rewards[0] print(reward.theta) scale = 2.0 _, q_star = vi(xtr, phi, reward) pi_star = BoltzmannExplorationPolicy(q_star, scale=scale) demo_star = pi_star.get_rollouts(env, 10) phi_bar_star = phi.demo_average(demo_star, xtr.gamma) print(phi_bar_star) x0 = np.zeros(len(phi)) bounds = np.array([(-10.0, 0.0) for _ in range(len(phi))]) res = minimize( bv_maxlikelihood_irl, x0, args=(xtr, phi, demo_star, None, scale), jac=True, bounds=bounds, options=dict(disp=True), ) print(res) pass
def bv_maxlikelihood_irl( x, xtr, phi, rollouts, weights=None, boltzmann_scale=0.5, qge_tol=1e-3, nll_only=False, ): """Compute the average rollout Negative Log Likelihood (and gradient) for ML-IRL This method is biased to prefer shorter paths through any MDP. TODO ajs 29/Oct/2020 Support SoftMax Q function from Babes-Vroman 2011 paper via nb_smq_value_iteration() Args: x (numpy array): Current reward function parameter vector estimate xtr (mdp_extras.DiscreteExplicitExtras): Extras object for the MDP being optimized phi (mdp_extras.FeatureFunction): Feature function to use with linear reward parameters. We require len(phi) == len(x). rollouts (list): List of (s, a) rollouts. weights (numpy array): Optional path weights for weighted IRL problems boltzmann_scale (float): Optimality parameter for Boltzmann policy. Babes-Vroman use 0.5. Values closer to 1.0 cause slower convergence, but values closer to 0 model the demonstrations as being non-expert. Empirically I find 0.2 works well. qge_tol (float): Tolerance for q-gradient estimation. nll_only (bool): If true, only return NLL """ if weights is None: weights = np.ones(len(rollouts)) / len(rollouts) # Compute Q*, pi* for current reward guess reward = Linear(x) _, q_star = vi(xtr, phi, reward) # To use the soft Q function from Babes-Vroman's paper, uncomment below # q_star = nb_smq_value_iteration( # xtr.t_mat, xtr.gamma, *reward.structured(xtr, phi), boltzmann_scale # ) pi = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale) if not nll_only: # Get Q* gradient for current reward parameters dq_dtheta = q_grad_fpi(reward.theta, xtr, phi, tol=qge_tol) # Sweep demonstrated state-action pairs nll = 0 nll_grad = np.zeros_like(x) num_sa_samples = 0 for path, weight in zip(rollouts, weights): for s, a in path[:-1]: num_sa_samples += 1 ell_theta = pi.prob_for_state_action(s, a) # Accumulate negative log likelihood of demonstration data nll += -1 * weight * np.log(ell_theta) if not nll_only: expected_action_grad = np.sum( [ pi.prob_for_state_action(s, b) * dq_dtheta[s, b, :] for b in xtr.actions ], axis=0, ) dl_dtheta = boltzmann_scale * (expected_action_grad - dq_dtheta[s, a, :]) nll_grad += weight * dl_dtheta # Convert NLL and gradient to average, not sum # This makes for consistent magnitude values regardless of dataset size nll /= len(rollouts) nll_grad /= len(rollouts) if nll_only: return nll else: return nll, nll_grad
def maxlikelihood_ml_path(xtr, phi, reward, start, goal, max_path_length, boltzmann_scale=0.5): """Find the ML path from s1 to sg under a MaxLikelihood model If transitions can inccur +ve rewards the returned paths may contain loops NB ajs 14/Jan/2020 The log likelihood of the path that we compute internally is fine for doing viterbi ML path inference, but it's not the actual path log likelihood - it's not normalized, and the gamma time offset is incorrect (depending on what start time the Viterbi alg picks). Args: xtr (DiscreteExplicitExtras): MDP Extras object phi (FeatureFunction): MDP Featrure function reward (Linear): Linear reward function start (int): Starting state goal (int): End state max_path_length (int): Maximum allowable path length to search boltzmann_scale (float): Boltzmann scale parameter Returns: (list): Maximum Likelihood path from start to goal under the given MaxEnt reward function, or None if no path is possible """ _, q_star = vi(xtr, phi, reward) # Initialize an SxA LL Viterbi trellis sa_lls = np.zeros( (len(xtr.states), len(xtr.actions), max_path_length)) - np.inf for a in xtr.actions: sa_lls[goal, :, :] = boltzmann_scale * q_star[goal, a] # Suppress divide by zero - we take logs of many zeroes here with np.errstate(divide="ignore"): # Walk backward to propagate the maximum LL for t in range(max_path_length - 2, -1, -1): # Max-Reduce over actions to compute state LLs # (it's a max because we get to choose our actions) s_lls = np.max(sa_lls, axis=1) # Sweep end states for s2 in xtr.states: if np.isneginf(s_lls[s2, t + 1]): # Skip this state - it hasn't been reached by probability messages yet continue # Sweep actions for a in xtr.actions: # Sweep starting states for s1 in xtr.states: if xtr.terminal_state_mask[s1]: # We can't step forward from terminal states - skip this one continue transition_ll = boltzmann_scale * q_star[ s1, a] + np.log(xtr.t_mat[s1, a, s2]) if np.isneginf(transition_ll): # This transition is impossible - skip continue # Store the max because we're after the maximum likelihood path sa_lls[s1, a, t] = max(sa_lls[s1, a, t], transition_ll + s_lls[s2, t + 1]) # Max-reduce to get state/action ML trellises for conveience s_lls = np.max(sa_lls, axis=1) # Identify our starting time if np.isneginf(np.max(s_lls[start])): # There is no feasible path from s1 to sg less or equal to than max_path_length return None start_time = np.argmax(s_lls[start, :]) # Walk forward from start state, start time to re-construct path state = start time = start_time ml_path = [] while state != goal: action = np.argmax(sa_lls[state, :, time]) ml_path.append((state, action)) successor_states = [s for (a, s) in xtr.children[state] if a == action] # Choose successor state with highest log likelihood at time + 1 ml = -np.inf next_state = None for s2 in successor_states: s2_ll = s_lls[s2, time + 1] if s2_ll >= ml: next_state = s2 ml = s2_ll state = next_state time = time + 1 # Add final (goal) state ml_path.append((state, None)) return ml_path
def main(): """Main function""" from mdp_extras import vi, OptimalPolicy from multimodal_irl.envs.element_world import ElementWorldEnv, element_world_extras demos_per_mode = 10 num_elements = 4 env = ElementWorldEnv(num_elements=num_elements, wind=0.1) xtr, phi, rewards_gt = element_world_extras(env) demos = [] for reward in rewards_gt: _, q_star = vi(xtr, phi, reward) pi_star = OptimalPolicy(q_star) cluster_demos = pi_star.get_rollouts(env, demos_per_mode) demos.extend(cluster_demos) print("GT Rewards:") print(np.array([r.theta for r in rewards_gt])) # Algorithm hyper-params concentration = 10.0 max_initial_clusters = num_elements reward_mean = np.mean(rewards_gt[0].theta) * np.ones_like( rewards_gt[0].theta) reward_covariance = np.var(rewards_gt[0].theta) * np.ones_like( rewards_gt[0].theta) max_iterations = 10000 reward_bounds = (-10.0, 0.0) # Initialise clusters initial_clusters = [] for cluster_idx, cluster_size in enumerate( multinomial( len(demos), *dirichlet( np.ones(max_initial_clusters) * concentration / max_initial_clusters).rvs(1), ).rvs(1)[0]): for _ in range(cluster_size): initial_clusters.append(cluster_idx) initial_clusters = cluster_compaction(np.array(initial_clusters)) num_initial_clusters = len(set(initial_clusters)) # Define the reward prior and associated access functions reward_prior = multivariate_normal(mean=reward_mean, cov=reward_covariance) reward_prior_sample = lambda: reward_prior.rvs(1) reward_prior_log_pdf = lambda theta: reward_prior.logpdf(theta) reward_prior_log_pdf_grad = lambda theta: np.array( [(1.0 / norm(mu_d, sigma_d).pdf(theta_d)) * (-1.0 / sigma_d / np.sqrt(2 * np.pi)) * ((theta_d - mu_d) / sigma_d) * (np.exp(-0.5 * ((theta_d - mu_d) / sigma_d)**2)) for reward_dim, (theta_d, mu_d, sigma_d) in enumerate( zip(theta, reward_mean, reward_covariance))] ) # Believe it or not, this computes the derivative of the multivariate normal log pdf (with diagonal covariance) # Initialise rewards initial_rewards = np.clip(reward_prior.rvs(num_initial_clusters), *reward_bounds) if len(set(initial_clusters)) == 1: initial_rewards = np.array([initial_rewards]) # Set initial clusters and rewards to ground truth # print("Initialising with ground truth rewards and clusters") # initial_clusters = np.concatenate( # [[i] * demos_per_mode for i in range(num_elements)] # ) # initial_rewards = np.array([r.theta for r in rewards_gt]) with np.errstate(divide="raise", over="raise", invalid="raise"): rewards_learned = ch_dpm_birl( xtr, phi, demos, initial_clusters, initial_rewards, max_iterations, reward_prior_sample, reward_prior_log_pdf, reward_prior_log_pdf_grad, dirichlet_prior_concentration=concentration, reward_bounds=reward_bounds, ) print("Learned Rewards:") print(rewards_learned) print("Done")
def main(): import gym import random import itertools as it from mdp_extras import ( vi, OptimalPolicy, Indicator, Linear, BoltzmannExplorationPolicy, ) from mdp_extras.envs import nchain_extras n = 2 env = gym.make("NChain-v0", n=n) xtr, phi, reward_gt = nchain_extras(env, gamma=0.9) rollouts = OptimalPolicy(vi(xtr, phi, reward_gt)[1]).get_rollouts(env, 10, max_path_length=10) mean = np.zeros_like(reward_gt.theta) r_prior = sp.stats.multivariate_normal(mean, 5.0) r = r_prior.rvs(1) _, q = vi(xtr, phi, Linear(r)) pi = OptimalPolicy(q, stochastic=False) print(r.reshape(n, -1)) num_acceptances = 0 for i in it.count(): r_tilde = sp.stats.multivariate_normal(r).rvs(1) _, q_tilde = vi(xtr, phi, Linear(r_tilde)) pi_actions = np.argmax([pi.prob_for_state(s) for s in xtr.states], axis=1) q_actions = np.argmax(q_tilde, axis=1) if not np.all(pi_actions == q_actions): # We've reached a new policy equivalence class pib_tilde = BoltzmannExplorationPolicy(q_tilde) pib = BoltzmannExplorationPolicy(q) logprob_tilde = 0.0 logprob = 0.0 for p in rollouts: logprob_tilde += pib_tilde.path_log_action_probability(p) logprob += pib.path_log_action_probability(p) logprob_tilde += r_prior.logpdf(r_tilde) logprob += r_prior.logpdf(r) acceptance_logprob = logprob_tilde - logprob acceptance_prob = np.exp(acceptance_logprob) acceptance_prob = min(1, acceptance_prob) if np.random.rand() <= acceptance_prob: # Accept proposal r = r_tilde q = q_tilde pi = OptimalPolicy(q_tilde) num_acceptances += 1 print(f"Accept ({num_acceptances / (i+1) * 100}%)") print(r.reshape(n, -1)) else: # Reject print(f"Reject ({num_acceptances / (i+1) * 100}%)") continue
def ch_dpm_birl( xtr, phi, demonstrations, initial_clusters, initial_rewards, max_iterations, reward_prior_sample, reward_prior_log_pdf, reward_prior_log_pdf_grad, dirichlet_prior_concentration=1.0, reward_prior_confidence=1.0, langevin_scale_param=0.001, reward_bounds=(None, None), ): """A Metropolis Hastings algorithm for DPM-BIRL TODO check this implementation against the following; * https://github.com/erensezener/IRL_Algorithms/tree/ea19532d61f229f03e254f1ba938deb814e2aca7/irl_multiple_experts/DPM_BIRL * https://github.com/s-arora-1987/sawyer_i2rl_project_workspace TODO ajs 3/Feb/21 This function leaks memory - after 3000 iterations it is hogging 15Gb. Args: xtr (mdp_extras.Extras): MDP Extras phi (mdp_extras.FeatureFunction): Feature function object demonstrations (list): List of (s, a) demonstration trajectories initial_clusters (list): Initial cluster allocations (one int for each demonstration) initial_rewards (numpy array): Array of initial reward parameters - one row for each initial reward function max_iterations (int): Maximum number of MH iterations reward_prior_sample (callable): A lambda that returns a single sample from the reward prior reward_prior_log_pdf (callable): A lambda that returns the reward prior log pdf for a reward parameter vector reward_prior_log_pdf_grad (callable): A lambda that returns the reward prior log pdf gradient wrt. a reward vector dirichlet_prior_concentration (float): Dirichlet distribution concentration parameter - set to 1.0 for uninformed prior reward_prior_confidence (float): Boltzmann confidence parameter for MaxLikelihood IRL model langevin_scale_param (float): Positive scale for the Langevin dynamics step size - should be tuned down and/or up until the average acceptance probability for the MH process is ~0.574. Acceptance probabilities are inversely proportional to this parameter's size. reward_bounds (tuple): Tuple of low, high reward parameter bounds. Set the respective entry to None to remove that reward bound. Returns: (numpy array): Cluster indices for each demonstration (numpy array): Array of learned reward functions """ assert (len(initial_rewards.shape) > 1), "Passed initial rewards must be two-dimensional" # THis is the optimal acceptance ratio for Langevin dynamics MH-MCMC (see paper by R & R). TARGET_REWARD_ACCEPTANCE_PROB_RATIO = 0.574 # Sanitize initial clusters and rewards clusters = cluster_compaction(initial_clusters) print("Initial clusters:") print(clusters) rewards = np.clip(initial_rewards, *reward_bounds) print("Initial Rewards:") print(rewards) # Solve for Boltzmann Policy for each reward parameter boltzmann_policies = [ BoltzmannExplorationPolicy( vi(xtr, phi, Linear(r))[1], reward_prior_confidence) for r in rewards ] # Loop until max number of iterations log_acceptance_probs = [] for t in range(max_iterations): print(f"Iteration {t+1}") # Loop over each demonstration # We assume the list of clusters is always compact # print("Updating demonstration memberships...") for demo_idx, demo in enumerate(demonstrations): demo_cluster = clusters[demo_idx] demo_cluster_boltzmann_policy = boltzmann_policies[demo_cluster] # TODO ajs sample new cluster assignment from the full conditional posterior # Sample a new cluster for this trajectory from Eq 5 cluster_probs = demo_reallocation_probabilities( clusters, demo_cluster, dirichlet_prior_concentration) demo_cluster_new = np.random.choice(list(range( len(cluster_probs))), p=cluster_probs) if demo_cluster_new == demo_cluster: # We didn't move the demonstration - nothing doing continue elif demo_cluster_new not in clusters: # We selected a new/empty cluster, sample a new reward function from the prior and apply bounds demo_cluster_new_reward = reward_prior_sample() demo_cluster_new_reward = np.clip(demo_cluster_new_reward, *reward_bounds) demo_cluster_boltzmann_policy_new = BoltzmannExplorationPolicy( vi(xtr, phi, Linear(demo_cluster_new_reward))[1], reward_prior_confidence, ) else: # We selected to move the demonstration to an existing cluster demo_cluster_new_reward = rewards[demo_cluster_new] demo_cluster_boltzmann_policy_new = boltzmann_policies[ demo_cluster_new] # Compute acceptance probability under Eq. 5 (in log space) if cluster_probs[demo_cluster_new] == 0.0: loga = -np.inf else: loga = np.log( cluster_probs[demo_cluster_new] ) + demo_cluster_boltzmann_policy_new.path_log_action_probability( demo) if cluster_probs[demo_cluster] == 0.0: logb = -np.inf else: logb = np.log( cluster_probs[demo_cluster] ) + demo_cluster_boltzmann_policy.path_log_action_probability( demo) if np.isneginf(logb): acceptance_logprob_ratio = np.inf else: acceptance_logprob_ratio = loga - logb acceptance_logprob = min(np.log(1.0), acceptance_logprob_ratio) acceptance_prob = np.exp(acceptance_logprob) # Accept/reject the new cluster+reward assignment if np.random.rand() <= acceptance_prob: if demo_cluster_new not in clusters: # Accept the new cluster and reward clusters[demo_idx] = demo_cluster_new # We spawned a new cluster - add it to the reward list rewards = np.concatenate( (rewards, [demo_cluster_new_reward]), axis=0) boltzmann_policies.append( demo_cluster_boltzmann_policy_new) else: # We added this trajectory to an existing cluster # Accept the new cluster and reward clusters[demo_idx] = demo_cluster_new # Run a compaction step, removing any empty clusters clusters, rewards, boltzmann_policies = cluster_compaction( clusters, rewards, boltzmann_policies) else: # Reject the new cluster and reward - don't change anything continue print("Current clusters:", clusters) # print("Updating rewards...") for cluster_idx in range(len(set(clusters))): # Update reward function estimates based on current clusters reward = rewards[cluster_idx] policy = boltzmann_policies[cluster_idx] q_star = policy.q demos = [ demonstrations[idx] for idx, c in enumerate(clusters) if c == cluster_idx ] # Take a single IRL step to update this reward function q_star_grad = q_grad_fpi(reward, xtr, phi) reward_log_grad = log_urpd_grad( xtr, reward, q_star_grad, demos, reward_prior_log_pdf_grad, reward_prior_confidence, ) new_reward = reward_improvement_step( reward, reward_log_grad, langevin_scale_param, ) # Apply reward constraints here - projection into a box constraint is truncation new_reward = np.clip(new_reward, *reward_bounds) # Compute acceptance probability of new reward _, new_q_star = vi(xtr, phi, Linear(new_reward)) new_q_grad = q_grad_fpi(new_reward, xtr, phi) new_reward_log_urpd = log_urpd( xtr, new_reward, new_q_star, demos, reward_prior_log_pdf, reward_prior_confidence, ) new_reward_log_grad = log_urpd_grad( xtr, new_reward, new_q_grad, demos, reward_prior_log_pdf_grad, reward_prior_confidence, ) new_reward_log_g = log_g(new_reward, new_reward_log_grad, reward, langevin_scale_param) reward_log_urpd = log_urpd( xtr, reward, q_star, demos, reward_prior_log_pdf, reward_prior_confidence, ) reward_log_g = log_g(reward, reward_log_grad, new_reward, langevin_scale_param) log_ratio = (new_reward_log_urpd + new_reward_log_g - (reward_log_urpd + reward_log_g)) log_acceptance_probs.append(log_ratio) accept_logprob = min(np.log(1.0), log_ratio) accept_prob = np.exp(accept_logprob) # print(f"R_{cluster_idx}: Log ratio is {log_ratio} ({np.exp(log_ratio)})") # Accept/reject new reward if np.random.rand() <= accept_prob: # Accept new reward # print( # f"Accepting R_{cluster_idx} change from\n{reward} to\n{new_reward}" # ) rewards[cluster_idx] = new_reward # Solve for policy under new cluster reward cluster_policy_new = BoltzmannExplorationPolicy( new_q_star, reward_prior_confidence) boltzmann_policies[cluster_idx] = cluster_policy_new else: # Reject new reward # print(f"Rejecting change of R_{cluster_idx}") continue # Run a compaction step, removing any empty clusters clusters, rewards, boltzmann_policies = cluster_compaction( clusters, rewards, boltzmann_policies) print("Current rewards:") print(rewards) print( f"Mean acceptance ratio is {np.exp(np.mean(log_acceptance_probs))} - target is {TARGET_REWARD_ACCEPTANCE_PROB_RATIO}" ) # Return learned reward ensemble return clusters, rewards
def element_world_v4( num_elements, num_demos, demo_skew, num_clusters, wind, algorithm, initialisation, width, gamma, max_demonstration_length, reward_range, num_init_restarts, em_nll_tolerance, em_resp_tolerance, max_iterations, boltzmann_scale, skip_ml_paths, reward_initialisation, _log, _seed, _run, ): """ElementWorld Sacred Experiment""" # Construct EW _log.info(f"{_seed}: Preparing environment...") env = ElementWorldEnv(width=width, num_elements=num_elements, wind=wind, gamma=gamma) xtr, phi, gt_rewards = element_world_extras(env) mode_proportions = geometric_distribution(demo_skew, num_elements) demos_per_mode = np.floor(mode_proportions * num_demos) # Ensure every mode has at least 1 demo demos_per_mode = np.maximum(demos_per_mode, 1) # Ensure correct number of demos are present while np.sum(demos_per_mode) > num_demos: demos_per_mode[np.argmax(demos_per_mode)] -= 1 while np.sum(demos_per_mode) < num_demos: demos_per_mode[np.argmin(demos_per_mode)] += 1 # Convert to int demos_per_mode = demos_per_mode.astype(int) # Solve, get train dataset train_demos = [] train_gt_resp = [] for ri, (reward, num_element_demos) in enumerate(zip(gt_rewards, demos_per_mode)): resp_row = np.zeros(num_elements) resp_row[ri] = 1.0 for _ in range(num_element_demos): train_gt_resp.append(resp_row) _, q_star = vi(xtr, phi, reward) # pi_star = OptimalPolicy(q_star) pi_star = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale) train_demos.extend( pi_star.get_rollouts(env, num_element_demos, max_path_length=max_demonstration_length)) train_gt_resp = np.array(train_gt_resp) train_gt_mixture_weights = np.sum(train_gt_resp, axis=0) / num_demos # Solve, get test dataset test_demos = [] test_gt_resp = [] for ri, (reward, num_element_demos) in enumerate(zip(gt_rewards, demos_per_mode)): resp_row = np.zeros(num_elements) resp_row[ri] = 1.0 for _ in range(num_element_demos): test_gt_resp.append(resp_row) _, q_star = vi(xtr, phi, reward) # pi_star = OptimalPolicy(q_star) pi_star = BoltzmannExplorationPolicy(q_star, scale=boltzmann_scale) test_demos.extend( pi_star.get_rollouts(env, num_element_demos, max_path_length=max_demonstration_length)) test_gt_resp = np.array(test_gt_resp) test_gt_mixture_weights = np.sum(test_gt_resp, axis=0) / num_demos if reward_initialisation == "MLE": # We use the current IRL model for Maximum Likelihood initialisation of the # reward parameters if algorithm == "MaxEnt": solver = MaxEntEMSolver() xtr_p, train_demos_p = padding_trick(xtr, train_demos) _, test_demos_p = padding_trick(xtr, test_demos) elif algorithm == "MaxLik": solver = MaxLikEMSolver() xtr_p = xtr train_demos_p = train_demos test_demos_p = test_demos elif algorithm == "SigmaGIRL": solver = SigmaGIRLEMSolver() xtr_p = xtr train_demos_p = train_demos test_demos_p = test_demos else: raise ValueError elif reward_initialisation == "MeanOnly": # We use a 'mean only' solver to do the reward initialisation solver = MeanOnlyEMSolver() xtr_p = xtr train_demos_p = train_demos test_demos_p = test_demos else: raise ValueError() # Initialize Mixture t0 = datetime.now() if initialisation == "Random": # Initialize uniformly at random init_mode_weights, init_rewards = solver.init_random( phi, num_clusters, reward_range) elif initialisation == "KMeans": # Initialize with K-Means (hard) clustering init_mode_weights, init_rewards = solver.init_kmeans( xtr_p, phi, train_demos_p, num_clusters, reward_range, num_init_restarts) elif initialisation == "GMM": # Initialize with GMM (soft) clustering init_mode_weights, init_rewards = solver.init_gmm( xtr_p, phi, train_demos_p, num_clusters, reward_range, num_init_restarts) elif initialisation == "Supervised": # We always have uniform clusters in supervised experiments assert num_clusters == num_elements if isinstance(solver, MaxEntEMSolver): # Apply padding trick xtr_p, train_demos_p = padding_trick(xtr, train_demos) # Learn rewards with ground truth responsibility matrix learn_rewards = solver.mstep(xtr_p, phi, train_gt_resp, train_demos_p, reward_range) # Compute baseline NLL mixture_nll = solver.mixture_nll(xtr_p, phi, train_gt_mixture_weights, learn_rewards, train_demos_p) elif isinstance(solver, MaxLikEMSolver): # Learn rewards with ground truth responsibility matrix learn_rewards = solver.mstep(xtr, phi, train_gt_resp, train_demos, reward_range) # Compute baseline NLL mixture_nll = solver.mixture_nll(xtr, phi, train_gt_mixture_weights, learn_rewards, train_demos) else: raise ValueError() # No initial solution for supervised experiment init_resp = None init_mode_weights = None init_rewards = None init_eval = None init_eval_train = None # Skip BV training train_iterations = np.nan resp_history = [train_gt_resp] mode_weights_history = [train_gt_mixture_weights] rewards_history = [learn_rewards] nll_history = [mixture_nll] train_reason = "Supervised baseline mixture - no training needed" else: raise ValueError def post_em_iteration(solver, iteration, resp, mode_weights, rewards, nll, nll_delta, resp_delta): _log.info(f"{_seed}: Iteration {iteration} ended") _run.log_scalar("training.nll", nll) _run.log_scalar("training.nll_delta", nll_delta) _run.log_scalar("training.resp_delta", resp_delta) for mw_idx, mw in enumerate(mode_weights): _run.log_scalar(f"training.mw{mw_idx+1}", mw) for reward_idx, reward in enumerate(rewards): for theta_idx, theta_val in enumerate(reward.theta): _run.log_scalar(f"training.r{reward_idx+1}.t{theta_idx+1}", theta_val) _log.info( f"{_seed}: Initialisation done - switching to MLE reward model for EM alg" ) if algorithm == "MaxEnt": solver = MaxEntEMSolver(post_it=post_em_iteration) xtr_p, train_demos_p = padding_trick(xtr, train_demos) _, test_demos_p = padding_trick(xtr, test_demos) elif algorithm == "MaxLik": solver = MaxLikEMSolver(post_it=post_em_iteration) xtr_p = xtr train_demos_p = train_demos test_demos_p = test_demos elif algorithm == "SigmaGIRL": solver = SigmaGIRLEMSolver(post_it=post_em_iteration) xtr_p = xtr train_demos_p = train_demos test_demos_p = test_demos else: raise ValueError # Evaluate the initial mixture and run EM loop if initialisation != "Supervised": # Get initial responsibility matrix init_resp = solver.estep(xtr_p, phi, init_mode_weights, init_rewards, test_demos_p) init_resp_train = solver.estep(xtr_p, phi, init_mode_weights, init_rewards, train_demos_p) # Evaluate initial mixture _log.info(f"{_seed}: Evaluating initial solution (test set)") init_eval = element_world_eval( xtr, phi, test_demos, test_gt_resp, test_gt_mixture_weights, gt_rewards, init_resp, init_mode_weights, init_rewards, solver, skip_ml_paths, ) _log.info(f"{_seed}: Evaluating initial solution (train set)") init_eval_train = element_world_eval( xtr, phi, train_demos, train_gt_resp, train_gt_mixture_weights, gt_rewards, init_resp_train, init_mode_weights, init_rewards, solver, skip_ml_paths, non_data_perf=init_eval, ) # MI-IRL algorithm _log.info(f"{_seed}: BV-EM Loop") ( train_iterations, resp_history, mode_weights_history, rewards_history, nll_history, train_reason, ) = bv_em( solver, xtr_p, phi, train_demos_p, num_clusters, reward_range, mode_weights=init_mode_weights, rewards=init_rewards, nll_tolerance=em_nll_tolerance, resp_tolerance=em_resp_tolerance, max_iterations=max_iterations, ) _log.info(f"{_seed}: BV-EM Loop terminated, reason = {train_reason}") t1 = datetime.now() learn_resp = resp_history[-1] learn_mode_weights = mode_weights_history[-1] learn_rewards = rewards_history[-1] learn_nll = nll_history[-1] train_duration = (t1 - t0).total_seconds() # Derive Responsibility matrix for test paths learn_resp_test = solver.estep(xtr_p, phi, learn_mode_weights, learn_rewards, test_demos_p) # Evaluate final mixture _log.info(f"{_seed}: Evaluating final mixture (test set)") learn_eval = element_world_eval( xtr, phi, test_demos, test_gt_resp, test_gt_mixture_weights, gt_rewards, learn_resp_test, learn_mode_weights, learn_rewards, solver, skip_ml_paths, ) _log.info(f"{_seed}: Evaluating final mixture (train set)") learn_eval_train = element_world_eval( xtr, phi, train_demos, train_gt_resp, train_gt_mixture_weights, gt_rewards, learn_resp, learn_mode_weights, learn_rewards, solver, skip_ml_paths, non_data_perf=learn_eval, ) out_str = ( "{}: Finished after {} iterations ({}) =============================\n" "NLL: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n" "ANID: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n" "EVD: {:.2f} -> {:.2f} (train), {:.2f} -> {:.2f} (test)\n" "Mode Weights: {} -> {}\n" "===================================================\n".format( _seed, train_iterations, train_reason, np.nan if init_eval_train is None else init_eval_train["nll"], learn_eval_train["nll"], np.nan if init_eval is None else init_eval["nll"], learn_eval["nll"], np.nan if init_eval_train is None else init_eval_train["anid"], learn_eval_train["anid"], np.nan if init_eval is None else init_eval["anid"], learn_eval["anid"], np.nan if init_eval_train is None else init_eval_train["mcf_evd"], learn_eval_train["mcf_evd"], np.nan if init_eval is None else init_eval["mcf_evd"], learn_eval["mcf_evd"], init_mode_weights, learn_mode_weights, )) print(out_str, flush=True) # Dump experimental results to artifact _log.info(f"{_seed}: Done...") result_fname = f"{_seed}.result" with open(result_fname, "wb") as file: pickle.dump( { # Initial soln "init_resp": [] if init_resp is None else init_resp.tolist(), "init_mode_weights": [] if init_mode_weights is None else init_mode_weights.tolist(), "init_rewards": [] if init_rewards is None else [np.array(r.theta).tolist() for r in init_rewards], "init_eval": {} if init_eval is None else init_eval, "init_eval_train": {} if init_eval_train is None else init_eval_train, # Final soln "learn_resp": learn_resp.tolist(), "learn_mode_weights": learn_mode_weights.tolist(), "learn_rewards": [np.array(r.theta).tolist() for r in learn_rewards], "learn_eval": learn_eval, "learn_eval_train": learn_eval_train, # Training details "train_iterations": train_iterations, "train_duration": train_duration, "resp_history": np.array(resp_history).tolist(), "mode_weights_history": np.array(mode_weights_history).tolist(), "rewards_history": np.array([[r.theta for r in r1r2r3] for r1r2r3 in rewards_history]).tolist(), "nll_history": np.array(nll_history).tolist(), "train_reason": train_reason, }, file, ) _run.add_artifact(result_fname) os.remove(result_fname) _log.info(f"{_seed}: Done") return float(learn_nll)
def canonical_puddle_world( transition_type, environment, gt_num_clusters, tr_rollouts_per_mode, te_rollouts_per_mode, algorithm, initialisation, num_init_restarts, num_clusters, reward_range, tolerance, _log, _run, ): _log.info("Loading...") if transition_type == "Stochastic": wind = 0.2 elif transition_type == "Deterministic": wind = 0.0 else: raise ValueError if environment == "CanonicalPuddleWorld": env = CanonicalPuddleWorldEnv(wind=wind) else: raise ValueError xtr, phi, gt_rewards = puddle_world_extras(env) gt_rewards = list(gt_rewards.values()) if gt_num_clusters == 3: pass elif gt_num_clusters == 2: # Drop 'any' mode gt_rewards = gt_rewards[:gt_num_clusters] else: raise ValueError # Get rollouts q_stars = [] pi_stars = [] tr_rollouts_structured = [] tr_rollouts = [] te_rollouts_structured = [] te_rollouts = [] for reward in gt_rewards: # Get Q* function _, q_star = vi(xtr, phi, reward=reward) q_stars.append(q_star) # Get optimal stochastic policy pi_star = OptimalPolicy(q_star) pi_stars.append(pi_star) # Sample training rollouts from optimal policy _tr_rollouts = pi_star.get_rollouts(env, tr_rollouts_per_mode) tr_rollouts_structured.append(_tr_rollouts) tr_rollouts.extend(_tr_rollouts) # Sample distinct testing rollouts from optimal policy _te_rollouts = pi_star.get_rollouts(env, te_rollouts_per_mode) te_rollouts_structured.append(_te_rollouts) te_rollouts.extend(_te_rollouts) # Get solver object if algorithm == "MaxEnt": solver = MaxEntEMSolver() # Apply padding trick xtr_p, tr_rollouts_p = padding_trick(xtr, tr_rollouts) elif algorithm == "MaxLik": solver = MaxLikEMSolver() # Dummy padded variables xtr_p = xtr tr_rollouts_p = tr_rollouts else: raise ValueError # Lambda to get ground truth responsibility matrix gt_resp = lambda k, rpm: (np.concatenate( [np.repeat([np.eye(k)[r, :]], rpm, 0) for r in range(k)], 0, )) def eval_clustering( gt_resp, learned_resp, ): """Evaluate a mixture model's clustering performance""" # Compute cluster metrics nid = normalized_information_distance(gt_resp, learned_resp) anid = adjusted_normalized_information_distance(gt_resp, learned_resp) return nid, anid def eval_rewards( gt_mode_weights, gt_rewards, learned_mode_weights, learned_rewards, ): """Evaluate a mixture model's reward performance""" gt_num_clusters = len(gt_mode_weights) num_clusters = len(learned_mode_weights) # Compute reward recovery metrics ile_mat = np.zeros((num_clusters, gt_num_clusters)) evd_mat = np.zeros((num_clusters, gt_num_clusters)) for learned_mode_idx in range(num_clusters): for gt_mode_idx in range(gt_num_clusters): ile, evd = ile_evd( xtr, phi, gt_rewards[gt_mode_idx], learned_rewards[learned_mode_idx], ) ile_mat[learned_mode_idx, gt_mode_idx] = ile evd_mat[learned_mode_idx, gt_mode_idx] = evd mcf_ile, mcf_ile_flowdict = min_cost_flow_error_metric( learned_mode_weights, gt_mode_weights, ile_mat) mcf_evd, mcf_evd_flowdict = min_cost_flow_error_metric( learned_mode_weights, gt_mode_weights, evd_mat) return (mcf_ile, mcf_ile_flowdict, mcf_evd, mcf_evd_flowdict) _log.info("Initialising...") t0 = datetime.now() if initialisation == "Random": # Initialize uniformly at random st_mode_weights, st_rewards = solver.init_random( phi, num_clusters, reward_range) elif initialisation == "KMeans": # Initialize with K-Means (hard) clustering st_mode_weights, st_rewards = solver.init_kmeans( xtr_p, phi, tr_rollouts_p, num_clusters, reward_range, num_init_restarts) elif initialisation == "GMM": # Initialize with GMM (soft) clustering st_mode_weights, st_rewards = solver.init_gmm(xtr_p, phi, tr_rollouts_p, num_clusters, reward_range, num_init_restarts) elif initialisation == "Baseline": # This is a baseline experiment - simply set the clusters to the ground truth # model # We always have uniform clusters in these experiments assert num_clusters == gt_num_clusters # Use ground truth responsibility matrix and cluster weights _resp = gt_resp(num_clusters, tr_rollouts_per_mode) st_mode_weights = np.sum(_resp, axis=0) / len(_resp) # Learn rewards with ground truth responsibility matrix st_rewards = solver.mstep(xtr_p, phi, _resp, tr_rollouts_p, reward_range) # Compute baseline NLL _nll = solver.mixture_nll(xtr_p, phi, st_mode_weights, st_rewards, tr_rollouts_p) iterations = 0 tr_resp_history = [_resp] mode_weights_history = [st_mode_weights] rewards_history = [st_rewards] tr_nll_history = [_nll] reason = "Baseline model - EM loop skipped" # No initial solution for baseline models init_nid = np.nan init_anid = np.nan init_nll = np.nan init_mcf_ile = np.nan init_mcf_ile_flowdict = {} init_mcf_evd = np.nan init_mcf_evd_flowdict = {} else: raise ValueError if initialisation != "Baseline": _log.info("Evaluating initial solution...") init_learned_resp = solver.estep(xtr_p, phi, st_mode_weights, st_rewards, te_rollouts) init_nid, init_anid = eval_clustering( gt_resp(gt_num_clusters, te_rollouts_per_mode), init_learned_resp) init_nll = solver.mixture_nll(xtr_p, phi, st_mode_weights, st_rewards, te_rollouts) ( init_mcf_ile, init_mcf_ile_flowdict, init_mcf_evd, init_mcf_evd_flowdict, ) = eval_rewards( np.ones(gt_num_clusters) / gt_num_clusters, gt_rewards, st_mode_weights, st_rewards, ) _log.info("Solving...") if initialisation != "Baseline": ( iterations, tr_resp_history, mode_weights_history, rewards_history, tr_nll_history, reason, ) = bv_em( solver, xtr_p, phi, tr_rollouts_p, num_clusters, reward_range, mode_weights=st_mode_weights, rewards=st_rewards, nll_tolerance=tolerance, ) t1 = datetime.now() # Log training progress after experiment - timestamps will be wrong for it in range(iterations + 1): _run.log_scalar("training.mode_weights", mode_weights_history[it].tolist()) _run.log_scalar("training.rewards", [r.theta.tolist() for r in rewards_history[it]]) _run.log_scalar("training.nll", float(tr_nll_history[it])) tr_learned_resp = tr_resp_history[-1] learned_mode_weights = mode_weights_history[-1] learned_rewards = rewards_history[-1] tr_nll = tr_nll_history[-1] duration = (t1 - t0).total_seconds() _log.info("Evaluating...") # Evaluate training set clustering tr_nid, tr_anid = eval_clustering( gt_resp(gt_num_clusters, tr_rollouts_per_mode), tr_learned_resp) # Evaluate test set clustering te_learned_resp = solver.estep(xtr_p, phi, learned_mode_weights, learned_rewards, te_rollouts) te_nid, te_anid = eval_clustering( gt_resp(gt_num_clusters, te_rollouts_per_mode), te_learned_resp) # Evaluate test set NLL te_nll = solver.mixture_nll(xtr_p, phi, learned_mode_weights, learned_rewards, te_rollouts) # Evaluate reward performance (mcf_ile, mcf_ile_flowdict, mcf_evd, mcf_evd_flowdict) = eval_rewards( np.ones(gt_num_clusters) / gt_num_clusters, gt_rewards, learned_mode_weights, learned_rewards, ) _log.info("Done...") return { # Mixture Initialization "st_mode_weights": st_mode_weights.tolist(), "st_rewards": [st_r.theta.tolist() for st_r in st_rewards], "st_nll": float(tr_nll_history[0]), # # Initial solution evaluation "init_nid": init_nid, "init_anid": init_anid, "init_nll": init_nll, "init_mcf_ile": init_mcf_ile, "init_mcf_ile_flowdict": init_mcf_ile_flowdict, "init_mcf_evd": init_mcf_evd, "init_mcf_evd_flowdict": init_mcf_evd_flowdict, # # Learned model "iterations": int(iterations), "duration": float(duration), "learned_mode_weights": learned_mode_weights.tolist(), "learned_rewards": [learned_r.theta.tolist() for learned_r in learned_rewards], "reason": reason, # # Training set performance "tr_learned_resp": tr_learned_resp.tolist(), "tr_nll": float(tr_nll), "tr_normalized_information_distance": float(tr_nid), "tr_normalized_information_distance_adjusted": float(tr_anid), # # Test set performance "te_learned_resp": te_learned_resp.tolist(), "te_nll": float(te_nll), "te_normalized_information_distance": float(te_nid), "te_normalized_information_distance_adjusted": float(te_anid), # # Reward performance "min_cost_flow_ile": float(mcf_ile), "min_cost_flow_ile_flow": mcf_ile_flowdict, "min_cost_flow_evd": float(mcf_evd), "min_cost_flow_evd_flow": mcf_evd_flowdict, }
def main(): """Main function""" import gym import warnings import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from tqdm import tqdm from scipy.optimize import minimize from mdp_extras import ( vi, OptimalPolicy, padding_trick, UniformRandomPolicy, PaddedMDPWarning, Linear, ) from mdp_extras.envs import nchain_extras, frozen_lake_extras from unimodal_irl import sw_maxent_irl, sw_modelfree_maxent_irl, mean_ci, ile_evd n = 5 env = gym.make("NChain-v0", n=n) xtr, phi, reward_gt = nchain_extras(env, gamma=0.9) _, q_star = vi(xtr, phi, reward_gt) pi_star = OptimalPolicy(q_star) max_path_length = 10 num_paths = 40 demo_star = pi_star.get_rollouts(env, num_paths, max_path_length=max_path_length) phi_bar = phi.demo_average(demo_star, xtr.gamma) with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", category=PaddedMDPWarning) # Compute ground truth values gt_nll, gt_grad = sw_maxent_irl(reward_gt.theta, xtr, phi, phi_bar, max_path_length) print(f"GT: {gt_nll:.3f} {gt_grad}") pi_ref = UniformRandomPolicy(len(xtr.actions)) # num_reference_paths = 20 for num_reference_paths in 2**np.arange(13): nll_errs = [] grad_errs = [] for rep in range(100): pi_ref_demos = [] for _ in range(num_reference_paths): path_len = np.random.randint(1, max_path_length + 1) pi_ref_demos.extend( pi_ref.get_rollouts(env, 1, max_path_length=path_len)) nll, grad = sw_maxent_irl_modelfree( reward_gt.theta, xtr, phi, phi_bar, max_path_length, pi_ref, pi_ref_demos, nll_only=False, ) # print(f"IS: {nll:.3f} {grad}") nll_err = np.sqrt((nll - gt_nll)**2) grad_err = np.linalg.norm(gt_grad - grad) nll_errs.append(nll_err) grad_errs.append(grad_err) print( f"IS ({num_reference_paths}): {np.mean(nll_err):.3f} {np.mean(grad_err):.3f}" ) print("Howedy")
def ile_evd( xtr, phi, reward_gt, reward_test, *, p=1, vi_kwargs={}, policy_kwargs={}, pe_kwargs={}, ret_gt_value=False, gt_policy_value=None, ): """Find Inverse Learning Error and Expected Value Difference metrics Inverse Learning Error is defined in "Inverse reinforcement learning in partially observable environments." by Choi and Kim, 2011. Expected Value Difference is defined in "Nonlinear inverse reinforcement learning with gaussian processes." by Levine, et al. 2011. EVD is essentially a weighted version of ILE, that only considers states with non-zero starting probability. Args: xtr (mdp_extras.DiscreteExplicitExtras) MDP extras object phi (mdp_extras.FeatureFunction) Feature function for MDP reward_gt (mdp_extras.RewardFunction): Ground Truth reward function for MDP reward_test (mdp_extras.RewardFunction): Learned reward function for MDP p (int): p-Norm to use for ILE, Choi and Kim and other papers recommend p=1 vi_kwargs (dict): Extra keyword args for mdp_extras.vi Value Iteration method policy_kwargs (dict): Extra keyword args for mdp_extras.OptimalPolicy pe_kwargs (dict): Extra keyword args for mdp_extras.pi_eval Policy Evaluation method ret_gt_value (bool): If true, also return the GT policy state value function, can be used for speeding up future calls gt_policy_value (numpy array): Optional ground truth policy state value function - used for speeding up this function with multiple calls Returns: (float): Inverse Learning Error metric (float): Expected Value Difference metric """ if gt_policy_value is None: # Get GT policy state value function gt_policy_value, _ = vi(xtr, phi, reward_gt, **vi_kwargs) # Get test policy state value function under GT reward v_star_test, q_star_test = vi(xtr, phi, reward_test, **vi_kwargs) pi_star_test = OptimalPolicy(q_star_test, stochastic=False, **policy_kwargs) test_policy_value = pi_eval(xtr, phi, reward_gt, pi_star_test, **pe_kwargs) value_delta = gt_policy_value - test_policy_value ile = np.linalg.norm(value_delta, ord=p) evd = xtr.p0s @ value_delta if evd < 0: warnings.warn( f"EVD is < 0 (by {0 - evd}) - possible loss of accuracy due to numerical rounding" ) evd = 0.0 if ile < 0: warnings.warn( f"ILE is < 0 (by {0 - ile}) - possible loss of accuracy due to numerical rounding" ) ile = 0.0 if not ret_gt_value: return ile, evd else: return ile, evd, gt_policy_value