def test_scot(test, policy_opt, values_opt, seed, horizon, traj_limit): trajs = scot(test.env, test.env.weights, H=horizon, seed=seed, verbose=False) lens = [] for t in trajs: lens.append(len(t)) print("traj_limit", traj_limit) if len(trajs) > traj_limit: trajs = sample(trajs, traj_limit) # student's inferred reward function from the trajectories from SCOT r_weights = max_likelihood_irl(trajs, test.env, step_size=0.2, eps=1.0e-03, max_steps=1000, verbose=False) values_MLIRL, policy_MLIRL = value_iteration(mdp=test.env, r_weights=r_weights) # student's policy and value function under student's reward funct values_MLIRL, _ = value_iteration(mdp=test.env, policy=policy_MLIRL) # value of student's policy under teacher's reward funct (true) policy_similarity = np.sum(policy_MLIRL == policy_opt) / policy_MLIRL.shape[0] print("Policy similarity for SCOT: {}".format(policy_similarity)) # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS total_value_opt = np.dot(test.env.start_dist, values_opt) #print("Optimal expected value: {}".format(total_value_opt)) total_value_MLIRL = np.dot(test.env.start_dist, values_MLIRL) print("Max Likelihood IRL expected value: {}".format(total_value_MLIRL)) value_gain_MLIRL = total_value_MLIRL / total_value_opt print("Value gain of Max Likelihood IRL: {}".format(value_gain_MLIRL)) print(len(trajs), sum(lens)) return policy_similarity, total_value_MLIRL, value_gain_MLIRL
def test_QLearning(test, policy_opt, values_opt, horizon, traj_limit): value_function_est, _, Q_policy, num_trajs = q_learning(test.wrapper, **{'n_samp': traj_limit * horizon, 'step_size': 0.1, 'epsilon': 0.1, 'horizon':horizon, 'traj_limit':traj_limit}) print("num_trajs", num_trajs) values_QL, _ = value_iteration(mdp=test.env, policy=Q_policy) # value of student's PI policy under teacher's reward funct (true) policy_similarity = np.sum(Q_policy == policy_opt) / Q_policy.shape[0] print("Policy similarity for Q Learning: {}".format(policy_similarity)) # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS total_value_opt = np.dot(test.env.start_dist, values_opt) #print("Optimal expected value: {}".format(total_value_opt)) total_value_QL = np.dot(test.env.start_dist, values_QL) print("True QL expected value: {}".format(total_value_QL)) value_gain_QL = total_value_QL / total_value_opt print("Value gain of true PI: {}".format(value_gain_QL)) total_value_est_QL = np.dot(test.env.start_dist, value_function_est) print("Estimated PI expected value: {}".format(total_value_est_QL)) value_gain_est_QL = total_value_est_QL / total_value_opt print("Value gain of Est QL: {}".format(value_gain_est_QL)) return policy_similarity, total_value_QL, total_value_est_QL, value_gain_QL, value_gain_est_QL
def test_PI(test, policy_opt, values_opt, horizon, traj_limit): print("testPI") est_values_PI, policy_PI = policy_iteration( test.env, test.agent, every_visit_monte_carlo, kwargs={'n_eps': traj_limit, 'eps_len': horizon}) #est_values_PI, policy_PI = policy_iteration( # test.env, test.agent, temporal_difference, kwargs={'n_samp':1000, 'step_size': 0.1, 'horizon': horizon, 'traj_limit': traj_limit}) #est_values_PI, policy_PI = policy_iteration( # test.env, test.agent, first_visit_monte_carlo, kwargs={'n_eps': traj_limit, 'eps_len': horizon}) print('here') values_PI, _ = value_iteration(mdp=test.env, policy=policy_PI) # value of student's PI policy under teacher's reward funct (true) policy_similarity = np.sum(policy_PI == policy_opt) / policy_PI.shape[0] print("Policy similarity for PI: {}".format(policy_similarity)) # FOR NOW, THE START STATE DISTRIBUTION START_DIST DOES NOT HAVE AN ELEMENT FOR THE STATE nS total_value_opt = np.dot(test.env.start_dist, values_opt) #print("Optimal expected value: {}".format(total_value_opt)) total_value_PI = np.dot(test.env.start_dist, values_PI) print("True PI expected value: {}".format(total_value_PI)) value_gain_PI = total_value_PI / total_value_opt print("Value gain of true PI: {}".format(value_gain_PI)) total_value_est_PI = np.dot(test.env.start_dist, est_values_PI) print("Estimated PI expected value: {}".format(total_value_est_PI)) value_gain_est_PI = total_value_est_PI / total_value_opt print("Value gain of Est PI: {}".format(value_gain_est_PI)) return policy_similarity, total_value_PI, total_value_est_PI, value_gain_PI, value_gain_est_PI
def scot(mdp, w, s_start=None, m=None, H=None, seed=None, verbose=False): """ Implements the Set Cover Optimal Teaching (SCOT) algorithm from "Machine Teaching for Inverse Reinforcement Learning: Algorithms and Applications", Brown and Niekum (2019) Args: mdp: MDP environment s_start: list of possible initial states w (np.array): weights of linear reward function of expert teacher agent (featurization computed by MDP environment) as a numpy array m (int): number of sample demonstration trajectories to draw per start state H (int): horizon (max length) of demonstration trajectories verbose (Boolean): whether to print out algorithmic runtime information Returns: D: list of maximally informative machine teaching trajectories represented as lists of (s, a, r, s') experience tuples """ if seed is not None: np.random.seed(seed) else: np.random.seed(2) # compute optimal policy pi_opt _, teacher_pol_det = value_iteration(mdp) # using variation of VI code from HW1) # convert teacher policy to stochastic policy teacher_pol = det2stoch_policy(teacher_pol_det, mdp.nS, mdp.nA) # compute expected feature counts mu[s][a] under optimal policy mu, mu_sa = get_feature_counts(mdp, teacher_pol) # compute BEC of teacher as list of vectors defining halfspaces of linear reward # function parameters implied by teacher's policy BEC = np.empty((mdp.nS*mdp.nA, w.shape[0])) # compute BEC for teacher policy # for a in range(mdp.nA): # BEC[a*mdp.nS:(a+1)*mdp.nS] = mu - mu_sa[:, a] BEC = np.empty((mdp.nS*(mdp.nA-1), w.shape[0])) i = 0 for s in range(mdp.nS): for a in range(mdp.nA): if a != teacher_pol_det[s]: BEC[i] = mu[s] - mu_sa[s, a] i += 1 # remove trivial, duplicate, and redundant half-space constraints BEC = refineBEC(w, BEC) if verbose: print("BEC", BEC) # (1) compute candidate demonstration trajectories # number of demonstration trajectories to sample per start state if m is None: m = int(np.ceil(1/(1.0 - 0.95*mdp.noise))) teacher = Agent(teacher_pol, mdp.nS, mdp.nA) wrapper = Wrapper(mdp, teacher, False) demo_trajs = [] # limit trajectory length to guarantee SCOT termination, # increase max trajectory length for stochastic environments # may want to set trajectory limit to number of iterations required in VI/feature count computations if H is None: H = mdp.nS # sample demonstration trajectories from each starting state (either from each state with non-zero probability mass # in the start state distribution of the MDP or a given set of start states # for s in range(mdp.nS): # demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1] if s_start is None: for s in range(mdp.nS): if mdp.start_dist[s] > 0.0: demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1] else: for s in s_start: demo_trajs += wrapper.eval_episodes(m, s, horizon=H)[1] if verbose: print("number of demonstration trajectories:") print(len(demo_trajs)) # (2) greedy set cover algorithm to compute maximally informative trajectories U = set() for i in range(BEC.shape[0]): U.add(tuple(BEC[i].tolist())) D = [] C = set() U_sub_C = U - C if verbose: print("number of BEC constraints before set cover") print(len(U)) # greedy set cover algorithm """ the set cover problem is to identify the smallest sub-collection of S whose union equals the universe. For example, consider the universe U={1,2,3,4,5} and the collection of sets S={{1,2,3},{2,4},{3,4},{4,5}}} """ BECs_trajs = [] for traj in demo_trajs: BECs_trajs.append(compute_traj_BEC(traj, mu, mu_sa, mdp, w)) while len(U_sub_C) > 0 and len(BECs_trajs) > 0: t_list = [] # collects the cardinality of the intersection between BEC(traj|pi*) and U \ C BEC_list = [] for BEC_traj in BECs_trajs: BEC_list.append(BEC_traj) t_list.append(len(BEC_traj.intersection(U_sub_C))) t_greedy_index = t_list.index(max(t_list)) t_greedy = demo_trajs[t_greedy_index] # argmax over t_list to find greedy traj del BECs_trajs[t_greedy_index] del demo_trajs[t_greedy_index] D.append(t_greedy) C = C.union(BEC_list[t_greedy_index]) U_sub_C = U - C if len(BECs_trajs) == 0: print("BEC_trajs empty") print(len(U_sub_C)) print(U_sub_C) if verbose: print("trajectories", D) lens = [len(s) for s in D] print(len(D), lens) return D
def max_likelihood_irl(D, mdp, step_size=0.01, eps=1e-02, max_steps=float("inf"), verbose=False): """ Maximum Likelihood IRL: returns maximally likely reward function under Boltzmann policy for given set of rewards See Vroman 2011 (http://www.icml-2011.org/papers/478_icmlpaper.pdf) for original paper, Ratia 2012 (https://arxiv.org/pdf/1202.1558.pdf) for likelihood gradient formula. :param D: list of trajectories of an optimal policy in the given MDP mdp :param mdp: MDP environment :param eps: convergence criteria, float :param max_steps: max number of steps in gradient ascent, int :param verbose: verbosity of algorithmic reporting :return: r_weights: reward weights as np array """ # initialize reward weights r_weights = np.random.rand(*mdp.weights.shape) print("Initial reward weights:") print(r_weights) # get state-action pairs observed in trajectories # (should technically be the only thing input to the algorithm, maybe move this code outside into utilizing code... sa_traj = [] traj_states = [] for traj in D: for i in range(len(traj)): sa_traj.append([traj[i][0], traj[i][1]]) traj_states.append(traj[i][0]) sa_traj = np.array(sa_traj) # convergence criteria iters = 0 delta = eps + 1 while delta > eps and iters < max_steps: iters += 1 # compute value function and optimal policy under current reward estimate # Use max iterations of 100 in line with Vroman et al values, policy = value_iteration(mdp, r_weights=r_weights) policy = det2stoch_policy(policy, mdp.nS, mdp.nA) # compute Q-values Qvalues = np.array([[ mdp.reward(s, r_weights) + mdp.gamma * np.sum( [mdp.P[s, a, succ] * values[succ] for succ in range(mdp.nS)]) for a in range(mdp.nA) ] for s in traj_states]) # compute state-action likelihoods for all trajectory state-action pairs # temperature value for Boltzmann exploration policy (softmax with each exponent multiplied by beta) # used by Vroman et al beta = 0.5 # apply Boltzmann temperature. subtract max Q-value so no large intermediate exponential values in likelihoods Qvalues_normalized = beta * (Qvalues - np.max(Qvalues)) likelihoods = np.exp(Qvalues_normalized) for j in range(len(traj_states)): likelihood_sum = np.sum(likelihoods[j]) likelihoods[j, :] /= likelihood_sum # compute state-action feature counts under current optimal policy mu, mu_sa = get_feature_counts(mdp, policy, tol=1.0e-03) # print(mdp.nA, likelihoods.shape, mu_sa.shape, sa_traj.shape) # print(sa_traj) # get likelihood gradient grad = beta * np.sum(np.array( [(mu_sa[sa_traj[sa_idx][0], sa_traj[sa_idx][1]] - np.sum(np.array([ likelihoods[sa_idx, b] * mu_sa[sa_traj[sa_idx][0], b] for b in range(mdp.nA) ]), axis=0)) for sa_idx in range(len(sa_traj))]), axis=0) # perform gradient ascent step, use step size schedule of gradient ascent iterations r_weights_old = np.copy(r_weights) # L2_reg_factor = 0.001 r_weights += step_size / iters * grad #- L2_reg_factor * r_weights # normalize r_weights to constrain L2 norm and force a unique reward weight vector r_weights /= np.linalg.norm(r_weights, ord=2) # convergence criterion delta = np.linalg.norm(r_weights - r_weights_old, 1) if verbose: print(delta) if verbose: print("MLIRL iterations: {}".format(iters)) return r_weights
else: curr_state = next_state V = np.max(Q, axis=1) policy = np.argmax(Q, axis=1) return V, Q, policy, num_trajs if __name__ == '__main__': np.random.seed(2) from tests import BrownNiekum, Random from algorithms.value_iteration import value_iteration test = BrownNiekum() test.env.render() value_function_opt, policy = value_iteration(test.env) value_function_est, _, Q_policy = q_learning( test.wrapper, **{ 'n_samp': 50000, 'step_size': 0.1, 'epsilon': 0.1 }) # compare value functions print('Optimal policy: {}\nPolicy from Q-learning: {}'.format( policy, Q_policy)) print('Optimal value function from Value Iteration: {}'.format( value_function_opt)) print('Estimated value function from {}: {}'.format( q_learning.__name__, value_function_est))
def main(): # for i in range(50, 100): # np.random.seed(i) # test = get_env() # default BrownNiekum() # print("i") # print(i) # trajs = scot(test.env, test.env.weights, seed=i+1, verbose=False) horizon = 20 traj_limit = 30 #print(test_scot(test, policy_opt, values_opt, seed, horizon)) #print(test_PI(test, policy_opt, values_opt, horizon)) num_tests = 10 MLIRL_policy_similarity_list = [] total_value_MLIRL_list = [] value_gain_MLIRL_list = [] PI_policy_similarity_list = [] total_value_PI_list = [] total_value_est_PI_list = [] value_gain_PI_list = [] value_gain_est_PI_list = [] QL_policy_similarity_list = [] total_value_QL_list = [] total_value_est_QL_list = [] value_gain_QL_list = [] value_gain_est_QL_list = [] baseline_policy_similarity_list = [] total_value_baseline_list = [] value_gain_baseline_list = [] for i in range(num_tests): np.random.seed(i) test = get_env() # default BrownNiekum() test.env.render() values_opt, policy_opt = value_iteration( mdp=test.env) # optimal value and policy under teacher's reward funct (true) print(i) #MLIRL_policy_similarity, total_value_MLIRL, value_gain_MLIRL = test_scot(test, policy_opt, values_opt, i, horizon, traj_limit) #exit(0) PI_policy_similarity, total_value_PI, total_value_est_PI, value_gain_PI, value_gain_est_PI = test_PI(test, policy_opt, values_opt, horizon, traj_limit) #QL_policy_similarity, total_value_QL, total_value_est_QL, value_gain_QL, value_gain_est_QL = test_QLearning(test, policy_opt, values_opt, horizon, traj_limit) #baseline_policy_similarity, total_value_baseline, value_gain_baseline = test_baseline(test, policy_opt, values_opt, i, horizon, traj_limit) #MLIRL_policy_similarity_list.append(MLIRL_policy_similarity) #total_value_MLIRL_list.append(total_value_MLIRL) #value_gain_MLIRL_list.append(value_gain_MLIRL) PI_policy_similarity_list.append(PI_policy_similarity) total_value_PI_list.append(total_value_PI) total_value_est_PI_list.append(total_value_est_PI) value_gain_PI_list.append(value_gain_PI) value_gain_est_PI_list.append(value_gain_est_PI) """ QL_policy_similarity_list.append(QL_policy_similarity) total_value_QL_list.append(total_value_QL) total_value_est_QL_list.append(total_value_est_QL) value_gain_QL_list.append(value_gain_QL) value_gain_est_QL_list.append(value_gain_est_QL) baseline_policy_similarity_list.append(baseline_policy_similarity) total_value_baseline_list.append(total_value_baseline) value_gain_baseline_list.append(value_gain_baseline) """ print("MLIRL_policy_similarity", np.mean(MLIRL_policy_similarity_list), np.var(MLIRL_policy_similarity_list)) print("total_value_MLIRL", np.mean(total_value_MLIRL_list), np.var(total_value_MLIRL_list)) print("value_gain_MLIRL", np.mean(value_gain_MLIRL_list), np.var(value_gain_MLIRL_list)) print("PI_policy_similarity", np.mean(PI_policy_similarity_list), np.var(PI_policy_similarity_list)) print("total_value_PI", np.mean(total_value_PI_list), np.var(total_value_PI_list)) print("total_value_est_PI", np.mean(total_value_est_PI_list), np.var(total_value_est_PI_list)) print("value_gain_PI", np.mean(value_gain_PI_list), np.var(value_gain_PI_list)) print("value_gain_est_PI", np.mean(value_gain_est_PI_list), np.var(value_gain_est_PI_list)) print("QL_policy_similarity", np.mean(QL_policy_similarity_list), np.var(QL_policy_similarity_list)) print("total_value_QL", np.mean(total_value_QL_list), np.var(total_value_QL_list)) print("total_value_est_QL", np.mean(total_value_est_QL_list), np.var(total_value_est_QL_list)) print("value_gain_QL", np.mean(value_gain_QL_list), np.var(value_gain_QL_list)) print("value_gain_est_QL", np.mean(value_gain_est_QL_list), np.var(value_gain_est_QL_list)) print("baseline_policy_similarity", np.mean(baseline_policy_similarity_list), np.var(baseline_policy_similarity_list)) print("total_value_baseline", np.mean(total_value_baseline_list), np.var(total_value_baseline_list)) print("value_gain_baseline", np.mean(value_gain_baseline_list), np.var(value_gain_baseline_list))
def init_policy(self): _, policy = value_iteration(self.env) print('Policy from VI: {}'.format(policy)) return policy
def init_policy(self): _, policy = value_iteration(self.env) print('Policy from VI: {}'.format(policy)) policy = det2stoch_policy(policy, self.env.nS, self.env.nA) return policy
break traj = [] else: curr_state = next_state return V_pi if __name__ == '__main__': np.random.seed(2) from tests import BrownNiekum, Random from algorithms.value_iteration import value_iteration test = BrownNiekum() test.env.render() value_function_opt, policy = value_iteration(test.env, verbose=True) # change this to test other functions policy_eval_func = temporal_difference if policy_eval_func == temporal_difference: n_samples =500000 step_size = 0.1 horizon = 50 value_function_est = policy_eval_func(test.wrapper, **{'n_samp': n_samples, 'step_size':step_size, 'horizon': horizon}) print('Running {} with {} samples, step size {}, and horz, {} '.format(policy_eval_func.__name__, n_samples, step_size, horizon)) elif policy_eval_func == every_visit_monte_carlo: eps_len = 500 n_eps = 50000 value_function_est = policy_eval_func(test.wrapper, **{'eps_len': eps_len, 'n_eps':n_eps})