def calc_reward_gradient(demo, mdp_r, R, eta=1.0): """calculate the gradient of the loglikelihood wrt reward""" num_states, num_actions = mdp_r.num_states, mdp_r.num_actions #solve mdp with given reward R mdp = copy.deepcopy(mdp_r) mdp.set_reward(R) pi_star, V_star = mdp_solver.policy_iteration(mdp) Q_star = mdp_solver.calc_qvals(mdp, pi_star, V_star, mdp.gamma) #calculate gradient of R (|s|x1 vector of rewards per state) gradient = np.zeros((num_states, 1)) #precompute (I-\gammaT^\pi)^-1 num_states, num_actions = mdp.num_states, mdp.num_actions T_pi = np.array([np.dot(pi_star[x], mdp.T[x]) for x in range(num_states)]) #print "T_pi" #print T_pi #TODO check on inverse for better numerical stability Ws = [np.dot(mdp.T[:,a,:], np.linalg.inv(np.eye(mdp.num_states) - mdp.gamma * T_pi)) for a in range(num_actions)] for i in range(num_states): #iterate over reward elements r_i_grad = 0 for s,a in demo: #iterate over demonstrations #print s,a,"demo pair" deriv_log_num = eta * partial_Q_wrt_R(s,a,i,mdp,pi_star,Ws[a]) deriv_log_denom = 1.0/np.sum(np.exp(eta * Q_star[s,:])) \ * np.sum([np.exp(eta * Q_star[s,b]) * eta * partial_Q_wrt_R(s,b,i,mdp,pi_star,Ws[b]) for b in range(num_actions)]) #print "deriv_log_num",deriv_log_num #print "deriv_log_denom", deriv_log_denom r_i_grad += deriv_log_num - deriv_log_denom gradient[i] = r_i_grad return gradient
def calc_reward_gradient(demo, mdp_r, R, eta=1.0): """calculate the gradient of the loglikelihood wrt reward""" num_states, num_actions = mdp_r.num_states, mdp_r.num_actions #solve mdp with given reward R mdp = copy.deepcopy(mdp_r) mdp.set_reward(R) pi_star, V_star = mdp_solver.policy_iteration(mdp) Q_star = mdp_solver.calc_qvals(mdp, pi_star, V_star, mdp.gamma) #calculate gradient of R (|s|x1 vector of rewards per state) gradient = np.zeros((num_states, 1)) for i in range(num_states): #iterate over reward elements r_i_grad = 0 for s, a in demo: #iterate over demonstrations #print s,a,"demo pair" deriv_log_num = eta * partial_Q_wrt_R(s, a, i, mdp, pi_star) deriv_log_denom = 1.0/np.sum(np.exp(eta * Q_star[s,:])) \ * np.sum([np.exp(eta * Q_star[s,b]) * eta * partial_Q_wrt_R(s,b,i,mdp,pi_star) for b in range(num_actions)]) #print "deriv_log_num",deriv_log_num #print "deriv_log_denom", deriv_log_denom r_i_grad += deriv_log_num - deriv_log_denom gradient[i] = r_i_grad return gradient
def calc_reward_gradient(demo, mdp_r, R, eta=1.0): """calculate the gradient of the loglikelihood wrt reward""" num_states, num_actions = mdp_r.num_states, mdp_r.num_actions #solve mdp with given reward R mdp = copy.deepcopy(mdp_r) mdp.set_reward(R) pi_star, V_star = mdp_solver.policy_iteration(mdp) Q_star = mdp_solver.calc_qvals(mdp, pi_star, V_star, mdp.gamma) #calculate gradient gradient = np.zeros((num_states, num_actions)) one_over_Lr = 1.0 / np.array([sa_likelihood(s,a,Q_star,eta) for s,a in demo]) #print "one_over", one_over_Lr for x in range(num_states): for a in range(num_actions): delL_delRxa = np.array([partial_Lsu_partial_Rxa(s, u, x, a, mdp, Q_star, pi_star, eta) for s,u in demo]) #print "delL_delR", delL_delRxa #print "debug", np.dot(one_over_Lr, delL_delRxa) gradient[x,a] = np.dot(one_over_Lr, delL_delRxa) return gradient
import util import birl_optimized as birl ##test script for running gradient descent for bayesian inverse reinforcement learning ##domain is a simple grid world (see gridworld.py) ##TODO I haven't incorporated a prior so this really is more of a maximum likelihood rather than bayesian irl algorithm reward = [[0, 0, 0, -1, 0, 0, 0], [0, -1, 0, -1, 0, -1, 0], [0, -1, 0, -1, 0, -1, 0], [1, -1, 0, 0, 0, -1, 0]] #true expert reward terminals = [21] #no terminals, you can change this if you want gamma = 0.95 #discount factor for mdp grid = gridworld.GridWorld(reward, terminals, gamma) #create grid world print "expert reward" util.print_reward(grid) pi_star, V_star = mdp_solver.policy_iteration(grid) #solve for expert policy print pi_star print "expert policy" util.print_policy(grid, pi_star) print "expert value function" util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols))) Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma) print "expert Q-values" print Q_star #give optimal action in each (non-terminal) state as demonstration #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want demo = [(state, np.argmax(Q_star[state, :])) for state in range(grid.num_states) if state not in terminals] print "demonstration", demo
import simple_chain import util import birl ##test script for running gradient descent for bayesian inverse reinforcement learning ##domain is a simple markov chain (see simple_chain.py) ##TODO I haven't incorporated a prior so this really is more of a maximum likelihood rather than bayesian irl algorithm reward = [1, 0, 0, 0, 0, 1] #true expert reward terminals = [0] #no terminals, you can change this if you want gamma = 0.9 #discount factor for mdp chain = simple_chain.SimpleChain(reward, terminals, gamma) #create markov chain print "expert reward" print chain.reward pi_star, V_star = mdp_solver.policy_iteration(chain) #solve for expert policy print "expert policy" print pi_star print "expert value function" print V_star Q_star = mdp_solver.calc_qvals(chain, pi_star, V_star, gamma) print "expert Q-values" print Q_star #give optimal action in each (non-terminal) state as demonstration #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want demo = [(state, np.argmax(Q_star[state, :])) for state in range(chain.num_states) if state not in terminals] print "demonstration", demo ####### gradient descent starting from random guess at expert's reward
def run_experiment(size): print "size", size rows = size cols = size #reward = [[0,0,0,-1,0],[0,-1,0,-1,0],[1,-1,0,0,0]] #true expert reward reward = np.reshape([np.random.randint(-10,10) for _ in range(rows*cols)],(rows,cols)) #true expert reward terminals = [] #no terminals, you can change this if you want gamma = 0.9 #discount factor for mdp grid = gridworld.GridWorld(reward, terminals, gamma) #create grid world #print "expert reward" #util.print_reward(grid) pi_star, V_star = mdp_solver.policy_iteration(grid) #solve for expert policy #print pi_star #print "expert policy" #util.print_policy(grid, pi_star) #print "expert value function" #util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols))) Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma) #print "expert Q-values" #print Q_star #give optimal action in each (non-terminal) state as demonstration #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want demo = [(state, np.argmax(Q_star[state,:])) for state in range(grid.num_states) if state not in terminals] #print "demonstration", demo ####### gradient descent starting from random guess at expert's reward reward_guess = np.reshape([np.random.randint(-10,10) for _ in range(grid.num_states)],(grid.rows,grid.cols)) #create new mdp with reward_guess as reward mdp = gridworld.GridWorld(reward_guess, terminals, gamma) #create markov chain start = timeit.default_timer() num_steps = 10 c = 0.5 #we should experiment with step sizes print "----- gradient descent ------" for step in range(num_steps): #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood #print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) step_size = c / np.sqrt(step + 1) #print "stepsize", step_size #calculate gradient of posterior wrt reward grad = birl.calc_reward_gradient(demo, mdp, mdp.R, eta=1.0) #update reward R_new = mdp.R + step_size * grad #print "new reward" #print R_new #update mdp with new reward mdp.set_reward(R_new) stop = timeit.default_timer() #print "recovered reward" #util.print_reward(mdp) pi, V = mdp_solver.policy_iteration(mdp) #print "resulting optimal policy" #util.print_policy(mdp, pi) print "policy difference" #print np.linalg.norm(pi_star - pi) runtime = stop - start print "runtime for size", size, "=", runtime f = open("../results/runtime_size" + str(size) + ".txt", "w") f.write(str(runtime)) f.close()
import numpy as np import mdp_solver import gridworld import util import birl #gradient descent on reward reward = [[0, 0, 0], [0, -1, 0], [1, -1, 0]] terminals = [6] gamma = 0.9 simple_world = gridworld.GridWorld(reward, terminals, gamma) print "reward" util.print_reward(simple_world) pi_star, V_star = mdp_solver.policy_iteration(simple_world) print "optimal policy" util.print_policy(simple_world, pi_star) Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma) print "q-vals" print Q_star #give optimal action in each state as demonstration demo = [(state, np.argmax(Q_star[state, :])) for state in range(simple_world.num_states)] print demo #compute the gradient of R_guess #TODO get an actual guess and update it towards real R num_states = simple_world.num_states num_actions = simple_world.num_actions print "gradient"