#give optimal action in each state as demonstration demo = [(state, np.argmax(Q_star[state, :])) for state in range(simple_world.num_states)] print demo #compute the gradient of R_guess #TODO get an actual guess and update it towards real R num_states = simple_world.num_states num_actions = simple_world.num_actions print "gradient" print birl.calc_reward_gradient(demo, simple_world, simple_world.R, eta=1.0) #test out the log posterior print "log-likelihood true reward", birl.demo_log_likelihood(demo, Q_star) reward2 = np.reshape([[0, 0, 0], [0, -1, 0], [-1, -1, 1]], (num_states, 1)) #print reward2 #set reward to false values simple_world.set_reward(reward2) #calculate new policy pi_star, V_star = mdp_solver.policy_iteration(simple_world) print "false reward" util.print_reward(simple_world) util.print_policy(simple_world, pi_star) Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma) print "log-likelihood false reward", birl.demo_log_likelihood(demo, Q_star) #try out gradient step to see if likelihood goes up #I want to try with reward only dependant on states, not actions... for i in range(10):
step_size = 1 # BTLS Initialization with step size = 1 alpha = 0.01 beta = 0.5 # BTLS variables, alpha is related to the constraint finishing each iteration, beta determines how fast step size will decrease print "----- gradient descent ------" for step in range(num_steps): flag = 0 # Variable that checks whether the BTLS iteration should finish or not #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) #calculate gradient of posterior wrt reward grad = birl.calc_reward_gradient(demo, mdp, mdp.R, eta=1.0) while flag == 0: R_temp = mdp_temp.R + step_size * grad mdp_temp.set_reward(R_temp) pi_temp, V_temp = mdp_solver.policy_iteration(mdp_temp) Q_temp = mdp_solver.calc_qvals(mdp_temp, pi_temp, V_temp, gamma) func = birl.demo_log_likelihood(demo, Q_temp) approx = birl.demo_log_likelihood( demo, Q) + alpha * step_size * pow(la.norm(grad, 2), 2) if func >= approx: step_size = beta * step_size #update reward R_new = mdp.R + step_size * grad
lam = 0.0 #regularization term num_steps = 20 step_size = 0.1 #we should experiment with step sizes #c = 1.0 #decreasing stepsize print "----- gradient descent ------" for step in range(num_steps): print "iter", step #calculate optimal policy for current estimate of reward pi, V = mdp_solver.policy_iteration(mdp) #print "new policy" #print pi_star #calculate Q values for current estimate of reward Q = mdp_solver.calc_qvals(mdp, pi, V, gamma) #print "new Qvals" #print log-likelihood print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q) #calculate subgradient of posterior wrt reward minus l1 regularization on the reward subgrad = birl.calc_l1regularized_reward_gradient(demo, mdp, mdp.R, lam, eta=1.0) #update stepsize #step_size = c / np.sqrt(step + 1) #print "stepsize", step_size #update reward R_new = mdp.R + step_size * subgrad #print "new reward" #print R_new #update mdp with new reward