def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value # to find the log policy, find the argmax at each state and then create a new Q with each (s,a) = oldQ - (max for that state) return QQ
def solve(self, model, initial, true_samples, other_policy): ''' Returns a pair (agent, weights) where the agent attempts to generalize from the behavior observed in samples and weights is what was combined with the MDP to generate agent. model: an MDP with a linear reward functions. Parameters WILL be overwritten. initial: initial distribution over states samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal policy. ''' # Initial weight vector # w_0 = model.feature_function.params self.other_policy = other_policy self.full_initial = util.classes.NumMap() for s in model.S(): self.full_initial[s] = 1.0 self.full_initial = self.full_initial.normalize() # Compute feature expectations of agent = mu_E from samples (mu_E, sa_freq) = self.feature_expectations2(model, true_samples) print("True Samples", mu_E) self.Q_value = None w = numpy.random.random((model.reward_function.dim,)) # w = numpy.zeros((model.reward_function.dim,)) w_opt = scipy.optimize.fmin_bfgs(self.maxEntObjValue, w, self.maxEntObjGradient, (model, initial, mu_E, len(true_samples[0]), sa_freq), self._epsilon, maxiter=self._max_iter) print(w_opt) model.reward_function.params = w_opt agent = self._solver.solve(model) return (agent, w_opt)
def QValueSoftMaxSolve(model, thresh = 1): v = util.classes.NumMap() for s in model.S(): v[s] = 0.0 diff = 100.0 while diff >= thresh: vp = v Q = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*v[s_prime] for (s_prime,t) in T.items()] ) Q[ (s,a) ] = value v = util.classes.NumMap() # need the max action for each state! for s in model.S(): maxx = None for a in model.A(s): if (maxx == None) or Q[(s,a)] > maxx: maxx = Q[(s,a)] e_sum = 0 for a in model.A(s): e_sum += math.exp(Q[(s,a)] - maxx) v[s] = maxx + math.log(e_sum) diff = max(abs(value - vp[s]) for (s, value) in v.iteritems()) logp = util.classes.NumMap() for (sa, value) in Q.iteritems(): logp[sa] = value - v[sa[0]] return logp
def project(model, agent, initial, t_max): ''' Does a full state space simulation, allowing each state at each timestep to have < 1.0 probability of occupancy ''' # will return an array of dicts # build initial state distribution result = [ {}, ] for s in model.S(): if s in initial.keys(): result[0][s] = initial[s] else: result[0][s] = 0 t = 1 while t < t_max: dist = {} for s in model.S(): if result[t - 1][s] == 0: continue for (a, pr_a) in agent.actions(s): for (s_prime, pr_s_prime) in model.T(s, a): if not s_prime in dist: dist[s_prime] = pr_a * pr_s_prime * result[t - 1][s] else: dist[s_prime] += pr_a * pr_s_prime * result[t - 1][s] for s in model.S(): if not s in dist.keys: dist[s] = 0 result.append(dist) t += 1 return result
def iter(cls, model, Q): V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 # QQ(s,a) = R(s,a) + gamma*sum_{s'} T(s,a,s')*V(s') QQ = util.classes.NumMap() for s in model.S(): for a in model.A(s): value = model.R(s,a) T = model.T(s,a) value += sum( [model.gamma*t*V[s_prime] for (s_prime,t) in T.items()] ) QQ[ (s,a) ] = value return QQ
def solve(self, model, true_samples, s_r_prior, gridsize, max_states): ''' Returns a pair (agent, weights) where the agent attempts to generalize from the behavior observed in samples and weights is what was combined with the MDP to generate agent. model: an MDP with a linear reward functions. Parameters WILL be overwritten. initial: initial distribution over states samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal policy. ''' # Initial weight vector # w_0 = model.feature_function.params R = patrol.rewardbayesian.BayesianPatrolReward(len(model.S())*len(model.A()), gridsize) model.reward_function = R pi = self._solver.solve(model) for i in range(self._max_iter): R_tilde = R.randNeighbor() model.reward_function = R_tilde Q_pi = self._q_value_solver.solve(model) found_worse = False for history in true_samples: for (s, a) in history: if s.location[0] >= 0 and s.location[0] < max_states and Q_pi[(s, pi.actions(s).keys()[0])] < Q_pi[(s, a)]: # print(a, Q_pi[(s, a)], pi.actions(s).keys()[0], Q_pi[(s, pi.actions(s).keys()[0])]) found_worse = True break if found_worse: pi_tilde = self._solver.solve(model) chance = min(1, s_r_prior.prior(pi_tilde, R_tilde) / s_r_prior.prior(pi, R)) if random.random() < chance: pi = pi_tilde R = R_tilde else: chance = min(1, s_r_prior.prior(pi, R_tilde) / s_r_prior.prior(pi, R)) if random.random() < chance: R = R_tilde model.reward_function = R return pi
def maxEntObjGradient(self, w, model, initial, mu_E, true_samples_len, sa_freq): if (self.Q_value == None): model.reward_function.params = w agent = self._solver.solve(model) #shouldn't be doing this! else: policy = {} for s in model.S(): actions = util.classes.NumMap() for a in model.A(s): actions[a] = self.Q_value[ (s,a) ] policy[s] = actions.argmax() agent = mdp.agent.MapAgent(policy) samples = self.generate_samples(model, agent, initial, true_samples_len) _mu = self.feature_expectations(model, samples) print(w, mu_E - _mu) return -( mu_E - _mu)
def solve(self, model): '''Returns a map of (state, action) => q-value determined by this solver''' Q = util.classes.NumMap() for i in range(self._max_iter): Q = self.iter(model, Q) returnQ = util.classes.NumMap() V = util.classes.NumMap() # Compute V(s) = max_{a} Q(s,a) for s in model.S(): V_s = util.classes.NumMap() for a in model.A(s): V_s[a] = Q[ (s,a) ] if len(V_s) > 0: V[s] = V_s.max() else: V[s] = 0.0 for (sa, value) in Q.iteritems(): returnQ[sa] = value - V[sa[0]] return returnQ
def init(): global pub global all_stop_pub global pub_status global model global policy global lastpublish global savedPose global amclmessageReceived global ground_truth global add_delay global otherPose global lastGoal global go global maxsightdistance global dr_client global home global timestep global current_cg, curr_st, next_st rospy.init_node('patrolcontrol') global recordConvTraj, useRegions recordConvTraj = 0 useRegions = 0 recordConvTraj = int(rospy.get_param("~recordConvTraj")) observableStates = int(rospy.get_param("~observableStates")) useRegions = int(rospy.get_param("~useRegions")) data_log = str(rospy.get_param("~dataLog")) rospy.Subscriber("amcl_pose", PoseWithCovarianceStamped, handle_pose) pub = rospy.Publisher("move_base_simple/goal", PoseStamped) all_stop_pub = rospy.Publisher("all_stop_enabled", std_msgs.msg.Bool) initialpose = rospy.Publisher("initialpose", PoseWithCovarianceStamped) rospy.Subscriber("base_pose_ground_truth", Odometry, save_ground_truth) pub_status = rospy.Publisher("/study_attacker_status", String, latch=True) global skip_interaction if not skip_interaction: if add_delay: rospy.Subscriber("/robot_0/base_pose_ground_truth", Odometry, handle_other) else: rospy.Subscriber("/robot_1/base_pose_ground_truth", Odometry, handle_other) random.seed() np.random.seed() ## Initialize constants p_fail = 0.05 longHallway = 10 shortSides = 2 patrolAreaSize = longHallway + shortSides + shortSides observableStateLow = patrolAreaSize / 2 - 1 observableStateHigh = patrolAreaSize / 2 observableStateLow = 0 observableStateHigh = patrolAreaSize # calculate farness for each node in the patrolled area farness = np.zeros(patrolAreaSize) for i in range(patrolAreaSize): sum = 0 for j in range(patrolAreaSize): sum += abs(i - j) farness[i] = sum global m global ogmap current_cg = None ## Create reward function if (m == "boydright"): mapparams = boydrightMapParams(False) ogmap = OGMap(*mapparams) ## Create Model model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap()) model.gamma = 0.9 elif m == "largeGridPatrol": current_cg = 4 mapparams = largeGridMapParams(False) ogmap = OGMap(*mapparams) ## Create Model # model = patrol.model.PatrolModel(p_fail, PatrolState(np.array( [6, 8, 0] )), ogmap.theMap()) model = patrol.model.PatrolExtendedModel(0.01, PatrolExtendedState\ (np.array( [-1,-1, -1] ),0), \ ogmap.theMap()) model.gamma = 0.9 elif (m == "boydright2"): mapparams = boydright2MapParams(False) ogmap = OGMap(*mapparams) ## Create Model model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap()) model.gamma = 0.9 else: mapparams = boyd2MapParams(False) ogmap = OGMap(*mapparams) ## Create Model model = patrol.model.PatrolModel(p_fail, None, ogmap.theMap()) model.gamma = 0.9 # Call external solver here instead of Python based args = [ "boydsimple_t", ] import subprocess p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdin = m + "\n" stdin += "1.0\n" (transitionfunc, stderr) = p.communicate(stdin) args = [ "boydpatroller", ] p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdin = m + "\n" stdin += str(useRegions) + "\n" stdin += transitionfunc f = open(home + "/patrolstudy/toupload/patrollerT.log", "w") f.write(stdin) f.close() (stdout, stderr) = p.communicate(stdin) print("\n computed patroller's policy") if m == "largeGridPatrol": correctpolicy = home + "/patrolstudy/largeGridpolicy_mdppatrol" if m == "boydright": correctpolicy = home + "/patrolstudy/boydright_policy" if m == "boydright2": correctpolicy = home + "/patrolstudy/boydright2_policy" if m == "boyd2": correctpolicy = home + "/patrolstudy/boydpolicy_mdppatrolcontrol" # stdout now needs to be parsed into a hash of state => action, which is then sent to map agent # correctpolicy = "/home/saurabh/patrolstudy/largeGridpolicy_mdppatrol" f = open(correctpolicy, "w") f.write("") f.close() f = open(correctpolicy, "w") outtraj = "" p = {} stateactions = stdout.split("\n") for stateaction in stateactions: temp = stateaction.split(" = ") if len(temp) < 2: continue state = temp[0] action = temp[1] outtraj += state + " = " + action + "\n" # ps = patrol.model.PatrolState(np.array([int(pieces[0]), int(pieces[1]), int(pieces[2])])) if m == "largeGridPatrol": state = state.split(";") loc = state[0] current_goal = int(state[1]) loc = loc[1:len(loc) - 1] pieces = loc.split(",") location = np.array( [int(pieces[0]), int(pieces[1]), int(pieces[2])]) ps = patrol.model.PatrolExtendedState(location, current_goal) else: state = state[1:len(state) - 1] pieces = state.split(",") ps = patrol.model.PatrolState( np.array([int(pieces[0]), int(pieces[1]), int(pieces[2])])) if action == "MoveForwardAction": a = patrol.model.PatrolActionMoveForward() elif action == "TurnLeftAction": a = patrol.model.PatrolActionTurnLeft() elif action == "TurnAroundAction": a = patrol.model.PatrolActionTurnAround() elif action == "TurnRightAction": a = patrol.model.PatrolActionTurnRight() else: a = patrol.model.PatrolActionStop() p[ps] = a f.write(outtraj) f.close() # pub_status.publish(String("abortrun")) from mdp.agent import MapAgent policy = MapAgent(p) # sample trajectories for m == "largeGrid": global recordConvTraj if m == "largeGridPatrol" and recordConvTraj == 1: from util.classes import NumMap state_dict = {} for s in model.S(): if s.location[0] <= 7 and s.location[1] <= 1: state_dict[s] = 1.0 # state_dict = { PatrolState([0,0,0]):1.0 } initial = NumMap(state_dict).normalize() # data_log = "/home/saurabh/patrolstudy/toupload/recd_convertedstates_14_14_200states_LGR.log" f = open(data_log, "w") from mdp.simulation import sample_traj n_samples = 45 t_max = 50 outtraj = "" # outtraj2 = "observableStates: " # outtraj2 += str(observableStates) # s = ogmap.toState((6, 0, 0), False) # s = PatrolState(np.array( [6, 0, 0] )) # outtraj2 += str(largeGridisvisible(s, observableStates)) # pub_status.publish(String("datacollected")) pair_ct = 0 for i in range(n_samples): traj_list = sample_traj(model, t_max, initial, policy) for (s, a, s_p) in traj_list: # outtraj2 += str(s.location[0:3]) # outtraj2 += str(largeGridisvisible(s, observableStates))+"\n" if largeGridisvisible(s, observableStates): s = s.location[0:3] # print(s) # print(sap[1].__class__.__name__) outtraj += str(s[0]) + "," + str(s[1]) + "," + str( s[2]) + "," if a.__class__.__name__ == "PatrolActionMoveForward": outtraj += "PatrolActionMoveForward" elif a.__class__.__name__ == "PatrolActionTurnLeft": outtraj += "PatrolActionTurnLeft" elif a.__class__.__name__ == "PatrolActionTurnRight": outtraj += "PatrolActionTurnRight" elif a.__class__.__name__ == "PatrolActionTurnAround": outtraj += "PatrolActionTurnAround" else: outtraj += "PatrolActionStop" else: outtraj += "None" outtraj += "\n" pair_ct += 1 # outtraj2 += "\n" if pair_ct >= 200: outtraj += "ENDREC\n" pair_ct = 0 f.write(outtraj) # f.write(outtraj2) f.close() # policy = mdp.solvers.PolicyIteration(30, mdp.solvers.IteratingPolicyEvaluator2(.5)).solve(model) # policy = mdp.solvers.PolicyIteration(30, mdp.solvers.IteratingPolicyEvaluator(20)).solve(model) # policy = mdp.solvers.ValueIteration(.5).solve(model) print("Finished solving mdp") has_slowed_down = False dr_client = dynamic_reconfigure.client.Client( "move_base_node/TrajectoryPlannerROS") print("dr_client initialized") r = rospy.Rate(8) while not rospy.is_shutdown(): # print "savedPose is not None and otherPose is not None and canSee(savedPose, otherPose)" # print (savedPose is not None and otherPose is not None and canSee(savedPose, otherPose)) if m == "largeGridPatrol": print current_cg if savedPose is not None and otherPose is not None and canSee( savedPose, otherPose): if not add_delay: params = {'max_vel_x': 0.2} config = dr_client.update_configuration(params) has_slowed_down = True if not add_delay and lastGoal is not None: a = otherPose.orientation angles2 = tf.transformations.euler_from_quaternion( [a.x, a.y, a.z, a.w]) if (angles2[2] < 0): angles2 = (0, 0, 2 * math.pi + angles2[2]) otherState = ogmap.toState( (otherPose.position.x, otherPose.position.y, angles2[2]), False) if lastGoal.conflicts(otherState): # if rospy.get_time() - lastpublish >= timestep: lg = lastGoal while (lg.conflicts(otherState)): if m == "largeGridPatrol": curr_st = PatrolExtendedState( lg.location, current_cg) # print "(lg.conflicts(otherState)) next_st = policy.actions(curr_st).keys()[0].apply(curr_st)" next_st = policy.actions(curr_st).keys()[0].apply( curr_st) lg = PatrolState(next_st.location) else: lg = policy.actions(lg).keys()[0].apply(lg) sendGoal(lg) else: nextGoal() if add_delay and go: go = False topub = std_msgs.msg.Bool() topub.data = True all_stop_pub.publish(topub) elif savedPose is not None: if otherPose is not None and not canSee( savedPose, otherPose) and has_slowed_down: params = {'max_vel_x': 0.5} config = dr_client.update_configuration(params) has_slowed_down = False # print "nextGoal() in elif savedPose is not None" nextGoal() time.sleep(0.25)
def solve(self, model, initial, true_samples, other_policy): ''' Returns a pair (agent, weights) where the agent attempts to generalize from the behavior observed in samples and weights is what was combined with the MDP to generate agent. model: an MDP with a linear reward functions. Parameters WILL be overwritten. initial: initial distribution over states samples: a list of sample trajectories [ (s_t,a_t) ] of the (supposedly) optimal policy. ''' # Initial weight vector # w_0 = model.feature_function.params self.other_policy = other_policy self.full_initial = util.classes.NumMap() for s in model.S(): self.full_initial[s] = 1.0 self.full_initial = self.full_initial.normalize() # Compute feature expectations of agent = mu_E from samples mu_E = self.feature_expectations2(model, true_samples) print("True Samples", mu_E) # Pick random policy pi^(0) agent = mdp.agent.RandomAgent( model.A() ) # Calculate feature expectations of pi^(0) = mu^(0) samples = self.generate_samples(model, agent, initial, len(true_samples[0])) mu = self.feature_expectations(model, samples ) # mu = self.feature_expectations2(model, initial, agent ) lastT = 0 for i in range(self._max_iter): # Perform projections to new weights w^(i) if i == 0: mu_bar = mu else: mmmb = mu - mu_bar mu_bar = mu_bar + numpy.dot( mmmb, mu_E-mu_bar )/numpy.dot( mmmb,mmmb )*mmmb w = mu_E - mu_bar t = numpy.linalg.norm(mu_E - mu_bar) w[0] = abs(w[0]) print(w) model.reward_function.params = w print 'IRLApproxSolver Iteration #{},t = {:4.4f}'.format(i,t) if t < self._epsilon: break if abs(t - lastT) < .000001: break lastT = t # Compute optimal policy used R(s,a) = dot( feature_f(s,a), w^(i) ) if (numpy.linalg.norm(mu) == 0): agent = mdp.agent.RandomAgent( model.A() ) else: agent = self._solver.solve(model) # Compute feature expectations of pi^(i) = mu^(i) samples = self.generate_samples(model, agent, initial, len(true_samples[0])) mu = self.feature_expectations(model, samples) print(mu) # mu = self.feature_expectations2(model, initial, agent) # Restore initial weight vector # model.feature_function.params = w_0 return (agent, w)
def stupidPythonIdiots(): global resetPub global perceptPub global calcPub global mdpId global states global actions print(mdpId) rospy.wait_for_service('irlsimulate') initService() initNode() print(mdpId) p_fail = 0.05 longHallway = 10 shortSides = 4 patrolAreaSize = longHallway + shortSides + shortSides observableStateLow = 7 observableStateHigh = 8 # calculate farness for each node in the patrolled area farness = np.zeros(patrolAreaSize) for i in range(patrolAreaSize): sum = 0 for j in range(patrolAreaSize): sum += abs(i - j) farness[i] = sum ## Create reward function reward = patrol.reward.PatrolReward(patrolAreaSize, farness, observableStateLow, observableStateHigh) reward_weights = np.zeros(reward.dim) reward_weights[0] = .2 reward_weights[1] = .35 reward_weights[2] = .45 reward_weights[3] = 0 reward_weights[4] = 0 reward.params = reward_weights ## Create Model model = patrol.model.PatrolModel(p_fail, longHallway, shortSides) model.reward_function = reward model.gamma = 0.999 states = model.S() actions = model.A() ## Create initial distribution initial = util.classes.NumMap() for s in model.S(): initial[s] = 1.0 initial = initial.normalize() ## Define feature function (approximate methods only) # feature_function = mdp.etc.StateActionFeatureFunction(model) # feature_function = mdp.etc.StateFeatureFunction(model) # feature_function = gridworld.etc.GWLocationFF(model) ## Define player # policy = mdp.agent.HumanAgent(model) opt_policy = mdp.solvers.ValueIteration(50).solve(model) j = 0 for (s, a, r) in mdp.simulation.simulate(model, opt_policy, initial, 68): if (s.location[0] < observableStateLow): pass elif (s.location[0] > observableStateHigh): pass else: perceptPub.publish( percept(mdpId=mdpId, state=stateToId(s), action=actionToId(a), time=j)) j += 1 centerObs = util.classes.NumMap() for s in model.S(): centerObs[s] = 0 if (s.location[0] == (observableStateLow + observableStateHigh) / 2): centerObs[s] = 1 centerObs = centerObs.normalize() s = mdpId calcPub.publish(String(s)) raw_input("Percepts Sent, Press Enter to continue...") policyPxy = rospy.ServiceProxy('irlpolicy', policy) est_p = policyPxy(policyRequest(mdpId)) est_policy = util.classes.NumMap() for (i, a) in enumerate(est_p.policy): est_policy[idToState(i)] = idToAction(a) mdp.etc.policy_report(opt_policy, est_policy, mdp.solvers.ExactPolicyEvaluator(), model, centerObs) for s in model.S(): print 's = %s, pi*(s) = %s, pi_E(s) = %s' % (s, opt_policy.actions(s), est_policy.actions(s)) print 'pi* and pi_E disagree on {} of {} states'.format( len([ s for s in model.S() if opt_policy.actions(s) != est_policy.actions(s) ]), len(model.S())) simulatePxy = rospy.ServiceProxy('irlsimulate', simulate) enc_policy = simulatePxy(simulateRequest(mdpId)).state_actions