class Simulator: def __init__(self, actions, observations, goal, occlusions, discount=1.0, xsize=15, ysize=15): self.NumActions = actions self.NumObservations = observations self.Discount = discount self.Knowledge = Knowledge() self.RewardRange = 100 self.GoalPos = goal self.XSize = xsize self.YSize = ysize self.Occlusions = occlusions self.ASTAR = AStar(self.XSize, self.YSize, self.Occlusions) assert (self.Discount > 0 and self.Discount <= 1.0) def Validate(self, state): return True def LocalMove(self, state, history, stepObs, status): return 1 def GenerateLegal(self, state, history, actions, status): for a in range(self.NumActions): actions.append(a) return actions def GeneratePreferred(self, state, history, actions, status): return actions def SelectRandom(self, state, history, status): if self.Knowledge.RolloutLevel >= MOVES.SMART: actions = [] actions = self.GeneratePreferred(state, history, actions, status) if actions: return actions[Random(0, len(actions))] if self.Knowledge.RolloutLevel >= MOVES.LEGAL: actions = [] actions = self.GenerateLegal(state, history, actions, status) if actions: return actions[Random(0, len(actions))] return Random(0, self.NumActions) def SelectASTARRandom(self, state, history, status): if Bernoulli(0.5): self.ASTAR.__init__(self.XSize, self.YSize, self.Occlusions) self.ASTAR.InitializeGrid((state.AgentPos).Copy(), self.GoalPos.Copy()) path = self.ASTAR.Solve() pathPos = COORD(path[1][0], path[1][1]) for action in range(self.NumActions): agentPos = (state.AgentPos).Copy() nextAgentPos = agentPos + Compass[action] if nextAgentPos == pathPos: break return action return self.SelectRandom(state, history, status) def Prior(self, state, history, vnode, status): actions = [] if self.Knowledge.TreeLevel == MOVES.PURE: vnode.SetChildren(0, 0) else: vnode.SetChildren(LargeInteger, -Infinity) if self.Knowledge.TreeLevel >= MOVES.LEGAL: actions = [] actions = self.GenerateLegal(state, history, actions, status) for action in actions: qnode = vnode.Child(action) qnode.Value.Set(0, 0) qnode.AMAF.Set(0, 0) vnode.Children[action] = qnode if self.Knowledge.TreeLevel >= MOVES.SMART: actions = [] actions = self.GeneratePreferred(state, history, actions, status) if actions: for action in actions: qnode = vnode.Child(action) qnode.Value.Set(self.Knowledge.SmartTreeCount, self.Knowledge.SmartTreeValue) qnode.AMAF.Set(self.Knowledge.SmartTreeCount, self.Knowledge.SmartTreeValue) vnode.Children[action] = qnode return vnode #---- Displays def DisplayBeliefs(self, beliefState): return def DisplayState(self, state): return def DisplayAction(self, state, action): print("Action ", action) def DisplayObservation(self, state, observation): print("Observation ", observation) def DisplayReward(self, reward): print("Reward ", reward) #--- Accessors def SetKnowledge(self, knowledge): self.Knowledge = knowledge def GetNumActions(self): return self.NumActions def GetNumObservations(self): return self.NumObservations def GetDiscount(self): return self.Discount def GetHyperbolicDiscount(self, t, kappa=0.277, sigma=1.0): return 1.0 / np.power((1.0 + kappa * float(t)), sigma) def GetRewardRange(self): return self.RewardRange def GetHorizon(self, accuracy, undiscountedHorizon): if self.Discount == 1: return undiscountedHorizon return np.log(accuracy) / np.log(self.Discount)
class Experiment: def __init__(self, real): self.Real = real self.ASTAR = AStar(self.Real.XSize, self.Real.YSize, self.Real.Occlusions) self.PRQL = PRQL(self.Real) def Run(self, policy): undiscountedReturn = 0.0 discountedReturn = 0.0 discount = 1.0 state = self.Real.CreateStartState() currentState = self.Real.Copy(state) t = 0 while True: try: action = policy[t] except IndexError: self.ASTAR.__init__(self.Real.XSize, self.Real.YSize, self.Real.Occlusions) self.ASTAR.InitializeGrid((state.AgentPos).Copy(), (self.Real.GoalPos).Copy()) path = self.ASTAR.Solve() pathPos = COORD(path[1][0], path[1][1]) for action in range(self.Real.NumActions): agentPos = (state.AgentPos).Copy() nextAgentPos = self.Real.NextPos(agentPos, action) if nextAgentPos == pathPos: break terminal, state, observation, reward = self.Real.Step( state, action) currentState = self.Real.Copy(state) undiscountedReturn += reward discountedReturn += reward * discount discount *= self.Real.GetDiscount() if terminal: return reward t += 1 return None def DiscountedReturn(self, policies, predatorHome, occlusions): if not policies: return 0.0 self.Real.__init__(self.Real.XSize, self.Real.YSize, occlusions=occlusions) self.Real.PredatorHome = predatorHome self.PRQL.__init__(self.Real, policies=policies) newPolicyLibrary = self.PRQL.RemoveDuplicates(policies) self.PRQL.PolicyLibrary = newPolicyLibrary survivalrates = 0 trajGraphDistances = [] for j in range(2): terminalRewards = [] successTrajectoryIndices = [] for i in range(self.PRQL.N): self.PRQL.PolicyIndex = self.PRQL.ChoosePolicy() policy = self.PRQL.PolicyLibrary[self.PRQL.PolicyIndex] terminalReward = self.Run(policy) if not terminalReward: i -= 1 continue terminalRewards.append(terminalReward) if terminalReward > 0: successTrajectoryIndices.append(self.PRQL.PolicyIndex) survivalrates += float( sum(reward > 0 for reward in terminalRewards)) / float( len(terminalRewards)) trajGraphDistances.append( self.PRQL.gDistance(successTrajectoryIndices)) return survivalrates / 2., np.mean(trajGraphDistances)
def GetPath(occlusions): ASTAR = AStar(XSize, YSize, occlusions) ASTAR.InitializeGrid(AgentHome, GoalPos) path = ASTAR.Solve() return path