def __init__(self): self.actionList = getAllAction() gamma = 0.8 alpha = 0.05 #initialQ = MaxStepReward/(1-gamma) initialQ = 0 dumpCount = 100000 #self.agent = LinearHORDQ(0.05, 0.1, 0.8, self.actionList, initialQ, dumpCount, pseudoReward) self.agent = LambdaHORDQ(alpha, 0.05, gamma, self.actionList, initialQ, dumpCount, 0) self.rewardAgent = RewardSarsa(alpha, 0.05, 0, self.actionList, initialQ, dumpCount) #self.agent = LambdaSARSA(0.10, 0.05, 0.90, actionList, initialQ, dumpCount) self.totalStep = 0 #self.rewardList = [] self.distList = [] self.feaList = {} for action in self.actionList: self.feaList[action] = [] self.rewardFeaList = [] self.episodeNum = -1 self.lastPlan = [] self.DynamicLearner = {} self.obsList = [] #TODO: remove me
class ModelAgent(Agent): def __init__(self): self.actionList = getAllAction() gamma = 0.8 alpha = 0.05 #initialQ = MaxStepReward/(1-gamma) initialQ = 0 dumpCount = 100000 #self.agent = LinearHORDQ(0.05, 0.1, 0.8, self.actionList, initialQ, dumpCount, pseudoReward) self.agent = LambdaHORDQ(alpha, 0.05, gamma, self.actionList, initialQ, dumpCount, 0) self.rewardAgent = RewardSarsa(alpha, 0.05, 0, self.actionList, initialQ, dumpCount) #self.agent = LambdaSARSA(0.10, 0.05, 0.90, actionList, initialQ, dumpCount) self.totalStep = 0 #self.rewardList = [] self.distList = [] self.feaList = {} for action in self.actionList: self.feaList[action] = [] self.rewardFeaList = [] self.episodeNum = -1 self.lastPlan = [] self.DynamicLearner = {} self.obsList = [] #TODO: remove me def setParam(self, epsilon, pseudoReward, type): #self.agentType = SarsaAgent self.agentType = type self.initPseudoReward = pseudoReward self.epsilon = epsilon #too hacky if self.agentType == AgentType.SarsaAgent: self.HORDQ_episilon = self.epsilon else: self.HORDQ_episilon = 0.00 #disable exploration for HORDQ + model-based agent self.agent.pseudoReward = self.initPseudoReward def AgentType(self): return self.agentType #return True when we can start using our model def isModelReady(self): if self.AgentType() == AgentType.SarsaAgent: return False if self.DynamicLearner == {} or self.DynamicLearner[0].empty(): return False return True #prune both dynamic and reward data def prune(self): for action in self.actionList: self.feaList[action], self.DynamicLearner[action].treeList = self.DynamicLearner[action].prune(self.feaList[action]) print "action learner ", action, " nodes: ", orngTree.countNodes(self.DynamicLearner[action].treeList[0]) print "action learner leaves: ", self.DynamicLearner[action].treeList[0].count_leaves() #self.rewardFeaList, self.RewardLearner.treeList = self.RewardLearner.prune(self.rewardFeaList) #print "reward learner nodes: ", orngTree.countNodes(self.RewardLearner.treeList[0]) #print "reward learner leaves: ", self.RewardLearner.treeList[0].count_leaves() def planning(self, state, initActionRange): if len(initActionRange) == 1: #nothing to choose return initActionRange[0] MaxNode = 300 path = Optimize(state, self.DynamicLearner, self.rewardAgent, MaxNode, self.lastPlan, initActionRange) self.lastPlan = path return path[0] def initLearner(self): if self.DynamicLearner == {}: commonVar = getCommonVar() classVarList = getClassVar() #rewardVar = orange.FloatVariable("reward") #self.RewardLearner = Learner(commonVar, [rewardVar], 6000) commonVar.pop(0) for action in self.actionList: if action == 9: maxFeature = 10000 else: maxFeature = 3000 self.DynamicLearner[action] = Learner(commonVar, classVarList, maxFeature) def agent_init(self, taskSpecString): print "init type", self.agentType print "pseudo reward: ", self.initPseudoReward #parse action print "begin: ", self.totalStep feaNum = len(self.feaList[9]) print "feaNum", feaNum print "SARSA Num:", len(self.agent.Q) self.initLearner() if feaNum == 0: return if not self.AgentType() == AgentType.SarsaAgent: self.prune() def agent_start(self, obs): state = WorldState(obs) self.lastState = state fea = getSarsaFeature(state, NoTask) if self.isModelReady(): self.agent.epsilon = self.HORDQ_episilon possibleAction = self.agent.getPossibleAction(fea) action = self.planning(state, possibleAction) action = self.agent.start(fea, action) else: if self.AgentType() == AgentType.SarsaAgent: self.agent.epsilon = self.HORDQ_episilon else: self.agent.epsilon = 0.05 #encourage exploration action = self.agent.start(fea, NoTask) rewardFea = getRewardFeature(state, NoTask) self.rewardAgent.start(rewardFea, action) self.stepNum = 0 self.lastAction = action self.distList.append(()) #put a dummy one self.episodeNum = self.episodeNum + 1 return makeAction(action) def agent_step(self, reward, obs): #self.obsList.append(obs) #if reward < -0.01 + epsilon and reward > -0.01 - epsilon: #reward = -1 state = WorldState(obs) fea = getSarsaFeature(state, self.lastAction) lastMario = self.lastState.mario mario = state.mario #for internal reward system dx = mario.x - lastMario.x reward = reward + dx modelReward = 0 if isMarioInPit(state): print "in pit !!!!!!!" #reward = reward + InPitPenalty #no pit penalty for HORDQ modelReward = InPitPenalty if not self.isModelReady(): #fea = getSarsaFeature(obs) action = self.agent.step(reward, fea, NoTask) else: #episilon greey policy if random.random() < self.epsilon: #select randomly action = self.actionList[int(random.random()*len(self.actionList))] print "random!!" else: possibleAction = self.agent.getPossibleAction(fea) #if fea[0] == (): #if not monster around, pass control to the planner #possibleAction = self.actionList action = self.planning(state, possibleAction) print "planning", action self.agent.pseudoReward = 10000 action = self.agent.step(reward, fea, action) self.agent.pseudoReward = self.initPseudoReward #state.dump() print "step loc:", self.stepNum, " ", mario.x , " ", mario.y, " ", mario.sx, " ", mario.sy #state.path = [] #state.reward = 0 #nextState, isValid = ExpandPath([0], state, self.DynamicLearner, self.RewardLearner) #nextState.dump() #print "pred loc:", nextState.mario.x , " ", nextState.mario.y, " ", nextState.mario.sx, " ", nextState.mario.sy #print "backoff reward: ", nextState.reward #nextState, isValid = ExpandPath([action], state, self.DynamicLearner, self.RewardLearner) #nextState.dump() #print "pred loc:", nextState.mario.x , " ", nextState.mario.y, " ", nextState.mario.sx, " ", nextState.mario.sy #print "pred rewar:", action, " ", nextState.reward lastActionId = self.lastAction deltaX = mario.x - (lastMario.x + lastMario.sx) deltaY = mario.y - (lastMario.y + lastMario.sy) aX = mario.sx - lastMario.sx aY = mario.sy - lastMario.sy classVar = [round(aX, Precision), round(aY, Precision), round(deltaX, Precision), round(deltaY, Precision)] rewardClassVar = [round(modelReward, 0)] modelFea = getModelFeature(self.lastState, classVar) #rewardFea = getTrainFeature(self.lastState, rewardClassVar, lastActionId) #don't learn the pseudo reward if self.isModelReady(): #TODO: too dirty #predictModelClass = self.DynamicLearner[lastActionId].getClass(modelFea) #predictModelClass = [round(v, 1) for v in predictModelClass] #print "feature: ", lastActionId, " ", modelFea #print "predict: ", predictModelClass predictModelClass = self.DynamicLearner[lastActionId].getClass(modelFea) predictModelClass = [round(v, 1) for v in predictModelClass] roundClassVar = [round(v, 1) for v in classVar] print "feature: ", lastActionId, " ", modelFea print "predict: ", predictModelClass if not roundClassVar == predictModelClass: self.feaList[lastActionId].append(modelFea) else: print "pass model-------------" else: if not self.AgentType() == AgentType.SarsaAgent: self.feaList[lastActionId].append(modelFea) rewardFea = getRewardFeature(state, self.lastAction) print "before pre reward: ", self.rewardAgent.getQ(rewardFea, action) self.rewardAgent.step(rewardFea, modelReward, action) print "pre reward: ", self.rewardAgent.getQ(rewardFea, action) print "reward: ", modelReward #if self.isModelReady(): #predictRewardClass = self.RewardLearner.getClass(rewardFea) #predictRewardClass = [round(v, 0) for v in predictRewardClass] #print "reward: ", modelReward #print "pre reward: ", predictRewardClass #if not rewardClassVar == predictRewardClass: #self.rewardFeaList.append(rewardFea) #else: #print "pass reward-------------" #else: #if not self.AgentType() == SarsaAgent: #self.rewardFeaList.append(rewardFea) self.lastState = state self.lastLastAction = self.lastAction self.lastAction = action self.stepNum = self.stepNum + 1 self.distList[len(self.distList)-1] = (self.totalStep + self.stepNum, self.lastState.mario.x, self.episodeNum, 0) return makeAction(action) def agent_end(self, reward): modelReward = reward if reward == -10.0: reward = DeathPenalty modelReward = InPitPenalty lastActionId = self.lastAction #rewardFea = getTrainFeature(self.lastState, [round(modelReward, 0)], lastActionId) #don't learn the pseudo reward #if not self.AgentType() == SarsaAgent: #self.rewardFeaList.append(rewardFea) #if self.isModelReady(): #preReward, = self.RewardLearner.getClass(rewardFea) #print "pre reward: ", reward rewardFea = getRewardFeature(self.lastState, self.lastLastAction) print "before pre reward: ", self.rewardAgent.getQ(rewardFea, self.lastAction) self.agent.end(reward) self.rewardAgent.end(round(modelReward, 0)) print "pre reward: ", self.rewardAgent.getQ(rewardFea, self.lastAction) print "end: ", reward, " step: ", self.stepNum, " dist:", self.lastState.mario.x self.totalStep = self.totalStep + self.stepNum #if self.DynamicLearner.empty() or self.RewardLearner.empty(): #else: #self.agent.end(reward) #self.rewardList.append(reward) if reward < 10: #only the reward on the finish line counts reward = 0 self.distList[len(self.distList)-1] = (self.totalStep, self.lastState.mario.x, self.episodeNum, reward) #print the decision tree #treeList = getClassifier(self.feaList) #for tree in treeList: #printTree(treeList[2]) #res = classify(data, treeList) #if self.episodeNum % 10000 == 0: #print "dump:", self.episodeNum #tool.Save(self, "mario" + str(self.episodeNum) + ".db") def agent_cleanup(self): pass def agent_freeze(self): pass def agent_message(self, inMessage): pass