def valueIteration(discountFactor): # all locations in grid alllocations = [ (x,y) for x in range(11) for y in range(11)] # initialize values values = {} bestMoves = {} for predloc in alllocations: for preyloc in alllocations: if preyloc != predloc: values[(predloc,preyloc)] = 0 agent = Agent(0,0) deltas = [] epsilon = 0.01 delta = 1 numIt = 0 # perform value iteration according to pseud-code while delta > epsilon: delta = 0 newValues = {} # loop over all states for predloc in alllocations: for preyloc in alllocations: if predloc == preyloc: continue agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[(predloc,preyloc)] # find optimal value according to current values bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = ((predloc[0] + predMove[0])%11,(predloc[1] + predMove[1])%11) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): preySum += preyProb * discountFactor * values[(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[(predloc,preyloc)] = bestVal bestMoves[(predloc,preyloc)] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) numIt+=1 # greedy policy to the optimal values computed above def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[(predloc,preyloc)] return numIt, values, policy
def valueIteration(): alldiffs = [ (x,y) for x in range(-5,6) for y in range(-5,6)] alldiffs.remove((0,0)) # the relative positions vary from -5 up to 5, in both dimensions values = {} for x in range(-5,6): for y in range(-5,6): values[(x,y)] = 0 bestMoves = {} agent = Agent(0,0) deltas = [] discountFactor = 0.8 epsilon = 0.01 delta = 1 while delta > epsilon: delta = 0 newValues = {} for diff in alldiffs: # we place the predator in the middle of the world, # we are allowed to do this, since the positions are encoded relatively predloc = (5,5) preyloc = (predloc[0]+diff[0],predloc[1]+diff[1]) curKey = rewriteStates(predloc,preyloc) agent.setLocation(predloc) prey = Prey(*preyloc) temp = values[curKey] bestVal = 0 bestMove = (0,0) for prob, predMove in agent.getMoveList(): preySum = 0 newPredloc = agent.locAfterMove(predMove) if newPredloc == preyloc : preySum += 10.0 else: for preyProb, newPreyloc in prey.expand(newPredloc): # using rewriteStates we use relative positions preySum += preyProb * discountFactor * values[rewriteStates(newPredloc,newPreyloc)] if bestVal <= preySum: bestVal = preySum bestMove = predMove newValues[curKey] = bestVal bestMoves[curKey] = bestMove delta = max(delta, np.abs(bestVal - temp)) values = newValues deltas.append(delta) def policy(state): predloc, preyloc = state agent.setLocation(predloc) prey = Prey(*preyloc) return bestMoves[rewriteStates(predloc,preyloc)] return policy