def __init__(self, policy): # world object, (starting state is trivial) world = World((0,0),(1,1)) value = {} for state in world.allStates(): value[state] = 0 discount = 0.9 delta = 1 while abs(delta) > 0.00001: delta = 0 for state in world.allStates(): world.setState(state) old = value[state] # we can set the minimum to 0 since we know every value will be 0 or positive curMax = 0 for move in world.moveList(): if world.posAfterMove(move) == (0,0): probSum = 10 else: probSum = 0 for nextState,prob in world.nextPreyStates(): probSum += prob*discount*value[nextState] curMax = max(curMax,probSum) value[state] = curMax delta = max(delta,abs(old - curMax)) value[(0,0)] = 10 self.value = value self.actionList = [] self.allList = [] self.bottomPolicy = policy self.discount = discount
def MCon(episodes, initValue=15,epsilon=0.1, alpha=0.5,discount=0.9): # world object, (starting state is trivial) world = World((0,0),(1,1)) # initialize Q value table and Return list for every (s,a)-pair Q = {} R = {} for state in world.allStates(): for move in world.moveList(): Q[state,move] = initValue # some value R[state,move] = [] # empty list; return = cummulative discounted reward steps = [0]*episodes # list counting number of iterations for i in range(episodes): iterations = 0 # initialize world world.setState((-5,-5)) stateActionPairs = {} # generate an episode using current policy while True: state = world.position # move the predator according to policy action = epsGreedyPolicy(state, world, Q, epsilon) world.move(action) if not (state,action) in stateActionPairs: # store first occurence stateActionPairs[(state,action)] = iterations # will be used for discounting iterations += 1 # check if predator caught the prey if world.stopState(): break # move the prey (stochasticly) world.performPreyMove() newState = world.position steps[i] = iterations # save amount of iterations needed to catch the prey # update Q and R for pair in stateActionPairs.keys(): firstReturn = 10.0*discount**(iterations-stateActionPairs[pair]) # always zero but 10 when episode ends R[pair].append(firstReturn) Q[pair] = np.mean(np.array(R[pair])) # update policy done in epsilon greedy policy code return steps
def Qlearning(episodes, policy, startState=(-5,-5), initValue=15,policyParam=0.1, alpha=0.4,discount=0.9): # world object, (starting state is trivial) world = World((0,0),(1,1)) # Q value table Q = {} for state in world.allStates(): for move in world.moveList(): Q[state,move] = initValue steps = [0]*episodes for i in range(episodes): iterations = 0 # initialize world world.setState(startState) while True: state = world.position # move the predator according to policy with one parameter (epsilon for E-greedy or Tua for softmax) action = policy(state, world, Q, policyParam) world.move(action) iterations += 1 # check if predator caught the prey if world.stopState(): # the Q(s,a) update rule (note that the next state is the absorbing state) Q[state,action] = Q[state,action] + alpha * (10 - Q[state,action]) break # move the prey (stochasticly) world.performPreyMove() newState = world.position # the maximum value the agent can have after another move maxQ = max([Q[newState,nextAction] for nextAction in world.moveList()]) # the Q(s,a) update rule (note that the immediate reward is zero) Q[state,action] = Q[state,action] + alpha * ( discount*maxQ - Q[state,action]) # print the number of steps the predator took steps[i] = iterations return steps
def minimax(episodes,initial_state,epsilon, decay, gamma, alpha_pred=1.0, alpha_prey=1.0): # initialization might be too expansive Q_pred = dict() Q_prey = dict() V_pred = dict() V_prey = dict() pi_pred = dict() pi_prey = dict() initValue = 1.0 # initialisation world = World((5,5),initial_state) for state in world.allStates(): V_pred[state] = 1.0 V_prey[state] = 1.0 for action in world.allMoveList(): pi_pred[(state,action)]=1.0/len(world.allMoveList()) for prey_move in world.singleMoveList(): Q_pred[(state, action, prey_move)]=1.0 Q_prey[(state, action, prey_move)]=1.0 for action in world.singleMoveList(): pi_prey[(state,action)]=1.0/len(world.singleMoveList()) # absorbing states terminal_state = tuple([(0,0)] * len(initial_state)) V_pred[terminal_state] = 0.0 V_prey[terminal_state] = 0.0 steps = [0]*episodes rewards = [0]*episodes for epi in range(episodes): # initialize world world = World((5,5),initial_state) # print "Begin Pred", V_pred[world.position] # print "End Prey", V_prey[world.position] # for s in world.singleMoveList(): # print s, "Pred", V_pred[(s,)] # print s, "Prey", V_pred[(s,)] # for a in world.allMoveList(): # for a2 in world.singleMoveList(): # print s, "Q", a, a2, Q_pred[(state,a,a2)] iterations =0 while not world.stopState(): state = world.position # choose action action_pred = minimax_policy(epsilon, pi_pred, state, world.allMoveList()) action_prey = minimax_policy(epsilon, pi_prey, state, world.singleMoveList()) reward = world.move(action_prey,action_pred) iterations +=1 new_state = world.position # update Q # if (state,action_prey) not in Q_prey: # Q_prey[state,action_prey] = initValue # if (state,action_pred) not in Q_pred: # Q_pred[state,action_pred] = initValue Q_pred[(state,action_pred,action_prey)] = (1.0-alpha_pred)*Q_pred[(state,action_pred,action_prey)] + alpha_pred*(reward[1]+ gamma* V_pred[new_state]) Q_prey[(state,action_pred,action_prey)] = (1.0-alpha_prey)*Q_prey[(state,action_pred,action_prey)] + alpha_prey*(reward[0]+ gamma* V_prey[new_state]) # update pi # adapted from example: http://abel.ee.ucla.edu/cvxopt/examples/tutorial/lp.html ## PREDATOR update # constraint to minimize w.r.t. prey action minConstr = [[1.0] + [-Q_pred[(state,a_pred,a_prey)] for a_pred in world.allMoveList()] for a_prey in world.singleMoveList()] # constrinat to keep every pi(a) positive posConstr = [] for i in range(1,len(world.allMoveList())+1): new_row = [0.0] * (len(world.allMoveList())+1) new_row[i] = -1.0 posConstr.append(new_row) normGreater = [0.0] + [1.0] * len(world.allMoveList()) normSmaller = [0.0] + [-1.0] * len(world.allMoveList()) A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans() b = matrix([ 1.0, -1.0] + [0.0] * (len(world.singleMoveList()) + len(world.allMoveList())) ) # -1 V and 0 for all pi(s,a) c = matrix([ -1.0 ] + [0.0] * len(world.allMoveList())) sol=solvers.lp(c,A,b) V_pred[state] = sol['x'][0] for a_pred, x in zip(world.allMoveList(),sol['x'][1:]): pi_pred[(state,a_pred)] = x # ## PREY update # constraint to minimize w.r.t. prey action minConstr = [[1.0] + [-Q_prey[(state,a_pred,a_prey)] for a_prey in world.singleMoveList()] for a_pred in world.allMoveList()] # # constriant to keep every pi(a) positive posConstr = [] for i in range(1,len(world.singleMoveList())+1): new_row = [0.0] * (len(world.singleMoveList())+1) new_row[i] = -1.0 posConstr.append(new_row) normGreater = [0.0] + [ 1.0] * len(world.singleMoveList()) normSmaller = [0.0] + [-1.0] * len(world.singleMoveList()) A = matrix([normGreater, normSmaller] + minConstr + posConstr).trans() b = matrix([ 1.0, -1.0] + [0.0] * (len(world.allMoveList()) + len(world.singleMoveList())) ) # -1 V and 0 for all pi(s,a) c = matrix([ -1.0 ] + [0.0] * len(world.singleMoveList())) sol=solvers.lp(c,A,b) V_prey[state] = sol['x'][0] for a_prey, x in zip(world.singleMoveList(),sol['x'][1:]): pi_prey[(state,a_prey)] = x alpha_pred *= decay alpha_prey *= decay if epi > 0 and epi % 50 == 0: print "Episode",epi steps[epi] = iterations if reward[1] > 0: rewards[epi] = 1 return steps, rewards
def MCoff(episodes, behaPolicy, matches=[], initValue=15,discount=0.9): # behaPolicy = dictionary with keys (state,action) and value P(action|state) world = World((0,0),(1,1)) movelist = world.moveList() def policy(world): return world.pickElementWithProbs([(move,behaPolicy[(world.position,move)]) for move in movelist]) # initialize Q value table and Return list for every (s,a)-pair Q = {} R = {} num = {} denum = {} for state in world.allStates(): for move in world.moveList(): num[state,move] = 0.0 denum[state,move] = 0.0 Q[state,move] = float(initValue) # some value R[state,move] = [] # empty list; return = cummulative discounted reward steps = [0]*episodes # list counting number of iterations for epi in range(episodes): time = 0 totalTime =0 # initialize world world.setState((-5,-5)) episode = [] while True: action = policy(world) episode.append((world.position, action)) if action == None: print action, state world.move(action) if world.stopState(): break world.performPreyMove() # save the pairs that match, and their first occurence matchingHistory = {} # last time move was equal to policy last = 0 for i, (state, action) in enumerate(episode[::-1]): actionValues = [(Q[state,maction],maction) for maction in world.moveList()] bestActions = [actionValues[j][1] for j in maxIndices(actionValues)] matchingHistory[(state, action)] = len(episode)-i - 1 if action not in bestActions: last = len(episode)-i break matches.append(len(episode)-last) for (state, action) in matchingHistory: if matchingHistory[(state, action)] >= last-1: w = np.prod([ 1.0/behaPolicy[episode[j]] for j in range(matchingHistory[(state, action)],len(episode))]) num[(state,move)] += w * (10.0*discount**matchingHistory[(state, action)]) # return is gamma^{T-t}*10 denum[(state,move)] += w Q[(state,move)]= num[(state,move)]/float(denum[(state,move)]) world.setState((-5,-5)) iterations = 0 while True: iterations += 1 actionValues = [(maction, Q[state,maction]) for maction in world.moveList()] bestAction = random.choice([actionValues[j][0] for j in maxIndices(actionValues)]) world.move(bestAction) if world.stopState() or iterations > 2000: break world.performPreyMove() steps[epi] = iterations return steps