def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" currentIterationCounter = 1 for state in mdp.getStates(): self.values[state] = mdp.getReward(state, 'Stop', state) while (currentIterationCounter != self.iterations): newValues = util.Counter() for state in mdp.getStates(): tempValues = util.Counter() for action in mdp.getPossibleActions(state): for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action): newState = newStateAndProb[0] prob = newStateAndProb[1] tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState]) newValues[state] = tempValues[tempValues.argMax()] currentIterationCounter += 1 for state in mdp.getStates(): self.values[state] = newValues[state]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" allStates = mdp.getStates() vPrimes = util.Counter() # A Counter is a dict with default 0 iteration = 0 while iteration < iterations: for s in allStates: if mdp.isTerminal(s): vPrimes[s] = mdp.getReward(s, None, s); else: sreward = mdp.getReward(s, None, s) vPrimes[s] = sreward + discount * self.utilOfBestAction(mdp, s ) for s in allStates: self.values[s] = vPrimes[s] iteration +=1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 iteration = 1 while iteration <= self.iterations: updated_values = self.values.copy() for possibleNextState in mdp.getStates(): if mdp.isTerminal(possibleNextState) == True: for possibleAction in mdp.getPossibleActions( possibleNextState): possibleValue = 0 for possibleTransition in mdp.getTransitionStatesAndProbs( possibleNextState, possibleAction): #Following Bellman's equation possibleValue += possibleTransition[1] * ( mdp.getReward(possibleNextState, possibleAction, possibleTransition[0]) + discount * self.values[possibleTransition[0]]) updated_values[possibleNextState] = possibleValue else: maxStateValue = float("-inf") for possibleAction in mdp.getPossibleActions( possibleNextState): possibleValue = 0 for possibleTransition in mdp.getTransitionStatesAndProbs( possibleNextState, possibleAction): #Following Bellman's equation possibleValue += possibleTransition[1] * ( mdp.getReward(possibleNextState, possibleAction, possibleTransition[0]) + discount * self.values[possibleTransition[0]]) if possibleValue > maxStateValue: maxStateValue = possibleValue updated_values[possibleNextState] = maxStateValue self.values = updated_values iteration += 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.depth = 1 self.qTable = {} self.vTable = {} for state in mdp.getStates(): self.vTable[state] = 0 self.qTable[state] = {} for action in mdp.getPossibleActions(state): self.qTable[state][action] = 0 while self.depth < self.iterations + 1: self.tempTable = {} for state in mdp.getStates(): self.stateValue = 0 if not mdp.isTerminal(state): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState] self.qTable[state][action] = self.Qtotal #print self.qTable[state][action] self.stateValue = max(self.stateValue,self.qTable[state][action]) else: self.tempTable[state] = 0 self.tempTable[state] = self.stateValue self.vTable = self.tempTable self.depth += 1 for state in mdp.getStates(): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) self.qTable[state][action] = self.Qtotal
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 temp = util.Counter() #to keep track of values while iterating #use this for argmax states = mdp.getStates() ValueIterationAgent.policy = dict.fromkeys(states, '') actionUtilities = {} #initialize utility fn for state in states: if mdp.isTerminal(state): self.values[state] = mdp.getReward(state) temp[state] = mdp.getReward(state) looper = 0 #returns list of tuples - state and # Write value iteration code here """ U(s) = R(s) + Y * (max value after trying each action)(T(s, a, s') * U(s)) Do this for all states Return policy Handle case for no available actions """ #loop until we've hit the right number of iterations while looper < self.iterations: for state in states: actionUtilities = {} possibleActions = mdp.getPossibleActions(state) #start new Bellman eqn - add reward for each state if len(possibleActions) == 0: #terminal state newUtility = 0 elif len(possibleActions) == 1: #1 possible action: exit actionUtilities[possibleActions[0]] = self.computeQValueFromValues(state, possibleActions[0]) ValueIterationAgent.policy[state] = possibleActions[0] newUtility = actionUtilities[possibleActions[0]] else: for action in possibleActions: #multiple possible actions; try them all actionUtilities[action] = self.computeQValueFromValues(state, action) #get the utility for each action at the given state ValueIterationAgent.policy[state] = max(actionUtilities, key=actionUtilities.get) #update the policy for this state newUtility = actionUtilities[ValueIterationAgent.policy[state]] temp[state] = newUtility self.values = temp.copy() looper = looper + 1 "*** YOUR CODE HERE ***"
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" oldCounter = util.Counter() states = mdp.getStates() while self.iterations > 0: for state in states: if self.mdp.isTerminal(state): self.values[state] = 0 else: actions = mdp.getPossibleActions(state) # if len(actions) > 0: # initialize maxAct by calculating the first action from the action list of the given state firstAct = actions[0] listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs( state, firstAct) for pair in listOfNextStateAndProbPairs: value = 0 for pair in listOfNextStateAndProbPairs: (nextState, prob) = pair reward = mdp.getReward(state, firstAct, nextState) value = value + prob * (reward + self.discount * (oldCounter[nextState])) # compare and choose the best action value maxAct = value for action in actions: listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs( state, action) value = 0 for pair in listOfNextStateAndProbPairs: (nextState, prob) = pair reward = mdp.getReward(state, action, nextState) value = value + prob * (reward + self.discount * (oldCounter[nextState])) maxAct = max(maxAct, value) # update the state value for the given state self.values[state] = maxAct oldCounter = self.values.copy() self.iterations = self.iterations - 1
def runValueIteration(self): "*** YOUR CODE HERE ***" mdp = self.mdp values = self.values discount = self.discount predecessors = {} for state in mdp.getStates(): preList = [] for preState in mdp.getStates(): for action in mdp.getPossibleActions(preState): if state in [ pair[0] for pair in mdp.getTransitionStatesAndProbs( preState, action) if pair[1] > 0 ]: preList.append(preState) break predecessors[state] = preList queue = util.PriorityQueue() for s in mdp.getStates(): if not mdp.isTerminal(s): actions = mdp.getPossibleActions(s) realValue = max( sum(prob * (mdp.getReward(s, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(s, action)) for action in actions) diff = abs(realValue - values[s]) queue.push(s, 0 - diff) for _ in range(self.iterations): if queue.isEmpty(): return s = queue.pop() if not mdp.isTerminal(s): actions = mdp.getPossibleActions(s) values[s] = max( sum(prob * (mdp.getReward(s, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(s, action)) for action in actions) for p in predecessors[s]: actions = mdp.getPossibleActions(p) realValue = max( sum(prob * (mdp.getReward(p, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(p, action)) for action in actions) diff = abs(realValue - values[p]) if diff > self.theta: queue.update(p, 0 - diff)
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" oldCounter = util.Counter() states = mdp.getStates() while self.iterations>0: for state in states: if self.mdp.isTerminal(state): self.values[state] = 0 else: actions = mdp.getPossibleActions(state) # if len(actions) > 0: # initialize maxAct by calculating the first action from the action list of the given state firstAct = actions[0] listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, firstAct) for pair in listOfNextStateAndProbPairs: value = 0 for pair in listOfNextStateAndProbPairs: (nextState, prob) = pair reward = mdp.getReward(state, firstAct, nextState) value = value + prob * (reward + self.discount * (oldCounter[nextState])) # compare and choose the best action value maxAct = value for action in actions: listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, action) value = 0 for pair in listOfNextStateAndProbPairs: (nextState, prob) = pair reward = mdp.getReward(state, action, nextState) value = value + prob * (reward + self.discount * (oldCounter[nextState])) maxAct = max(maxAct, value) # update the state value for the given state self.values[state] = maxAct oldCounter = self.values.copy() self.iterations = self.iterations - 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" #define a dictionary to store values in iterations for each state self.valueRecord = {} for state in mdp.getStates(): self.valueRecord[state] = [] for i in range(0, self.iterations + 1): for state in mdp.getStates(): if i == 0 or mdp.isTerminal(state): self.valueRecord[state].append(0) continue actions = mdp.getPossibleActions(state) #store the current reward as value and return if 'exit' in actions: self.valueRecord[state].append( mdp.getReward( state, 'exit', mdp.getTransitionStatesAndProbs(state, 'exit')[0][0])) continue max = float("-inf") #find the action that maximize the value for action in actions: summax = 0 for (s, p) in mdp.getTransitionStatesAndProbs(state, action): summax += p * ( mdp.getReward(state, action, s) + self.discount * self.valueRecord[s][i - 1]) if summax > max: max = summax self.valueRecord[state].append(max) #store the final value we get from iteration into values for k, v in self.valueRecord.items(): self.values[k] = v[iterations]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.actions = util.Counter() tempValues = util.Counter() # Write value iteration code here for k in range(0, iterations): for state in mdp.getStates(): maxAction = float("-inf") for action in mdp.getPossibleActions(state): total = 0 for nextState, prob in mdp.getTransitionStatesAndProbs( state, action): total += prob * ( self.values[nextState] * discount + mdp.getReward(state, action, nextState)) maxAction = max(maxAction, total) tempValues[state] = maxAction for state in mdp.getStates(): if tempValues[state] > float("-inf"): self.values[state] = tempValues[state] for state in mdp.getStates(): maxAction = None maxActionValue = float("-inf") for action in mdp.getPossibleActions(state): total = 0 for nextState, prob in mdp.getTransitionStatesAndProbs( state, action): total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState)) if total > maxActionValue: maxActionValue = total maxAction = action self.actions[state] = maxAction
def __init__(self, mdp, discount=0.9, iterations=200): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.q_values = {} self.best_action = {} # calculate utilities values for i in range(self.iterations): next_values = util.Counter() for s in mdp.getStates(): updated = False for a in mdp.getPossibleActions(s): action_value = 0.0 for t in mdp.getTransitionStatesAndProbs(s, a): r = mdp.getReward(s, a, t[0]) action_value += t[1] * (r + discount * self.values[t[0]]) if not updated or action_value > next_values[s]: next_values[s] = action_value updated = True self.values = next_values # with the given utilities, calculate q-values p = False for s in mdp.getStates(): self.best_action[s] = None max_action_value = -10000000 for a in mdp.getPossibleActions(s): action_value = 0.0 for t in mdp.getTransitionStatesAndProbs(s, a): r = mdp.getReward(s, a, t[0]) action_value += t[1] * (r + discount * self.values[t[0]]) self.q_values[(s, a)] = action_value if action_value > max_action_value: max_action_value = action_value self.best_action[s] = a
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 iteration_values = util.Counter() for i in range(iterations): for state in mdp.getStates(): max_action_value = -99999 max_action = None actions = mdp.getPossibleActions(state) if len(actions) == 0: max_action_value = mdp.getReward(state, None, None) max_action = None max_state_prime = None else: for action in actions: summation = 0 for state_prime, prob in mdp.getTransitionStatesAndProbs( state, action): if mdp.isTerminal(state_prime): iteration_values[state] = mdp.getReward( state, 'exit', 'TERMINAL_STATE') else: utility = self.values[state_prime] summation += utility * prob if summation > max_action_value: max_action_value = summation max_action = action iteration_values[state] = mdp.getReward( state, None, None) + discount * max_action_value # Update at the end of each iteration self.values = iteration_values.copy()
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.tempvalues = util.Counter() # Write value iteration code here for i in range(0, self.iterations): newVal = self.values.copy() for state in mdp.getStates(): if not mdp.isTerminal(state): #worst V value possible maxV = float("-inf") for action in mdp.getPossibleActions(state): v = 0 #transitions = [newState, probability] for transition in mdp.getTransitionStatesAndProbs( state, action): v = v + transition[1] * ( mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]]) if v > maxV: maxV = v newVal[state] = maxV else: #state is terminal for action in mdp.getPossibleActions(state): v = 0 for transition in mdp.getTransitionStatesAndProbs( state, action): v = v + transition[1] * ( mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]]) newVal[state] = v #update whole V values self.values = newVal
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(0,iterations): b = self.values.copy() #print ('b',b) #print ('all',mdp.getStates()) for s in mdp.getStates(): if s == 'TERMINAL_STATE': self.values[s]= 0 else: #print ('s',s) qlist = [] for a in mdp.getPossibleActions(s): if a =='exit': qlist.append(mdp.getReward(s,a,(mdp.getTransitionStatesAndProbs(s,a)))) else: #print('a',a) spsum = 0 for sp in mdp.getTransitionStatesAndProbs(s,a): #print('sp',sp) #print(mdp.getReward(s,a,sp[0])) spsum =spsum+ (sp[1]*(mdp.getReward(s,a,sp[0])+self.discount*b[sp[0]])) #print ('spsum',spsum) #print('i',i) qlist.append(spsum) #print qlist self.values[s] = max(qlist) while len(qlist) > 0 : qlist.pop() # Write value iteration code here "*** YOUR CODE HERE ***"
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.utilities = {} self.qvalues = {} states = mdp.getStates() for state in states: self.utilities[state] = 0 self.qvalues[state] = util.Counter() for i in range(iterations): newUtilities = {} for state in states: if self.mdp.isTerminal(state): continue childQs = [] for action in mdp.getPossibleActions(state): q_value = 0 for transition in mdp.getTransitionStatesAndProbs( state, action): q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \ discount*self.utilities[transition[0]]) childQs.append(q_value) newUtilities[state] = max(childQs) self.utilities.update(newUtilities) """ q-values are a dictionary from states to dictionaries of action => qvalue mappings""" for state in states: for action in mdp.getPossibleActions(state): q_value = 0 for transition in mdp.getTransitionStatesAndProbs( state, action): q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \ discount*self.utilities[transition[0]]) self.qvalues[state][action] = q_value
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for i in range(iterations): nextValues = util.Counter() for state in mdp.getStates(): rewardsPossible = util.Counter() for action in mdp.getPossibleActions(state): nextPossible = size(mdp.getTransitionStatesAndProbs(state, action))[1] newRewards = util.Counter() for tmpState in range(nextPossible): nextState = mdp.getTransitionStatesAndProbs(state, action)[tmpState][0] prob = mdp.getTransitionStatesAndProbs(state, action)[tmpState][1] rewards = mdp.getReward(state, action, tmpState) newRewards[tmpState] = prob * (rewards + self.discount * self.values[nextState]) rewardsPossible[action] = newRewards.totalCount() nextValues[state] = rewardsPossible[rewardsPossible.argMax()] self.values = nextValues
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 while self.iterations > 0: prev_values = self.values.copy() for state in mdp.getStates(): actions = mdp.getPossibleActions(state) if not actions: continue self.values[state] = max([sum([prob*(mdp.getReward(state, act, state1) + discount*prev_values[state1]) for state1, prob in mdp.getTransitionStatesAndProbs(state, act)]) for act in actions]) self.iterations -= 1
def computeQValueFromValues(self, state, action): """ Compute the Q-value of action in state from the value function stored in self.values. """ "*** YOUR CODE HERE ***" # get the discount discount = self.discount # get the values values = self.values # get the mdp mdp = self.mdp # set initial q value qv = 0 # tStatesAndProbs = mdp.getTransitionStatesAndProbs(state, action) # keep track of pairs seen so far j = 0 while j < len(tStatesAndProbs): # extract tState and Prob from this member of the list tState = tStatesAndProbs[j][0] prob = tStatesAndProbs[j][1] # calcuate the qv the same we we calculated v above qv = qv + ((discount * values[tState]) + mdp.getReward(state, action, tState)) * prob # increment j = j + 1 return qv
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(self.iterations): temp = self.values.copy() for item in mdp.getStates(): value_list = [] if mdp.isTerminal(item): self.values[item] = 0 continue for k in mdp.getPossibleActions(item): sum_value = 0 for x in mdp.getTransitionStatesAndProbs(item, k): reward_next_state = mdp.getReward(item, k, x[0]) sum_value += x[1] * (reward_next_state + self.discount * temp[x[0]]) value_list.append(sum_value) self.values[item] = max(value_list)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for k in range(self.iterations): self.values_old = self.values.copy() for s in mdp.getStates(): if not self.mdp.isTerminal(s): self.values[s] = max( [ sum( [ T * (mdp.getReward(s, a, sp) + self.discount * self.values_old[sp]) for (sp, T) in mdp.getTransitionStatesAndProbs(s, a) ] ) for a in mdp.getPossibleActions(s) ] )
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for s in mdp.getStates(): self.values[s] = 0 "for a in mdp.getPossibleActions(s):" "for ac in mdp.getTransitionStatesAndProbs(s,a):" " print ac[0]" "print ac[1]" "copy_value = self.values.copy()" "for c in mdp.getStates():" " print copy_value[c]" i=0 "self.states = mdp.getStates()" while i < iterations: copy_value = self.values.copy() for s in mdp.getStates(): if not mdp.isTerminal(s): self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)]) i = i + 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for _ in range(iterations): updateBatch = self.values.copy() for state in mdp.getStates(): self.values[state] = max([ sum([prob*(mdp.getReward(state, action, transitionState) + discount*updateBatch[transitionState]) for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action) if prob != 0]) for action in mdp.getPossibleActions(state)] or [0])
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for n in range(iterations): V = self.values.copy() for s in mdp.getStates(): action_values = [] for a in mdp.getPossibleActions(s): action_value = 0 for s_, P in mdp.getTransitionStatesAndProbs(s, a): action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_]) action_values.append(action_value) self.values[s] = max(action_values or [0])
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for i in range(0, iterations): preValues = self.values.copy() for state in mdp.getStates(): curValue = util.Counter() for action in mdp.getPossibleActions(state): for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action): curValue[action] += prob*(mdp.getReward(state, action, nextState)+discount*preValues[nextState]) self.values[state] = curValue[curValue.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here while self.iterations > 0: junk = self.values.copy() for state in self.mdp.getStates(): garbage = {} for action in mdp.getPossibleActions(state): garbage[action] = 0 for (nextState, prob) in self.mdp.getTransitionStatesAndProbs(state, action): garbage[action] += prob * (mdp.getReward(state, action, nextState) + self.discount * junk[nextState]) try: self.values[state] = max(garbage.values()) except ValueError: self.values[state] = 0 self.iterations -= 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter( ) # A Counter is a dict with default values as 0 "*** YOUR CODE HERE ***" states = mdp.getStates() for k in range(0, iterations): for state in states: actions = [] for action in mdp.getPossibleActions(state): trans_prob = mdp.getTransitionStatesAndProbs(state, action) actions.append( sum(self.values[tp[0], k - 1] * tp[1] for tp in trans_prob)) if actions: max_prob = max(actions) else: max_prob = 0 self.values[state, k] = mdp.getReward(state) + discount * max_prob
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for i in range(iterations): newValues=util.Counter() for state in mdp.getStates(): values=[] for action in mdp.getPossibleActions(state): qvalue=sum([(discount*self.values[newState]+mdp.getReward(state,action,newState))*prob for newState,prob in mdp.getTransitionStatesAndProbs(state,action)]) values.append(qvalue) if len(values)>0: newValues[state]=max(values) for state in self.values: self.values[state]=newValues[state]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(iterations): lastValues = copy.deepcopy(self.values) for s in mdp.getStates(): aCounter = util.Counter() for a in mdp.getPossibleActions(s): for s2 in mdp.getStates(): aCounter[a] += self.T(s,a,s2) * (mdp.getReward(s,a,s2) + discount*lastValues[s2]) self.values[s] = aCounter[aCounter.argMax()]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here for k in range(1, iterations + 1): values = copy(self.values) for state in mdp.getStates(): actions = mdp.getPossibleActions(state) if len(actions) > 0: self.values[state] = max([ sum([ prob * (mdp.getReward(state, action, next) + discount * values[next]) for next, prob in mdp.getTransitionStatesAndProbs( state, action) ]) for action in mdp.getPossibleActions(state) ])
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(iterations): nextValues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): continue first = True for action in mdp.getPossibleActions(state): qValue = 0 for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action): reward = mdp.getReward(state, action, nextState) qValue += prob * (reward + discount*self.values[nextState]) if first: maxQValue = qValue first = False elif qValue > maxQValue: maxQValue = qValue nextValues[state] = maxQValue self.values = nextValues
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for i in range(iterations): newValues = util.Counter() for state in mdp.getStates(): values = [] for action in mdp.getPossibleActions(state): qvalue = sum([ (discount * self.values[newState] + mdp.getReward(state, action, newState)) * prob for newState, prob in mdp.getTransitionStatesAndProbs( state, action) ]) values.append(qvalue) if len(values) > 0: newValues[state] = max(values) for state in self.values: self.values[state] = newValues[state]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # keep track of the number of iterations we have done so far i = 0 # final output value v = 0 # get all the states states = mdp.getStates() # for each of the specified iterations: while i < iterations: # save the current self.values oldSV = self.values.copy() # increment our variable for number of iterations i = i + 1 # for each of the states, for s in states: # get the value at this state v = util.Counter() # look at all possible actions from that state actions = mdp.getPossibleActions(s) # for each state action pair ... for a in actions: # get the transition states and the probablilities of # reaching those states tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a) # keep track of the number of pairs we have seen so far j = 0 # print tStatesAndProbs # for each pair in tStatesAndProbs, while j < len(tStatesAndProbs): # extract tState and Prob from this member of the list tState = tStatesAndProbs[j][0] prob = tStatesAndProbs[j][1] # set the value associated with that move # make sure to account for prob and discount v[a] = v[a] + (mdp.getReward(s, a, tState) + discount * oldSV[tState]) * prob # increment j = j + 1 # return self.values[s] = v[v.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" temp = util.Counter() for i in xrange(iterations): temp = self.values.copy() for j in mdp.getStates(): vlist = [] actions = mdp.getPossibleActions(j) if not mdp.isTerminal(j): for k in actions: tran = mdp.getTransitionStatesAndProbs(j, k) val = 0 for m in tran: val += m[1] * (mdp.getReward(j, k, m[0]) + self.discount * temp[m[0]]) vlist.append(val) self.values[j] = max(vlist)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(iterations): oldv = self.values.copy() for s in mdp.getStates(): if (mdp.isTerminal(s)): self.values[s] = 0 continue maxu = None for a in mdp.getPossibleActions(s): eu = 0 for (sp, p) in mdp.getTransitionStatesAndProbs(s, a): r = mdp.getReward(s, a, sp) r += self.discount * oldv[sp] eu += p * r if (maxu is None or eu > maxu): maxu = eu self.values[s] = maxu
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for _ in range(0, iterations): self.newValues = util.Counter() for st in mdp.getStates(): if len(mdp.getPossibleActions(st)) != 0: maxV = -sys.maxint for act in mdp.getPossibleActions(st): newV = 0 for tst, prob in mdp.getTransitionStatesAndProbs( st, act): r = mdp.getReward(st, act, tst) newV += prob * (r + discount * self.values[tst]) if newV > maxV: maxV = newV self.newValues[st] = maxV else: self.newValues[st] = self.values[st] self.values = self.newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() for i in range(iterations): # running the alg on the indicated number of iterations y = self.values.copy() #V sub k-1 for state in mdp.getStates(): actions = util.Counter() if mdp.isTerminal(state) == False: for possibleActions in mdp.getPossibleActions(state): for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions): value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState])) actions[possibleActions] += value_iteration self.values[state] = actions[actions.argMax()]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(self.iterations): # run the indicated iterations valuesCopy = self.values.copy() # a copy of the Counter for state in self.mdp.getStates(): tmpValues = util.Counter() for action in self.mdp.getPossibleActions(state): for nextState, prob in self.mdp.getTransitionStatesAndProbs( state, action): tmpValues[action] += prob * ( mdp.getReward(state, action, nextState) + self.discount * valuesCopy[nextState]) self.values[state] = tmpValues[ tmpValues.argMax()] # return highest value
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" # OUR CODE HERE #Note: I think we should use the util.Counter thing? for times in range(0, iterations): #values from previous iteration so we don't update over them while iterating prevVals = self.values.copy() #iterate through all states for state in mdp.getStates(): #will store the action-value for the iteration value = util.Counter() for action in mdp.getPossibleActions(state): for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action): #expected value, probability * reward for the state with the discount * reward value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState]) #update the values to the new value from the iteration #the .argMax() function returns the one with the largest value self.values[state] = value[value.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 states = mdp.getStates() for i in range(iterations): lastValues = self.values.copy() for s in states: actions = mdp.getPossibleActions(s) if not actions: continue values = [] for a in actions: sum = 0 for s2, p in mdp.getTransitionStatesAndProbs(s,a): sum += p * lastValues[s2] values.append(mdp.getReward(s,None,None) + self.discount*sum) self.values[s] = max(values)
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 helper_vector = util.Counter() # Copy of vectors to be used for batch updating for i in range(self.iterations): for state in mdp.getStates(): if mdp.isTerminal(state): continue if mdp.getPossibleActions(state): helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] ) for action in mdp.getPossibleActions(state): helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, action)] )) for state in helper_vector: self.values[state] = helper_vector[state]
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" import random mdp = self.mdp possibleActions = mdp.getPossibleActions(state) valuesForAction = util.Counter() if (mdp.isTerminal(state)): return None for action in possibleActions: tp = mdp.getTransitionStatesAndProbs(state, action) sum = 0 for i in range(len(tp)): nextState, prob = tp[i] sum = sum + prob * (mdp.getReward(state, action, nextState) + self.discount * self.values[nextState]) valuesForAction[action] = sum if (valuesForAction.totalCount() == 0): return possibleActions[0] return valuesForAction.argMax()
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" mdp = self.mdp actions = mdp.getPossibleActions(state) if len(actions) == 0: return "stop" best_action = (-99999999999999, None) for a in actions: transitions = mdp.getTransitionStatesAndProbs(state,a) value_of_action = 0 for y in transitions: prob = y[1] new_state = y[0] reward = mdp.getReward(state,a,new_state) next = self.values[new_state] val = prob*(reward + self.discount*next) value_of_action += val if value_of_action >= best_action[0]: best_action = (value_of_action, a) return best_action[1]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" # Init : Not required # Value iteration for i in range(iterations): old_values = self.values.copy() for state in mdp.getStates(): value_state_action = [] for action in mdp.getPossibleActions(state): val = 0 transition = mdp.getTransitionStatesAndProbs(state,action) for sstate,prob_s_a_ss in transition: val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate]) value_state_action.append(val) if value_state_action : self.values[state] = max(value_state_action)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here for i in xrange(iterations): new_values = self.values.copy() for state in mdp.getStates(): actions = mdp.getPossibleActions(state) qValues = [] for action in actions: probs = mdp.getTransitionStatesAndProbs(state, action) qvs = [ p * (self.values[nextState] * discount + mdp.getReward(state, action, nextState)) for nextState, p in probs ] qValues.append(sum(qvs)) if len(qValues) > 0: new_values[state] = max(qValues) else: new_values[state] = 0 self.values = new_values
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 while self.iterations > 0: prev_values = self.values.copy() for state in mdp.getStates(): actions = mdp.getPossibleActions(state) if not actions: continue self.values[state] = max([ sum([ prob * (mdp.getReward(state, act, state1) + discount * prev_values[state1]) for state1, prob in mdp.getTransitionStatesAndProbs( state, act) ]) for act in actions ]) self.iterations -= 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for times in range(iterations): V = self.values.copy() for state in mdp.getStates(): action_values = util.Counter() for action in mdp.getPossibleActions(state): for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action): action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state]) self.values[state] = action_values[action_values.argMax()]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 newValues = util.Counter() # Write value iteration code here "*** YOUR CODE HERE ***" allStates = mdp.getStates() for i in range(iterations): for s in allStates: if mdp.isTerminal(s): continue mx = float("-inf") for a in mdp.getPossibleActions(s): score = 0 for (sp, tp) in mdp.getTransitionStatesAndProbs(s, a): score += tp * (mdp.getReward(s, a, sp) + self.discount * self.values[sp]) if score > mx: mx = score newValues[s] = mx self.values = newValues.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #states = mdp.getStates() #values = {state: 0 for state in states} for i in range(iterations): previous = self.values.copy() for state in mdp.getStates(): possibleActions = mdp.getPossibleActions(state) if len(possibleActions) == 0: continue results = [] for action in possibleActions: total = 0 for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action): total += (prob * previous[nextState]) results.append(total) self.values[state] = mdp.getReward(state) + (discount * max(results))
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.qvalues = util.Counter() self.bestact = util.Counter() "*** YOUR CODE HERE ***" states = mdp.getStates() for i in range(iterations): v = util.Counter() for state in states: if mdp.isTerminal(state): continue value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state]) for next_state, prob in mdp.getTransitionStatesAndProbs(state, action)) for action in mdp.getPossibleActions(state)} self.bestact[state] = max(value, key=value.get) v[state] = value[self.bestact[state]] for action in value.keys(): self.qvalues[state,action] = value[action] self.values = v.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 mdpStates = mdp.getStates() for iteration in xrange(iterations): newValues = util.Counter() for state in mdpStates: if self.mdp.isTerminal(state): continue actionValues = -sys.maxint - 1 for action in mdp.getPossibleActions(state): sum = 0 for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action): sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState]) if sum > actionValues: actionValues = sum newValues[state] = actionValues self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.policy = util.Counter() oldValues = util.Counter() states = mdp.getStates() for x in xrange(0,iterations): for state in states: possibleActions = mdp.getPossibleActions(state) qValues = util.Counter() for action in possibleActions: qValue = 0; for nextState, prob in mdp.getTransitionStatesAndProbs(state,action): qValue += prob*(mdp.getReward(state, action, nextState)+discount*oldValues[nextState]) qValues[action] = qValue bestAction = qValues.argMax() self.values[state] = qValues[bestAction] for value in self.values: oldValues[value] = self.values[value]
def computeQValueFromValues(self, state, action): """ Compute the Q-value of action in state from the value function stored in self.values. """ "*** YOUR CODE HERE ***" mdp = self.mdp discount = self.discount iterations = self.iterations values = self.values #states = mdp.getStates() #print('state is :', state) qValues = [] sum = 0 for item in mdp.getTransitionStatesAndProbs( state, action ): #'mdp.getTransitionStatesAndProbs :', [((0, 1), 1.0), ((0, 0), 0.0), ((0, 2), 0.0)] resultState = item[0] prob = item[1] sum += prob * (mdp.getReward(state, action, resultState) + discount * values[resultState]) return sum
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.utilities = {} self.qvalues = {} states = mdp.getStates() for state in states: self.utilities[state] = 0 self.qvalues[state] = util.Counter() for i in range(iterations): newUtilities = {} for state in states: if self.mdp.isTerminal(state): continue childQs = [] for action in mdp.getPossibleActions(state): q_value = 0 for transition in mdp.getTransitionStatesAndProbs(state,action): q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \ discount*self.utilities[transition[0]]) childQs.append(q_value) newUtilities[state] = max(childQs) self.utilities.update(newUtilities) """ q-values are a dictionary from states to dictionaries of action => qvalue mappings""" for state in states: for action in mdp.getPossibleActions(state): q_value = 0 for transition in mdp.getTransitionStatesAndProbs(state,action): q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \ discount*self.utilities[transition[0]]) self.qvalues[state][action] = q_value
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" ## get dict of transitions for all (state, action, destination): self.T = dict() self.R = dict() self.D = dict() for state in self.mdp.getStates(): if self.mdp.isTerminal(state): continue actions = self.mdp.getPossibleActions(state) for action in actions: for (destination, prob ) in mdp.getTransitionStatesAndProbs(state, action): self.T[(state, action, destination)] = prob self.R[(state, action, destination)] = mdp.getReward(state, action, destination) if (state,action) not in self.D: self.D[(state, action)] = [destination] else: self.D[(state, action)] += [destination] #print "----------T:\n", self.T, '\n-------R:\n', self.R, '\n------D:\n', self.D for _ in range(iterations): #print "\n----------SELF.VALUES\n", self.values copyVals = util.Counter() for state in self.mdp.getStates(): if self.mdp.isTerminal(state): continue actions = self.mdp.getPossibleActions(state) bestVals = [] #print state , mdp.getPossibleActions(state) for action in actions: #for dest in self.D[state,action]: #print "(state,action,dest):", (state, action, dest) , "T:", self.T[(state, action, dest)], "R:",self.R[(state, action, dest)], "k_val:", self.values[dest] #bestVals += [ sum( [ self.T[state, action, dest] * ( self.R[state, action, dest] + self.discount*self.values[dest] ) ] ) ] bestVals += [self.getQValue(state,action)] #bestVals += [ sum( [ T[state, action, dest] * ( R[state, action, dest] + self.discount*self.values[state] ) # for dest in D[state,action] ] ) ] #print "state", state, "bestvals", bestVals copyVals[state] = max(bestVals) #self.values[state] = max(bestVals) self.values = copyVals.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # keep track of the number of iterations we have done so far i = 0 # final output value v = 0 # get all the states states = mdp.getStates() # for each of the specified iterations: while i < iterations: # save the current self.values oldSV = self.values.copy() # increment our variable for number of iterations i = i + 1 # for each of the states, for s in states: # get the value at this state v = util.Counter() # look at all possible actions from that state actions = mdp.getPossibleActions(s) # for each state action pair ... for a in actions: # get the transition states and the probablilities of # reaching those states tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a) # keep track of the number of pairs we have seen so far j = 0 # print tStatesAndProbs # for each pair in tStatesAndProbs, while j < len(tStatesAndProbs): # extract tState and Prob from this member of the list tState = tStatesAndProbs[j][0] prob = tStatesAndProbs[j][1] # set the value associated with that move # make sure to account for prob and discount v[a] = v[a] + (mdp.getReward(s, a, tState) + discount * oldSV[tState]) * prob # increment j = j + 1 # return self.values[s] = v[v.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.actions = util.Counter() tempValues = util.Counter() # Write value iteration code here for k in range(0,iterations): for state in mdp.getStates(): maxAction = float("-inf") for action in mdp.getPossibleActions(state): total = 0 for nextState, prob in mdp.getTransitionStatesAndProbs(state, action): total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState)) maxAction = max(maxAction, total) tempValues[state] = maxAction for state in mdp.getStates(): if tempValues[state] > float("-inf"): self.values[state] = tempValues[state] for state in mdp.getStates(): maxAction = None maxActionValue = float("-inf") for action in mdp.getPossibleActions(state): total = 0 for nextState, prob in mdp.getTransitionStatesAndProbs(state, action): total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState)) if total > maxActionValue: maxActionValue = total maxAction = action self.actions[state] = maxAction
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations "*** YOUR CODE HERE ***" "value at each state" self.V = util.Counter() self.tempV = util.Counter() "Q for each state,action pair" self.Q = util.Counter() "policy for each state = best action to take" self.P = util.Counter() gamma = self.discount for iter in range(1,self.iterations+1): for state in mdp.getStates(): "There is a Q for each (state,action) pair, so index this by state and keep a list of all actions" self.Q[state] = util.Counter() "Cycle through each possible action for the given state" for action in mdp.getPossibleActions(state): for neighborStateAndTransitionProb in mdp.getTransitionStatesAndProbs(state,action): [neighborState, T_s_a_sp] = neighborStateAndTransitionProb "Compute the Q values for this state and the available actions" R_s_a_sp = mdp.getReward(state,action,neighborState) self.Q[state][action] += T_s_a_sp*(R_s_a_sp+gamma*self.V[neighborState]) "As long as there were actions at this state, find the one that produces the largest Q value" if len(self.Q[state]) > 0: maxQstate = -1000000 maxQAction = None for key,value in self.Q[state].items(): if value > maxQstate: maxQstate = value maxQAction = key elif value == maxQstate: [maxQstate,maxQAction] = random.choice([[maxQstate,maxQAction],[value,key]]) if maxQstate == -10000000: maxQstate = 0.0 "Find the policy (or best action) that corresponds to the best Q value" self.P[state] = maxQAction "Choose the value of the state to be the max Q value that the state has" self.tempV[state] = self.Q[state][maxQAction] "After all states have been updated, store tempV in V before the next iteration" self.V = self.tempV.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" print "-----------------------------------------------------" "*** MY CODE BEGINS ***" k = 0 while k < iterations: val = self.values.copy() #before each iteration, copy one. for s in mdp.getStates(): if mdp.isTerminal(s) == False: max = -999999 for action in mdp.getPossibleActions(s): v = 0 for pos_pro in mdp.getTransitionStatesAndProbs(s,action): v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]]) if v > max: max = v val[s] = max else: for action in mdp.getPossibleActions(s): v = 0 for pos_pro in mdp.getTransitionStatesAndProbs(s,action): v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]]) val[s] = v k = k+1 for s in mdp.getStates(): self.values[s] = val[s]
def computeQValueFromValues(self, state, action): """ Compute the Q-value of action in state from the value function stored in self.values. """ "*** YOUR CODE HERE ***" mdp = self.mdp gamma = self.discount successors = mdp.getTransitionStatesAndProbs(state, action) return sum([successor[1] * (mdp.getReward(state, action, successor[0]) + gamma * self.getValue(successor[0])) for successor in successors])