def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" count = 0 for j in range(self.iterations): newVal = self.values.copy() for s in mdp.getStates(): k = [float("-inf")] if mdp.isTerminal(s): #checking if mdp is a terminal state count = count + 1 elif not mdp.isTerminal(s): for action in mdp.getPossibleActions(s): #Getting Q state k += [self.computeQValueFromValues(s, action) ] #computing Q values newVal[s] = max(k) self.values = newVal
def runValueIteration(self): "*** YOUR CODE HERE ***" mdp = self.mdp predecessors = {state: set() for state in mdp.getStates()} for state in mdp.getStates(): for action in mdp.getPossibleActions(state): for nextState, prob in mdp.getTransitionStatesAndProbs( state, action): if prob > 0: predecessors[nextState].add(state) queue = util.PriorityQueue() for state in mdp.getStates(): if not mdp.isTerminal(state): diff = abs(self.values[state] - max([ self.getQValue(state, action) for action in mdp.getPossibleActions(state) ])) queue.update(state, -diff) for i in range(self.iterations): if queue.isEmpty(): break state = queue.pop() if not mdp.isTerminal(state): self.values[state] = max([ self.getQValue(state, action) for action in mdp.getPossibleActions(state) ]) for pred in predecessors[state]: diff = abs(self.values[pred] - max([ self.getQValue(pred, action) for action in mdp.getPossibleActions(pred) ])) if diff > self.theta: queue.update(pred, -diff)
def runValueIteration(self): "*** YOUR CODE HERE ***" mdp = self.mdp state = mdp.getStates() predecessor_list = {} discount = self.discount iterations = self.iterations theta = self.theta for temp_state in state: predecessor_list[temp_state] = self.getpredecessor(temp_state) pq = util.PriorityQueue() for temp_state in state: if not mdp.isTerminal(temp_state): pq.push(temp_state, -self.find_difference(temp_state)) for i in range(iterations): if pq.isEmpty(): return cur_state = pq.pop() if not mdp.isTerminal(cur_state): action = self.getAction(cur_state) self.values[cur_state] = self.getQValue(cur_state, action) for pre in predecessor_list[cur_state]: diff_pre = self.find_difference(pre) if diff_pre > theta: pq.update(pre, -diff_pre)
def runValueIteration(self): "*** YOUR CODE HERE ***" mdp = self.mdp values = self.values discount = self.discount predecessors = {} for state in mdp.getStates(): preList = [] for preState in mdp.getStates(): for action in mdp.getPossibleActions(preState): if state in [ pair[0] for pair in mdp.getTransitionStatesAndProbs( preState, action) if pair[1] > 0 ]: preList.append(preState) break predecessors[state] = preList queue = util.PriorityQueue() for s in mdp.getStates(): if not mdp.isTerminal(s): actions = mdp.getPossibleActions(s) realValue = max( sum(prob * (mdp.getReward(s, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(s, action)) for action in actions) diff = abs(realValue - values[s]) queue.push(s, 0 - diff) for _ in range(self.iterations): if queue.isEmpty(): return s = queue.pop() if not mdp.isTerminal(s): actions = mdp.getPossibleActions(s) values[s] = max( sum(prob * (mdp.getReward(s, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(s, action)) for action in actions) for p in predecessors[s]: actions = mdp.getPossibleActions(p) realValue = max( sum(prob * (mdp.getReward(p, action, nextState) + (discount * values[nextState])) for (nextState, prob ) in mdp.getTransitionStatesAndProbs(p, action)) for action in actions) diff = abs(realValue - values[p]) if diff > self.theta: queue.update(p, 0 - diff)
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.delta = 0 while(self.iterations > 0): # self.delta = 0 batchValues = util.Counter() for state in mdp.getStates(): maxM = -10000 if mdp.isTerminal(state): continue for action in mdp.getPossibleActions(state): statesProbs = mdp.getTransitionStatesAndProbs(state, action) sumU = 0 Rs = 0 for stateProb in statesProbs: # if stateProb[0] == 'TERMINAL_STATE': # continue sumU = sumU + self.values[stateProb[0]]*stateProb[1] Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1] # if sumU > maxM: # maxM = sumU v = Rs + sumU * discount if (v > maxM): maxM = v batchValues[state] = maxM self.values = batchValues self.iterations = self.iterations - 1 self.policy = {} for state in mdp.getStates(): if mdp.isTerminal(state): self.policy[state] = None continue QValues = [] for action in mdp.getPossibleActions(state): QValues.append(self.getQValue(state, action)) self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]
def get_average_reward(agent, mdp): current_state = mdp.getStartState() iterations = 0 total = 0 while not mdp.isTerminal(current_state) and iterations < 1000: total += agent.getValue(current_state) action = agent.getPolicy(current_state) next_states = mdp.getTransitionStatesAndProbs(current_state, action) current_state = next_states[0][0] iterations += 1 if mdp.isTerminal(current_state): return total / iterations return 0
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for ite in range(iterations): ## repeat stateValues=util.Counter() ## initial all values are 0 for state in mdp.getStates(): if mdp.isTerminal(state)==False: ##if not a terminal state ## maximizes the expected utility of each state stateValues[state] = max([self.computeQValueFromValues(state, action) for action in mdp.getPossibleActions(state)]) self.values=stateValues
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here states = mdp.getStates() for _ in range(0, self.iterations): # create temporary values for for k iteration update tempValues = util.Counter() # loop over states to find max q value for state in states: if not mdp.isTerminal(state): actions = mdp.getPossibleActions(state) tempValues[state] = max( [self.getQValue(state, action) for action in actions]) # set values to k iteration update values self.values = tempValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in xrange(iterations): # do batch processing vals = copy.deepcopy(self.values) for s in mdp.getStates(): if mdp.isTerminal(s): vals[s] = 0 else: vals[s] = self.computeQValueFromValues(s, self.computeActionFromValues(s)) self.values = vals
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" states = mdp.getStates() for i in range(iterations): # Make a new dictionary to memorize new values newValues = util.Counter() # For every state... for curState in states: if mdp.isTerminal(curState): continue action = self.getAction(curState) # Compute new Q value newValues[curState] = self.computeQValueFromValues( curState, action) # Memorize new values self.values = newValues.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ "*** YOUR CODE HERE ***" self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # value of each state; a Counter is a dict with default 0 # run for desired number of iterations for i in xrange(iterations): new_values = self.values.copy() for s in mdp.getStates(): if not mdp.isTerminal(s): # the commented code works as well #curr_best = float("-inf") #for a in mdp.getPossibleActions(s): #temp_value = sum([p * (mdp.getReward(s, a, s2) + discount*prev[s2]) for s2, p in mdp.getTransitionStatesAndProbs(s, a)]) # if temp_value > curr_best: # curr_best = temp_value #self.values[s] = curr_best new_values[s] = max([self.getQValue(s, a) for a in mdp.getPossibleActions(s)]) self.values = new_values
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 v = 0 # Write value iteration code here "*** YOUR CODE HERE ***" for k in range(iterations): tmp = util.Counter() for s in mdp.getStates(): if mdp.isTerminal(s): tmp[s] = 0 else: maior = -9999 for a in mdp.getPossibleActions(s): qv = self.computeQValueFromValues(s, a) if qv > maior: maior = qv tmp[s] = maior self.values = tmp
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for s in mdp.getStates(): self.values[s] = 0 "for a in mdp.getPossibleActions(s):" "for ac in mdp.getTransitionStatesAndProbs(s,a):" " print ac[0]" "print ac[1]" "copy_value = self.values.copy()" "for c in mdp.getStates():" " print copy_value[c]" i=0 "self.states = mdp.getStates()" while i < iterations: copy_value = self.values.copy() for s in mdp.getStates(): if not mdp.isTerminal(s): self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)]) i = i + 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.oldValues = self.values.copy() states = mdp.getStates() for i in range(iterations): for state in states: actions = mdp.getPossibleActions(state) if not mdp.isTerminal(state): actionValue = -float('inf') for action in actions: qValue = self.computeQValueFromValues(state, action) actionValue = max(actionValue, qValue) self.values[state] = actionValue self.oldValues = self.values.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # Outer loop for iteration time # Define the value for each of state taking pi action for i in range(0, iterations): newDict = util.Counter() # Create another dictionary to prevent mutation for state in mdp.getStates(): if mdp.isTerminal(state): newDict[state] = 0 # Set zero when it's in terminal node else: newDict[state] = max([ self.getQValue(state, action) for action in mdp.getPossibleActions(state)]) self.values = newDict # Update the raw values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #mdp este procesul de decizii, care este cunoscut (spre deosebire de q learning, #unde nu este cunoscut modelul for iteration in range(iterations): #se itereaza de iterations ori currentStepValues = self.values.copy() #valorile Q ale pasului curent, deci mereu in pasul k + 1 se porneste cu valorile din pasul k for nextState in mdp.getStates(): #toata lista de stari posibile, in cazul nostru toate patratele de pe harta if mdp.isTerminal(nextState):#starea terminala nu isi mai schimba valoarea Q continue currentStepValues[nextState] = self.computeQValueFromValues(nextState, self.computeActionFromValues(nextState))#update la valori, defapt singura operatie dintr-o iteratie #se recalculeaza valoarea q a acestei stari in functie de valoarea precedenta #print("VALUE IS: ") # sa vedem frumos ce se intampla #print(currentStepValues[nextState]) self.values = currentStepValues.copy()#salvam valorile noi
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in range(iterations): nextValues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): continue first = True for action in mdp.getPossibleActions(state): qValue = 0 for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action): reward = mdp.getReward(state, action, nextState) qValue += prob * (reward + discount*self.values[nextState]) if first: maxQValue = qValue first = False elif qValue > maxQValue: maxQValue = qValue nextValues[state] = maxQValue self.values = nextValues
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.oldValues = self.values.copy() states = self.mdp.getStates() for iteration in range(iterations): for state in states: if not mdp.isTerminal(state): action = self.computeActionFromValues(state) value = self.computeQValueFromValues(state, action) self.values[state] = value self.oldValues = self.values.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" stateValue = dict() for _ in range(self.iterations): maxVal = 0 for state in self.mdp.getStates(): if not mdp.isTerminal(state): # Check if the given state is terminal or not actions = list() # List to store possible actions available for action in self.mdp.getPossibleActions(state): actions.append(self.getQValue(state, action)) # Get Q value for each of the possible states with given actions if actions: maxVal = max(actions) # Store the max actions value in order to calculate optimal path stateValue[state] = maxVal self.values = stateValue.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 #print self.values['test'] #self.values['test'] +=1 for x in range(0,iterations): statevalues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): statevalues[state]=0 continue actionvalue = -100000 for action in mdp.getPossibleActions(state): S_next = self.getQValue(state,action) if actionvalue<S_next: actionvalue = S_next statevalues[state] = actionvalue self.values = statevalues "*** YOUR CODE HERE ***"
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" allStates = mdp.getStates() vPrimes = util.Counter() # A Counter is a dict with default 0 iteration = 0 while iteration < iterations: for s in allStates: if mdp.isTerminal(s): vPrimes[s] = mdp.getReward(s, None, s); else: sreward = mdp.getReward(s, None, s) vPrimes[s] = sreward + discount * self.utilOfBestAction(mdp, s ) for s in allStates: self.values[s] = vPrimes[s] iteration +=1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #while still iterations #for each state #for action in each state #get Q(state,action) #store largest (state,action) in Counter for i in range(self.iterations): newValues = self.values.copy() #WTF WHY THIS TOOK HOURS for state in mdp.getStates(): v = [float("-inf")] if not mdp.isTerminal(state): for action in mdp.getPossibleActions(state): v += [self.computeQValueFromValues(state,action)] newValues[state] = max(v) self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(iterations): valuesNew = util.Counter() for state in mdp.getStates(): maxVal = -1 if not mdp.isTerminal(state): vals = util.Counter() for possact in mdp.getPossibleActions(state): #value = self.computeQValueFromValues(state, possact) #if value > maxVal: # maxVal = value vals[possact] = self.computeQValueFromValues(state, possact) #valuesNew[state] = maxVal valuesNew[state] = max(vals.values()) for st2 in valuesNew: self.values[st2] = valuesNew[st2]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.qvalues = util.Counter() self.bestact = util.Counter() "*** YOUR CODE HERE ***" states = mdp.getStates() for i in range(iterations): v = util.Counter() for state in states: if mdp.isTerminal(state): continue value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state]) for next_state, prob in mdp.getTransitionStatesAndProbs(state, action)) for action in mdp.getPossibleActions(state)} self.bestact[state] = max(value, key=value.get) v[state] = value[self.bestact[state]] for action in value.keys(): self.qvalues[state,action] = value[action] self.values = v.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for i in xrange(iterations): tmp = util.Counter() for state in mdp.getStates(): tmp[state] = self.values[state] for state in mdp.getStates(): if not mdp.isTerminal(state): q_value = -10000000 for action in mdp.getPossibleActions(state): q_value = max(q_value, self.computeQValueFromValues(state, action)) tmp[state] = q_value self.values = tmp
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 for loop in range(self.iterations): value_dict = util.Counter() mdp_list = mdp.getStates() for state in mdp_list: #check if the agents isTerminal if not mdp.isTerminal(state): #make counter_dict and tuple for possible actions counter_dict = util.Counter() actions_tuple = mdp.getPossibleActions(state) #iterate and get qValue for action in actions_tuple: counter_dict[action] = self.computeQValueFromValues( state, action) #get max of the counter_dict values value_dict[state] = max(counter_dict.values()) #make a copy and update values self.values = value_dict.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for time in range(iterations): values = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): values[state] = 0 else: maxValue = -INF for action in mdp.getPossibleActions(state): maxValue = max(maxValue, self.getQValue(state, action)) values[state] = maxValue self.values = values
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 newValues = util.Counter() # Write value iteration code here "*** YOUR CODE HERE ***" allStates = mdp.getStates() for i in range(iterations): for s in allStates: if mdp.isTerminal(s): continue mx = float("-inf") for a in mdp.getPossibleActions(s): score = 0 for (sp, tp) in mdp.getTransitionStatesAndProbs(s, a): score += tp * (mdp.getReward(s, a, sp) + self.discount * self.values[sp]) if score > mx: mx = score newValues[s] = mx self.values = newValues.copy()
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.tempvalues = self.values.copy() # Write value iteration code here #print "start location ", mdp.getStates() "*** YOUR CODE HERE ***" for i in xrange(0, self.iterations): for allstate in mdp.getStates(): if not mdp.isTerminal(allstate): value = [] for possibleaction in mdp.getPossibleActions(allstate): statevalue = 0 statevalue = statevalue + self.computeQValueFromValues( allstate, possibleaction) value.append(statevalue) self.tempvalues[allstate] = max(value) self.values = self.tempvalues.copy()
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" mdp = self.mdp possibleActions = mdp.getPossibleActions(state) maxActionValue = float('-inf') maxAction = None if ((possibleActions == None) or (mdp.isTerminal(state))): return None for action in possibleActions: actionSum = self.getQValue(state, action) #Find the maximum action if maxActionValue < actionSum: maxAction = action maxActionValue = actionSum return maxAction
def runValueIteration(self): # Write value iteration code here # print(self.iterations) for i in range(0, self.iterations): newValues = {} for state in self.values: newValues[state] = [state] # print("iteartion ",i) mdp = self.mdp statesInMDP = mdp.getStates() for currentState in statesInMDP: if (not mdp.isTerminal(currentState)): actions = mdp.getPossibleActions(currentState) maxActionReward = -math.inf for currentAction in actions: currentActionReward = self.computeQValueFromValues( currentState, currentAction) if (currentActionReward > maxActionReward): maxActionReward = currentActionReward newValues[currentState] = maxActionReward else: newValues[currentState] = self.getValue(currentState) self.values = newValues # mdp.isTerminal(state) "*** YOUR CODE HERE ***"
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" for i in range(0, iterations): new_values = copy.copy(self.values) for s in mdp.getStates(): if mdp.isTerminal(s): new_values[s] = 0.0 else: values = [] actions = mdp.getPossibleActions(s) for anaction in actions: values.append(self.getQValue(s, anaction)) new_values[s] = max(values) self.values = new_values
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(0, iterations): valuesCopy = self.values.copy() for state in mdp.getStates(): maximumScore = -10000 for action in mdp.getPossibleActions(state): currQVal = self.computeQValueFromValues(state, action) maximumScore = max(maximumScore, currQVal) if mdp.isTerminal(state): valuesCopy[state] = 0 else: valuesCopy[state] = maximumScore self.values = valuesCopy
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" import random mdp = self.mdp possibleActions = mdp.getPossibleActions(state) valuesForAction = util.Counter() if (mdp.isTerminal(state)): return None for action in possibleActions: tp = mdp.getTransitionStatesAndProbs(state, action) sum = 0 for i in range(len(tp)): nextState, prob = tp[i] sum = sum + prob * (mdp.getReward(state, action, nextState) + self.discount * self.values[nextState]) valuesForAction[action] = sum if (valuesForAction.totalCount() == 0): return possibleActions[0] return valuesForAction.argMax()
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" mdp = self.mdp possibleActions = mdp.getPossibleActions(state) maxActionValue = float('-inf') maxAction = None if ((possibleActions==None) or (mdp.isTerminal(state))): return None for action in possibleActions: actionSum = self.getQValue(state, action) #Find the maximum action if maxActionValue < actionSum: maxAction = action maxActionValue = actionSum return maxAction
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() for i in range(iterations): # running the alg on the indicated number of iterations y = self.values.copy() #V sub k-1 for state in mdp.getStates(): actions = util.Counter() if mdp.isTerminal(state) == False: for possibleActions in mdp.getPossibleActions(state): for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions): value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState])) actions[possibleActions] += value_iteration self.values[state] = actions[actions.argMax()]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 helper_vector = util.Counter() # Copy of vectors to be used for batch updating for i in range(self.iterations): for state in mdp.getStates(): if mdp.isTerminal(state): continue if mdp.getPossibleActions(state): helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] ) for action in mdp.getPossibleActions(state): helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]]) for transition in mdp.getTransitionStatesAndProbs(state, action)] )) for state in helper_vector: self.values[state] = helper_vector[state]
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here for i in range(self.iterations): valCopy = self.values.copy() for state in self.mdp.getStates(): maxVal = float("-inf") if not mdp.isTerminal(state): for action in mdp.getPossibleActions(state): maxVal = max( maxVal, self.computeQValueFromValues(state, action)) valCopy[state] = maxVal self.values = valCopy
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" temp = util.Counter() for i in xrange(iterations): temp = self.values.copy() for j in mdp.getStates(): vlist = [] actions = mdp.getPossibleActions(j) if not mdp.isTerminal(j): for k in actions: tran = mdp.getTransitionStatesAndProbs(j, k) val = 0 for m in tran: val += m[1] * (mdp.getReward(j, k, m[0]) + self.discount * temp[m[0]]) vlist.append(val) self.values[j] = max(vlist)
def computeActionFromValues(self, state): """ The policy is the best action in the given state according to the values currently stored in self.values. You may break ties any way you see fit. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ "*** YOUR CODE HERE ***" mdp = self.mdp max_val = -100 direction = "exit" if mdp.isTerminal(state) == True: return "exit" else: for action in mdp.getPossibleActions(state): Q_value = self.computeQValueFromValues(state, action) if max_val < Q_value: max_val = Q_value direction = action '''max_val = Q_value if max_val < Q_value else max_val direction = action if max_val < Q_value else None''' return direction
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for i in range(iterations): oldv = self.values.copy() for s in mdp.getStates(): if (mdp.isTerminal(s)): self.values[s] = 0 continue maxu = None for a in mdp.getPossibleActions(s): eu = 0 for (sp, p) in mdp.getTransitionStatesAndProbs(s, a): r = mdp.getReward(s, a, sp) r += self.discount * oldv[sp] eu += p * r if (maxu is None or eu > maxu): maxu = eu self.values[s] = maxu
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" states = self.mdp.getStates() for i in range(self.iterations): tempValue = util.Counter() for s in states: if mdp.isTerminal(s): val = 0 tempValue[s] = val else: val = float("-inf") for a in self.mdp.getPossibleActions(s): temp_value = self.computeQValueFromValues(s, a) val = max(temp_value, val) tempValue[s] = val self.values = tempValue
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 next_values = util.Counter() for i in range(0, iterations): for state in mdp.getStates(): if not mdp.isTerminal(state): arr = [] for action in self.mdp.getPossibleActions(state): arr.append(self.getQValue(state, action)) next_values[state] = max(arr) self.values = next_values.copy()
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" for index in range(self.iterations): nextValues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): # terminal states have 0 value; # their actual 'value' is the transition reward continue qValues = [] for action in mdp.getPossibleActions(state): qValues.append(self.computeQValueFromValues(state, action)) nextValues[state] = max(qValues) self.values = nextValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.ValuesDup = util.Counter() # Write value iteration code here "*** YOUR CODE HERE ***" iterations = self.iterations while(iterations >0): for astate in mdp.getStates(): if mdp.isTerminal(astate)==0: QVallist=[] for action in mdp.getPossibleActions(astate): QVallist += [self.computeQValueFromValues(astate, action)] self.values[astate]=max(QVallist) for states,value in self.values.items(): self.ValuesDup[states] = self.values[states] iterations+=-1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" self.depth = 1 self.qTable = {} self.vTable = {} for state in mdp.getStates(): self.vTable[state] = 0 self.qTable[state] = {} for action in mdp.getPossibleActions(state): self.qTable[state][action] = 0 while self.depth < self.iterations + 1: self.tempTable = {} for state in mdp.getStates(): self.stateValue = 0 if not mdp.isTerminal(state): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState] self.qTable[state][action] = self.Qtotal #print self.qTable[state][action] self.stateValue = max(self.stateValue,self.qTable[state][action]) else: self.tempTable[state] = 0 self.tempTable[state] = self.stateValue self.vTable = self.tempTable self.depth += 1 for state in mdp.getStates(): self.stateValue = -9999 for action in mdp.getPossibleActions(state): self.Qtotal = 0 for nextState,prob in mdp.getTransitionStatesAndProbs(state,action): self.reward = mdp.getReward(state, action, nextState) self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState]) self.qTable[state][action] = self.Qtotal
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" states = mdp.getStates() #Iterate through user-defined number of iterations for num in range(iterations): temp = util.Counter() #Compute Ut+1 for all states for state in states: if mdp.isTerminal(state): self.values[state] = 0 continue actions = mdp.getPossibleActions(state) maxVal = MIN #iterate through trans of each action of the state and sum up values for action in actions: transitions = mdp.getTransitionStatesAndProbs(state, action) totalSum = 0 for transition in transitions: #transition[0] = nextState, transition[1] = probability reward = mdp.getReward(state, action, transition[0]) #value of the nextState UtValue = self.values[transition[0]] #using formula of value iteration from wikipedia totalSum += transition[1]*(reward + discount * UtValue) maxVal = max(maxVal, totalSum) #for some reason, self.values[state] = maxVal doesn't work. temp[state] = maxVal for state in states: self.values[state] = temp[state]
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 "*** YOUR CODE HERE ***" oldValues = util.Counter() self.policy = util.Counter() states=self.mdp.getStates() for k in range (iterations): for state in states: actionValues = util.Counter() actions=self.mdp.getPossibleActions(state) for action in actions: aux=self.mdp.getTransitionStatesAndProbs(state, action) for things in aux: #print "nextState:",things[0] #print "prob:",things[1] actionValues[action]+=things[1]*(mdp.getReward(state,action,things[0])+self.discount*oldValues[things[0]]) tmp = actionValues.argMax() self.values[state] = actionValues[tmp] if mdp.isTerminal(state): self.policy[state] = None else: self.policy[state] = tmp oldValues=self.values.copy()
def computeValue(self,mdp,state,discount): actions = mdp.getPossibleActions(state) valueList = [] if (mdp.isTerminal(state)): return for action in actions: transitions = mdp.getTransitionStatesAndProbs(state,action) value = 0 for transition in transitions: subValue = float(transition[1]) * (float(mdp.getReward(state,action,transition[0]))+(float(discount)*(float(self.getValue(transition[0]))))) value += subValue valueList.append(value) self.tmpValues[state] = max(valueList)
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # fill every state with some action. self.actions = dict() for state in mdp.getStates(): stateActions = mdp.getPossibleActions(state) if len(stateActions) > 0: action = stateActions[0] self.actions[state] = action for i in xrange(iterations): # make a copy of all the values. # this copy will get modified in the for-loop, # and at the end of the loop, # the new values will become then real values. nextValues = self.values.copy() # for every state, and if it isn't a terminal state # (you can't do any action on a terminal state): for state in mdp.getStates(): if not mdp.isTerminal(state): # get the best action. action = self.computeActionFromValues(state) self.actions[state] = action # get the value for doing the currently stored action. nextValues[state] = self.computeQValueFromValues(state, action) # copy the new values over the old values. self.values.update(nextValues)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # NOTE: The naming conventions are a bit off. This is to emphasize the # connection between code and the formulas we were provided with. # Enjoy! import sys values_temp = util.Counter() for i in range(iterations): for state in mdp.getStates(): Q_max = -sys.maxint # A terminal state has no actions, so we must be careful to # reset the value to zero here. if mdp.isTerminal(state): Q_max = 0.0 # This is a trivial loop to find the 'best' possible action in # the current state, according to computed Q values. This is # essentially the Pythonic way of saying the following: # V_k+1(s) <- max Q(s, a) for action in mdp.getPossibleActions(state): Q = self.getQValue(state, action) if Q > Q_max: Q_max = Q values_temp[state] = Q_max # Store the new values. self.values = values_temp values_temp = util.Counter()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # print "Final state data" # print self.mdp.getPossibleActions((3,1)) # print self.mdp.getTransitionStatesAndProbs((3,1),"exit") # print "Final state data" for i in range(self.iterations): newValues = self.values.copy() for state in self.mdp.getStates(): # print "state is "+ str(state) if (mdp.isTerminal(state)): continue possibleActions = self.mdp.getPossibleActions(state) # print possibleActions maxVal=-999 for action in possibleActions: # print action val = self.computeQValueFromValues(state,action) # print val if val>maxVal: maxVal = val newValues[state] = maxVal self.values = newValues.copy()
def __init__(self, mdp, discount = 0.9, iterations = 1000): """ Your cyclic value iteration agent should take an mdp on construction, run the indicated number of iterations, and then act according to the resulting policy. Each iteration updates the value of only one state, which cycles through the states list. If the chosen state is terminal, nothing happens in that iteration. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = collections.defaultdict(float) states = self.mdp.getStates() for state in states: self.values[state] = 0 "*** YOUR CODE HERE ***" for i in range(iterations): start = time.time() state = states[i%len(states)] if not mdp.isTerminal(state): reward = mdp.getReward(state) actions = mdp.getPossibleActions(state) EUActions = [] for action in actions: EUAction = 0 transitions = mdp.getTransitionStatesAndProbs(state, action) for transition in transitions: transitionState = transition[0] transitionUtility = self.values[transitionState] transitionProbability = transition[1] EUAction += transitionUtility*transitionProbability EUActions.append(EUAction) maxEU = max(EUActions) updatedUtility = reward + discount*maxEU self.values[state] = updatedUtility print('Iteration: ' + str(i)) print(sum(abs(value - 100) for state, value in self.values.items() if not self.mdp.isTerminal(state))) print('Time elapsed: ' + str(time.time()-start))
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.action_array = {} self.last_values = util.Counter() # Write value iteration code here "*** YOUR CODE HERE ***" for state in mdp.getStates(): self.values[state] = 0.0 while self.iterations > 0: for state in mdp.getStates(): qact = {} if mdp.isTerminal(state): continue else: currentActions = mdp.getPossibleActions(state) for action in currentActions: qact[action] = self.computeQValueFromValues(state, action) #print action, qact.values() #for v in qact.values(): # tmp.append(v) self.values[state] = max(qact.values()) for k, v in qact.items(): #print state #print k, v if v == self.values[state]: self.action_array[state] = k self.last_values = self.values.copy() #print self.last_values self.iterations -= 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" Unew = util.Counter() delta = 0 for iterate in range(0,self.iterations): Unew = util.Counter() for states in mdp.getStates(): if mdp.isTerminal(states): Unew[states] = 0 continue; actions = mdp.getPossibleActions(states) if not actions: Unew[states] = 0 maxQvalue = float("-inf") qValue = 0 for a in actions: qValue = self.getQValue(states,a); if qValue > maxQvalue: maxQvalue = qValue Unew[states] = qValue self.values = Unew
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" import copy for iteration in range(iterations): newvalues = util.Counter() for state in mdp.getStates(): if mdp.isTerminal(state): newvalues[state] = 0 continue possibleActions = mdp.getPossibleActions(state) maxAction = None maxActionValue = float('-inf') if (possibleActions==None): newvalues[state] = 0 for action in possibleActions: actionsum = self.getQValue(state, action) #Find the maximum action if maxActionValue < actionsum: maxAction = action maxActionValue = actionsum #maxActionValue is now V:k+1 after iteration newvalues[state] = maxActionValue self.values = newvalues
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" # State = Coordinate for iteration in range(iterations): newValues = util.Counter() oldValues = self.values.copy() allStates = mdp.getStates() for state in allStates: if not mdp.isTerminal(state): rewardsForActions = util.Counter() possibleActions = mdp.getPossibleActions(state) for possibleAction in possibleActions: transitionStatesAndProbs = mdp.getTransitionStatesAndProbs(state, possibleAction) for transitionStateAndProb in transitionStatesAndProbs: newState = transitionStateAndProb[0] prob = transitionStateAndProb[1] rewardForTransition = mdp.getReward(state, possibleAction, newState) rewardsForActions[possibleAction] = rewardsForActions[possibleAction] + prob * ( rewardForTransition + (discount * oldValues[newState]) ) maxAction = rewardsForActions.argMax() newValues[state] = rewardsForActions[maxAction] self.values = newValues
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here counter = 0 for state in self.mdp.getStates(): self.values[state] = 0 while counter < iterations: newValues = util.Counter() for state in self.mdp.getStates(): if not mdp.isTerminal(state): actions = self.mdp.getPossibleActions(state) maxValue = float("-inf") for action in actions: value = 0 stateProbs = self.mdp.getTransitionStatesAndProbs(state, action) for pair in stateProbs: discounted = 0 if pair[0] == 'TERMINAL_STATE': discounted = 0 #redundant??? else: discounted = discount * self.getValue(pair[0]) value += (self.mdp.getReward(state, action, pair[0]) + discounted) * pair[1] if value > maxValue: maxValue = value newValues[state] = maxValue for state in newValues: self.values[state] = newValues[state] counter += 1 print self.values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" print "-----------------------------------------------------" "*** MY CODE BEGINS ***" k = 0 while k < iterations: val = self.values.copy() #before each iteration, copy one. for s in mdp.getStates(): if mdp.isTerminal(s) == False: max = -999999 for action in mdp.getPossibleActions(s): v = 0 for pos_pro in mdp.getTransitionStatesAndProbs(s,action): v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]]) if v > max: max = v val[s] = max else: for action in mdp.getPossibleActions(s): v = 0 for pos_pro in mdp.getTransitionStatesAndProbs(s,action): v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]]) val[s] = v k = k+1 for s in mdp.getStates(): self.values[s] = val[s]