Example #1
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        helper_vector = util.Counter() # Copy of vectors to be used for batch updating 
        
        for i in range(self.iterations):
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    continue
                if mdp.getPossibleActions(state):
                    helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] )
                for action in mdp.getPossibleActions(state):
                    helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, action)] ))
            for state in helper_vector:
                self.values[state] = helper_vector[state]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for i in range(iterations):
     nextValues = util.Counter()
     for state in mdp.getStates():
       rewardsPossible = util.Counter()
       for action in mdp.getPossibleActions(state):
           nextPossible = size(mdp.getTransitionStatesAndProbs(state, action))[1]
           newRewards = util.Counter()
           for tmpState in range(nextPossible):
             nextState  = mdp.getTransitionStatesAndProbs(state, action)[tmpState][0]
             prob = mdp.getTransitionStatesAndProbs(state, action)[tmpState][1]
             rewards = mdp.getReward(state, action, tmpState)
             newRewards[tmpState] = prob * (rewards + self.discount * self.values[nextState])
           rewardsPossible[action] = newRewards.totalCount()
       nextValues[state] = rewardsPossible[rewardsPossible.argMax()]
       
   self.values = nextValues
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        iteration = 1
        while iteration <= self.iterations:

            updated_values = self.values.copy()

            for possibleNextState in mdp.getStates():

                if mdp.isTerminal(possibleNextState) == True:
                    for possibleAction in mdp.getPossibleActions(
                            possibleNextState):
                        possibleValue = 0
                        for possibleTransition in mdp.getTransitionStatesAndProbs(
                                possibleNextState, possibleAction):
                            #Following Bellman's equation
                            possibleValue += possibleTransition[1] * (
                                mdp.getReward(possibleNextState,
                                              possibleAction,
                                              possibleTransition[0]) +
                                discount * self.values[possibleTransition[0]])
                        updated_values[possibleNextState] = possibleValue

                else:

                    maxStateValue = float("-inf")
                    for possibleAction in mdp.getPossibleActions(
                            possibleNextState):
                        possibleValue = 0
                        for possibleTransition in mdp.getTransitionStatesAndProbs(
                                possibleNextState, possibleAction):
                            #Following Bellman's equation
                            possibleValue += possibleTransition[1] * (
                                mdp.getReward(possibleNextState,
                                              possibleAction,
                                              possibleTransition[0]) +
                                discount * self.values[possibleTransition[0]])
                        if possibleValue > maxStateValue:
                            maxStateValue = possibleValue
                    updated_values[possibleNextState] = maxStateValue

            self.values = updated_values
            iteration += 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.depth = 1
        self.qTable = {}
        self.vTable = {}
        for state in mdp.getStates():
            self.vTable[state] = 0
            self.qTable[state] = {}
            for action in mdp.getPossibleActions(state):
                
                self.qTable[state][action] = 0
        
        while self.depth < self.iterations + 1:
            self.tempTable = {}
            for state in mdp.getStates():
                self.stateValue = 0
                if not mdp.isTerminal(state):
                    self.stateValue = -9999
                    for action in mdp.getPossibleActions(state):
                        self.Qtotal = 0
                        for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                            self.reward = mdp.getReward(state, action, nextState)
                            self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                            #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState]
                        self.qTable[state][action] = self.Qtotal
                        #print self.qTable[state][action]
                        self.stateValue = max(self.stateValue,self.qTable[state][action])
                else:
                    self.tempTable[state] = 0
                self.tempTable[state] = self.stateValue
            self.vTable = self.tempTable
            self.depth += 1
            
        for state in mdp.getStates():
            self.stateValue = -9999
            for action in mdp.getPossibleActions(state):
                self.Qtotal = 0
                for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                    self.reward = mdp.getReward(state, action, nextState)
                    self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                self.qTable[state][action] = self.Qtotal
Example #5
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        oldCounter = util.Counter()
        states = mdp.getStates()
        while self.iterations > 0:
            for state in states:
                if self.mdp.isTerminal(state):
                    self.values[state] = 0
                else:
                    actions = mdp.getPossibleActions(state)
                    # if len(actions) > 0:
                    # initialize maxAct by calculating the first action from the action list of the given state
                    firstAct = actions[0]
                    listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(
                        state, firstAct)
                    for pair in listOfNextStateAndProbPairs:
                        value = 0
                        for pair in listOfNextStateAndProbPairs:
                            (nextState, prob) = pair
                            reward = mdp.getReward(state, firstAct, nextState)
                            value = value + prob * (reward + self.discount *
                                                    (oldCounter[nextState]))
                    # compare and choose the best action value
                    maxAct = value
                    for action in actions:
                        listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(
                            state, action)
                        value = 0
                        for pair in listOfNextStateAndProbPairs:
                            (nextState, prob) = pair
                            reward = mdp.getReward(state, action, nextState)
                            value = value + prob * (reward + self.discount *
                                                    (oldCounter[nextState]))
                        maxAct = max(maxAct, value)
                    # update the state value for the given state
                    self.values[state] = maxAct
            oldCounter = self.values.copy()
            self.iterations = self.iterations - 1
Example #6
0
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     values = self.values
     discount = self.discount
     predecessors = {}
     for state in mdp.getStates():
         preList = []
         for preState in mdp.getStates():
             for action in mdp.getPossibleActions(preState):
                 if state in [
                         pair[0]
                         for pair in mdp.getTransitionStatesAndProbs(
                             preState, action) if pair[1] > 0
                 ]:
                     preList.append(preState)
                     break
         predecessors[state] = preList
     queue = util.PriorityQueue()
     for s in mdp.getStates():
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             realValue = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
             diff = abs(realValue - values[s])
             queue.push(s, 0 - diff)
     for _ in range(self.iterations):
         if queue.isEmpty():
             return
         s = queue.pop()
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             values[s] = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
         for p in predecessors[s]:
             actions = mdp.getPossibleActions(p)
             realValue = max(
                 sum(prob * (mdp.getReward(p, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(p, action))
                 for action in actions)
             diff = abs(realValue - values[p])
             if diff > self.theta:
                 queue.update(p, 0 - diff)
Example #7
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        oldCounter = util.Counter()
        states = mdp.getStates()
        while self.iterations>0:
          for state in states:
            if self.mdp.isTerminal(state):
              self.values[state] = 0
            else:
              actions = mdp.getPossibleActions(state)
              # if len(actions) > 0:
                # initialize maxAct by calculating the first action from the action list of the given state
              firstAct = actions[0]
              listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, firstAct)
              for pair in listOfNextStateAndProbPairs:
                value = 0
                for pair in listOfNextStateAndProbPairs:
                  (nextState, prob) = pair
                  reward = mdp.getReward(state, firstAct, nextState)
                  value = value + prob * (reward + self.discount * (oldCounter[nextState]))
              # compare and choose the best action value
              maxAct = value
              for action in actions:
                listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, action)
                value = 0
                for pair in listOfNextStateAndProbPairs:
                  (nextState, prob) = pair
                  reward = mdp.getReward(state, action, nextState)
                  value = value + prob * (reward + self.discount * (oldCounter[nextState]))
                maxAct = max(maxAct, value)
              # update the state value for the given state
              self.values[state] = maxAct
          oldCounter = self.values.copy()
          self.iterations = self.iterations - 1
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        #define a dictionary to store values in iterations for each state
        self.valueRecord = {}
        for state in mdp.getStates():
            self.valueRecord[state] = []

        for i in range(0, self.iterations + 1):
            for state in mdp.getStates():
                if i == 0 or mdp.isTerminal(state):
                    self.valueRecord[state].append(0)
                    continue
                actions = mdp.getPossibleActions(state)
                #store the current reward as value and return
                if 'exit' in actions:
                    self.valueRecord[state].append(
                        mdp.getReward(
                            state, 'exit',
                            mdp.getTransitionStatesAndProbs(state,
                                                            'exit')[0][0]))
                    continue
                max = float("-inf")
                #find the action that maximize the value
                for action in actions:
                    summax = 0
                    for (s,
                         p) in mdp.getTransitionStatesAndProbs(state, action):
                        summax += p * (
                            mdp.getReward(state, action, s) +
                            self.discount * self.valueRecord[s][i - 1])
                    if summax > max:
                        max = summax
                self.valueRecord[state].append(max)
        #store the final value we get from iteration into values
        for k, v in self.valueRecord.items():
            self.values[k] = v[iterations]
Example #9
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.actions = util.Counter()

        tempValues = util.Counter()

        # Write value iteration code here
        for k in range(0, iterations):
            for state in mdp.getStates():
                maxAction = float("-inf")
                for action in mdp.getPossibleActions(state):
                    total = 0
                    for nextState, prob in mdp.getTransitionStatesAndProbs(
                            state, action):
                        total += prob * (
                            self.values[nextState] * discount +
                            mdp.getReward(state, action, nextState))
                    maxAction = max(maxAction, total)
                tempValues[state] = maxAction
            for state in mdp.getStates():
                if tempValues[state] > float("-inf"):
                    self.values[state] = tempValues[state]

        for state in mdp.getStates():
            maxAction = None
            maxActionValue = float("-inf")
            for action in mdp.getPossibleActions(state):
                total = 0
                for nextState, prob in mdp.getTransitionStatesAndProbs(
                        state, action):
                    total += prob * (self.values[nextState] * discount +
                                     mdp.getReward(state, action, nextState))
                if total > maxActionValue:
                    maxActionValue = total
                    maxAction = action
            self.actions[state] = maxAction
Example #10
0
    def __init__(self, mdp, discount=0.9, iterations=200):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.q_values = {}
        self.best_action = {}

        # calculate utilities values
        for i in range(self.iterations):
            next_values = util.Counter()
            for s in mdp.getStates():
                updated = False
                for a in mdp.getPossibleActions(s):
                    action_value = 0.0

                    for t in mdp.getTransitionStatesAndProbs(s, a):
                        r = mdp.getReward(s, a, t[0])
                        action_value += t[1] * (r +
                                                discount * self.values[t[0]])

                    if not updated or action_value > next_values[s]:
                        next_values[s] = action_value
                        updated = True
            self.values = next_values

        # with the given utilities, calculate q-values
        p = False
        for s in mdp.getStates():
            self.best_action[s] = None
            max_action_value = -10000000
            for a in mdp.getPossibleActions(s):
                action_value = 0.0
                for t in mdp.getTransitionStatesAndProbs(s, a):
                    r = mdp.getReward(s, a, t[0])
                    action_value += t[1] * (r + discount * self.values[t[0]])
                self.q_values[(s, a)] = action_value
                if action_value > max_action_value:
                    max_action_value = action_value
                    self.best_action[s] = a
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.tempvalues = util.Counter()

        # Write value iteration code here
        for i in range(0, self.iterations):
            newVal = self.values.copy()

            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    #worst V value possible
                    maxV = float("-inf")
                    for action in mdp.getPossibleActions(state):
                        v = 0
                        #transitions = [newState, probability]
                        for transition in mdp.getTransitionStatesAndProbs(
                                state, action):
                            v = v + transition[1] * (
                                mdp.getReward(state, action, transition[0]) +
                                discount * self.values[transition[0]])

                        if v > maxV:
                            maxV = v
                    newVal[state] = maxV
                else:
                    #state is terminal
                    for action in mdp.getPossibleActions(state):
                        v = 0
                        for transition in mdp.getTransitionStatesAndProbs(
                                state, action):
                            v = v + transition[1] * (
                                mdp.getReward(state, action, transition[0]) +
                                discount * self.values[transition[0]])
                        newVal[state] = v
            #update whole V values
            self.values = newVal
Example #12
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        #       print mdp.getStates() # ['TERMINAL_STATE', (0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0), (3, 1), (3, 2)]
        #       s = mdp.getStates()[1]
        #       print mdp.getPossibleActions(s) # ('north', 'west', 'south', 'east')
        #       a = mdp.getPossibleActions(s)[0]
        #       print mdp.getTransitionStatesAndProbs(s, a) # [((0, 1), 0.8), ((1, 0), 0.1), ((0, 0), 0.1)]
        #       ns = mdp.getTransitionStatesAndProbs(s, a)[0][0]
        #       print mdp.getReward(s, a, ns)  # 0.0
        #       raise Exception

        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # value iteration
        for i in xrange(iterations):
            newvalues = self.values.copy()
            for state in mdp.getStates():
                if state == 'TERMINAL_STATE': continue
                candidates = []
                for action in mdp.getPossibleActions(state):  # noraml state
                    if len(mdp.getTransitionStatesAndProbs(state, action)) > 1:
                        candidates.append( sum([probability*self.getValue(nextstate) \
                                for nextstate, probability in mdp.getTransitionStatesAndProbs(state, action)]) )

                if len(candidates) == 0:  # TERMINAL_STATE
                    for action in mdp.getPossibleActions(state):  # terminal
                        candidates.append( sum([mdp.getReward(state, action, nextstate) \
                            for nextstate, _ in mdp.getTransitionStatesAndProbs(state, action) \
                            if nextstate=='TERMINAL_STATE']) )
                    newvalues[state] = max(candidates)
                else:
                    newvalues[state] = discount * max(candidates)
            self.values = newvalues
Example #13
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations

        self.utilities = {}
        self.qvalues = {}
        states = mdp.getStates()
        for state in states:
            self.utilities[state] = 0
            self.qvalues[state] = util.Counter()

        for i in range(iterations):
            newUtilities = {}
            for state in states:
                if self.mdp.isTerminal(state):
                    continue
                childQs = []
                for action in mdp.getPossibleActions(state):
                    q_value = 0
                    for transition in mdp.getTransitionStatesAndProbs(
                            state, action):
                        q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                            discount*self.utilities[transition[0]])
                    childQs.append(q_value)
                newUtilities[state] = max(childQs)
            self.utilities.update(newUtilities)
        """ q-values are a dictionary from states to dictionaries of action => qvalue mappings"""

        for state in states:
            for action in mdp.getPossibleActions(state):
                q_value = 0
                for transition in mdp.getTransitionStatesAndProbs(
                        state, action):
                    q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                        discount*self.utilities[transition[0]])
                self.qvalues[state][action] = q_value
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
	for i in range(0,iterations):
		b = self.values.copy()
		#print ('b',b)
		#print ('all',mdp.getStates())
		for s in mdp.getStates():
		    if s == 'TERMINAL_STATE':
				self.values[s]= 0
		    else:
			#print ('s',s)
			qlist = []
			for a in mdp.getPossibleActions(s):
				if a =='exit':
					qlist.append(mdp.getReward(s,a,(mdp.getTransitionStatesAndProbs(s,a))))
				else:
					#print('a',a)
					spsum = 0
					for sp in mdp.getTransitionStatesAndProbs(s,a):
						#print('sp',sp)
						#print(mdp.getReward(s,a,sp[0]))
						spsum =spsum+ (sp[1]*(mdp.getReward(s,a,sp[0])+self.discount*b[sp[0]]))
						#print ('spsum',spsum)
						#print('i',i)
					qlist.append(spsum)
			#print qlist
			self.values[s] = max(qlist)
			while len(qlist) > 0 : qlist.pop()	
				


        # Write value iteration code here
        "*** YOUR CODE HERE ***"
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        while self.iterations > 0:
            prev_values = self.values.copy()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if not actions:
                    continue
                self.values[state] = max([sum([prob*(mdp.getReward(state, act, state1) + discount*prev_values[state1])
                                               for state1, prob in mdp.getTransitionStatesAndProbs(state, act)])
                                          for act in actions])
            self.iterations -= 1
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   self.policy = util.Counter()
   oldValues = util.Counter()
   states = mdp.getStates()
   for x in xrange(0,iterations):
       for state in states:
           possibleActions = mdp.getPossibleActions(state)
           qValues = util.Counter()
           for action in possibleActions:
               qValue = 0;
               for nextState, prob in mdp.getTransitionStatesAndProbs(state,action):
                   qValue += prob*(mdp.getReward(state, action, nextState)+discount*oldValues[nextState])
               qValues[action] = qValue
           bestAction = qValues.argMax()
           self.values[state] = qValues[bestAction]
       for value in self.values:
           oldValues[value] = self.values[value]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for _ in range(iterations):
            updateBatch = self.values.copy()
            for state in mdp.getStates():
                self.values[state] = max([
                    sum([prob*(mdp.getReward(state, action, transitionState) + discount*updateBatch[transitionState])
                        for transitionState, prob 
                        in mdp.getTransitionStatesAndProbs(state, action)
                        if prob != 0])
                    for action in mdp.getPossibleActions(state)] or [0])
Example #18
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        for k in range(self.iterations):
            self.values_old = self.values.copy()
            for s in mdp.getStates():
                if not self.mdp.isTerminal(s):
                    self.values[s] = max(
                        [
                            sum(
                                [
                                    T * (mdp.getReward(s, a, sp) + self.discount * self.values_old[sp])
                                    for (sp, T) in mdp.getTransitionStatesAndProbs(s, a)
                                ]
                            )
                            for a in mdp.getPossibleActions(s)
                        ]
                    )
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        # get the discount
        discount = self.discount
        # get the values
        values = self.values
        # get the mdp
        mdp = self.mdp
        # set initial q value
        qv = 0
        # 
        tStatesAndProbs = mdp.getTransitionStatesAndProbs(state, action)
        # keep track of pairs seen so far
        j = 0 
        while j < len(tStatesAndProbs):
          # extract tState and Prob from this member of the list
          tState = tStatesAndProbs[j][0]
          prob = tStatesAndProbs[j][1]
          # calcuate the qv the same we we calculated v above
          qv = qv + ((discount * values[tState]) + mdp.getReward(state, action, tState)) * prob
          # increment
          j = j + 1

        return qv
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     predecessors = {state: set() for state in mdp.getStates()}
     for state in mdp.getStates():
         for action in mdp.getPossibleActions(state):
             for nextState, prob in mdp.getTransitionStatesAndProbs(
                     state, action):
                 if prob > 0:
                     predecessors[nextState].add(state)
     queue = util.PriorityQueue()
     for state in mdp.getStates():
         if not mdp.isTerminal(state):
             diff = abs(self.values[state] - max([
                 self.getQValue(state, action)
                 for action in mdp.getPossibleActions(state)
             ]))
             queue.update(state, -diff)
     for i in range(self.iterations):
         if queue.isEmpty():
             break
         state = queue.pop()
         if not mdp.isTerminal(state):
             self.values[state] = max([
                 self.getQValue(state, action)
                 for action in mdp.getPossibleActions(state)
             ])
         for pred in predecessors[state]:
             diff = abs(self.values[pred] - max([
                 self.getQValue(pred, action)
                 for action in mdp.getPossibleActions(pred)
             ]))
             if diff > self.theta:
                 queue.update(pred, -diff)
Example #21
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
     
     Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state)
     """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter(
     )  # A Counter is a dict with default values as 0
     "*** YOUR CODE HERE ***"
     states = mdp.getStates()
     for k in range(0, iterations):
         for state in states:
             actions = []
             for action in mdp.getPossibleActions(state):
                 trans_prob = mdp.getTransitionStatesAndProbs(state, action)
                 actions.append(
                     sum(self.values[tp[0], k - 1] * tp[1]
                         for tp in trans_prob))
             if actions:
                 max_prob = max(actions)
             else:
                 max_prob = 0
             self.values[state,
                         k] = mdp.getReward(state) + discount * max_prob
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for i in range(0, iterations):
     preValues = self.values.copy()
     for state in mdp.getStates():
       curValue = util.Counter()
       for action in mdp.getPossibleActions(state):
         for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
           curValue[action] += prob*(mdp.getReward(state, action, nextState)+discount*preValues[nextState])
       self.values[state] = curValue[curValue.argMax()]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for k in range(1, iterations + 1):
            values = copy(self.values)
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if len(actions) > 0:
                    self.values[state] = max([
                        sum([
                            prob * (mdp.getReward(state, action, next) +
                                    discount * values[next])
                            for next, prob in mdp.getTransitionStatesAndProbs(
                                state, action)
                        ]) for action in mdp.getPossibleActions(state)
                    ])
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.qvalues = util.Counter()
    self.bestact = util.Counter()

    "*** YOUR CODE HERE ***"
    states = mdp.getStates()

    for i in range(iterations):
        v = util.Counter()
        for state in states:
            if mdp.isTerminal(state):
                continue
            value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state])
                    for next_state, prob in mdp.getTransitionStatesAndProbs(state, action))
                    for action in mdp.getPossibleActions(state)}
            self.bestact[state] = max(value, key=value.get)
            v[state] = value[self.bestact[state]] 
            for action in value.keys():
                self.qvalues[state,action] = value[action]
        self.values = v.copy()
Example #25
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
          
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0

    for i in range(iterations):
        nextValues = util.Counter()
        for state in mdp.getStates(): 
            if mdp.isTerminal(state): continue
            first = True
            for action in mdp.getPossibleActions(state):
                qValue = 0
                for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
                    reward = mdp.getReward(state, action, nextState)
                    qValue += prob * (reward + discount*self.values[nextState])
                if first:
                    maxQValue = qValue
                    first = False
                elif qValue > maxQValue:
                    maxQValue = qValue
            nextValues[state] = maxQValue
        self.values = nextValues
Example #26
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        for i in range(iterations):
            values = self.values.copy()  #vrednosti iz prethodne iteracije
            for state in mdp.getStates():
                actionsCounter = util.Counter()
                for action in mdp.getPossibleActions(state):
                    for nextState, prob in mdp.getTransitionStatesAndProbs(
                            state, action):
                        actionsCounter[action] += prob * (
                            mdp.getReward(state, action, nextState) +
                            discount * values[nextState])
                self.values[state] = actionsCounter[actionsCounter.argMax()]
Example #27
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        for _ in range(0, iterations):
            self.newValues = util.Counter()
            for st in mdp.getStates():
                if len(mdp.getPossibleActions(st)) != 0:
                    maxV = -sys.maxint
                    for act in mdp.getPossibleActions(st):
                        newV = 0
                        for tst, prob in mdp.getTransitionStatesAndProbs(
                                st, act):
                            r = mdp.getReward(st, act, tst)
                            newV += prob * (r + discount * self.values[tst])
                        if newV > maxV: maxV = newV
                    self.newValues[st] = maxV
                else:
                    self.newValues[st] = self.values[st]
            self.values = self.newValues
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   currentIterationCounter = 1
   for state in mdp.getStates():
     self.values[state] = mdp.getReward(state, 'Stop', state)
   while (currentIterationCounter != self.iterations):
     newValues = util.Counter()
     for state in mdp.getStates():
       tempValues = util.Counter()
       for action in mdp.getPossibleActions(state):
         for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action):
           newState = newStateAndProb[0]
           prob = newStateAndProb[1]
           tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState])
       newValues[state] = tempValues[tempValues.argMax()]
     currentIterationCounter += 1
     for state in mdp.getStates():
       self.values[state] = newValues[state]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() 
        
        for i in range(iterations): # running the alg on the indicated number of iterations
            y = self.values.copy() #V sub k-1
            
            for state in mdp.getStates():
                actions = util.Counter()
                
                if  mdp.isTerminal(state) == False:
                    for possibleActions in mdp.getPossibleActions(state):

                        for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions):
                                value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState]))
                                actions[possibleActions] += value_iteration
                    self.values[state] = actions[actions.argMax()] 
Example #30
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        newValues = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        allStates = mdp.getStates()

        for i in range(iterations):
            for s in allStates:
                if mdp.isTerminal(s): continue
                mx = float("-inf")
                for a in mdp.getPossibleActions(s):
                    score = 0
                    for (sp, tp) in mdp.getTransitionStatesAndProbs(s, a):
                        score += tp * (mdp.getReward(s, a, sp) +
                                       self.discount * self.values[sp])
                    if score > mx:
                        mx = score
                newValues[s] = mx
            self.values = newValues.copy()
Example #31
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            oldv = self.values.copy()
            for s in mdp.getStates():
                if (mdp.isTerminal(s)):
                    self.values[s] = 0
                    continue
                maxu = None
                for a in mdp.getPossibleActions(s):
                    eu = 0
                    for (sp, p) in mdp.getTransitionStatesAndProbs(s, a):
                        r = mdp.getReward(s, a, sp)
                        r += self.discount * oldv[sp]
                        eu += p * r
                    if (maxu is None or eu > maxu): maxu = eu
                self.values[s] = maxu
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        mdp = self.mdp
        discount = self.discount
        iterations = self.iterations
        values = self.values

        #states = mdp.getStates()
        #print('state is :', state)

        qValues = []

        sum = 0

        for item in mdp.getTransitionStatesAndProbs(
                state, action
        ):  #'mdp.getTransitionStatesAndProbs :', [((0, 1), 1.0), ((0, 0), 0.0), ((0, 2), 0.0)]
            resultState = item[0]
            prob = item[1]
            sum += prob * (mdp.getReward(state, action, resultState) +
                           discount * values[resultState])

        return sum
Example #33
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   # OUR CODE HERE
   #Note: I think we should use the util.Counter thing?
   for times in range(0, iterations):
     #values from previous iteration so we don't update over them while iterating
     prevVals = self.values.copy()
     #iterate through all states
     for state in mdp.getStates():
       #will store the action-value for the iteration
       value = util.Counter()
       for action in mdp.getPossibleActions(state):
         for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action):
           #expected value, probability * reward for the state with the discount * reward
           value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState])
       #update the values to the new value from the iteration
       #the .argMax() function returns the one with the largest value
       self.values[state] = value[value.argMax()]
Example #34
0
    def __init__(self, mdp, discount=0.9, iterations=100, theta=1e-5):
        """
          Your prioritized sweeping value iteration agent should take an mdp on
          construction, run the indicated number of iterations,
          and then act according to the resulting policy.

        """

        self.theta = theta
        self.pq = util.PriorityQueue()
        self.predecessors = util.Counter()

        # self.mdp=mdp

        states = mdp.getStates()
        for s in states:

            actions = mdp.getPossibleActions(s)
            for a in actions:
                # print(mdp.getTransitionStatesAndProbs(s,a))
                T = mdp.getTransitionStatesAndProbs(s, a)
                # print(T[0])
                (s_next, p) = T[0]
                if self.predecessors[s_next] == 0:
                    # print('hi')
                    self.predecessors[s_next] = set()
                if p != 0:
                    # print(self.predecessors[s_next])
                    self.predecessors[s_next].add(s)

        ValueIterationAgent.__init__(self, mdp, discount, iterations)
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # keep track of the number of iterations we have done so far
        i = 0
        # final output value
        v = 0
        # get all the states
        states = mdp.getStates()
        # for each of the specified iterations:
        while i < iterations:
            # save the current self.values
            oldSV = self.values.copy()
            # increment our variable for number of iterations
            i = i + 1
            # for each of the states,
            for s in states:
                # get the value at this state
                v = util.Counter()
                # look at all possible actions from that state
                actions = mdp.getPossibleActions(s)
                # for each state action pair ...
                for a in actions:
                    # get the transition states and the probablilities of
                    # reaching those states
                    tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a)
                    # keep track of the number of pairs we have seen so far
                    j = 0
                    # print tStatesAndProbs
                    # for each pair in tStatesAndProbs,
                    while j < len(tStatesAndProbs):
                        # extract tState and Prob from this member of the list
                        tState = tStatesAndProbs[j][0]
                        prob = tStatesAndProbs[j][1]
                        # set the value associated with that move
                        # make sure to account for prob and discount
                        v[a] = v[a] + (mdp.getReward(s, a, tState) +
                                       discount * oldSV[tState]) * prob
                        # increment
                        j = j + 1
                # return
                self.values[s] = v[v.argMax()]
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        # get the discount
        discount = self.discount
        # get the values
        values = self.values
        # get the mdp
        mdp = self.mdp
        # set initial q value
        qv = 0
        #
        tStatesAndProbs = mdp.getTransitionStatesAndProbs(state, action)
        # keep track of pairs seen so far
        j = 0
        while j < len(tStatesAndProbs):
            # extract tState and Prob from this member of the list
            tState = tStatesAndProbs[j][0]
            prob = tStatesAndProbs[j][1]
            # calcuate the qv the same we we calculated v above
            qv = qv + ((discount * values[tState]) +
                       mdp.getReward(state, action, tState)) * prob
            # increment
            j = j + 1

        return qv
Example #37
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
   Your value iteration agent should take an mdp on
   construction, run the indicated number of iterations
   and then act according to the resulting policy.
 
   Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state, action, nextState)
 """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter()  # A Counter is a dict with default 0
     "*** YOUR CODE HERE ***"
     for i in range(iterations):
         newValues = util.Counter()
         for state in mdp.getStates():
             values = []
             for action in mdp.getPossibleActions(state):
                 qvalue = sum([
                     (discount * self.values[newState] +
                      mdp.getReward(state, action, newState)) * prob
                     for newState, prob in mdp.getTransitionStatesAndProbs(
                         state, action)
                 ])
                 values.append(qvalue)
             if len(values) > 0:
                 newValues[state] = max(values)
         for state in self.values:
             self.values[state] = newValues[state]
Example #38
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for n in range(iterations):
       V = self.values.copy()
       for s in mdp.getStates():
           action_values = []
           for a in mdp.getPossibleActions(s):
               action_value = 0
               for s_, P in mdp.getTransitionStatesAndProbs(s, a):
                   action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_])
               action_values.append(action_value)
           self.values[s] = max(action_values or [0])
Example #39
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    # Init : Not required

    # Value iteration
    for i in range(iterations):
        old_values = self.values.copy()
        for state in mdp.getStates():
            value_state_action = []
            for action in mdp.getPossibleActions(state):
                val = 0 
                transition = mdp.getTransitionStatesAndProbs(state,action)
                for sstate,prob_s_a_ss in transition:
                    val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate])
                value_state_action.append(val)
            if value_state_action : self.values[state] = max(value_state_action)
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for i in xrange(iterations):
            new_values = self.values.copy()

            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                qValues = []
                for action in actions:
                    probs = mdp.getTransitionStatesAndProbs(state, action)
                    qvs = [
                        p * (self.values[nextState] * discount +
                             mdp.getReward(state, action, nextState))
                        for nextState, p in probs
                    ]
                    qValues.append(sum(qvs))
                if len(qValues) > 0:
                    new_values[state] = max(qValues)
                else:
                    new_values[state] = 0

            self.values = new_values
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for times in range(iterations):
       V = self.values.copy()
       for state in mdp.getStates():
           action_values = util.Counter()
           for action in mdp.getPossibleActions(state):
               for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action):
                   action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state])
           self.values[state] = action_values[action_values.argMax()]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        while self.iterations > 0:
            prev_values = self.values.copy()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if not actions:
                    continue
                self.values[state] = max([
                    sum([
                        prob * (mdp.getReward(state, act, state1) +
                                discount * prev_values[state1])
                        for state1, prob in mdp.getTransitionStatesAndProbs(
                            state, act)
                    ]) for act in actions
                ])
            self.iterations -= 1
Example #43
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #states = mdp.getStates()
        #values = {state: 0 for state in states}
        for i in range(iterations):
            previous = self.values.copy()
            for state in mdp.getStates():
                possibleActions = mdp.getPossibleActions(state)
                if len(possibleActions) == 0: continue
                results = []
                for action in possibleActions:
                    total = 0
                    for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action):
                        total += (prob * previous[nextState])
                    results.append(total)
                self.values[state] = mdp.getReward(state) + (discount * max(results))
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        temp = util.Counter()
        for i in xrange(iterations):
            temp = self.values.copy()
            for j in mdp.getStates():
                vlist = []
                actions = mdp.getPossibleActions(j)
                if not mdp.isTerminal(j):
                    for k in actions:
                        tran = mdp.getTransitionStatesAndProbs(j, k)
                        val = 0
                        for m in tran:
                            val += m[1] * (mdp.getReward(j, k, m[0]) + self.discount * temp[m[0]])
                        vlist.append(val)
                    self.values[j] = max(vlist)
Example #45
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        states = mdp.getStates()
        for i in range(iterations):
            lastValues = self.values.copy()
            for s in states:
                actions = mdp.getPossibleActions(s)
                if not actions:
                    continue
                values = []
                for a in actions:
                    sum = 0
                    for s2, p in mdp.getTransitionStatesAndProbs(s,a):
                        sum += p * lastValues[s2]
                    values.append(mdp.getReward(s,None,None) + self.discount*sum)
                self.values[s] = max(values)
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        mdpStates = mdp.getStates()

        for iteration in xrange(iterations):
          newValues = util.Counter()
          for state in mdpStates:
            if self.mdp.isTerminal(state):
              continue
            actionValues = -sys.maxint - 1
            for action in mdp.getPossibleActions(state):
              sum = 0
              for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action):
                sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState])
              if sum > actionValues:
                actionValues = sum
            newValues[state] = actionValues
          self.values = newValues
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        mdp = self.mdp
        actions = mdp.getPossibleActions(state)
        if len(actions) == 0:
            return "stop"
        best_action = (-99999999999999, None)
        for a in actions:
            transitions = mdp.getTransitionStatesAndProbs(state,a)
            value_of_action = 0
            for y in transitions:
                prob = y[1]
                new_state = y[0]
                reward = mdp.getReward(state,a,new_state)
                next = self.values[new_state]
                val = prob*(reward + self.discount*next)
                value_of_action += val
                
            if value_of_action >= best_action[0]:
                best_action = (value_of_action, a)

        return best_action[1]
Example #48
0
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        import random
        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        valuesForAction = util.Counter()
        if (mdp.isTerminal(state)):
            return None

        for action in possibleActions:
            tp = mdp.getTransitionStatesAndProbs(state, action)
            sum = 0
            for i in range(len(tp)):
                nextState, prob = tp[i]
                sum = sum + prob * (mdp.getReward(state, action, nextState) +
                                    self.discount * self.values[nextState])
            valuesForAction[action] = sum

        if (valuesForAction.totalCount() == 0):
            return possibleActions[0]
        return valuesForAction.argMax()
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   
   self.utilities = {}
   self.qvalues = {}
   states = mdp.getStates()
   for state in states:
       self.utilities[state] = 0
       self.qvalues[state] = util.Counter()
       
   for i in range(iterations):
       newUtilities = {}
       for state in states:
           if self.mdp.isTerminal(state):
               continue
           childQs = []
           for action in mdp.getPossibleActions(state):
               q_value = 0
               for transition in mdp.getTransitionStatesAndProbs(state,action):
                   q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                       discount*self.utilities[transition[0]])
               childQs.append(q_value)
           newUtilities[state] = max(childQs)
       self.utilities.update(newUtilities)
   
   """ q-values are a dictionary from states to dictionaries of action => qvalue mappings"""
   
   for state in states:
       for action in mdp.getPossibleActions(state):
           q_value = 0
           for transition in mdp.getTransitionStatesAndProbs(state,action):
               q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                   discount*self.utilities[transition[0]])
           self.qvalues[state][action] = q_value
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    ## get dict of transitions for all (state, action, destination):
    self.T = dict()
    self.R = dict()
    self.D = dict()
    
    for state in self.mdp.getStates():
        if self.mdp.isTerminal(state):
            continue
        actions = self.mdp.getPossibleActions(state)
        for action in actions:
            for (destination, prob ) in mdp.getTransitionStatesAndProbs(state, action):
                self.T[(state, action, destination)] = prob
                self.R[(state, action, destination)] = mdp.getReward(state, action, destination)
                if (state,action) not in self.D:
                    self.D[(state, action)] = [destination]
                else:
                    self.D[(state, action)] += [destination]
    #print "----------T:\n", self.T, '\n-------R:\n', self.R, '\n------D:\n', self.D
    for _ in range(iterations):
        #print "\n----------SELF.VALUES\n", self.values
        copyVals = util.Counter() 
        for state in self.mdp.getStates():
            if self.mdp.isTerminal(state):
                continue
            actions = self.mdp.getPossibleActions(state)

            bestVals = []
            #print state , mdp.getPossibleActions(state)
            for action in actions:
                #for dest in self.D[state,action]:
                    #print "(state,action,dest):", (state, action, dest) , "T:", self.T[(state, action, dest)], "R:",self.R[(state, action, dest)], "k_val:", self.values[dest]
                    #bestVals += [ sum( [ self.T[state, action, dest] * ( self.R[state, action, dest] + self.discount*self.values[dest] ) ] ) ]
                bestVals += [self.getQValue(state,action)]
                #bestVals += [ sum( [ T[state, action, dest] * ( R[state, action, dest] + self.discount*self.values[state] )  
                #                    for  dest in D[state,action] ] ) ]
                
            #print "state", state, "bestvals", bestVals
            copyVals[state] = max(bestVals)
            #self.values[state] = max(bestVals)
            
        self.values = copyVals.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # keep track of the number of iterations we have done so far
        i = 0
        # final output value
        v = 0
        # get all the states
        states = mdp.getStates()
        # for each of the specified iterations:
        while i < iterations:
          # save the current self.values
          oldSV = self.values.copy()
          # increment our variable for number of iterations
          i = i + 1
          # for each of the states, 
          for s in states:
            # get the value at this state
            v = util.Counter()
            # look at all possible actions from that state
            actions = mdp.getPossibleActions(s)
            # for each state action pair ...
            for a in actions:
              # get the transition states and the probablilities of 
              # reaching those states
              tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a)
              # keep track of the number of pairs we have seen so far
              j = 0
              # print tStatesAndProbs
              # for each pair in tStatesAndProbs, 
              while j < len(tStatesAndProbs):
                # extract tState and Prob from this member of the list
                tState = tStatesAndProbs[j][0]
                prob = tStatesAndProbs[j][1]
                # set the value associated with that move
                # make sure to account for prob and discount
                v[a] = v[a] + (mdp.getReward(s, a, tState) + discount * oldSV[tState]) * prob
                # increment
                j = j + 1
            # return 
            self.values[s] = v[v.argMax()]
Example #52
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.actions = util.Counter()

        tempValues = util.Counter()

        # Write value iteration code here
        for k in range(0,iterations):
          for state in mdp.getStates():
            maxAction = float("-inf")
            for action in mdp.getPossibleActions(state):
              total = 0
              for nextState, prob in mdp.getTransitionStatesAndProbs(state, action):
                total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState))
              maxAction = max(maxAction, total)
            tempValues[state] = maxAction
          for state in mdp.getStates():
            if tempValues[state] > float("-inf"):
              self.values[state] = tempValues[state]

        for state in mdp.getStates():
          maxAction = None
          maxActionValue = float("-inf")
          for action in mdp.getPossibleActions(state):
            total = 0
            for nextState, prob in mdp.getTransitionStatesAndProbs(state, action):
              total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState))
            if total > maxActionValue:
              maxActionValue = total
              maxAction = action
          self.actions[state] = maxAction
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
     
    "*** YOUR CODE HERE ***"
    "value at each state"
    self.V = util.Counter()
    self.tempV = util.Counter()
    "Q for each state,action pair"
    self.Q = util.Counter()
    "policy for each state = best action to take"
    self.P = util.Counter()
    gamma = self.discount

    for iter in range(1,self.iterations+1):
      for state in mdp.getStates():
        "There is a Q for each (state,action) pair, so index this by state and keep a list of all actions"
        self.Q[state] = util.Counter()
        "Cycle through each possible action for the given state"
        for action in mdp.getPossibleActions(state):
          for neighborStateAndTransitionProb in mdp.getTransitionStatesAndProbs(state,action):
            [neighborState, T_s_a_sp] = neighborStateAndTransitionProb  
            "Compute the Q values for this state and the available actions"
            R_s_a_sp = mdp.getReward(state,action,neighborState)
            self.Q[state][action] += T_s_a_sp*(R_s_a_sp+gamma*self.V[neighborState])
            
        "As long as there were actions at this state, find the one that produces the largest Q value"
        if len(self.Q[state]) > 0:
          maxQstate = -1000000
          maxQAction = None
          for key,value in self.Q[state].items():
            if value > maxQstate:
                maxQstate = value
                maxQAction = key
            elif value == maxQstate:
                [maxQstate,maxQAction] = random.choice([[maxQstate,maxQAction],[value,key]])
          if maxQstate == -10000000:
            maxQstate = 0.0
                
          "Find the policy (or best action) that corresponds to the best Q value"
          self.P[state] = maxQAction
          "Choose the value of the state to be the max Q value that the state has"
          self.tempV[state] = self.Q[state][maxQAction]

      "After all states have been updated, store tempV in V before the next iteration"
      self.V = self.tempV.copy()
Example #54
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        print "-----------------------------------------------------"
        "*** MY CODE BEGINS ***"
        k = 0
        while k < iterations:
            val = self.values.copy()  #before each iteration, copy one.
            for s in mdp.getStates():
                if mdp.isTerminal(s) == False:
                    max = -999999
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        if v > max:
                            max = v
                    val[s] = max
                else:
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        val[s] = v
            k = k+1
            for s in mdp.getStates():
                self.values[s] = val[s]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   
   #Because we have to coordination util.Count , so we have to use dict
 
   self.P = dict()
   self.R = dict()
   self.S = dict()
   
   for state in self.mdp.getStates():
       if self.mdp.isTerminal(state):
           continue
   #get possible action from state
       actions = self.mdp.getPossibleActions(state)
       for action in actions:
           #get the destination and prob from (state,action)
           for (destination, prob ) in mdp.getTransitionStatesAndProbs(state, action):
               #store the prob and reward
               self.P[(state, action, destination)] = prob
               self.R[(state, action, destination)] = self.mdp.getReward(state, action, destination)
               
               if (state,action) not in self.S:
                   self.S[(state, action)] = [destination]
               else:
                   self.S[(state, action)] += [destination]
   
   for _ in range(iterations):
       copyVals = util.Counter()
       for state in self.mdp.getStates():
           if self.mdp.isTerminal(state):
               continue
           actions = self.mdp.getPossibleActions(state)
                         
           bestVals = []
           #call getQValue U(s)=max a belong to A(s) Q(s,a)
           for action in actions:
               bestVals += [self.getQValue(state,action)]
           copyVals[state] = max(bestVals)
                                         
       self.values = copyVals.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
       
        
        i = 0
        terminalstates = []
        
        
        while i<iterations:
            nextValues = util.Counter()
            
            for state in mdp.getStates(): 
                
                stateValues = []
                for action in mdp.getPossibleActions(state):
                    sumValue = 0
                    for item in mdp.getTransitionStatesAndProbs(state, action):
                        nextState = item[0]
                        
                        probability = item[1]
                        reward = mdp.getReward(state,action,nextState)
                        
                        #print "reward", reward
                        sumValue = sumValue + (probability * (reward + (discount * self.values[nextState])))
                        
                        #print "SUMVALUE", sumValue
               
                    stateValues.append(sumValue)
                    
                if len(mdp.getPossibleActions(state)) == 0:
                    nextValues[state] = 0
                else: 
                    nextValues[state] = max(stateValues)
                        
            i+=1
            self.values = nextValues
 def computeQValueFromValues(self, state, action):
     """
       Compute the Q-value of action in state from the
       value function stored in self.values.
     """
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     gamma = self.discount
     successors = mdp.getTransitionStatesAndProbs(state, action)
     
     return sum([successor[1] * (mdp.getReward(state, action, successor[0]) + gamma * self.getValue(successor[0])) for successor in successors])
 def getNextVal(state):
     # max_action sum_s' T(s, a, s') * (R(s, a, s') + γ * V^(k-1)(s')
     # functional programming is beautiful
     try:
         return max(map(lambda action:
                 sum(map(lambda nStateAndProb:
                         nStateAndProb[1] * # T(s, a, s')
                         (mdp.getReward(state, action, nStateAndProb[0]) + discount * cur_buffer[nStateAndProb[0]]), # R(s, a, s') + γ * V^(k-1)(s')
                     mdp.getTransitionStatesAndProbs(state, action))),
                 mdp.getPossibleActions(state)))
     except ValueError: # terminal state
         return 0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
         
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()

        #Iterate through user-defined number of iterations
        for num in range(iterations):
            temp = util.Counter()

            #Compute Ut+1 for all states
            for state in states:
                
                if mdp.isTerminal(state):
                    self.values[state] = 0
                    continue
                
                actions = mdp.getPossibleActions(state)
                maxVal = MIN

                #iterate through trans of each action of the state and sum up values 
                for action in actions:
                    transitions = mdp.getTransitionStatesAndProbs(state, action)
                    totalSum = 0
                    
                    for transition in transitions:
                        #transition[0] = nextState, transition[1] = probability
                        reward = mdp.getReward(state, action, transition[0])
                        #value of the nextState
                        UtValue = self.values[transition[0]]
                        #using formula of value iteration from wikipedia
                        totalSum += transition[1]*(reward + discount * UtValue)
                    maxVal = max(maxVal, totalSum)
                    
                    #for some reason, self.values[state] = maxVal doesn't work.
                    temp[state] = maxVal
            
            for state in states:
                self.values[state] = temp[state]
Example #60
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.delta = 0
    while(self.iterations > 0):
#         self.delta = 0
        batchValues = util.Counter()
        for state in mdp.getStates():  
            maxM = -10000
                   
            if mdp.isTerminal(state):
                continue 
            for action in mdp.getPossibleActions(state):
                statesProbs = mdp.getTransitionStatesAndProbs(state, action)
                sumU = 0
                Rs = 0
                for stateProb in statesProbs:
#                     if stateProb[0] == 'TERMINAL_STATE':
#                         continue
                    sumU = sumU + self.values[stateProb[0]]*stateProb[1]
                    Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1]
#                 if sumU > maxM:
#                     maxM = sumU   
                v = Rs + sumU * discount
                if (v > maxM):
                    maxM = v
            batchValues[state] = maxM
        self.values = batchValues
        self.iterations = self.iterations - 1       
    self.policy = {}
    for state in mdp.getStates():
        if mdp.isTerminal(state):
            self.policy[state] = None
            continue
        QValues = []
        for action in mdp.getPossibleActions(state):
            QValues.append(self.getQValue(state, action))
            self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]