def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   currentIterationCounter = 1
   for state in mdp.getStates():
     self.values[state] = mdp.getReward(state, 'Stop', state)
   while (currentIterationCounter != self.iterations):
     newValues = util.Counter()
     for state in mdp.getStates():
       tempValues = util.Counter()
       for action in mdp.getPossibleActions(state):
         for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action):
           newState = newStateAndProb[0]
           prob = newStateAndProb[1]
           tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState])
       newValues[state] = tempValues[tempValues.argMax()]
     currentIterationCounter += 1
     for state in mdp.getStates():
       self.values[state] = newValues[state]
Ejemplo n.º 2
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   allStates = mdp.getStates() 
   vPrimes = util.Counter() #  A Counter is a dict with default 0
   
   iteration = 0
   while iteration < iterations:
       
       for s in allStates: 
           if mdp.isTerminal(s):
               vPrimes[s] = mdp.getReward(s, None, s);
           else: 
               sreward = mdp.getReward(s, None, s)
               vPrimes[s] = sreward + discount * self.utilOfBestAction(mdp, s )
              
       for s in allStates:
           self.values[s] = vPrimes[s]
           
       iteration +=1
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        iteration = 1
        while iteration <= self.iterations:

            updated_values = self.values.copy()

            for possibleNextState in mdp.getStates():

                if mdp.isTerminal(possibleNextState) == True:
                    for possibleAction in mdp.getPossibleActions(
                            possibleNextState):
                        possibleValue = 0
                        for possibleTransition in mdp.getTransitionStatesAndProbs(
                                possibleNextState, possibleAction):
                            #Following Bellman's equation
                            possibleValue += possibleTransition[1] * (
                                mdp.getReward(possibleNextState,
                                              possibleAction,
                                              possibleTransition[0]) +
                                discount * self.values[possibleTransition[0]])
                        updated_values[possibleNextState] = possibleValue

                else:

                    maxStateValue = float("-inf")
                    for possibleAction in mdp.getPossibleActions(
                            possibleNextState):
                        possibleValue = 0
                        for possibleTransition in mdp.getTransitionStatesAndProbs(
                                possibleNextState, possibleAction):
                            #Following Bellman's equation
                            possibleValue += possibleTransition[1] * (
                                mdp.getReward(possibleNextState,
                                              possibleAction,
                                              possibleTransition[0]) +
                                discount * self.values[possibleTransition[0]])
                        if possibleValue > maxStateValue:
                            maxStateValue = possibleValue
                    updated_values[possibleNextState] = maxStateValue

            self.values = updated_values
            iteration += 1
Ejemplo n.º 4
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.depth = 1
        self.qTable = {}
        self.vTable = {}
        for state in mdp.getStates():
            self.vTable[state] = 0
            self.qTable[state] = {}
            for action in mdp.getPossibleActions(state):
                
                self.qTable[state][action] = 0
        
        while self.depth < self.iterations + 1:
            self.tempTable = {}
            for state in mdp.getStates():
                self.stateValue = 0
                if not mdp.isTerminal(state):
                    self.stateValue = -9999
                    for action in mdp.getPossibleActions(state):
                        self.Qtotal = 0
                        for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                            self.reward = mdp.getReward(state, action, nextState)
                            self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                            #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState]
                        self.qTable[state][action] = self.Qtotal
                        #print self.qTable[state][action]
                        self.stateValue = max(self.stateValue,self.qTable[state][action])
                else:
                    self.tempTable[state] = 0
                self.tempTable[state] = self.stateValue
            self.vTable = self.tempTable
            self.depth += 1
            
        for state in mdp.getStates():
            self.stateValue = -9999
            for action in mdp.getPossibleActions(state):
                self.Qtotal = 0
                for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                    self.reward = mdp.getReward(state, action, nextState)
                    self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                self.qTable[state][action] = self.Qtotal
Ejemplo n.º 5
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        temp = util.Counter()   #to keep track of values while iterating
        #use this for argmax
        states = mdp.getStates()
        ValueIterationAgent.policy = dict.fromkeys(states, '')
        actionUtilities = {}
        #initialize utility fn
        for state in states:
            if mdp.isTerminal(state):
                self.values[state] = mdp.getReward(state)
            temp[state] = mdp.getReward(state)
        looper = 0
        #returns list of tuples - state and 
        # Write value iteration code here
        """
            U(s) = R(s) + Y * (max value after trying each action)(T(s, a, s') * U(s))
            Do this for all states
            Return policy
            Handle case for no available actions
        """
        #loop until we've hit the right number of iterations
        while looper < self.iterations:
            for state in states:
                actionUtilities = {}
                possibleActions = mdp.getPossibleActions(state)
                #start new Bellman eqn - add reward for each state
                if len(possibleActions) == 0:   #terminal state
                    newUtility = 0
                elif len(possibleActions) == 1: #1 possible action: exit
                    actionUtilities[possibleActions[0]] = self.computeQValueFromValues(state, possibleActions[0])
                    ValueIterationAgent.policy[state] = possibleActions[0]
                    newUtility = actionUtilities[possibleActions[0]]
                else:
                    for action in possibleActions:  #multiple possible actions; try them all
                        actionUtilities[action] = self.computeQValueFromValues(state, action)   #get the utility for each action at the given state
                    ValueIterationAgent.policy[state] = max(actionUtilities, key=actionUtilities.get)   #update the policy for this state
                    newUtility = actionUtilities[ValueIterationAgent.policy[state]]
                temp[state] = newUtility
            self.values = temp.copy()
            looper = looper + 1
        "*** YOUR CODE HERE ***"
Ejemplo n.º 6
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        oldCounter = util.Counter()
        states = mdp.getStates()
        while self.iterations > 0:
            for state in states:
                if self.mdp.isTerminal(state):
                    self.values[state] = 0
                else:
                    actions = mdp.getPossibleActions(state)
                    # if len(actions) > 0:
                    # initialize maxAct by calculating the first action from the action list of the given state
                    firstAct = actions[0]
                    listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(
                        state, firstAct)
                    for pair in listOfNextStateAndProbPairs:
                        value = 0
                        for pair in listOfNextStateAndProbPairs:
                            (nextState, prob) = pair
                            reward = mdp.getReward(state, firstAct, nextState)
                            value = value + prob * (reward + self.discount *
                                                    (oldCounter[nextState]))
                    # compare and choose the best action value
                    maxAct = value
                    for action in actions:
                        listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(
                            state, action)
                        value = 0
                        for pair in listOfNextStateAndProbPairs:
                            (nextState, prob) = pair
                            reward = mdp.getReward(state, action, nextState)
                            value = value + prob * (reward + self.discount *
                                                    (oldCounter[nextState]))
                        maxAct = max(maxAct, value)
                    # update the state value for the given state
                    self.values[state] = maxAct
            oldCounter = self.values.copy()
            self.iterations = self.iterations - 1
Ejemplo n.º 7
0
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     values = self.values
     discount = self.discount
     predecessors = {}
     for state in mdp.getStates():
         preList = []
         for preState in mdp.getStates():
             for action in mdp.getPossibleActions(preState):
                 if state in [
                         pair[0]
                         for pair in mdp.getTransitionStatesAndProbs(
                             preState, action) if pair[1] > 0
                 ]:
                     preList.append(preState)
                     break
         predecessors[state] = preList
     queue = util.PriorityQueue()
     for s in mdp.getStates():
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             realValue = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
             diff = abs(realValue - values[s])
             queue.push(s, 0 - diff)
     for _ in range(self.iterations):
         if queue.isEmpty():
             return
         s = queue.pop()
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             values[s] = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
         for p in predecessors[s]:
             actions = mdp.getPossibleActions(p)
             realValue = max(
                 sum(prob * (mdp.getReward(p, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(p, action))
                 for action in actions)
             diff = abs(realValue - values[p])
             if diff > self.theta:
                 queue.update(p, 0 - diff)
Ejemplo n.º 8
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        oldCounter = util.Counter()
        states = mdp.getStates()
        while self.iterations>0:
          for state in states:
            if self.mdp.isTerminal(state):
              self.values[state] = 0
            else:
              actions = mdp.getPossibleActions(state)
              # if len(actions) > 0:
                # initialize maxAct by calculating the first action from the action list of the given state
              firstAct = actions[0]
              listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, firstAct)
              for pair in listOfNextStateAndProbPairs:
                value = 0
                for pair in listOfNextStateAndProbPairs:
                  (nextState, prob) = pair
                  reward = mdp.getReward(state, firstAct, nextState)
                  value = value + prob * (reward + self.discount * (oldCounter[nextState]))
              # compare and choose the best action value
              maxAct = value
              for action in actions:
                listOfNextStateAndProbPairs = mdp.getTransitionStatesAndProbs(state, action)
                value = 0
                for pair in listOfNextStateAndProbPairs:
                  (nextState, prob) = pair
                  reward = mdp.getReward(state, action, nextState)
                  value = value + prob * (reward + self.discount * (oldCounter[nextState]))
                maxAct = max(maxAct, value)
              # update the state value for the given state
              self.values[state] = maxAct
          oldCounter = self.values.copy()
          self.iterations = self.iterations - 1
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        #define a dictionary to store values in iterations for each state
        self.valueRecord = {}
        for state in mdp.getStates():
            self.valueRecord[state] = []

        for i in range(0, self.iterations + 1):
            for state in mdp.getStates():
                if i == 0 or mdp.isTerminal(state):
                    self.valueRecord[state].append(0)
                    continue
                actions = mdp.getPossibleActions(state)
                #store the current reward as value and return
                if 'exit' in actions:
                    self.valueRecord[state].append(
                        mdp.getReward(
                            state, 'exit',
                            mdp.getTransitionStatesAndProbs(state,
                                                            'exit')[0][0]))
                    continue
                max = float("-inf")
                #find the action that maximize the value
                for action in actions:
                    summax = 0
                    for (s,
                         p) in mdp.getTransitionStatesAndProbs(state, action):
                        summax += p * (
                            mdp.getReward(state, action, s) +
                            self.discount * self.valueRecord[s][i - 1])
                    if summax > max:
                        max = summax
                self.valueRecord[state].append(max)
        #store the final value we get from iteration into values
        for k, v in self.valueRecord.items():
            self.values[k] = v[iterations]
Ejemplo n.º 10
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.actions = util.Counter()

        tempValues = util.Counter()

        # Write value iteration code here
        for k in range(0, iterations):
            for state in mdp.getStates():
                maxAction = float("-inf")
                for action in mdp.getPossibleActions(state):
                    total = 0
                    for nextState, prob in mdp.getTransitionStatesAndProbs(
                            state, action):
                        total += prob * (
                            self.values[nextState] * discount +
                            mdp.getReward(state, action, nextState))
                    maxAction = max(maxAction, total)
                tempValues[state] = maxAction
            for state in mdp.getStates():
                if tempValues[state] > float("-inf"):
                    self.values[state] = tempValues[state]

        for state in mdp.getStates():
            maxAction = None
            maxActionValue = float("-inf")
            for action in mdp.getPossibleActions(state):
                total = 0
                for nextState, prob in mdp.getTransitionStatesAndProbs(
                        state, action):
                    total += prob * (self.values[nextState] * discount +
                                     mdp.getReward(state, action, nextState))
                if total > maxActionValue:
                    maxActionValue = total
                    maxAction = action
            self.actions[state] = maxAction
Ejemplo n.º 11
0
    def __init__(self, mdp, discount=0.9, iterations=200):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.q_values = {}
        self.best_action = {}

        # calculate utilities values
        for i in range(self.iterations):
            next_values = util.Counter()
            for s in mdp.getStates():
                updated = False
                for a in mdp.getPossibleActions(s):
                    action_value = 0.0

                    for t in mdp.getTransitionStatesAndProbs(s, a):
                        r = mdp.getReward(s, a, t[0])
                        action_value += t[1] * (r +
                                                discount * self.values[t[0]])

                    if not updated or action_value > next_values[s]:
                        next_values[s] = action_value
                        updated = True
            self.values = next_values

        # with the given utilities, calculate q-values
        p = False
        for s in mdp.getStates():
            self.best_action[s] = None
            max_action_value = -10000000
            for a in mdp.getPossibleActions(s):
                action_value = 0.0
                for t in mdp.getTransitionStatesAndProbs(s, a):
                    r = mdp.getReward(s, a, t[0])
                    action_value += t[1] * (r + discount * self.values[t[0]])
                self.q_values[(s, a)] = action_value
                if action_value > max_action_value:
                    max_action_value = action_value
                    self.best_action[s] = a
Ejemplo n.º 12
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        iteration_values = util.Counter()

        for i in range(iterations):
            for state in mdp.getStates():

                max_action_value = -99999
                max_action = None

                actions = mdp.getPossibleActions(state)
                if len(actions) == 0:
                    max_action_value = mdp.getReward(state, None, None)
                    max_action = None
                    max_state_prime = None
                else:
                    for action in actions:
                        summation = 0
                        for state_prime, prob in mdp.getTransitionStatesAndProbs(
                                state, action):
                            if mdp.isTerminal(state_prime):
                                iteration_values[state] = mdp.getReward(
                                    state, 'exit', 'TERMINAL_STATE')
                            else:
                                utility = self.values[state_prime]
                                summation += utility * prob

                        if summation > max_action_value:
                            max_action_value = summation
                            max_action = action

                iteration_values[state] = mdp.getReward(
                    state, None, None) + discount * max_action_value
            # Update at the end of each iteration
            self.values = iteration_values.copy()
Ejemplo n.º 13
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.tempvalues = util.Counter()

        # Write value iteration code here
        for i in range(0, self.iterations):
            newVal = self.values.copy()

            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    #worst V value possible
                    maxV = float("-inf")
                    for action in mdp.getPossibleActions(state):
                        v = 0
                        #transitions = [newState, probability]
                        for transition in mdp.getTransitionStatesAndProbs(
                                state, action):
                            v = v + transition[1] * (
                                mdp.getReward(state, action, transition[0]) +
                                discount * self.values[transition[0]])

                        if v > maxV:
                            maxV = v
                    newVal[state] = maxV
                else:
                    #state is terminal
                    for action in mdp.getPossibleActions(state):
                        v = 0
                        for transition in mdp.getTransitionStatesAndProbs(
                                state, action):
                            v = v + transition[1] * (
                                mdp.getReward(state, action, transition[0]) +
                                discount * self.values[transition[0]])
                        newVal[state] = v
            #update whole V values
            self.values = newVal
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
	for i in range(0,iterations):
		b = self.values.copy()
		#print ('b',b)
		#print ('all',mdp.getStates())
		for s in mdp.getStates():
		    if s == 'TERMINAL_STATE':
				self.values[s]= 0
		    else:
			#print ('s',s)
			qlist = []
			for a in mdp.getPossibleActions(s):
				if a =='exit':
					qlist.append(mdp.getReward(s,a,(mdp.getTransitionStatesAndProbs(s,a))))
				else:
					#print('a',a)
					spsum = 0
					for sp in mdp.getTransitionStatesAndProbs(s,a):
						#print('sp',sp)
						#print(mdp.getReward(s,a,sp[0]))
						spsum =spsum+ (sp[1]*(mdp.getReward(s,a,sp[0])+self.discount*b[sp[0]]))
						#print ('spsum',spsum)
						#print('i',i)
					qlist.append(spsum)
			#print qlist
			self.values[s] = max(qlist)
			while len(qlist) > 0 : qlist.pop()	
				


        # Write value iteration code here
        "*** YOUR CODE HERE ***"
Ejemplo n.º 15
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations

        self.utilities = {}
        self.qvalues = {}
        states = mdp.getStates()
        for state in states:
            self.utilities[state] = 0
            self.qvalues[state] = util.Counter()

        for i in range(iterations):
            newUtilities = {}
            for state in states:
                if self.mdp.isTerminal(state):
                    continue
                childQs = []
                for action in mdp.getPossibleActions(state):
                    q_value = 0
                    for transition in mdp.getTransitionStatesAndProbs(
                            state, action):
                        q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                            discount*self.utilities[transition[0]])
                    childQs.append(q_value)
                newUtilities[state] = max(childQs)
            self.utilities.update(newUtilities)
        """ q-values are a dictionary from states to dictionaries of action => qvalue mappings"""

        for state in states:
            for action in mdp.getPossibleActions(state):
                q_value = 0
                for transition in mdp.getTransitionStatesAndProbs(
                        state, action):
                    q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                        discount*self.utilities[transition[0]])
                self.qvalues[state][action] = q_value
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for i in range(iterations):
     nextValues = util.Counter()
     for state in mdp.getStates():
       rewardsPossible = util.Counter()
       for action in mdp.getPossibleActions(state):
           nextPossible = size(mdp.getTransitionStatesAndProbs(state, action))[1]
           newRewards = util.Counter()
           for tmpState in range(nextPossible):
             nextState  = mdp.getTransitionStatesAndProbs(state, action)[tmpState][0]
             prob = mdp.getTransitionStatesAndProbs(state, action)[tmpState][1]
             rewards = mdp.getReward(state, action, tmpState)
             newRewards[tmpState] = prob * (rewards + self.discount * self.values[nextState])
           rewardsPossible[action] = newRewards.totalCount()
       nextValues[state] = rewardsPossible[rewardsPossible.argMax()]
       
   self.values = nextValues
Ejemplo n.º 17
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        while self.iterations > 0:
            prev_values = self.values.copy()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if not actions:
                    continue
                self.values[state] = max([sum([prob*(mdp.getReward(state, act, state1) + discount*prev_values[state1])
                                               for state1, prob in mdp.getTransitionStatesAndProbs(state, act)])
                                          for act in actions])
            self.iterations -= 1
Ejemplo n.º 18
0
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        # get the discount
        discount = self.discount
        # get the values
        values = self.values
        # get the mdp
        mdp = self.mdp
        # set initial q value
        qv = 0
        # 
        tStatesAndProbs = mdp.getTransitionStatesAndProbs(state, action)
        # keep track of pairs seen so far
        j = 0 
        while j < len(tStatesAndProbs):
          # extract tState and Prob from this member of the list
          tState = tStatesAndProbs[j][0]
          prob = tStatesAndProbs[j][1]
          # calcuate the qv the same we we calculated v above
          qv = qv + ((discount * values[tState]) + mdp.getReward(state, action, tState)) * prob
          # increment
          j = j + 1

        return qv
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(self.iterations):
            temp = self.values.copy()
            for item in mdp.getStates():
                value_list = []
                if mdp.isTerminal(item):
                    self.values[item] = 0
                    continue
                for k in mdp.getPossibleActions(item):
                    sum_value = 0
                    for x in mdp.getTransitionStatesAndProbs(item, k):
                        reward_next_state = mdp.getReward(item, k, x[0])
                        sum_value += x[1] * (reward_next_state +
                                             self.discount * temp[x[0]])
                    value_list.append(sum_value)
                self.values[item] = max(value_list)
Ejemplo n.º 20
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        for k in range(self.iterations):
            self.values_old = self.values.copy()
            for s in mdp.getStates():
                if not self.mdp.isTerminal(s):
                    self.values[s] = max(
                        [
                            sum(
                                [
                                    T * (mdp.getReward(s, a, sp) + self.discount * self.values_old[sp])
                                    for (sp, T) in mdp.getTransitionStatesAndProbs(s, a)
                                ]
                            )
                            for a in mdp.getPossibleActions(s)
                        ]
                    )
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for s in mdp.getStates():
       self.values[s] = 0
   "for a in mdp.getPossibleActions(s):"
   "for ac in mdp.getTransitionStatesAndProbs(s,a):"
   " print ac[0]"
   "print ac[1]"
   "copy_value = self.values.copy()"
   "for c in mdp.getStates():"
   "   print copy_value[c]"
   i=0
   "self.states = mdp.getStates()"
   while i < iterations:
       copy_value = self.values.copy()
       for s in mdp.getStates():
           if not mdp.isTerminal(s):
               self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)])
       i = i + 1
Ejemplo n.º 22
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for _ in range(iterations):
            updateBatch = self.values.copy()
            for state in mdp.getStates():
                self.values[state] = max([
                    sum([prob*(mdp.getReward(state, action, transitionState) + discount*updateBatch[transitionState])
                        for transitionState, prob 
                        in mdp.getTransitionStatesAndProbs(state, action)
                        if prob != 0])
                    for action in mdp.getPossibleActions(state)] or [0])
Ejemplo n.º 23
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for n in range(iterations):
       V = self.values.copy()
       for s in mdp.getStates():
           action_values = []
           for a in mdp.getPossibleActions(s):
               action_value = 0
               for s_, P in mdp.getTransitionStatesAndProbs(s, a):
                   action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_])
               action_values.append(action_value)
           self.values[s] = max(action_values or [0])
Ejemplo n.º 24
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for i in range(0, iterations):
     preValues = self.values.copy()
     for state in mdp.getStates():
       curValue = util.Counter()
       for action in mdp.getPossibleActions(state):
         for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
           curValue[action] += prob*(mdp.getReward(state, action, nextState)+discount*preValues[nextState])
       self.values[state] = curValue[curValue.argMax()]
Ejemplo n.º 25
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        # Write value iteration code here
        while self.iterations > 0:
            junk = self.values.copy()
            for state in self.mdp.getStates():
                garbage = {}
                for action in mdp.getPossibleActions(state):
                    garbage[action] = 0
                    for (nextState, prob) in self.mdp.getTransitionStatesAndProbs(state, action):
                        garbage[action] += prob * (mdp.getReward(state, action, nextState) + self.discount * junk[nextState])
                try:
                    self.values[state] = max(garbage.values())
                except ValueError:
                    self.values[state] = 0
            self.iterations -= 1
Ejemplo n.º 26
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
     
     Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state)
     """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter(
     )  # A Counter is a dict with default values as 0
     "*** YOUR CODE HERE ***"
     states = mdp.getStates()
     for k in range(0, iterations):
         for state in states:
             actions = []
             for action in mdp.getPossibleActions(state):
                 trans_prob = mdp.getTransitionStatesAndProbs(state, action)
                 actions.append(
                     sum(self.values[tp[0], k - 1] * tp[1]
                         for tp in trans_prob))
             if actions:
                 max_prob = max(actions)
             else:
                 max_prob = 0
             self.values[state,
                         k] = mdp.getReward(state) + discount * max_prob
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   "*** YOUR CODE HERE ***"
   for i in range(iterations):
       newValues=util.Counter()
       for state in mdp.getStates():
           values=[]
           for action in mdp.getPossibleActions(state):
               qvalue=sum([(discount*self.values[newState]+mdp.getReward(state,action,newState))*prob for newState,prob in mdp.getTransitionStatesAndProbs(state,action)])
               values.append(qvalue)
           if len(values)>0:
               newValues[state]=max(values)
       for state in self.values:
           self.values[state]=newValues[state]
Ejemplo n.º 28
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   for i in range(iterations):
       lastValues = copy.deepcopy(self.values)
       for s in mdp.getStates():
           aCounter = util.Counter()
           for a in mdp.getPossibleActions(s):
               for s2 in mdp.getStates():
                   aCounter[a] += self.T(s,a,s2) * (mdp.getReward(s,a,s2) + discount*lastValues[s2])
           self.values[s] = aCounter[aCounter.argMax()]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for k in range(1, iterations + 1):
            values = copy(self.values)
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if len(actions) > 0:
                    self.values[state] = max([
                        sum([
                            prob * (mdp.getReward(state, action, next) +
                                    discount * values[next])
                            for next, prob in mdp.getTransitionStatesAndProbs(
                                state, action)
                        ]) for action in mdp.getPossibleActions(state)
                    ])
Ejemplo n.º 30
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
          
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0

    for i in range(iterations):
        nextValues = util.Counter()
        for state in mdp.getStates(): 
            if mdp.isTerminal(state): continue
            first = True
            for action in mdp.getPossibleActions(state):
                qValue = 0
                for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
                    reward = mdp.getReward(state, action, nextState)
                    qValue += prob * (reward + discount*self.values[nextState])
                if first:
                    maxQValue = qValue
                    first = False
                elif qValue > maxQValue:
                    maxQValue = qValue
            nextValues[state] = maxQValue
        self.values = nextValues
Ejemplo n.º 31
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
   Your value iteration agent should take an mdp on
   construction, run the indicated number of iterations
   and then act according to the resulting policy.
 
   Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state, action, nextState)
 """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter()  # A Counter is a dict with default 0
     "*** YOUR CODE HERE ***"
     for i in range(iterations):
         newValues = util.Counter()
         for state in mdp.getStates():
             values = []
             for action in mdp.getPossibleActions(state):
                 qvalue = sum([
                     (discount * self.values[newState] +
                      mdp.getReward(state, action, newState)) * prob
                     for newState, prob in mdp.getTransitionStatesAndProbs(
                         state, action)
                 ])
                 values.append(qvalue)
             if len(values) > 0:
                 newValues[state] = max(values)
         for state in self.values:
             self.values[state] = newValues[state]
Ejemplo n.º 32
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # keep track of the number of iterations we have done so far
        i = 0
        # final output value
        v = 0
        # get all the states
        states = mdp.getStates()
        # for each of the specified iterations:
        while i < iterations:
            # save the current self.values
            oldSV = self.values.copy()
            # increment our variable for number of iterations
            i = i + 1
            # for each of the states,
            for s in states:
                # get the value at this state
                v = util.Counter()
                # look at all possible actions from that state
                actions = mdp.getPossibleActions(s)
                # for each state action pair ...
                for a in actions:
                    # get the transition states and the probablilities of
                    # reaching those states
                    tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a)
                    # keep track of the number of pairs we have seen so far
                    j = 0
                    # print tStatesAndProbs
                    # for each pair in tStatesAndProbs,
                    while j < len(tStatesAndProbs):
                        # extract tState and Prob from this member of the list
                        tState = tStatesAndProbs[j][0]
                        prob = tStatesAndProbs[j][1]
                        # set the value associated with that move
                        # make sure to account for prob and discount
                        v[a] = v[a] + (mdp.getReward(s, a, tState) +
                                       discount * oldSV[tState]) * prob
                        # increment
                        j = j + 1
                # return
                self.values[s] = v[v.argMax()]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        temp = util.Counter()
        for i in xrange(iterations):
            temp = self.values.copy()
            for j in mdp.getStates():
                vlist = []
                actions = mdp.getPossibleActions(j)
                if not mdp.isTerminal(j):
                    for k in actions:
                        tran = mdp.getTransitionStatesAndProbs(j, k)
                        val = 0
                        for m in tran:
                            val += m[1] * (mdp.getReward(j, k, m[0]) + self.discount * temp[m[0]])
                        vlist.append(val)
                    self.values[j] = max(vlist)
Ejemplo n.º 34
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            oldv = self.values.copy()
            for s in mdp.getStates():
                if (mdp.isTerminal(s)):
                    self.values[s] = 0
                    continue
                maxu = None
                for a in mdp.getPossibleActions(s):
                    eu = 0
                    for (sp, p) in mdp.getTransitionStatesAndProbs(s, a):
                        r = mdp.getReward(s, a, sp)
                        r += self.discount * oldv[sp]
                        eu += p * r
                    if (maxu is None or eu > maxu): maxu = eu
                self.values[s] = maxu
Ejemplo n.º 35
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        for _ in range(0, iterations):
            self.newValues = util.Counter()
            for st in mdp.getStates():
                if len(mdp.getPossibleActions(st)) != 0:
                    maxV = -sys.maxint
                    for act in mdp.getPossibleActions(st):
                        newV = 0
                        for tst, prob in mdp.getTransitionStatesAndProbs(
                                st, act):
                            r = mdp.getReward(st, act, tst)
                            newV += prob * (r + discount * self.values[tst])
                        if newV > maxV: maxV = newV
                    self.newValues[st] = maxV
                else:
                    self.newValues[st] = self.values[st]
            self.values = self.newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() 
        
        for i in range(iterations): # running the alg on the indicated number of iterations
            y = self.values.copy() #V sub k-1
            
            for state in mdp.getStates():
                actions = util.Counter()
                
                if  mdp.isTerminal(state) == False:
                    for possibleActions in mdp.getPossibleActions(state):

                        for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions):
                                value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState]))
                                actions[possibleActions] += value_iteration
                    self.values[state] = actions[actions.argMax()] 
Ejemplo n.º 37
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(self.iterations):  # run the indicated iterations
            valuesCopy = self.values.copy()  # a copy of the Counter
            for state in self.mdp.getStates():
                tmpValues = util.Counter()
                for action in self.mdp.getPossibleActions(state):
                    for nextState, prob in self.mdp.getTransitionStatesAndProbs(
                            state, action):
                        tmpValues[action] += prob * (
                            mdp.getReward(state, action, nextState) +
                            self.discount * valuesCopy[nextState])
                self.values[state] = tmpValues[
                    tmpValues.argMax()]  # return highest value
Ejemplo n.º 38
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   # OUR CODE HERE
   #Note: I think we should use the util.Counter thing?
   for times in range(0, iterations):
     #values from previous iteration so we don't update over them while iterating
     prevVals = self.values.copy()
     #iterate through all states
     for state in mdp.getStates():
       #will store the action-value for the iteration
       value = util.Counter()
       for action in mdp.getPossibleActions(state):
         for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action):
           #expected value, probability * reward for the state with the discount * reward
           value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState])
       #update the values to the new value from the iteration
       #the .argMax() function returns the one with the largest value
       self.values[state] = value[value.argMax()]
Ejemplo n.º 39
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        states = mdp.getStates()
        for i in range(iterations):
            lastValues = self.values.copy()
            for s in states:
                actions = mdp.getPossibleActions(s)
                if not actions:
                    continue
                values = []
                for a in actions:
                    sum = 0
                    for s2, p in mdp.getTransitionStatesAndProbs(s,a):
                        sum += p * lastValues[s2]
                    values.append(mdp.getReward(s,None,None) + self.discount*sum)
                self.values[s] = max(values)
Ejemplo n.º 40
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        helper_vector = util.Counter() # Copy of vectors to be used for batch updating 
        
        for i in range(self.iterations):
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    continue
                if mdp.getPossibleActions(state):
                    helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] )
                for action in mdp.getPossibleActions(state):
                    helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, action)] ))
            for state in helper_vector:
                self.values[state] = helper_vector[state]
Ejemplo n.º 41
0
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        import random
        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        valuesForAction = util.Counter()
        if (mdp.isTerminal(state)):
            return None

        for action in possibleActions:
            tp = mdp.getTransitionStatesAndProbs(state, action)
            sum = 0
            for i in range(len(tp)):
                nextState, prob = tp[i]
                sum = sum + prob * (mdp.getReward(state, action, nextState) +
                                    self.discount * self.values[nextState])
            valuesForAction[action] = sum

        if (valuesForAction.totalCount() == 0):
            return possibleActions[0]
        return valuesForAction.argMax()
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        mdp = self.mdp
        actions = mdp.getPossibleActions(state)
        if len(actions) == 0:
            return "stop"
        best_action = (-99999999999999, None)
        for a in actions:
            transitions = mdp.getTransitionStatesAndProbs(state,a)
            value_of_action = 0
            for y in transitions:
                prob = y[1]
                new_state = y[0]
                reward = mdp.getReward(state,a,new_state)
                next = self.values[new_state]
                val = prob*(reward + self.discount*next)
                value_of_action += val
                
            if value_of_action >= best_action[0]:
                best_action = (value_of_action, a)

        return best_action[1]
Ejemplo n.º 43
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    # Init : Not required

    # Value iteration
    for i in range(iterations):
        old_values = self.values.copy()
        for state in mdp.getStates():
            value_state_action = []
            for action in mdp.getPossibleActions(state):
                val = 0 
                transition = mdp.getTransitionStatesAndProbs(state,action)
                for sstate,prob_s_a_ss in transition:
                    val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate])
                value_state_action.append(val)
            if value_state_action : self.values[state] = max(value_state_action)
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for i in xrange(iterations):
            new_values = self.values.copy()

            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                qValues = []
                for action in actions:
                    probs = mdp.getTransitionStatesAndProbs(state, action)
                    qvs = [
                        p * (self.values[nextState] * discount +
                             mdp.getReward(state, action, nextState))
                        for nextState, p in probs
                    ]
                    qValues.append(sum(qvs))
                if len(qValues) > 0:
                    new_values[state] = max(qValues)
                else:
                    new_values[state] = 0

            self.values = new_values
Ejemplo n.º 45
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        while self.iterations > 0:
            prev_values = self.values.copy()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if not actions:
                    continue
                self.values[state] = max([
                    sum([
                        prob * (mdp.getReward(state, act, state1) +
                                discount * prev_values[state1])
                        for state1, prob in mdp.getTransitionStatesAndProbs(
                            state, act)
                    ]) for act in actions
                ])
            self.iterations -= 1
Ejemplo n.º 46
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for times in range(iterations):
       V = self.values.copy()
       for state in mdp.getStates():
           action_values = util.Counter()
           for action in mdp.getPossibleActions(state):
               for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action):
                   action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state])
           self.values[state] = action_values[action_values.argMax()]
Ejemplo n.º 47
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        newValues = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        allStates = mdp.getStates()

        for i in range(iterations):
            for s in allStates:
                if mdp.isTerminal(s): continue
                mx = float("-inf")
                for a in mdp.getPossibleActions(s):
                    score = 0
                    for (sp, tp) in mdp.getTransitionStatesAndProbs(s, a):
                        score += tp * (mdp.getReward(s, a, sp) +
                                       self.discount * self.values[sp])
                    if score > mx:
                        mx = score
                newValues[s] = mx
            self.values = newValues.copy()
Ejemplo n.º 48
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #states = mdp.getStates()
        #values = {state: 0 for state in states}
        for i in range(iterations):
            previous = self.values.copy()
            for state in mdp.getStates():
                possibleActions = mdp.getPossibleActions(state)
                if len(possibleActions) == 0: continue
                results = []
                for action in possibleActions:
                    total = 0
                    for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action):
                        total += (prob * previous[nextState])
                    results.append(total)
                self.values[state] = mdp.getReward(state) + (discount * max(results))
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.qvalues = util.Counter()
    self.bestact = util.Counter()

    "*** YOUR CODE HERE ***"
    states = mdp.getStates()

    for i in range(iterations):
        v = util.Counter()
        for state in states:
            if mdp.isTerminal(state):
                continue
            value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state])
                    for next_state, prob in mdp.getTransitionStatesAndProbs(state, action))
                    for action in mdp.getPossibleActions(state)}
            self.bestact[state] = max(value, key=value.get)
            v[state] = value[self.bestact[state]] 
            for action in value.keys():
                self.qvalues[state,action] = value[action]
        self.values = v.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        mdpStates = mdp.getStates()

        for iteration in xrange(iterations):
          newValues = util.Counter()
          for state in mdpStates:
            if self.mdp.isTerminal(state):
              continue
            actionValues = -sys.maxint - 1
            for action in mdp.getPossibleActions(state):
              sum = 0
              for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action):
                sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState])
              if sum > actionValues:
                actionValues = sum
            newValues[state] = actionValues
          self.values = newValues
Ejemplo n.º 51
0
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        # get the discount
        discount = self.discount
        # get the values
        values = self.values
        # get the mdp
        mdp = self.mdp
        # set initial q value
        qv = 0
        #
        tStatesAndProbs = mdp.getTransitionStatesAndProbs(state, action)
        # keep track of pairs seen so far
        j = 0
        while j < len(tStatesAndProbs):
            # extract tState and Prob from this member of the list
            tState = tStatesAndProbs[j][0]
            prob = tStatesAndProbs[j][1]
            # calcuate the qv the same we we calculated v above
            qv = qv + ((discount * values[tState]) +
                       mdp.getReward(state, action, tState)) * prob
            # increment
            j = j + 1

        return qv
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   self.policy = util.Counter()
   oldValues = util.Counter()
   states = mdp.getStates()
   for x in xrange(0,iterations):
       for state in states:
           possibleActions = mdp.getPossibleActions(state)
           qValues = util.Counter()
           for action in possibleActions:
               qValue = 0;
               for nextState, prob in mdp.getTransitionStatesAndProbs(state,action):
                   qValue += prob*(mdp.getReward(state, action, nextState)+discount*oldValues[nextState])
               qValues[action] = qValue
           bestAction = qValues.argMax()
           self.values[state] = qValues[bestAction]
       for value in self.values:
           oldValues[value] = self.values[value]
    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
          value function stored in self.values.
        """
        "*** YOUR CODE HERE ***"
        mdp = self.mdp
        discount = self.discount
        iterations = self.iterations
        values = self.values

        #states = mdp.getStates()
        #print('state is :', state)

        qValues = []

        sum = 0

        for item in mdp.getTransitionStatesAndProbs(
                state, action
        ):  #'mdp.getTransitionStatesAndProbs :', [((0, 1), 1.0), ((0, 0), 0.0), ((0, 2), 0.0)]
            resultState = item[0]
            prob = item[1]
            sum += prob * (mdp.getReward(state, action, resultState) +
                           discount * values[resultState])

        return sum
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   
   self.utilities = {}
   self.qvalues = {}
   states = mdp.getStates()
   for state in states:
       self.utilities[state] = 0
       self.qvalues[state] = util.Counter()
       
   for i in range(iterations):
       newUtilities = {}
       for state in states:
           if self.mdp.isTerminal(state):
               continue
           childQs = []
           for action in mdp.getPossibleActions(state):
               q_value = 0
               for transition in mdp.getTransitionStatesAndProbs(state,action):
                   q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                       discount*self.utilities[transition[0]])
               childQs.append(q_value)
           newUtilities[state] = max(childQs)
       self.utilities.update(newUtilities)
   
   """ q-values are a dictionary from states to dictionaries of action => qvalue mappings"""
   
   for state in states:
       for action in mdp.getPossibleActions(state):
           q_value = 0
           for transition in mdp.getTransitionStatesAndProbs(state,action):
               q_value += transition[1]*(mdp.getReward(state,action,transition[0]) + \
                   discount*self.utilities[transition[0]])
           self.qvalues[state][action] = q_value
Ejemplo n.º 55
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    ## get dict of transitions for all (state, action, destination):
    self.T = dict()
    self.R = dict()
    self.D = dict()
    
    for state in self.mdp.getStates():
        if self.mdp.isTerminal(state):
            continue
        actions = self.mdp.getPossibleActions(state)
        for action in actions:
            for (destination, prob ) in mdp.getTransitionStatesAndProbs(state, action):
                self.T[(state, action, destination)] = prob
                self.R[(state, action, destination)] = mdp.getReward(state, action, destination)
                if (state,action) not in self.D:
                    self.D[(state, action)] = [destination]
                else:
                    self.D[(state, action)] += [destination]
    #print "----------T:\n", self.T, '\n-------R:\n', self.R, '\n------D:\n', self.D
    for _ in range(iterations):
        #print "\n----------SELF.VALUES\n", self.values
        copyVals = util.Counter() 
        for state in self.mdp.getStates():
            if self.mdp.isTerminal(state):
                continue
            actions = self.mdp.getPossibleActions(state)

            bestVals = []
            #print state , mdp.getPossibleActions(state)
            for action in actions:
                #for dest in self.D[state,action]:
                    #print "(state,action,dest):", (state, action, dest) , "T:", self.T[(state, action, dest)], "R:",self.R[(state, action, dest)], "k_val:", self.values[dest]
                    #bestVals += [ sum( [ self.T[state, action, dest] * ( self.R[state, action, dest] + self.discount*self.values[dest] ) ] ) ]
                bestVals += [self.getQValue(state,action)]
                #bestVals += [ sum( [ T[state, action, dest] * ( R[state, action, dest] + self.discount*self.values[state] )  
                #                    for  dest in D[state,action] ] ) ]
                
            #print "state", state, "bestvals", bestVals
            copyVals[state] = max(bestVals)
            #self.values[state] = max(bestVals)
            
        self.values = copyVals.copy()
Ejemplo n.º 56
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # keep track of the number of iterations we have done so far
        i = 0
        # final output value
        v = 0
        # get all the states
        states = mdp.getStates()
        # for each of the specified iterations:
        while i < iterations:
          # save the current self.values
          oldSV = self.values.copy()
          # increment our variable for number of iterations
          i = i + 1
          # for each of the states, 
          for s in states:
            # get the value at this state
            v = util.Counter()
            # look at all possible actions from that state
            actions = mdp.getPossibleActions(s)
            # for each state action pair ...
            for a in actions:
              # get the transition states and the probablilities of 
              # reaching those states
              tStatesAndProbs = mdp.getTransitionStatesAndProbs(s, a)
              # keep track of the number of pairs we have seen so far
              j = 0
              # print tStatesAndProbs
              # for each pair in tStatesAndProbs, 
              while j < len(tStatesAndProbs):
                # extract tState and Prob from this member of the list
                tState = tStatesAndProbs[j][0]
                prob = tStatesAndProbs[j][1]
                # set the value associated with that move
                # make sure to account for prob and discount
                v[a] = v[a] + (mdp.getReward(s, a, tState) + discount * oldSV[tState]) * prob
                # increment
                j = j + 1
            # return 
            self.values[s] = v[v.argMax()]
Ejemplo n.º 57
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.actions = util.Counter()

        tempValues = util.Counter()

        # Write value iteration code here
        for k in range(0,iterations):
          for state in mdp.getStates():
            maxAction = float("-inf")
            for action in mdp.getPossibleActions(state):
              total = 0
              for nextState, prob in mdp.getTransitionStatesAndProbs(state, action):
                total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState))
              maxAction = max(maxAction, total)
            tempValues[state] = maxAction
          for state in mdp.getStates():
            if tempValues[state] > float("-inf"):
              self.values[state] = tempValues[state]

        for state in mdp.getStates():
          maxAction = None
          maxActionValue = float("-inf")
          for action in mdp.getPossibleActions(state):
            total = 0
            for nextState, prob in mdp.getTransitionStatesAndProbs(state, action):
              total += prob * (self.values[nextState] * discount + mdp.getReward(state, action, nextState))
            if total > maxActionValue:
              maxActionValue = total
              maxAction = action
          self.actions[state] = maxAction
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
     
    "*** YOUR CODE HERE ***"
    "value at each state"
    self.V = util.Counter()
    self.tempV = util.Counter()
    "Q for each state,action pair"
    self.Q = util.Counter()
    "policy for each state = best action to take"
    self.P = util.Counter()
    gamma = self.discount

    for iter in range(1,self.iterations+1):
      for state in mdp.getStates():
        "There is a Q for each (state,action) pair, so index this by state and keep a list of all actions"
        self.Q[state] = util.Counter()
        "Cycle through each possible action for the given state"
        for action in mdp.getPossibleActions(state):
          for neighborStateAndTransitionProb in mdp.getTransitionStatesAndProbs(state,action):
            [neighborState, T_s_a_sp] = neighborStateAndTransitionProb  
            "Compute the Q values for this state and the available actions"
            R_s_a_sp = mdp.getReward(state,action,neighborState)
            self.Q[state][action] += T_s_a_sp*(R_s_a_sp+gamma*self.V[neighborState])
            
        "As long as there were actions at this state, find the one that produces the largest Q value"
        if len(self.Q[state]) > 0:
          maxQstate = -1000000
          maxQAction = None
          for key,value in self.Q[state].items():
            if value > maxQstate:
                maxQstate = value
                maxQAction = key
            elif value == maxQstate:
                [maxQstate,maxQAction] = random.choice([[maxQstate,maxQAction],[value,key]])
          if maxQstate == -10000000:
            maxQstate = 0.0
                
          "Find the policy (or best action) that corresponds to the best Q value"
          self.P[state] = maxQAction
          "Choose the value of the state to be the max Q value that the state has"
          self.tempV[state] = self.Q[state][maxQAction]

      "After all states have been updated, store tempV in V before the next iteration"
      self.V = self.tempV.copy()
Ejemplo n.º 59
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        print "-----------------------------------------------------"
        "*** MY CODE BEGINS ***"
        k = 0
        while k < iterations:
            val = self.values.copy()  #before each iteration, copy one.
            for s in mdp.getStates():
                if mdp.isTerminal(s) == False:
                    max = -999999
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        if v > max:
                            max = v
                    val[s] = max
                else:
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        val[s] = v
            k = k+1
            for s in mdp.getStates():
                self.values[s] = val[s]
 def computeQValueFromValues(self, state, action):
     """
       Compute the Q-value of action in state from the
       value function stored in self.values.
     """
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     gamma = self.discount
     successors = mdp.getTransitionStatesAndProbs(state, action)
     
     return sum([successor[1] * (mdp.getReward(state, action, successor[0]) + gamma * self.getValue(successor[0])) for successor in successors])