def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   currentIterationCounter = 1
   for state in mdp.getStates():
     self.values[state] = mdp.getReward(state, 'Stop', state)
   while (currentIterationCounter != self.iterations):
     newValues = util.Counter()
     for state in mdp.getStates():
       tempValues = util.Counter()
       for action in mdp.getPossibleActions(state):
         for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action):
           newState = newStateAndProb[0]
           prob = newStateAndProb[1]
           tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState])
       newValues[state] = tempValues[tempValues.argMax()]
     currentIterationCounter += 1
     for state in mdp.getStates():
       self.values[state] = newValues[state]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   for i in range(iterations):
       lastValues = copy.deepcopy(self.values)
       for s in mdp.getStates():
           aCounter = util.Counter()
           for a in mdp.getPossibleActions(s):
               for s2 in mdp.getStates():
                   aCounter[a] += self.T(s,a,s2) * (mdp.getReward(s,a,s2) + discount*lastValues[s2])
           self.values[s] = aCounter[aCounter.argMax()]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        # Add all states to the dictionary and initialize their values to 0.
        for state in mdp.getStates():
            self.values[state] = 0
        
        # Run the evaluation a specified number of times.
        for index in range(self.iterations):
            # Keep self.values static during an iteration
            iterationValues = util.Counter()
            for state in mdp.getStates():
                QValues = util.Counter()
                for action in self.mdp.getPossibleActions(state):
                    QValues[action] = self.computeQValueFromValues(state, action)
                if len(QValues) > 0:
                    iterationValues[state] = QValues[QValues.sortedKeys()[0]]
            # Only update self.values at the end of an iteration
            self.values = iterationValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        state = mdp.getStartState()
        for i in range(0,iterations):
            #print "iteration: ", i
            #iterate once through all states and actions, save q-values
            for state in mdp.getStates():
                for action in mdp.getPossibleActions(state):
                    #compute qValue for each action
                    qValue = self.getQValue(state, action)
                    self.values[(state,action)] = qValue
            #after all qValues are computed, iterate againt through states, save value from optimal policy. these values will be V* for next iteration
            for state in mdp.getStates():
                action = self.getAction(state)
                self.values[state] = self.values[(state, action)] 

        """
Example #5
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        
        self.newvalues = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        iterationsRun = 0
        while iterationsRun < iterations:
            iterationsRun += 1
            for state in mdp.getStates():
                self.computeActionFromValues(state)
            for state in mdp.getStates():
                self.values[state] = self.newvalues[state]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for s in mdp.getStates():
       self.values[s] = 0
   "for a in mdp.getPossibleActions(s):"
   "for ac in mdp.getTransitionStatesAndProbs(s,a):"
   " print ac[0]"
   "print ac[1]"
   "copy_value = self.values.copy()"
   "for c in mdp.getStates():"
   "   print copy_value[c]"
   i=0
   "self.states = mdp.getStates()"
   while i < iterations:
       copy_value = self.values.copy()
       for s in mdp.getStates():
           if not mdp.isTerminal(s):
               self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)])
       i = i + 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.depth = 1
        self.qTable = {}
        self.vTable = {}
        for state in mdp.getStates():
            self.vTable[state] = 0
            self.qTable[state] = {}
            for action in mdp.getPossibleActions(state):
                
                self.qTable[state][action] = 0
        
        while self.depth < self.iterations + 1:
            self.tempTable = {}
            for state in mdp.getStates():
                self.stateValue = 0
                if not mdp.isTerminal(state):
                    self.stateValue = -9999
                    for action in mdp.getPossibleActions(state):
                        self.Qtotal = 0
                        for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                            self.reward = mdp.getReward(state, action, nextState)
                            self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                            #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState]
                        self.qTable[state][action] = self.Qtotal
                        #print self.qTable[state][action]
                        self.stateValue = max(self.stateValue,self.qTable[state][action])
                else:
                    self.tempTable[state] = 0
                self.tempTable[state] = self.stateValue
            self.vTable = self.tempTable
            self.depth += 1
            
        for state in mdp.getStates():
            self.stateValue = -9999
            for action in mdp.getPossibleActions(state):
                self.Qtotal = 0
                for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                    self.reward = mdp.getReward(state, action, nextState)
                    self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                self.qTable[state][action] = self.Qtotal
Example #8
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.delta = 0
    while(self.iterations > 0):
#         self.delta = 0
        batchValues = util.Counter()
        for state in mdp.getStates():  
            maxM = -10000
                   
            if mdp.isTerminal(state):
                continue 
            for action in mdp.getPossibleActions(state):
                statesProbs = mdp.getTransitionStatesAndProbs(state, action)
                sumU = 0
                Rs = 0
                for stateProb in statesProbs:
#                     if stateProb[0] == 'TERMINAL_STATE':
#                         continue
                    sumU = sumU + self.values[stateProb[0]]*stateProb[1]
                    Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1]
#                 if sumU > maxM:
#                     maxM = sumU   
                v = Rs + sumU * discount
                if (v > maxM):
                    maxM = v
            batchValues[state] = maxM
        self.values = batchValues
        self.iterations = self.iterations - 1       
    self.policy = {}
    for state in mdp.getStates():
        if mdp.isTerminal(state):
            self.policy[state] = None
            continue
        QValues = []
        for action in mdp.getPossibleActions(state):
            QValues.append(self.getQValue(state, action))
            self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # fill every state with some action.
        self.actions = dict()
        for state in mdp.getStates():
            stateActions = mdp.getPossibleActions(state)
            if len(stateActions) > 0:
                action = stateActions[0]
                self.actions[state] = action

        for i in xrange(iterations):
            # make a copy of all the values.
            # this copy will get modified in the for-loop,
            # and at the end of the loop,
            # the new values will become then real values.
            nextValues = self.values.copy()

            # for every state, and if it isn't a terminal state
            # (you can't do any action on a terminal state):
            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    # get the best action.
                    action = self.computeActionFromValues(state)
                    self.actions[state] = action
                        
                    # get the value for doing the currently stored action.
                    nextValues[state] = self.computeQValueFromValues(state, action)

            # copy the new values over the old values.
            self.values.update(nextValues)
Example #10
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    """
    i=1
    for state in mdp.getStates():
      print "state ", i, ": ", state
      print "possible action: ", mdp.getPossibleActions(state)
      i+=1
    """
    self.policy = util.Counter()
    self.nextStateValue = util.Counter()
    states = mdp.getStates()
    
    for state in mdp.getStates():
      self.values[state] = 0

    i=0
    while i < self.iterations:
      self.currentStateValue = self.values.copy()
      for state in states:
        actions = mdp.getPossibleActions(state)
        max_qvalue = -99999999
        take_action = None
        for action in actions:
          qvalue = self.getQValue(state, action)
          if qvalue > max_qvalue:
            max_qvalue = qvalue
            take_action = action
        if max_qvalue != -99999999:
          self.values[state] = max_qvalue
        self.policy[state] = take_action
      i+=1
Example #11
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()
        for x in range(0, iterations):
            values = util.Counter()
            for state in states:
                action = self.getAction(state)
                if action is not None:
                    values[state] = self.getQValue(state, action)
            self.values = values
Example #12
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        import sys
        states = mdp.getStates()
        states.pop(0)
        values = util.Counter()
        for i in range(iterations):
            for state in states:
                maxValue = -sys.maxint
                actions = mdp.getPossibleActions(state)
                for action in actions:
                    value = self.getQValue(state, action)
                    if value > maxValue:
                        maxValue = value
                values[state] = maxValue
            self.values = values.copy()
Example #13
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
     
     Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state)
     """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter(
     )  # A Counter is a dict with default values as 0
     "*** YOUR CODE HERE ***"
     states = mdp.getStates()
     for k in range(0, iterations):
         for state in states:
             actions = []
             for action in mdp.getPossibleActions(state):
                 trans_prob = mdp.getTransitionStatesAndProbs(state, action)
                 actions.append(
                     sum(self.values[tp[0], k - 1] * tp[1]
                         for tp in trans_prob))
             if actions:
                 max_prob = max(actions)
             else:
                 max_prob = 0
             self.values[state,
                         k] = mdp.getReward(state) + discount * max_prob
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() 
        
        for i in range(iterations): # running the alg on the indicated number of iterations
            y = self.values.copy() #V sub k-1
            
            for state in mdp.getStates():
                actions = util.Counter()
                
                if  mdp.isTerminal(state) == False:
                    for possibleActions in mdp.getPossibleActions(state):

                        for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions):
                                value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState]))
                                actions[possibleActions] += value_iteration
                    self.values[state] = actions[actions.argMax()] 
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        allStates = mdp.getStates()
        
        for iter in range(0, iterations+1):
            valuesIter = self.calcValue(allStates, iter)
            self.values = valuesIter.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        states = mdp.getStates()
        for i in range(0,self.iterations):
          V = util.Counter()
          for state in states:
            action = self.computeActionFromValues(state)
            if action is None: 
              V[state] = 0
            else:
              V[state] = self.computeQValueFromValues(state,action)
          self.values = V
Example #17
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # Value iteration works based on the Belman equations
        for _ in range(self.iterations):
            values = self.values.copy()
            for state in mdp.getStates():
                actions = self.mdp.getPossibleActions(state)
                if not actions:
                    values[state] = 0
                else:
                    values[state] = max(
                        self.computeQValueFromValues(state, action)
                        for action in actions)
            self.values = values
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.new_values = util.Counter() # A Counter is a dict with default 0
	self.my_states = mdp.getStates()
	#for s in mdp.getStates():
	#  print(s)
	for i in range(0, iterations):
	    self.do_one_iteration(mdp)
	    self.values = self.new_values
        print("Finished my iterations ")
	def __init__(self, mdp, discount = 0.9, iterations = 100):
		"""
		  Your value iteration agent should take an mdp on
		  construction, run the indicated number of iterations
		  and then act according to the resulting policy.

		  Some useful mdp methods you will use:
			  mdp.getStates()
			  mdp.getPossibleActions(state)
			  mdp.getTransitionStatesAndProbs(state, action)
			  mdp.getReward(state, action, nextState)
			  mdp.isTerminal(state)
		"""
		self.mdp = mdp
		self.discount = discount
		self.iterations = iterations
		self.values = util.Counter() # A Counter is a dict with default 0
		self.startState = (0, 0)
		# Write value iteration code here
		"*** YOUR CODE HERE ***"
		for _ in range(0, iterations):
			currentValues = {}
			for state in mdp.getStates():
				bestAction = self.computeActionFromValues(state)
				bestActionValue = self.computeQValueFromValues(state, bestAction)
				currentValues[state] = bestActionValue
			self.values = currentValues
Example #20
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for n in range(iterations):
       V = self.values.copy()
       for s in mdp.getStates():
           action_values = []
           for a in mdp.getPossibleActions(s):
               action_value = 0
               for s_, P in mdp.getTransitionStatesAndProbs(s, a):
                   action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_])
               action_values.append(action_value)
           self.values[s] = max(action_values or [0])
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        currentIteration = util.Counter()
        for i in range(iterations):
          for state in mdp.getStates():
            bestScore = float('-inf')
            for action in mdp.getPossibleActions(state):
              qVal = self.computeQValueFromValues(state, action)
              if qVal > bestScore:
                bestScore = qVal
            if bestScore != float('-inf'):
              currentIteration[state] = bestScore

          for state in self.values:
            self.values[state] = currentIteration[state]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        t = mdp.getTransitionStatesAndProbs
        r = mdp.getReward
        for i in range(iterations):
            temp = self.values.copy()
            for s in mdp.getStates():
                if len(mdp.getPossibleActions(s)) != 0:
                    self.values[s] = max([
                        sum([
                            p * (r(s, a, nexts) + discount * temp[nexts])
                            for nexts, p in t(s, a)
                        ]) for a in mdp.getPossibleActions(s)
                    ])
                else:
                    self.values[s] = temp[s]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        T = mdp.getTransitionStatesAndProbs
        R = mdp.getReward
        Act = mdp.getPossibleActions
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        disc = discount
        values = util.Counter()
        states = mdp.getStates()

        for _ in range(iterations):
            oldval = values.copy()
            for s in states:
                curr = [
                    sum(((el[1] * (R(s, a, el[0]) + disc * oldval[el[0]])
                          for el in T(s, a)))) for a in Act(s)
                ]
                values[s] = max(curr) if len(curr) else 0

        self.values = values
Example #24
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is action dict with default 0
    
   "*** YOUR CODE HERE ***"
   
   for i in range(iterations):
       self.prevBatch = self.values.copy() 
       for state in mdp.getStates():
           qValues = util.Counter()
           for action in mdp.getPossibleActions(state):
               for (statePrime, tValue) in mdp.getTransitionStatesAndProbs(state, action):
                   qValues[action] += tValue * (mdp.getReward(state, action, statePrime) + self.discount * self.prevBatch[statePrime])
           self.values[state] = qValues[qValues.argMax()]
Example #25
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()
        for k in range(iterations):
          newValues = {}
          for state in states:
            actions = mdp.getPossibleActions(state)
            v = util.Counter()
            for action in actions:
              v[action] = self.computeQValueFromValues(state, action)
            newValues[state] = v[v.argMax()]
          self.values = newValues
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()
        temp = util.Counter()
        for i in range(iterations):
            for s in states:
                actions = mdp.getPossibleActions(s)
                v = map(lambda x: self.computeQValueFromValues(s, x), actions)
                if len(v) > 0:
                    temp[s] = max(v)
            self.values = temp.copy()
Example #27
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for _ in range(iterations):  # do value iteration for this many times
            nextValues = util.Counter(
            )  # init values_k+1 but still use self.values as values_k

            for state in mdp.getStates():  # for every state
                action = self.computeActionFromValues(
                    state)  # calculate best action
                nextValues[state] = self.computeQValueFromValues(
                    state, action)  # set value_k+1 as val for that action

            self.values = nextValues  # set values_k+1 as values_k for next iteration
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for times in range(iterations):
       V = self.values.copy()
       for state in mdp.getStates():
           action_values = util.Counter()
           for action in mdp.getPossibleActions(state):
               for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action):
                   action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state])
           self.values[state] = action_values[action_values.argMax()]
Example #29
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            next_v = util.Counter()
            for state in mdp.getStates():
                if mdp.isTerminal(state): v = 0
                else: v = float("-inf")
                for a in mdp.getPossibleActions(state):
                    q = self.computeQValueFromValues(state, a)
                    if (q > v): v = q
                next_v[state] = v
            self.values = next_v
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   self.tmpValues = util.Counter();
   iterationsCompleted = 0
   startState = mdp.getStartState();
   while (iterationsCompleted < iterations):
     for state in mdp.getStates():
       self.computeValue(mdp,state,discount)
     for key in self.tmpValues:
       self.values[key] = self.tmpValues[key]
     iterationsCompleted += 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.ValuesDup = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        iterations  = self.iterations
        while(iterations >0):
            for astate in mdp.getStates():
                if mdp.isTerminal(astate)==0:
                    
                    QVallist=[]
                    for action in mdp.getPossibleActions(astate):  
                        QVallist += [self.computeQValueFromValues(astate, action)]   
                    self.values[astate]=max(QVallist)
            for states,value in self.values.items():
                self.ValuesDup[states] = self.values[states]
            iterations+=-1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        mdpStates = mdp.getStates()

        for iteration in xrange(iterations):
          newValues = util.Counter()
          for state in mdpStates:
            if self.mdp.isTerminal(state):
              continue
            actionValues = -sys.maxint - 1
            for action in mdp.getPossibleActions(state):
              sum = 0
              for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action):
                sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState])
              if sum > actionValues:
                actionValues = sum
            newValues[state] = actionValues
          self.values = newValues
Example #33
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        for _ in range(self.iterations):
            nextValues = util.Counter()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                newValue = max(self.computeQValueFromValues(state, action) for action in actions) if len(actions) else 0
                nextValues[state] = newValue

            self.values = nextValues
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for i in range(iterations):
     nextValues = util.Counter()
     for state in mdp.getStates():
       rewardsPossible = util.Counter()
       for action in mdp.getPossibleActions(state):
           nextPossible = size(mdp.getTransitionStatesAndProbs(state, action))[1]
           newRewards = util.Counter()
           for tmpState in range(nextPossible):
             nextState  = mdp.getTransitionStatesAndProbs(state, action)[tmpState][0]
             prob = mdp.getTransitionStatesAndProbs(state, action)[tmpState][1]
             rewards = mdp.getReward(state, action, tmpState)
             newRewards[tmpState] = prob * (rewards + self.discount * self.values[nextState])
           rewardsPossible[action] = newRewards.totalCount()
       nextValues[state] = rewardsPossible[rewardsPossible.argMax()]
       
   self.values = nextValues
Example #35
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.previousValues = util.Counter()

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        gameStates = mdp.getStates()

        for i in xrange(self.iterations):
            for state in gameStates:
                totalRewardPerAction = util.Counter()
                possibeActions = mdp.getPossibleActions(state)
                for action in possibeActions:
                    totalRewardPerAction[action] = self.getTotalReward(
                        state, action)
                self.values[state] = totalRewardPerAction[
                    totalRewardPerAction.argMax()]
            for stateTemp in gameStates:
                self.previousValues[stateTemp] = self.values[stateTemp]
  def __init__(self, mdp, discountRate = 0.9, iters = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.

      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discountRate = discountRate
    self.iters = iters
    self.values = util.Counter() # A Counter is a dict with default 0

    """Description:
    V_k+1(s) := max_a sum_(s') T(s,a,s')[R(s,a,s') + gV_k(s')]
    """
    """ YOUR CODE HERE """
    # We want to start at V_0(s) and work our way up.
    # Currently, all values default to 0 for every state.
    allStates = mdp.getStates()
    for _ in range(iters):
      tempValues = self.values.copy()
      for state in allStates:

        if self.mdp.isTerminal(state):
          continue
        actions = self.mdp.getPossibleActions(state)
        maxVal = max(self.getQValue(state, action) for action in actions)
        tempValues[state] = maxVal
      self.values = tempValues.copy()
    """ END CODE """
Example #37
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        for i in range(iterations):
            values = self.values.copy()  #vrednosti iz prethodne iteracije
            for state in mdp.getStates():
                actionsCounter = util.Counter()
                for action in mdp.getPossibleActions(state):
                    for nextState, prob in mdp.getTransitionStatesAndProbs(
                            state, action):
                        actionsCounter[action] += prob * (
                            mdp.getReward(state, action, nextState) +
                            discount * values[nextState])
                self.values[state] = actionsCounter[actionsCounter.argMax()]
Example #38
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.old_values = self.values.copy()
        for i in range(iterations):
            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    action_value = -float('inf')
                    for action in mdp.getPossibleActions(state):
                        q_value = self.computeQValueFromValues(state, action)
                        action_value = max(action_value, q_value)
                    self.values[state] = action_value
            self.old_values = self.values.copy()
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.qvalues = util.Counter()
    self.bestact = util.Counter()

    "*** YOUR CODE HERE ***"
    states = mdp.getStates()

    for i in range(iterations):
        v = util.Counter()
        for state in states:
            if mdp.isTerminal(state):
                continue
            value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state])
                    for next_state, prob in mdp.getTransitionStatesAndProbs(state, action))
                    for action in mdp.getPossibleActions(state)}
            self.bestact[state] = max(value, key=value.get)
            v[state] = value[self.bestact[state]] 
            for action in value.keys():
                self.qvalues[state,action] = value[action]
        self.values = v.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # V*(s) = max sum T(s, a, s')[R(s, a, s') + gV*(s')] = qvalue
        states = mdp.getStates()
        for i in range(self.iterations):
            newValues = util.Counter()
            for state in states:
                maximum = float("-inf")
                actions = self.mdp.getPossibleActions(state)
                for action in actions:
                    temp = self.computeQValueFromValues(state, action)
                    if temp > maximum:
                        maximum = temp
                    newValues[state] = maximum

            self.values = newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #while still iterations
          #for each state
            #for action in each state
              #get Q(state,action)
            #store largest (state,action) in Counter

        for i in range(self.iterations):
          newValues = self.values.copy() #WTF WHY THIS TOOK HOURS
          for state in mdp.getStates():
            v = [float("-inf")]
            if not mdp.isTerminal(state):
              for action in mdp.getPossibleActions(state):
                v += [self.computeQValueFromValues(state,action)]
              newValues[state] = max(v)
          self.values = newValues
Example #42
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
   Your value iteration agent should take an mdp on
   construction, run the indicated number of iterations
   and then act according to the resulting policy.
 
   Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state, action, nextState)
 """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter()  # A Counter is a dict with default 0
     "*** YOUR CODE HERE ***"
     for i in range(iterations):
         newValues = util.Counter()
         for state in mdp.getStates():
             values = []
             for action in mdp.getPossibleActions(state):
                 qvalue = sum([
                     (discount * self.values[newState] +
                      mdp.getReward(state, action, newState)) * prob
                     for newState, prob in mdp.getTransitionStatesAndProbs(
                         state, action)
                 ])
                 values.append(qvalue)
             if len(values) > 0:
                 newValues[state] = max(values)
         for state in self.values:
             self.values[state] = newValues[state]
Example #43
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            valuesNew = util.Counter()
            for state in mdp.getStates():
                maxVal = -1
                if not mdp.isTerminal(state):
                    vals = util.Counter()
                    for possact in mdp.getPossibleActions(state):
                        #value = self.computeQValueFromValues(state, possact)
                        #if value > maxVal:
                        #    maxVal = value
                        vals[possact] = self.computeQValueFromValues(state, possact)
                    #valuesNew[state] = maxVal
                    valuesNew[state] = max(vals.values())
            for st2 in valuesNew:
              self.values[st2] = valuesNew[st2]
Example #44
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        for i in range(iterations):
            newValues = self.values.copy()
            for state in mdp.getStates():
                if self.mdp.isTerminal(state):
                    continue

                possible_actions = mdp.getPossibleActions(state)
                decision = -2500.0
                for action in possible_actions:
                    if (self.getQValue(state, action) > decision):
                        decision = self.getQValue(state, action)
                newValues[state] = decision
            self.values = newValues

        "*** YOUR CODE HERE ***"
Example #45
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    
    for time in range(iterations):
      values = util.Counter()
      for state in mdp.getStates():
        if mdp.isTerminal(state): 
	  values[state] = 0
	else: 
          maxValue = -INF
	  for action in mdp.getPossibleActions(state):
	    maxValue = max(maxValue, self.getQValue(state, action))
	  values[state] = maxValue
      self.values = values
Example #46
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        #print mdp.getStates()

        for i in range(self.iterations):
            newValues = util.Counter()
            for s in mdp.getStates():
                options = [ self.computeQValueFromValues(s, a) for a in mdp.getPossibleActions(s)]
                #if i == 2 and s == (2,2):
                #    for a in self.mdp.getPossibleActions(s): self.slowCompute(s,a)
                if len(options):
                    newValues[s] = max(options)
            self.values = newValues
Example #47
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
          
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0

    for i in range(iterations):
        nextValues = util.Counter()
        for state in mdp.getStates(): 
            if mdp.isTerminal(state): continue
            first = True
            for action in mdp.getPossibleActions(state):
                qValue = 0
                for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
                    reward = mdp.getReward(state, action, nextState)
                    qValue += prob * (reward + discount*self.values[nextState])
                if first:
                    maxQValue = qValue
                    first = False
                elif qValue > maxQValue:
                    maxQValue = qValue
            nextValues[state] = maxQValue
        self.values = nextValues
Example #48
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    # Init : Not required

    # Value iteration
    for i in range(iterations):
        old_values = self.values.copy()
        for state in mdp.getStates():
            value_state_action = []
            for action in mdp.getPossibleActions(state):
                val = 0 
                transition = mdp.getTransitionStatesAndProbs(state,action)
                for sstate,prob_s_a_ss in transition:
                    val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate])
                value_state_action.append(val)
            if value_state_action : self.values[state] = max(value_state_action)
Example #49
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
		"""
		  Your value iteration agent should take an mdp on
		  construction, run the indicated number of iterations
		  and then act according to the resulting policy.

		  Some useful mdp methods you will use:
			  mdp.getStates()
			  mdp.getPossibleActions(state)
			  mdp.getTransitionStatesAndProbs(state, action)
			  mdp.getReward(state, action, nextState)
			  mdp.isTerminal(state)
		"""
		self.mdp = mdp
		self.discount = discount
		self.iterations = iterations
		self.values = util.Counter() # A Counter is a dict with default 0

		# Write value iteration code here
		"*** YOUR CODE HERE ***"
		for i in range(self.iterations): # Will range through all iterations to determine each Vk(s)
			counter = util.Counter()
			for state in mdp.getStates(): # Range through all possible states
				if self.mdp.isTerminal(state): # This means the state value cannot change in the specific iteration
					counter[state] = 0
				else:
					actions = self.mdp.getPossibleActions(state) # Get all possible actions available at a given state
					values = [self.computeQValueFromValues(state, action) for action in actions] # Compute the values of each action
					counter[state] = max(values) # Choose the best possible value (and therefore action) accordingly

			self.values = counter # Change the initial counter to the new one determined above
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.optimalActionInState = collections.defaultdict(None)
        for k in range(iterations):
            lastValues = self.values.copy()
            for state in mdp.getStates():
                if self.mdp.isTerminal(state):
                    continue
                maxValue = float("-inf") if mdp.getPossibleActions(state) else 0
                for action in mdp.getPossibleActions(state):
                    theSum = 0
                    for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
                        R = self.mdp.getReward(state, action, nextState)
                        theSum += prob * (R + self.discount * lastValues[nextState])
                    maxValue = max(maxValue,theSum)
                self.values[state] = maxValue
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        for k in range(1, iterations + 1):
            values = copy(self.values)
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if len(actions) > 0:
                    self.values[state] = max([
                        sum([
                            prob * (mdp.getReward(state, action, next) +
                                    discount * values[next])
                            for next, prob in mdp.getTransitionStatesAndProbs(
                                state, action)
                        ]) for action in mdp.getPossibleActions(state)
                    ])
Example #52
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   # OUR CODE HERE
   #Note: I think we should use the util.Counter thing?
   for times in range(0, iterations):
     #values from previous iteration so we don't update over them while iterating
     prevVals = self.values.copy()
     #iterate through all states
     for state in mdp.getStates():
       #will store the action-value for the iteration
       value = util.Counter()
       for action in mdp.getPossibleActions(state):
         for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action):
           #expected value, probability * reward for the state with the discount * reward
           value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState])
       #update the values to the new value from the iteration
       #the .argMax() function returns the one with the largest value
       self.values[state] = value[value.argMax()]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   #get the mdp states
   # self.values(state) is already initialized to 0 for all states
   newVals = util.Counter()
   for i in range(self.iterations):
     for state in mdp.getStates():
       newVals[state] = mypy.newVal(self.mdp,state,self.discount,self.values)
     self.values = newVals.copy()
Example #54
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #states = mdp.getStates()
        #values = {state: 0 for state in states}
        for i in range(iterations):
            previous = self.values.copy()
            for state in mdp.getStates():
                possibleActions = mdp.getPossibleActions(state)
                if len(possibleActions) == 0: continue
                results = []
                for action in possibleActions:
                    total = 0
                    for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action):
                        total += (prob * previous[nextState])
                    results.append(total)
                self.values[state] = mdp.getReward(state) + (discount * max(results))
Example #55
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(0, iterations):
            valuesCopy = self.values.copy()
            for state in mdp.getStates():
                maximumScore = -10000
                for action in mdp.getPossibleActions(state):
                    currQVal = self.computeQValueFromValues(state, action)
                    maximumScore = max(maximumScore, currQVal)
                if mdp.isTerminal(state):
                    valuesCopy[state] = 0
                else:
                    valuesCopy[state] = maximumScore
            self.values = valuesCopy
Example #56
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        helper_vector = util.Counter() # Copy of vectors to be used for batch updating 
        
        for i in range(self.iterations):
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    continue
                if mdp.getPossibleActions(state):
                    helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] )
                for action in mdp.getPossibleActions(state):
                    helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, action)] ))
            for state in helper_vector:
                self.values[state] = helper_vector[state]
Example #57
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.vks = util.Counter()
        for i in range(0,iterations):
            self.vks = self.values.copy()
            st = mdp.getStates()
            for s in st:
              a = mdp.getPossibleActions(s)
              qvals = util.Counter()
              for action in a:
                  qvals[action] = 0
                  stp = self.mdp.getTransitionStatesAndProbs(s,action)
                  for ss, prob in stp:
                      qvals[action] = qvals[action] + prob*(self.mdp.getReward(s,action,ss) + self.discount*(self.vks[ss]))
              self.values[s] = qvals[qvals.argMax()]
Example #58
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    b
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        for i in xrange(0, iterations - 1):
            for state in mdp.getStates():
                q_values = []
                for action in mdp.getPossibleActions(state):
                    q = self.getQValue(state, action)
                    q_values.append(q)

                self.values[state] = self.values[state] if len(
                    q_values) == 0 else max(q_values)
Example #59
0
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     state = mdp.getStates()
     predecessor_list = {}
     discount = self.discount
     iterations = self.iterations
     theta = self.theta
     for temp_state in state:
         predecessor_list[temp_state] = self.getpredecessor(temp_state)
     pq = util.PriorityQueue()
     for temp_state in state:
         if not mdp.isTerminal(temp_state):
             pq.push(temp_state, -self.find_difference(temp_state))
     for i in range(iterations):
         if pq.isEmpty():
             return
         cur_state = pq.pop()
         if not mdp.isTerminal(cur_state):
             action = self.getAction(cur_state)
             self.values[cur_state] = self.getQValue(cur_state, action)
         for pre in predecessor_list[cur_state]:
             diff_pre = self.find_difference(pre)
             if diff_pre > theta:
                 pq.update(pre, -diff_pre)