Esempio n. 1
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        helper_vector = util.Counter() # Copy of vectors to be used for batch updating 
        
        for i in range(self.iterations):
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    continue
                if mdp.getPossibleActions(state):
                    helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] )
                for action in mdp.getPossibleActions(state):
                    helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, action)] ))
            for state in helper_vector:
                self.values[state] = helper_vector[state]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.optimalActionInState = collections.defaultdict(None)
        for k in range(iterations):
            lastValues = self.values.copy()
            for state in mdp.getStates():
                if self.mdp.isTerminal(state):
                    continue
                maxValue = float("-inf") if mdp.getPossibleActions(state) else 0
                for action in mdp.getPossibleActions(state):
                    theSum = 0
                    for nextState, prob in self.mdp.getTransitionStatesAndProbs(state, action):
                        R = self.mdp.getReward(state, action, nextState)
                        theSum += prob * (R + self.discount * lastValues[nextState])
                    maxValue = max(maxValue,theSum)
                self.values[state] = maxValue
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            currentValues = self.values.copy()
            for s in mdp.getStates():
                if not self.mdp.isTerminal(s):
                    temp, i = [float("-inf")]*len(mdp.getPossibleActions(s)), 0
                    for a in mdp.getPossibleActions(s):
                        temp[i], i = self.getQValue(s, a), i + 1
                    currentValues[s] = max(temp)
            self.values = currentValues
Esempio n. 4
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.depth = 1
        self.qTable = {}
        self.vTable = {}
        for state in mdp.getStates():
            self.vTable[state] = 0
            self.qTable[state] = {}
            for action in mdp.getPossibleActions(state):
                
                self.qTable[state][action] = 0
        
        while self.depth < self.iterations + 1:
            self.tempTable = {}
            for state in mdp.getStates():
                self.stateValue = 0
                if not mdp.isTerminal(state):
                    self.stateValue = -9999
                    for action in mdp.getPossibleActions(state):
                        self.Qtotal = 0
                        for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                            self.reward = mdp.getReward(state, action, nextState)
                            self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                            #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState]
                        self.qTable[state][action] = self.Qtotal
                        #print self.qTable[state][action]
                        self.stateValue = max(self.stateValue,self.qTable[state][action])
                else:
                    self.tempTable[state] = 0
                self.tempTable[state] = self.stateValue
            self.vTable = self.tempTable
            self.depth += 1
            
        for state in mdp.getStates():
            self.stateValue = -9999
            for action in mdp.getPossibleActions(state):
                self.Qtotal = 0
                for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                    self.reward = mdp.getReward(state, action, nextState)
                    self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                self.qTable[state][action] = self.Qtotal
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
       
        
        i = 0
        terminalstates = []
        
        
        while i<iterations:
            nextValues = util.Counter()
            
            for state in mdp.getStates(): 
                
                stateValues = []
                for action in mdp.getPossibleActions(state):
                    sumValue = 0
                    for item in mdp.getTransitionStatesAndProbs(state, action):
                        nextState = item[0]
                        
                        probability = item[1]
                        reward = mdp.getReward(state,action,nextState)
                        
                        #print "reward", reward
                        sumValue = sumValue + (probability * (reward + (discount * self.values[nextState])))
                        
                        #print "SUMVALUE", sumValue
               
                    stateValues.append(sumValue)
                    
                if len(mdp.getPossibleActions(state)) == 0:
                    nextValues[state] = 0
                else: 
                    nextValues[state] = max(stateValues)
                        
            i+=1
            self.values = nextValues
Esempio n. 6
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.delta = 0
    while(self.iterations > 0):
#         self.delta = 0
        batchValues = util.Counter()
        for state in mdp.getStates():  
            maxM = -10000
                   
            if mdp.isTerminal(state):
                continue 
            for action in mdp.getPossibleActions(state):
                statesProbs = mdp.getTransitionStatesAndProbs(state, action)
                sumU = 0
                Rs = 0
                for stateProb in statesProbs:
#                     if stateProb[0] == 'TERMINAL_STATE':
#                         continue
                    sumU = sumU + self.values[stateProb[0]]*stateProb[1]
                    Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1]
#                 if sumU > maxM:
#                     maxM = sumU   
                v = Rs + sumU * discount
                if (v > maxM):
                    maxM = v
            batchValues[state] = maxM
        self.values = batchValues
        self.iterations = self.iterations - 1       
    self.policy = {}
    for state in mdp.getStates():
        if mdp.isTerminal(state):
            self.policy[state] = None
            continue
        QValues = []
        for action in mdp.getPossibleActions(state):
            QValues.append(self.getQValue(state, action))
            self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   currentIterationCounter = 1
   for state in mdp.getStates():
     self.values[state] = mdp.getReward(state, 'Stop', state)
   while (currentIterationCounter != self.iterations):
     newValues = util.Counter()
     for state in mdp.getStates():
       tempValues = util.Counter()
       for action in mdp.getPossibleActions(state):
         for newStateAndProb in mdp.getTransitionStatesAndProbs(state, action):
           newState = newStateAndProb[0]
           prob = newStateAndProb[1]
           tempValues[action] += prob*(mdp.getReward(state, action, newState)+self.discount*self.values[newState])
       newValues[state] = tempValues[tempValues.argMax()]
     currentIterationCounter += 1
     for state in mdp.getStates():
       self.values[state] = newValues[state]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        mdpStates = mdp.getStates()

        for iteration in xrange(iterations):
          newValues = util.Counter()
          for state in mdpStates:
            if self.mdp.isTerminal(state):
              continue
            actionValues = -sys.maxint - 1
            for action in mdp.getPossibleActions(state):
              sum = 0
              for transitionState, prob in mdp.getTransitionStatesAndProbs(state, action):
                sum += prob*(mdp.getReward(state, action, transitionState) + discount * self.values[transitionState])
              if sum > actionValues:
                actionValues = sum
            newValues[state] = actionValues
          self.values = newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #while still iterations
          #for each state
            #for action in each state
              #get Q(state,action)
            #store largest (state,action) in Counter

        for i in range(self.iterations):
          newValues = self.values.copy() #WTF WHY THIS TOOK HOURS
          for state in mdp.getStates():
            v = [float("-inf")]
            if not mdp.isTerminal(state):
              for action in mdp.getPossibleActions(state):
                v += [self.computeQValueFromValues(state,action)]
              newValues[state] = max(v)
          self.values = newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() 
        
        for i in range(iterations): # running the alg on the indicated number of iterations
            y = self.values.copy() #V sub k-1
            
            for state in mdp.getStates():
                actions = util.Counter()
                
                if  mdp.isTerminal(state) == False:
                    for possibleActions in mdp.getPossibleActions(state):

                        for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions):
                                value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState]))
                                actions[possibleActions] += value_iteration
                    self.values[state] = actions[actions.argMax()] 
Esempio n. 11
0
  def __init__(self, mdp, discount=0.9, iterations=100):
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        allStates = self.mdp.getStates()        

        # Write value iteration code here

        for i in range(iterations):
          interState = util.Counter()
          for state in allStates:
            best = -9999999
            actions = mdp.getPossibleActions(state)
            for action in actions:
              transitions = self.mdp.getTransitionStatesAndProbs(state, action)
              sumTransitions = 0
              for transition in transitions:
                reward = self.mdp.getReward(state, action, transition[0])
                sumTransitions += transition[1]*(reward + discount*self.values[transition[0]])
              best = max(best, sumTransitions)
            if best != -9999999:
              interState[state] = best          
          
          for state in allStates:
            self.values[state] = interState[state]
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"

        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        maxActionValue = float('-inf')
        maxAction = None

        if  ((possibleActions==None) or (mdp.isTerminal(state))):
            return None

        for action in possibleActions:
            actionSum = self.getQValue(state, action)
                        
            #Find the maximum action
            if maxActionValue < actionSum:
                maxAction = action
                maxActionValue = actionSum

        return maxAction
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   self.qvalues = util.Counter()
   
   states = mdp.getStates()
   
   for i in range(self.iterations):
       valuesCopy = self.values.copy()
       for state in states:
           actions = mdp.getPossibleActions(state)
           q = []
           for action in actions:
               q.append(self.getQValue(state,action))
           if len(q) == 0:
               valuesCopy[state] = 0
           else: valuesCopy[state] = max(q)
       self.values = valuesCopy
Esempio n. 14
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   "*** YOUR CODE HERE ***"
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # value of each state; a Counter is a dict with default 0
   
   # run for desired number of iterations
   for i in xrange(iterations):
     new_values = self.values.copy()
     for s in mdp.getStates():
       if not mdp.isTerminal(s):
         # the commented code works as well
         #curr_best = float("-inf")
         #for a in mdp.getPossibleActions(s):
         #temp_value = sum([p * (mdp.getReward(s, a, s2) + discount*prev[s2]) for s2, p in mdp.getTransitionStatesAndProbs(s, a)])
         #  if temp_value > curr_best:
         #    curr_best = temp_value
         #self.values[s] = curr_best       
         new_values[s] = max([self.getQValue(s, a) for a in mdp.getPossibleActions(s)])  
     self.values = new_values
Esempio n. 15
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #states = mdp.getStates()
        #values = {state: 0 for state in states}
        for i in range(iterations):
            previous = self.values.copy()
            for state in mdp.getStates():
                possibleActions = mdp.getPossibleActions(state)
                if len(possibleActions) == 0: continue
                results = []
                for action in possibleActions:
                    total = 0
                    for (nextState, prob) in mdp.getTransitionStatesAndProbs(state,action):
                        total += (prob * previous[nextState])
                    results.append(total)
                self.values[state] = mdp.getReward(state) + (discount * max(results))
Esempio n. 16
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            valuesNew = util.Counter()
            for state in mdp.getStates():
                maxVal = -1
                if not mdp.isTerminal(state):
                    vals = util.Counter()
                    for possact in mdp.getPossibleActions(state):
                        #value = self.computeQValueFromValues(state, possact)
                        #if value > maxVal:
                        #    maxVal = value
                        vals[possact] = self.computeQValueFromValues(state, possact)
                    #valuesNew[state] = maxVal
                    valuesNew[state] = max(vals.values())
            for st2 in valuesNew:
              self.values[st2] = valuesNew[st2]
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        while self.iterations > 0:
            prev_values = self.values.copy()
            for state in mdp.getStates():
                actions = mdp.getPossibleActions(state)
                if not actions:
                    continue
                self.values[state] = max([sum([prob*(mdp.getReward(state, act, state1) + discount*prev_values[state1])
                                               for state1, prob in mdp.getTransitionStatesAndProbs(state, act)])
                                          for act in actions])
            self.iterations -= 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.ValuesDup = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        iterations  = self.iterations
        while(iterations >0):
            for astate in mdp.getStates():
                if mdp.isTerminal(astate)==0:
                    
                    QVallist=[]
                    for action in mdp.getPossibleActions(astate):  
                        QVallist += [self.computeQValueFromValues(astate, action)]   
                    self.values[astate]=max(QVallist)
            for states,value in self.values.items():
                self.ValuesDup[states] = self.values[states]
            iterations+=-1
Esempio n. 19
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()
        for k in range(iterations):
          newValues = {}
          for state in states:
            actions = mdp.getPossibleActions(state)
            v = util.Counter()
            for action in actions:
              v[action] = self.computeQValueFromValues(state, action)
            newValues[state] = v[v.argMax()]
          self.values = newValues
Esempio n. 20
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for times in range(iterations):
       V = self.values.copy()
       for state in mdp.getStates():
           action_values = util.Counter()
           for action in mdp.getPossibleActions(state):
               for trans_state, prob in mdp.getTransitionStatesAndProbs(state, action):
                   action_values[action] += prob * (mdp.getReward( state, action, trans_state) + discount * V[trans_state])
           self.values[state] = action_values[action_values.argMax()]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        state = mdp.getStartState()
        for i in range(0,iterations):
            #print "iteration: ", i
            #iterate once through all states and actions, save q-values
            for state in mdp.getStates():
                for action in mdp.getPossibleActions(state):
                    #compute qValue for each action
                    qValue = self.getQValue(state, action)
                    self.values[(state,action)] = qValue
            #after all qValues are computed, iterate againt through states, save value from optimal policy. these values will be V* for next iteration
            for state in mdp.getStates():
                action = self.getAction(state)
                self.values[state] = self.values[(state, action)] 

        """
Esempio n. 22
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.vks = util.Counter()
        for i in range(0,iterations):
            self.vks = self.values.copy()
            st = mdp.getStates()
            for s in st:
              a = mdp.getPossibleActions(s)
              qvals = util.Counter()
              for action in a:
                  qvals[action] = 0
                  stp = self.mdp.getTransitionStatesAndProbs(s,action)
                  for ss, prob in stp:
                      qvals[action] = qvals[action] + prob*(self.mdp.getReward(s,action,ss) + self.discount*(self.vks[ss]))
              self.values[s] = qvals[qvals.argMax()]
Esempio n. 23
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    # Init : Not required

    # Value iteration
    for i in range(iterations):
        old_values = self.values.copy()
        for state in mdp.getStates():
            value_state_action = []
            for action in mdp.getPossibleActions(state):
                val = 0 
                transition = mdp.getTransitionStatesAndProbs(state,action)
                for sstate,prob_s_a_ss in transition:
                    val += prob_s_a_ss*(mdp.getReward(state,action,sstate) + discount*old_values[sstate])
                value_state_action.append(val)
            if value_state_action : self.values[state] = max(value_state_action)
Esempio n. 24
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    
    for time in range(iterations):
      values = util.Counter()
      for state in mdp.getStates():
        if mdp.isTerminal(state): 
	  values[state] = 0
	else: 
          maxValue = -INF
	  for action in mdp.getPossibleActions(state):
	    maxValue = max(maxValue, self.getQValue(state, action))
	  values[state] = maxValue
      self.values = values
Esempio n. 25
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
          
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0

    for i in range(iterations):
        nextValues = util.Counter()
        for state in mdp.getStates(): 
            if mdp.isTerminal(state): continue
            first = True
            for action in mdp.getPossibleActions(state):
                qValue = 0
                for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
                    reward = mdp.getReward(state, action, nextState)
                    qValue += prob * (reward + discount*self.values[nextState])
                if first:
                    maxQValue = qValue
                    first = False
                elif qValue > maxQValue:
                    maxQValue = qValue
            nextValues[state] = maxQValue
        self.values = nextValues
Esempio n. 26
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   # OUR CODE HERE
   #Note: I think we should use the util.Counter thing?
   for times in range(0, iterations):
     #values from previous iteration so we don't update over them while iterating
     prevVals = self.values.copy()
     #iterate through all states
     for state in mdp.getStates():
       #will store the action-value for the iteration
       value = util.Counter()
       for action in mdp.getPossibleActions(state):
         for transitionState, probability in mdp.getTransitionStatesAndProbs(state, action):
           #expected value, probability * reward for the state with the discount * reward
           value[action] += probability * (mdp.getReward( state, action, transitionState) + discount * prevVals[transitionState])
       #update the values to the new value from the iteration
       #the .argMax() function returns the one with the largest value
       self.values[state] = value[value.argMax()]
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Author - Shandheap Shanmuganathan
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default values as 0
        self.count = 1
        while self.count <= iterations:
          for state in mdp.getStates():
            possibleActions = mdp.getPossibleActions(state)
            if len(possibleActions) == 0:
              continue
            QValues = {}
            for action in possibleActions:
              if action == "exit":
                finalScore = self.mdp.getReward(state, action, 'TERMINAL_STATE')
                self.values[state, self.count] = finalScore
                continue
              else:
                QValues[action] = self.getQValue(state, action)
            maxAction = None
            maxQ = -sys.maxint - 1
            for key, value in QValues.iteritems():
              if value > maxQ:
                maxAction = key
                maxQ = value
            if maxQ != -sys.maxint - 1:
              self.values[state, self.count] = maxQ
          self.count += 1
Esempio n. 28
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        # Write value iteration code here
        while self.iterations > 0:
            junk = self.values.copy()
            for state in self.mdp.getStates():
                garbage = {}
                for action in mdp.getPossibleActions(state):
                    garbage[action] = 0
                    for (nextState, prob) in self.mdp.getTransitionStatesAndProbs(state, action):
                        garbage[action] += prob * (mdp.getReward(state, action, nextState) + self.discount * junk[nextState])
                try:
                    self.values[state] = max(garbage.values())
                except ValueError:
                    self.values[state] = 0
            self.iterations -= 1
Esempio n. 29
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   "*** YOUR CODE HERE ***"
   for n in range(iterations):
       V = self.values.copy()
       for s in mdp.getStates():
           action_values = []
           for a in mdp.getPossibleActions(s):
               action_value = 0
               for s_, P in mdp.getTransitionStatesAndProbs(s, a):
                   action_value += P * (mdp.getReward(s, a, s_) + discount * V[s_])
               action_values.append(action_value)
           self.values[s] = max(action_values or [0])
Esempio n. 30
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   
   for i in range(iterations):
       lastValues = copy.deepcopy(self.values)
       for s in mdp.getStates():
           aCounter = util.Counter()
           for a in mdp.getPossibleActions(s):
               for s2 in mdp.getStates():
                   aCounter[a] += self.T(s,a,s2) * (mdp.getReward(s,a,s2) + discount*lastValues[s2])
           self.values[s] = aCounter[aCounter.argMax()]
Esempio n. 31
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here

        #Grab State from MDP
        State = self.mdp.getStates()[2]

        #Grab the NextState from TransitionStates based on the current state
        NextState = mdp.getTransitionStatesAndProbs(
            State,
            mdp.getPossibleActions(State)[0])

        #List of all States
        States = self.mdp.getStates()

        #MDP States
        MDPStates = self.mdp.getStates()

        for i in range(0, self.iterations):

            TemporaryValue = util.Counter()

            for State in MDPStates:

                #Is the state a terminal state?
                if self.mdp.isTerminal(State):
                    TemporaryValue[State] = 0

                else:
                    MaxPossibleValue = float("-inf")

                    for Action in self.mdp.getPossibleActions(State):
                        #Score is initially 0
                        Score = 0

                        for NextState, Probability in self.mdp.getTransitionStatesAndProbs(
                                State, Action):
                            #Bellman equation
                            Score += Probability * (
                                self.mdp.getReward(State, Action, NextState) +
                                (self.discount * self.values[NextState]))

                        #Set max possible value to whichever is greater, old value or current score.
                        MaxPossibleValue = max(Score, MaxPossibleValue)

                        #Update the temporary value of the state (for next iteration)
                        TemporaryValue[State] = MaxPossibleValue
            self.values = TemporaryValue
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.history = {}
        for state in mdp.getStates():
            self.history[state] = []

        for i in range(iterations + 1):
            for state in mdp.getStates():
                if i == 0:
                    self.history[state].append(0)
                    continue

                if mdp.isTerminal(state):
                    self.history[state].append(0)
                    continue

                actions = mdp.getPossibleActions(state)
                if 'exit' in actions:
                    self.history[state].append(
                        mdp.getReward(
                            state, 'exit',
                            mdp.getTransitionStatesAndProbs(state,
                                                            'exit')[0][0]))

                    continue

                max = -99999999
                for action in actions:
                    statesAndProbs = mdp.getTransitionStatesAndProbs(
                        state, action)
                    tempMax = 0
                    for (s, p) in statesAndProbs:
                        tempMax += p * (mdp.getReward(state, action, s) +
                                        self.discount * self.history[s][i - 1])

                    if tempMax > max:
                        max = tempMax

                self.history[state].append(max)

        #print history
        for key, value in self.history.iteritems():
            #print (key, value)
            self.values[key] = value[iterations]

        print self.values

        #print self.values
        "*** YOUR CODE HERE ***"
Esempio n. 33
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        print("using discount {}".format(discount))
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.policies = util.Counter()  # A Counter is a dict with default 0
        delta = 0.01
        # TODO: Implement Policy Iteration.
        # Exit either when the number of iterations is reached,
        # OR until convergence (L2 distance < delta).
        # Print the number of iterations to convergence.
        # To make the comparison FAIR, one iteration is a single sweep over states.
        # Compute the number of steps until policy convergence, but do not stop
        # the algorithm until values converge. #TODO

        # Init values
        for s in mdp.getStates():
            self.values[s] = 0
            if mdp.isTerminal(s):
                continue
            self.policies[s] = mdp.getPossibleActions(s)[0]

        state_iters = 0  # Iterations over state space until policy convergerce
        policy_iters = 0  # Iterations over algorithm until policy convergerce
        algo_iters = 0

        def L2_norm(v1, v2):
            dist = 0
            for k in v1.keys():
                dist += (v1[k] - v2[k])**2
            return dist**(1 / 2)

        policy_stable = False
        values_converged = False

        while not values_converged and algo_iters != iterations:
            # Policy Evaluation
            dist = delta
            while dist >= delta:
                old_values = self.values.copy()
                for s in mdp.getStates():
                    # Skip terminal state
                    if mdp.isTerminal(s):
                        continue
                    v = self.values[s]
                    new_v = 0

                    for s_n, p in mdp.getTransitionStatesAndProbs(
                            s, self.policies[s]):
                        new_v += p * (mdp.getReward(s, self.policies[s], s_n) +
                                      discount * self.values[s_n])
                    self.values[s] = new_v

                # Calculate the new distance
                dist = L2_norm(self.values, old_values)
                if not policy_stable:
                    state_iters += 1
            values_converged = True

            # Policy Improvement
            if not policy_stable:
                policy_iters += 1
                state_iters += 1

            policy_stable = True
            for s in mdp.getStates():
                if mdp.isTerminal(s):
                    continue

                old_action = self.policies[s]

                p_list = list()
                possible_actions = mdp.getPossibleActions(s)
                for a in possible_actions:
                    v_sum = 0
                    for s_n, p in mdp.getTransitionStatesAndProbs(s, a):
                        v_sum += p * (mdp.getReward(s, a, s_n) +
                                      discount * self.values[s_n])
                    p_list.append(v_sum)
                # Assign the maximum value to the current state
                self.policies[s] = possible_actions[np.argmax(p_list)]

                if old_action != self.policies[s]:
                    policy_stable = False
                    values_converged = False

            algo_iters += 1

        print(
            f"Policy Iteration: {state_iters} iterations over the state space")
        print(
            f"Policy Iteration: {policy_iters} iterations until policy convergence"
        )
        print("using discount {}".format(discount))
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        delta = 0.01
<<<<<<< HEAD

        for iteration in range(self.iterations):
            temp_values = util.Counter()
            l2_distance = 0
            for state in self.mdp.getStates():
                value = -np.inf
                if mdp.isTerminal(state):
                    temp_values[state] = 0
                    continue
                for action in mdp.getPossibleActions(state):
                    list = mdp.getTransitionStatesAndProbs(state, action)
                    tmp_value = 0
                    for pair in list:
                        tmp_value += pair[1] * (mdp.getReward(state, action, pair[0]) + self.discount * self.values[pair[0]])
                    value = max(value, tmp_value)
                temp_values[state] = value
                l2_distance = max(l2_distance, np.linalg.norm(value - self.values[state]))

            if l2_distance < delta:
                print(iteration)
                break
            self.values = temp_values

=======
>>>>>>> d00d5057d6ac8b04a3f737da09dbfae34a50aec3
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        max_err = 0.001
        iteration_number = 1
        #U1 = {s:0 for s in mdp.getStates() }
        U = util.Counter()
        while (iteration_number <= self.iterations):
            U = self.values.copy()
            delta = 0

            for s in mdp.getStates():
                T = []
                if (self.mdp.isTerminal(s)):
                    self.values[s] = 0

                else:
                    max_a = mdp.getPossibleActions(s)[0]
                    max_sum = -999999999990
                    for a in mdp.getPossibleActions(s):
                        sum_for_a = 0

                        # T will store a list of (nextState,prob)
                        T = mdp.getTransitionStatesAndProbs(s, a)

                        for pair in T:
                            sum_for_a += pair[1] * U[pair[0]]

                        if (max_sum < sum_for_a):
                            max_sum = sum_for_a
                            max_a = a

                    self.values[s] = mdp.getReward(
                        s, max_a, T[0]) + self.discount * max_sum

                delta = max(delta, abs(self.values[s] - U[s]))

            if (delta <= max_err * (1 - self.discount) / self.discount):
                for key in U:
                    self.values[key] = U[key]
                break

            #last line inside while
            iteration_number += 1
Esempio n. 36
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        #for state in mdp.getStates()
        #   if Actions.exit in mdp.getPossibleActions(state):
        #   self.values[state] = mdp.getReward(state, action, mdp.getTransitionStatesAndProbs(state, action)[0][0])
        #   WARNING: Assumes the last method call only returns a list of one (state, prob) pair
        #for iter in range(1, iterations):
        #   oldValues = self.values
        #   newValues = util.Counter()
        #   for state in mdp.getStates():
        #       newValues[state] = max([(sum([stateAndProb[1]*(mdp.getReward(state, action, stateAndProb[0]) + self.discount * oldValues[stateAndProb[0]]) for stateAndProb in mdp.getTransitionStatesAndProbs(state, action)])), action) for action in mdp.getPossibleActions(state)], key=lambda x: x[0])
        #   self.values = newValues

        self.qValues = util.Counter()  #To hold Q-values
        #FIXME Eliminate magic numbers
        for state in mdp.getStates():
            if 'exit' in mdp.getPossibleActions(state):
                self.values[state] = mdp.getReward(
                    state, 'exit',
                    mdp.getTransitionStatesAndProbs(state, 'exit')[0][0])
        for iter in range(1, iterations):
            #print "Iter:", iter
            oldValues, newValues = self.values, util.Counter()
            oldQValues, newQValues = self.qValues, util.Counter()
            for state in [
                    state1 for state1 in self.mdp.getStates()
                    if str(state1) != "TERMINAL_STATE"
            ]:
                #print "State:", state
                valueActionPairs = [(sum([
                    stateAndProb[1] *
                    (self.mdp.getReward(state, action, stateAndProb[0]) +
                     self.discount * oldValues[stateAndProb[0]])
                    for stateAndProb in self.mdp.getTransitionStatesAndProbs(
                        state, action)
                ]), action) for action in mdp.getPossibleActions(state)]
                for action in mdp.getPossibleActions(state):
                    newQValues[(state, action)] = [
                        valueActionPair[0]
                        for valueActionPair in valueActionPairs
                        if valueActionPair[1] == action
                    ][0]
                    #FIXME Assumes there is only one
                newValues[state] = max(valueActionPairs, key=lambda x: x[0])[0]
            self.values, self.qValues = newValues, newQValues
Esempio n. 37
0
        display = graphicsGridworldDisplay.GraphicsGridworldDisplay(mdp, opts.gridSize, opts.speed)
    display.start()

    ###########################
    # GET THE AGENT
    ###########################

    import valueIterationAgents, qlearningAgents
    a = None
    if opts.agent == 'value':
        a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount, opts.iters)
    elif opts.agent == 'q':
        #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon
        #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp)
        gridWorldEnv = GridworldEnvironment(mdp)
        actionFn = lambda state: mdp.getPossibleActions(state)
        qLearnOpts = {'gamma': opts.discount,
                      'alpha': opts.learningRate,
                      'epsilon': opts.epsilon,
                      'actionFn': actionFn}
        a = qlearningAgents.QLearningAgent(**qLearnOpts)
    elif opts.agent == 'random':
        # # No reason to use the random agent without episodes
        if opts.episodes == 0:
            opts.episodes = 10
        class RandomAgent:
            def getAction(self, state):
                return random.choice(mdp.getPossibleActions(state))
            def getValue(self, state):
                return 0.0
            def getQValue(self, state, action):
Esempio n. 38
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # a lot like doing expecti-max recurance for some horizon
        # start with v0 = 0, as no time left means no reward
        # given vk, do one ply of expectimax to get vk+1
        # repeat until converged

        states = mdp.getStates()
        # print(states)
        # actions = mdp.getPossibleActions(states[2])
        # print(actions)
        # tranprob = mdp.getTransitionStatesAndProbs(states[2], actions[1])
        # print(tranprob)
        # reward = mdp.getReward(states[2], actions[1], states[2])
        # print(reward)
        # term = mdp.isTerminal(states[0])
        # print(term)

        # for all states initally zero
        # for the depth of horizon
        for k in range(self.iterations):
            # Use the util.Counter class in util.py, which is a dictionary with a default value of zero.
            # Every iteration updates the values and (implicitly) the policy
            # self.values is the acumulator, meaning that at each step it can be looked to to get the vk-1 info
            # for each state, vk is the max over its actions + the time penelized vk-1
            # "Use the "batch" version of value iteration where each vector Vk is computed from a fixed vector Vk-1 (like in lecture)"
            # I think this is what you want? not undated in place?
            vk = util.Counter()
            # for each state, get the max sum over the actions
            for s in states:
                act = mdp.getPossibleActions(s)
                maxk = -99E99
                for a in act:
                    # find the expected value at that step based on probs
                    staProb = mdp.getTransitionStatesAndProbs(s, a)
                    # print(staProb)
                    # list of tuples (state, prob), we would want to fold as prob*reward(stateNow, a, Prob's state) +
                    # g*valueNow(s)
                    # print([0]+staProb)

                    sum2 = functools.reduce(
                        lambda x, y: x + y[1] * (mdp.getReward(s, a, y[
                            0]) + self.discount * self.values[y[0]]),
                        [0] + staProb)

                    maxk = max(maxk, sum2)
                # if the value has not updated/ no actions Make sure to handle the case when a state has no available
                # actions in an MDP (think about what this means for future rewards).
                if len(act
                       ) != 0:  # no actions to take, so stay 0 in the counter
                    vk[s] = maxk

            # after that iterate is done, vk becomes the new vk-1
            self.values = util.Counter.copy(vk)
            # so, at the end of iterations vk is now vk-1

        return
Esempio n. 39
0
 def actionFn(state):
     return mdp.getPossibleActions(state)
Esempio n. 40
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        # for storing the policy:
        self.policy = util.Counter()

        # for storing the Q values:
        self.Q_values = util.Counter()

        all_states = mdp.getStates()
        for i in range(iterations):
            new_values = util.Counter()
            for state in all_states:
                if mdp.isTerminal(state):
                    continue
                actions = mdp.getPossibleActions(state)
                max = -2000000000
                for action in actions:
                    sum = 0
                    states = mdp.getTransitionStatesAndProbs(state, action)
                    for state_pair in states:
                        sum += state_pair[1] * (
                            mdp.getReward(state, action, state_pair[0]) +
                            discount * self.values[state_pair[0]])
                    if sum > max:
                        max = sum
                new_values[state] = max
            self.values = new_values

        # Calculate best policy
        for state in all_states:
            if mdp.isTerminal(state):
                continue
            best = None
            max = -2000000000
            actions = mdp.getPossibleActions(state)
            for action in actions:
                new_states = mdp.getTransitionStatesAndProbs(state, action)
                sum = 0
                for state_pair in new_states:
                    sum += state_pair[1] * self.values[state_pair[0]]
                if sum > max:
                    max = sum
                    best = action
            self.policy[state] = best

        # Calculate all Q Values (Q Value iteration)
        for i in range(iterations):
            new_q_values = util.Counter()
            for state in all_states:
                if mdp.isTerminal(state):
                    continue
                actions = mdp.getPossibleActions(state)
                for action in actions:
                    states = mdp.getTransitionStatesAndProbs(state, action)
                    sum = 0
                    for state_pair in states:
                        new_actions = mdp.getPossibleActions(state_pair[0])
                        max_action = -2000000000
                        if len(new_actions) == 0:
                            max_action = 0
                        for a in new_actions:
                            v = self.Q_values[(state_pair[0], a)]
                            if v > max_action:
                                max_action = v
                        sum += state_pair[1] * (
                            mdp.getReward(state, action, state_pair[0]) +
                            discount * max_action)
                    new_q_values[(state, action)] = sum
            self.Q_values = new_q_values
Esempio n. 41
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.policy = util.Counter()
        self.q_value = util.Counter()

        for k in range(iterations):
            previous_values = self.values.copy()
            previous_q = self.q_value.copy()
            for state in mdp.getStates():

                possible_actions = mdp.getPossibleActions(state)
                max_val = -float("inf")
                for action in possible_actions:
                    prob = mdp.getTransitionStatesAndProbs(state, action)
                    sum = 0
                    q_val = 0

                    for nextState in prob:
                        reward = mdp.getReward(state, action, nextState[0])
                        previous = discount * previous_values[nextState[0]]
                        sum += (reward + previous) * nextState[1]

                        list_q = []

                        possible_actions2 = self.mdp.getPossibleActions(
                            nextState[0])
                        for q_act in possible_actions2:
                            list_q.append(previous_q[(nextState, q_act)])
                        if len(list_q) == 0:
                            previous_q_val = 0
                        else:
                            previous_q_val = discount * max(list_q)
                        q_val = (reward + previous_q_val) * nextState[1]

                    if max_val < sum:
                        max_val = sum

                    self.q_value[(state, action)] = q_val
                    self.values[state] = max_val

        for state in mdp.getStates():
            list = util.Counter()
            possible_actions = mdp.getPossibleActions(state)
            for action in possible_actions:
                prob = mdp.getTransitionStatesAndProbs(state, action)
                sum = 0
                for nextState in prob:
                    reward = mdp.getReward(state, action, nextState[0])
                    previous = discount * self.values[nextState[0]]
                    sum += (reward + previous) * nextState[1]
                list[action] = sum

            self.policy[state] = list.argMax()
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Loop according to the supplied number of iterations.
        for iteration in range(self.iterations):
            # Make a copy of the values to accommodate pass by value.
            val = self.values.copy()
            # Get all the states in the mdp.
            states = mdp.getStates()

            for state in states:
                # Get all the possible actions for the current state.
                actions = mdp.getPossibleActions(state)

                if mdp.isTerminal(state) == False:
                    max = -99999

                    for action in actions:
                        v = 0
                        # Get all the possible transitions for each the state and action.
                        transitions = mdp.getTransitionStatesAndProbs(
                            state, action)

                        for transition in transitions:
                            # Perform value iteration.
                            v = v \
                                + transition[1] \
                                * (mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]])

                        if v > max:
                            max = v

                        val[state] = max

                else:
                    for action in actions:
                        v = 0
                        # Get all the possible transitions for each the state and action.
                        transitions = mdp.getTransitionStatesAndProbs(
                            state, action)

                        for transition in transitions:
                            # Perform value iteration.
                            v = v \
                                + transition[1] \
                                * (mdp.getReward(state, action, transition[0]) + discount * self.values[transition[0]])

                        val[state] = v

            self.values = val
Esempio n. 43
0
 def getAction(self, state):
   return random.choice(mdp.getPossibleActions(state))
Esempio n. 44
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        """
        print mdp.getStates()
        for state in mdp.getStates():
          print state
          actions = mdp.getPossibleActions(state)
          for action in actions:
            print action,mdp.getTransitionStatesAndProbs(state,action)
        """
        for i in range(iterations):
          valuesHolder = util.Counter()
          for state in mdp.getStates():
            if mdp.isTerminal(state):
              valuesHolder[state] = mdp.getReward(state,'exit','')
            else:
              valuesHolder[state] = max([self.computeQValueFromValues(state,action) for action in mdp.getPossibleActions(state)])
          self.values = valuesHolder 
Esempio n. 45
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        print("using discount {}".format(discount))
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        delta = 0.01

        self.policy = dict()
        total_iterations = 0
        for _ in range(self.iterations):

            for iteration in range(self.iterations):  # policy evaluation
                temp_values = util.Counter()
                l2_distance = 0
                for state in self.mdp.getStates():
                    if state not in self.policy:  # initialize random policy
                        if not self.mdp.getPossibleActions(state):
                            self.policy[state] = None
                        else:
                            self.policy[state] = np.random.choice(
                                mdp.getPossibleActions(state))
                    if mdp.isTerminal(state):
                        temp_values[state] = 0
                        continue
                    list = mdp.getTransitionStatesAndProbs(
                        state, self.policy[state])
                    value = 0
                    for pair in list:
                        value += pair[1] * (
                            mdp.getReward(state, self.policy[state], pair[0]) +
                            self.discount * self.values[pair[0]])
                    temp_values[state] = value
                    l2_distance = max(
                        l2_distance,
                        np.linalg.norm(value - self.values[state]))

                total_iterations += 1
                self.values = temp_values
                if l2_distance < delta:
                    break

            policy_converged = True
            for state in self.mdp.getStates():  # policy improvement
                if mdp.isTerminal(state):
                    continue
                current_value = self.computeQValueFromValues(
                    state, self.policy[state])
                current_action = self.policy[state]
                for action in self.mdp.getPossibleActions(state):
                    if self.computeQValueFromValues(state,
                                                    action) > current_value:
                        # print(current_action, current_value, action, self.computeQValueFromValues(state, action))
                        current_value = self.computeQValueFromValues(
                            state, action)
                        current_action = action
                        policy_converged = False
                self.policy[state] = current_action
            if policy_converged:
                print(total_iterations)
                break
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for s in mdp.getStates():
       self.values[s] = 0
   "for a in mdp.getPossibleActions(s):"
   "for ac in mdp.getTransitionStatesAndProbs(s,a):"
   " print ac[0]"
   "print ac[1]"
   "copy_value = self.values.copy()"
   "for c in mdp.getStates():"
   "   print copy_value[c]"
   i=0
   "self.states = mdp.getStates()"
   while i < iterations:
       copy_value = self.values.copy()
       for s in mdp.getStates():
           if not mdp.isTerminal(s):
               self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)])
       i = i + 1
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
        """
        '''
        #Some useful mdp methods you will use:
        States 
        Action 
        Ttransition&prob 
        Reward 
        Y 
        Horizon - isTerminal
        Find optimal policy
        print(mdp.getStates())
        # ['TERMINAL_STATE', (0, 0), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1), (2, 2), (3, 0), (3, 1), (3, 2)]
        
        hello = mdp.getStates()[10]
        
        print(hello)
        
        print(mdp.getPossibleActions(hello))
        #('north', 'west', 'south', 'east')
        
        mdp.getTransitionStatesAndProbs(hello, 'south')
        #mdp.getTransitionStatesAndProbs(hello, 'south') = [((1, 2), 0.8), ((0, 2), 0.1), ((2, 2), 0.1)]
        
        print(mdp.getReward(hello, 'south', (3,0)))
        #gives you the probability
    
        print(mdp.isTerminal(hello))
        
        '''

        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        # "*** YOUR CODE HERE ***"
        # state = mdp.getStates()
        # states = state[1:]
        # print values
        #value = []
        x = 0
        while (x < iterations):
            random = self.values.copy()
            # actions = mdp.getPossibleActions()
            states = mdp.getStates()
            for i in states:
                # print i
                # if(mdp.isTerminal(i)==True): its already initialized with zero
                if (mdp.isTerminal(i) == False):
                    summ = 0
                    maxx_qval = -100000
                    directions = mdp.getPossibleActions(i)
                    print directions
                    for j in directions:
                        summ = 0
                        trans = mdp.getTransitionStatesAndProbs(i, j)
                        # Transition probability
                        for k in trans:
                            # print k
                            # print k[0] #k[1] - probability
                            myreward = mdp.getReward(i, j, k[0])
                            #reward
                            # print myreward
                            summ = summ + ((
                                (discount * self.values[k[0]]) + myreward) *
                                           k[1])
                        qvalue = summ
                        if (qvalue > maxx_qval):
                            maxx_qval = qvalue
                        random[i] = maxx_qval
            self.values = random
            x = x + 1
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here

        # need to find new utilites for all states in S
        states = mdp.getStates()

        # update the values for all iterations
        for i in range(iterations):
            # keep track of new values
            newValues = util.Counter()
            # for each state find the new utility
            for s in states:

                # if we're at a terminal state
                if mdp.isTerminal(s):
                    newValues[s] = self.mdp.getReward(s, None, None)

                else:
                    #find possible actions
                    actions = mdp.getPossibleActions(s)

                    # as long as there are actions to take
                    #if(not mdp.isTerminal(s)):
                    if len(actions) != 0:

                        # update utilities by finding the maximum q value
                        # find qValues based on the actions
                        qValues = []
                        for a in actions:
                            transitionStates = self.mdp.getTransitionStatesAndProbs(
                                s, a)

                            sumOfTransitions = 0.0

                            for ts in transitionStates:
                                nextState = ts[0]
                                prob = ts[1]
                                # value for this state * the probability of getting there
                                sumOfTransitions += prob * self.getValue(
                                    nextState)
                                #sumOfTransitions += prob * (self.mdp.getReward(s, a, nextState) + (self.discount * self.getValue(nextState)))

                            # current reward + the discount factor * the sum over all of the transition states
                            qValue = self.mdp.getReward(
                                s, None,
                                None) + self.discount * sumOfTransitions
                            # qValue = sumOfTransitions

                            qValues.append(qValue)

                        maxQValue = max(qValues)
                        newValues[s] = maxQValue

            self.values = newValues
Esempio n. 49
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        print("using discount {}".format(discount))
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.policies = util.Counter()

        delta = 0.01
        iteration = 1
        # initialize the policies arbitrarily
        for state in mdp.getStates():
            actions = mdp.getPossibleActions(state)
            if len(actions) >= 1:
                self.policies[state] = mdp.getPossibleActions(state)[0]
            else:
                self.policies[state] = None

        policy_loop = 0
        while True:
            policy_loop += 1
            while True:
                # policy evaluation
                iteration += 1
                difference = 0
                for state in mdp.getStates():
                    old_value = self.values[state]
                    action = self.policies[state]
                    if action is None:
                        continue
                    self.values[state] = self.computeQValueFromValues(state, action)
                    difference = max(difference, abs(old_value-self.values[state]))
                if difference < delta or iteration == iterations:
                    break

            if iteration == iterations:
                break

            # policy imporvement
            stable = True
            iteration += 1
            for state in mdp.getStates():
                old_policy = self.policies[state]
                self.policies[state] = self.computeActionFromValues(state)
                if old_policy != self.policies[state]:
                    stable = False
            if stable or iteration == iterations:
                break

        print("It took a total of {} iterations to converge.".format(iteration))
        print("It took a total of {} total policy iteration loops to converge.".format(policy_loop))
Esempio n. 50
0
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here

        for i in range(iterations):
            storeValues = util.Counter()
            states = mdp.getStates()
            for s in states:
                actions = mdp.getPossibleActions(s)
                if(len(actions) == 0): continue
                qVals = [self.getQValue(s, a) for a in actions]
                storeValues[s] = max(qVals)
            self.values = storeValues

    def getValue(self, state):
        """
          Return the value of the state (computed in __init__).
        """
        return self.values[state]


    def computeQValueFromValues(self, state, action):
        """
          Compute the Q-value of action in state from the
Esempio n. 51
0
        import graphicsGridworldDisplay
        display = graphicsGridworldDisplay.GraphicsGridworldDisplay(
            mdp, opts.gridSize, opts.speed)
    display.start()

    ###########################
    # GET THE AGENT
    ###########################

    import valueIterationAgents, qlearningAgents, sarsaLambdaAgents
    a = None
    if opts.agent == 'value':
        a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount,
                                                     opts.iters)
    elif opts.agent == 'valueApproximate':
        actionFn = lambda state: mdp.getPossibleActions(state)
        qLearnOpts = {
            'gamma': opts.discount,
            'iterations': opts.iters,
            'mdp': mdp,
            'alpha': opts.learningRate,
            'epsilon': opts.epsilon,
            'extractor': opts.extractor,
            'actionFn': actionFn
        }
        a = valueIterationAgents.ApproximateValueIterAgent(**qLearnOpts)
    elif opts.agent == 'q':
        #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon
        #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp)
        actionFn = lambda state: mdp.getPossibleActions(state)
        qLearnOpts = {
def main(myargs):
    sys.argv = myargs.split()
    opts = parseOptions()

    ###########################
    # GET THE GRIDWORLD
    ###########################

    if opts.grid == 'VerticalBridgeGrid':
        opts.gridSize = 120

    import gridworld
    mdpFunction = getattr(gridworld, "get" + opts.grid)
    mdp = mdpFunction()
    mdp.setLivingReward(opts.livingReward)
    mdp.setNoise(opts.noise)
    env = gridworld.GridworldEnvironment(mdp)

    ###########################
    # GET THE DISPLAY ADAPTER
    ###########################

    import textGridworldDisplay
    display = textGridworldDisplay.TextGridworldDisplay(mdp)
    if not opts.textDisplay:
        import graphicsGridworldDisplay
        display = graphicsGridworldDisplay.GraphicsGridworldDisplay(
            mdp, opts.gridSize, opts.speed)
    try:
        display.start()
    except KeyboardInterrupt:
        sys.exit(0)

    ###########################
    # GET THE AGENT
    ###########################

    import valueIterationAgents, qlearningAgents
    a = None
    if opts.agent == 'value':
        a = valueIterationAgents.ValueIterationAgent(mdp, opts.discount,
                                                     opts.iters)
    elif opts.agent == 'q':
        #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon
        #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp)
        gridWorldEnv = GridworldEnvironment(mdp)
        actionFn = lambda state: mdp.getPossibleActions(state)
        qLearnOpts = {
            'gamma': opts.discount,
            'alpha': opts.learningRate,
            'epsilon': opts.epsilon,
            'actionFn': actionFn
        }
        a = qlearningAgents.QLearningAgent(**qLearnOpts)
    elif opts.agent == 'random':
        # # No reason to use the random agent without episodes
        if opts.episodes == 0:
            opts.episodes = 10

        class RandomAgent:
            def getAction(self, state):
                return random.choice(mdp.getPossibleActions(state))

            def getValue(self, state):
                return 0.0

            def getQValue(self, state, action):
                return 0.0

            def getPolicy(self, state):
                "NOTE: 'random' is a special policy value; don't use it in your code."
                return 'random'

            def update(self, state, action, nextState, reward):
                pass

        a = RandomAgent()
    else:
        if not opts.manual: raise 'Unknown agent type: ' + opts.agent

    ###########################
    # RUN EPISODES
    ###########################
    # DISPLAY Q/V VALUES BEFORE SIMULATION OF EPISODES
    try:
        if not opts.manual and opts.agent == 'value':
            if opts.valueSteps:
                for i in range(opts.iters):
                    tempAgent = valueIterationAgents.ValueIterationAgent(
                        mdp, opts.discount, i)
                    display.displayValues(tempAgent,
                                          message="VALUES AFTER " + str(i) +
                                          " ITERATIONS")
                    display.pause()

            display.displayValues(a,
                                  message="VALUES AFTER " + str(opts.iters) +
                                  " ITERATIONS")
            display.pause()
            display.displayQValues(a,
                                   message="Q-VALUES AFTER " +
                                   str(opts.iters) + " ITERATIONS")
            display.pause()
    except KeyboardInterrupt:
        sys.exit(0)

    # FIGURE OUT WHAT TO DISPLAY EACH TIME STEP (IF ANYTHING)
    displayCallback = lambda x: None
    if not opts.quiet:
        if opts.manual and opts.agent == None:
            displayCallback = lambda state: display.displayNullValues(state)
        else:
            if opts.agent == 'random':
                displayCallback = lambda state: display.displayValues(
                    a, state, "CURRENT VALUES")
            if opts.agent == 'value':
                displayCallback = lambda state: display.displayValues(
                    a, state, "CURRENT VALUES")
            if opts.agent == 'q':
                displayCallback = lambda state: display.displayQValues(
                    a, state, "CURRENT Q-VALUES")

    messageCallback = lambda x: printString(x)
    if opts.quiet:
        messageCallback = lambda x: None

    # FIGURE OUT WHETHER TO WAIT FOR A KEY PRESS AFTER EACH TIME STEP
    pauseCallback = lambda: None
    if opts.pause:
        pauseCallback = lambda: display.pause()

    # FIGURE OUT WHETHER THE USER WANTS MANUAL CONTROL (FOR DEBUGGING AND DEMOS)
    if opts.manual:
        decisionCallback = lambda state: getUserAction(state, mdp.
                                                       getPossibleActions)
    else:
        decisionCallback = a.getAction

    # RUN EPISODES
    if opts.episodes > 0:
        print()
        print("RUNNING", opts.episodes, "EPISODES")
        print()
    returns = 0
    for episode in range(1, opts.episodes + 1):
        returns += runEpisode(a, env, opts.discount, decisionCallback,
                              displayCallback, messageCallback, pauseCallback,
                              episode)
    if opts.episodes > 0:
        print()
        print("AVERAGE RETURNS FROM START STATE: " +
              str((returns + 0.0) / opts.episodes))
        print()
        print()

    # DISPLAY POST-LEARNING VALUES / Q-VALUES
    if opts.agent == 'q' and not opts.manual:
        try:
            display.displayQValues(a,
                                   message="Q-VALUES AFTER " +
                                   str(opts.episodes) + " EPISODES")
            display.pause()
            display.displayValues(a,
                                  message="VALUES AFTER " +
                                  str(opts.episodes) + " EPISODES")
            display.pause()
        except KeyboardInterrupt:
            sys.exit(0)
Esempio n. 53
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        def computeValue(mdp, state, iterationCount):

            if mdp.isTerminal(state):
                # self.values[state] = 0
                return self.values[state]

            # if self.values[state] != 0:
            #     return self.values[state]
            iterationCount -= 1
            if iterationCount < -1:
                return self.values[state]

            actions = mdp.getPossibleActions(state)
            maxExpect = -9999
            for action in actions:
                expect = 0.0
                transactionAndProb = mdp.getTransitionStatesAndProbs(
                    state, action)

                for element in transactionAndProb:
                    # print element
                    nextState = element[0]
                    nextIteration = iterationCount
                    expect += element[1] * (
                        mdp.getReward(state, action, nextState) +
                        computeValue(mdp, nextState, nextIteration) * discount)

                if expect > maxExpect:
                    maxExpect = expect

            self.values[state] = maxExpect
            return maxExpect

        allStates = mdp.getStates()
        for i in range(self.iterations):
            # print "iteration number", i
            temp = util.Counter()
            for state in allStates:
                if self.mdp.isTerminal(state):
                    continue
                maxExpect = -9999
                for action in mdp.getPossibleActions(state):
                    expect = self.computeQValueFromValues(state, action)
                    if expect > maxExpect:
                        maxExpect = expect

                temp[state] = maxExpect
            self.values = temp
Esempio n. 54
0
    def runValueIteration(self):
        "*** YOUR CODE HERE ***"

        mdp = self.mdp
        values = self.values
        discount = self.discount
        iterations = self.iterations
        theta = self.theta
        states = mdp.getStates()

        predecessors = {}  # dict
        for state in states:
            predecessors[state] = set()

        pq = util.PriorityQueue()

        # computes predecessors and puts initial stuff into pq
        for state in states:
            Q_s = util.Counter()

            for action in mdp.getPossibleActions(state):
                # assigning predecessors
                T = mdp.getTransitionStatesAndProbs(state, action)
                for (nextState, prob) in T:
                    if prob != 0:
                        predecessors[nextState].add(state)

                # computing Q values for determining diff's for the pq
                Q_s[action] = self.computeQValueFromValues(state, action)

            if not mdp.isTerminal(state):  # means: if non terminal state
                maxQ_s = Q_s[Q_s.argMax()]
                diff = abs(values[state] - maxQ_s)
                pq.update(state, -diff)

        # now for the actual iterations
        for i in xrange(iterations):
            if pq.isEmpty():
                return

            state = pq.pop()

            if not mdp.isTerminal(state):
                Q_s = util.Counter()
                for action in mdp.getPossibleActions(state):
                    Q_s[action] = self.computeQValueFromValues(state, action)

                values[state] = Q_s[Q_s.argMax()]

            for p in predecessors[state]:
                Q_p = util.Counter()
                for action in mdp.getPossibleActions(p):
                    # computing Q values for determining diff's for the pq
                    Q_p[action] = self.computeQValueFromValues(p, action)

                #if not mdp.isTerminal(state): # means: if non terminal state
                maxQ_p = Q_p[Q_p.argMax()]
                diff = abs(values[p] - maxQ_p)

                if diff > theta:
                    pq.update(p, -diff)
Esempio n. 55
0
 def update_state(self, mdp, state, vk):
     Q = util.Counter()
     for action in mdp.getPossibleActions(state):
         Q[action] = self.computeQValueFromValues(state, action)
     vk[state] = Q[Q.argMax()]
Esempio n. 56
0
        import graphicsGridworldDisplay
        display = graphicsGridworldDisplay.GraphicsGridworldDisplay(mdp, opts.gridSize, opts.speed)
    try:
        display.start()
    except KeyboardInterrupt:
        sys.exit(0)

    ###########################
    # GET THE AGENT
    ###########################

    #import valueIterationAgents, rtdp #, qlearningAgents
    import sarsa_agents, tree_backup, q_sigma
    a = None
    if opts.agent == 'n_step_sarsa':
        a = sarsa_agents.NStepSarsaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state))
    elif opts.agent == 'n_step_expected_sarsa':
        a = sarsa_agents.NStepExpectedSarsaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state))
    elif opts.agent == 'tree_backup':
        a = tree_backup.NStepTreeBackupAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state))
    elif opts.agent == 'qsigma':
        a = q_sigma.QSigmaAgent(discount=opts.discount, alpha=opts.learningRate, epsilon=opts.epsilon, actionFn=lambda state: mdp.getPossibleActions(state), n=opts.stepn, terminalFn=lambda state: mdp.isTerminal(state), sigma=opts.sigma, numEpisodes=opts.episodes)
    elif opts.agent == 'value':
        a = valueIterationAgents.ValueIterationAgent(mdp, env, opts.discount, opts.iters, display)
    elif opts.agent == 'valuegs':
        a = valueIterationAgents.GSValueIterationAgent(mdp, env, opts.discount, opts.iters, display)
    elif opts.agent == 'rtdp':
        a = rtdp.RTDPLearningAgent(mdp, env, opts.discount, opts.iters) #mdp, env, opts.discount, opts.iters)
    elif opts.agent == 'q':
        #env.getPossibleActions, opts.discount, opts.learningRate, opts.epsilon
        #simulationFn = lambda agent, state: simulation.GridworldSimulation(agent,state,mdp)