def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        count = 0
        for j in range(self.iterations):
            newVal = self.values.copy()
            for s in mdp.getStates():
                k = [float("-inf")]
                if mdp.isTerminal(s):  #checking if mdp is a terminal state
                    count = count + 1
                elif not mdp.isTerminal(s):
                    for action in mdp.getPossibleActions(s):  #Getting Q state
                        k += [self.computeQValueFromValues(s, action)
                              ]  #computing Q values
                    newVal[s] = max(k)
            self.values = newVal
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     predecessors = {state: set() for state in mdp.getStates()}
     for state in mdp.getStates():
         for action in mdp.getPossibleActions(state):
             for nextState, prob in mdp.getTransitionStatesAndProbs(
                     state, action):
                 if prob > 0:
                     predecessors[nextState].add(state)
     queue = util.PriorityQueue()
     for state in mdp.getStates():
         if not mdp.isTerminal(state):
             diff = abs(self.values[state] - max([
                 self.getQValue(state, action)
                 for action in mdp.getPossibleActions(state)
             ]))
             queue.update(state, -diff)
     for i in range(self.iterations):
         if queue.isEmpty():
             break
         state = queue.pop()
         if not mdp.isTerminal(state):
             self.values[state] = max([
                 self.getQValue(state, action)
                 for action in mdp.getPossibleActions(state)
             ])
         for pred in predecessors[state]:
             diff = abs(self.values[pred] - max([
                 self.getQValue(pred, action)
                 for action in mdp.getPossibleActions(pred)
             ]))
             if diff > self.theta:
                 queue.update(pred, -diff)
Beispiel #3
0
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     state = mdp.getStates()
     predecessor_list = {}
     discount = self.discount
     iterations = self.iterations
     theta = self.theta
     for temp_state in state:
         predecessor_list[temp_state] = self.getpredecessor(temp_state)
     pq = util.PriorityQueue()
     for temp_state in state:
         if not mdp.isTerminal(temp_state):
             pq.push(temp_state, -self.find_difference(temp_state))
     for i in range(iterations):
         if pq.isEmpty():
             return
         cur_state = pq.pop()
         if not mdp.isTerminal(cur_state):
             action = self.getAction(cur_state)
             self.values[cur_state] = self.getQValue(cur_state, action)
         for pre in predecessor_list[cur_state]:
             diff_pre = self.find_difference(pre)
             if diff_pre > theta:
                 pq.update(pre, -diff_pre)
 def runValueIteration(self):
     "*** YOUR CODE HERE ***"
     mdp = self.mdp
     values = self.values
     discount = self.discount
     predecessors = {}
     for state in mdp.getStates():
         preList = []
         for preState in mdp.getStates():
             for action in mdp.getPossibleActions(preState):
                 if state in [
                         pair[0]
                         for pair in mdp.getTransitionStatesAndProbs(
                             preState, action) if pair[1] > 0
                 ]:
                     preList.append(preState)
                     break
         predecessors[state] = preList
     queue = util.PriorityQueue()
     for s in mdp.getStates():
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             realValue = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
             diff = abs(realValue - values[s])
             queue.push(s, 0 - diff)
     for _ in range(self.iterations):
         if queue.isEmpty():
             return
         s = queue.pop()
         if not mdp.isTerminal(s):
             actions = mdp.getPossibleActions(s)
             values[s] = max(
                 sum(prob * (mdp.getReward(s, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(s, action))
                 for action in actions)
         for p in predecessors[s]:
             actions = mdp.getPossibleActions(p)
             realValue = max(
                 sum(prob * (mdp.getReward(p, action, nextState) +
                             (discount * values[nextState]))
                     for (nextState, prob
                          ) in mdp.getTransitionStatesAndProbs(p, action))
                 for action in actions)
             diff = abs(realValue - values[p])
             if diff > self.theta:
                 queue.update(p, 0 - diff)
Beispiel #5
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.delta = 0
    while(self.iterations > 0):
#         self.delta = 0
        batchValues = util.Counter()
        for state in mdp.getStates():  
            maxM = -10000
                   
            if mdp.isTerminal(state):
                continue 
            for action in mdp.getPossibleActions(state):
                statesProbs = mdp.getTransitionStatesAndProbs(state, action)
                sumU = 0
                Rs = 0
                for stateProb in statesProbs:
#                     if stateProb[0] == 'TERMINAL_STATE':
#                         continue
                    sumU = sumU + self.values[stateProb[0]]*stateProb[1]
                    Rs = Rs + mdp.getReward(state, action, stateProb[0]) * stateProb[1]
#                 if sumU > maxM:
#                     maxM = sumU   
                v = Rs + sumU * discount
                if (v > maxM):
                    maxM = v
            batchValues[state] = maxM
        self.values = batchValues
        self.iterations = self.iterations - 1       
    self.policy = {}
    for state in mdp.getStates():
        if mdp.isTerminal(state):
            self.policy[state] = None
            continue
        QValues = []
        for action in mdp.getPossibleActions(state):
            QValues.append(self.getQValue(state, action))
            self.policy[state] = mdp.getPossibleActions(state)[QValues.index(max (QValues))]
Beispiel #6
0
def get_average_reward(agent, mdp):
    current_state = mdp.getStartState()
    iterations = 0
    total = 0
    while not mdp.isTerminal(current_state) and iterations < 1000:
        total += agent.getValue(current_state)
        action = agent.getPolicy(current_state)
        next_states = mdp.getTransitionStatesAndProbs(current_state, action)
        current_state = next_states[0][0]
        iterations += 1

    if mdp.isTerminal(current_state):
        return total / iterations
    return 0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for ite in range(iterations):  ## repeat 
            stateValues=util.Counter()  ## initial all values are 0
            for state in mdp.getStates():
                if mdp.isTerminal(state)==False:  ##if not a terminal state
                ## maximizes the expected utility of each state
                    stateValues[state] = max([self.computeQValueFromValues(state, action) for action in mdp.getPossibleActions(state)])
            self.values=stateValues
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        # Write value iteration code here
        states = mdp.getStates()
        for _ in range(0, self.iterations):
            # create temporary values for for k iteration update
            tempValues = util.Counter()
            # loop over states to find max q value
            for state in states:
                if not mdp.isTerminal(state):
                    actions = mdp.getPossibleActions(state)
                    tempValues[state] = max(
                        [self.getQValue(state, action) for action in actions])
            # set values to k iteration update values
            self.values = tempValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in xrange(iterations):
          # do batch processing
          vals = copy.deepcopy(self.values)
          for s in mdp.getStates():
            if mdp.isTerminal(s):
              vals[s] = 0
            else:
              vals[s] = self.computeQValueFromValues(s, self.computeActionFromValues(s))
          self.values = vals
Beispiel #10
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()
        for i in range(iterations):
            # Make a new dictionary to memorize new values
            newValues = util.Counter()
            # For every state...
            for curState in states:
                if mdp.isTerminal(curState): continue
                action = self.getAction(curState)
                # Compute new Q value
                newValues[curState] = self.computeQValueFromValues(
                    curState, action)
            # Memorize new values
            self.values = newValues.copy()
Beispiel #11
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   "*** YOUR CODE HERE ***"
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # value of each state; a Counter is a dict with default 0
   
   # run for desired number of iterations
   for i in xrange(iterations):
     new_values = self.values.copy()
     for s in mdp.getStates():
       if not mdp.isTerminal(s):
         # the commented code works as well
         #curr_best = float("-inf")
         #for a in mdp.getPossibleActions(s):
         #temp_value = sum([p * (mdp.getReward(s, a, s2) + discount*prev[s2]) for s2, p in mdp.getTransitionStatesAndProbs(s, a)])
         #  if temp_value > curr_best:
         #    curr_best = temp_value
         #self.values[s] = curr_best       
         new_values[s] = max([self.getQValue(s, a) for a in mdp.getPossibleActions(s)])  
     self.values = new_values
Beispiel #12
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        v = 0
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for k in range(iterations):
            tmp = util.Counter()

            for s in mdp.getStates():
                if mdp.isTerminal(s):
                    tmp[s] = 0
                else:

                    maior = -9999
                    for a in mdp.getPossibleActions(s):
                        qv = self.computeQValueFromValues(s, a)
                        if qv > maior:
                            maior = qv
                    tmp[s] = maior
            self.values = tmp
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   for s in mdp.getStates():
       self.values[s] = 0
   "for a in mdp.getPossibleActions(s):"
   "for ac in mdp.getTransitionStatesAndProbs(s,a):"
   " print ac[0]"
   "print ac[1]"
   "copy_value = self.values.copy()"
   "for c in mdp.getStates():"
   "   print copy_value[c]"
   i=0
   "self.states = mdp.getStates()"
   while i < iterations:
       copy_value = self.values.copy()
       for s in mdp.getStates():
           if not mdp.isTerminal(s):
               self.values[s] = mdp.getReward(s,'north',s) + discount * max([sum([copy_value[s1] * p for (s1,p) in mdp.getTransitionStatesAndProbs(s,a)]) for a in mdp.getPossibleActions(s)])
       i = i + 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.oldValues = self.values.copy()
        states = mdp.getStates()
        for i in range(iterations):
            for state in states:
                actions = mdp.getPossibleActions(state)
                if not mdp.isTerminal(state):
                    actionValue = -float('inf')
                    for action in actions:
                        qValue = self.computeQValueFromValues(state, action)
                        actionValue = max(actionValue, qValue)
                    self.values[state] = actionValue
            self.oldValues = self.values.copy()
Beispiel #15
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # Outer loop for iteration time
        # Define the value for each of state taking pi action
        for i in range(0, iterations):
          newDict = util.Counter() # Create another dictionary to prevent mutation 
          for state in mdp.getStates():
            if mdp.isTerminal(state):
              newDict[state] = 0 # Set zero when it's in terminal node
            else:
              newDict[state] = max([ self.getQValue(state, action) for action in mdp.getPossibleActions(state)])
          self.values = newDict # Update the raw values
Beispiel #16
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #mdp este procesul de decizii, care este cunoscut (spre deosebire de q learning,
        #unde nu este cunoscut modelul
        for iteration in range(iterations): #se itereaza de iterations ori
            currentStepValues = self.values.copy() #valorile Q ale pasului curent, deci mereu in pasul k + 1 se porneste cu valorile din pasul k
            for nextState in mdp.getStates(): #toata lista de stari posibile, in cazul nostru toate patratele de pe harta
                if mdp.isTerminal(nextState):#starea terminala nu isi mai schimba valoarea Q
                    continue
                currentStepValues[nextState] = self.computeQValueFromValues(nextState, self.computeActionFromValues(nextState))#update la valori, defapt singura operatie dintr-o iteratie
                #se recalculeaza valoarea q a acestei stari in functie de valoarea precedenta
                #print("VALUE IS: ")  # sa vedem frumos ce se intampla
                #print(currentStepValues[nextState])
            self.values = currentStepValues.copy()#salvam valorile noi
Beispiel #17
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
          
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0

    for i in range(iterations):
        nextValues = util.Counter()
        for state in mdp.getStates(): 
            if mdp.isTerminal(state): continue
            first = True
            for action in mdp.getPossibleActions(state):
                qValue = 0
                for (nextState, prob) in mdp.getTransitionStatesAndProbs(state, action):
                    reward = mdp.getReward(state, action, nextState)
                    qValue += prob * (reward + discount*self.values[nextState])
                if first:
                    maxQValue = qValue
                    first = False
                elif qValue > maxQValue:
                    maxQValue = qValue
            nextValues[state] = maxQValue
        self.values = nextValues
Beispiel #18
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.oldValues = self.values.copy()
        states = self.mdp.getStates()
        for iteration in range(iterations):
            for state in states:
                if not mdp.isTerminal(state):
                    action = self.computeActionFromValues(state)
                    value = self.computeQValueFromValues(state, action)
                    self.values[state] = value
            self.oldValues = self.values.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        stateValue = dict()
        for _ in range(self.iterations):
            maxVal = 0
            for state in self.mdp.getStates():
                if not mdp.isTerminal(state):  # Check if the given state is terminal or not
                    actions = list()  # List to store possible actions available
                    for action in self.mdp.getPossibleActions(state):
                        actions.append(self.getQValue(state, action))  # Get Q value for each of the possible states with given actions
                    if actions:
                        maxVal = max(actions)  # Store the max actions value in order to calculate optimal path
                stateValue[state] = maxVal
            self.values = stateValue.copy()
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    #print self.values['test']
    #self.values['test'] +=1
    for x in range(0,iterations):
        statevalues = util.Counter()
        for state in mdp.getStates():
            if mdp.isTerminal(state):
                statevalues[state]=0
                continue
            actionvalue = -100000
            for action in mdp.getPossibleActions(state):
                S_next = self.getQValue(state,action)
                if actionvalue<S_next:
                    actionvalue = S_next
            statevalues[state] = actionvalue
        self.values = statevalues

    "*** YOUR CODE HERE ***"
Beispiel #21
0
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
    
   "*** YOUR CODE HERE ***"
   allStates = mdp.getStates() 
   vPrimes = util.Counter() #  A Counter is a dict with default 0
   
   iteration = 0
   while iteration < iterations:
       
       for s in allStates: 
           if mdp.isTerminal(s):
               vPrimes[s] = mdp.getReward(s, None, s);
           else: 
               sreward = mdp.getReward(s, None, s)
               vPrimes[s] = sreward + discount * self.utilOfBestAction(mdp, s )
              
       for s in allStates:
           self.values[s] = vPrimes[s]
           
       iteration +=1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #while still iterations
          #for each state
            #for action in each state
              #get Q(state,action)
            #store largest (state,action) in Counter

        for i in range(self.iterations):
          newValues = self.values.copy() #WTF WHY THIS TOOK HOURS
          for state in mdp.getStates():
            v = [float("-inf")]
            if not mdp.isTerminal(state):
              for action in mdp.getPossibleActions(state):
                v += [self.computeQValueFromValues(state,action)]
              newValues[state] = max(v)
          self.values = newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            valuesNew = util.Counter()
            for state in mdp.getStates():
                maxVal = -1
                if not mdp.isTerminal(state):
                    vals = util.Counter()
                    for possact in mdp.getPossibleActions(state):
                        #value = self.computeQValueFromValues(state, possact)
                        #if value > maxVal:
                        #    maxVal = value
                        vals[possact] = self.computeQValueFromValues(state, possact)
                    #valuesNew[state] = maxVal
                    valuesNew[state] = max(vals.values())
            for st2 in valuesNew:
              self.values[st2] = valuesNew[st2]
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
    self.qvalues = util.Counter()
    self.bestact = util.Counter()

    "*** YOUR CODE HERE ***"
    states = mdp.getStates()

    for i in range(iterations):
        v = util.Counter()
        for state in states:
            if mdp.isTerminal(state):
                continue
            value = {action: sum(prob * (mdp.getReward(state,action,next_state) + discount*self.values[next_state])
                    for next_state, prob in mdp.getTransitionStatesAndProbs(state, action))
                    for action in mdp.getPossibleActions(state)}
            self.bestact[state] = max(value, key=value.get)
            v[state] = value[self.bestact[state]] 
            for action in value.keys():
                self.qvalues[state,action] = value[action]
        self.values = v.copy()
Beispiel #25
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        for i in xrange(iterations):
            tmp = util.Counter()
            for state in mdp.getStates():
                tmp[state] = self.values[state]
            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    q_value = -10000000
                    for action in mdp.getPossibleActions(state):
                        q_value = max(q_value, self.computeQValueFromValues(state, action))
                    tmp[state] = q_value
            self.values = tmp
Beispiel #26
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        for loop in range(self.iterations):
            value_dict = util.Counter()
            mdp_list = mdp.getStates()
            for state in mdp_list:
                #check if the agents isTerminal
                if not mdp.isTerminal(state):
                    #make counter_dict and tuple for possible actions
                    counter_dict = util.Counter()
                    actions_tuple = mdp.getPossibleActions(state)
                    #iterate and get qValue
                    for action in actions_tuple:
                        counter_dict[action] = self.computeQValueFromValues(
                            state, action)
                    #get max of the counter_dict values
                    value_dict[state] = max(counter_dict.values())
            #make a copy and update values
            self.values = value_dict.copy()
Beispiel #27
0
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    
    for time in range(iterations):
      values = util.Counter()
      for state in mdp.getStates():
        if mdp.isTerminal(state): 
	  values[state] = 0
	else: 
          maxValue = -INF
	  for action in mdp.getPossibleActions(state):
	    maxValue = max(maxValue, self.getQValue(state, action))
	  values[state] = maxValue
      self.values = values
Beispiel #28
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        newValues = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        allStates = mdp.getStates()

        for i in range(iterations):
            for s in allStates:
                if mdp.isTerminal(s): continue
                mx = float("-inf")
                for a in mdp.getPossibleActions(s):
                    score = 0
                    for (sp, tp) in mdp.getTransitionStatesAndProbs(s, a):
                        score += tp * (mdp.getReward(s, a, sp) +
                                       self.discount * self.values[sp])
                    if score > mx:
                        mx = score
                newValues[s] = mx
            self.values = newValues.copy()
Beispiel #29
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """

        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0
        self.tempvalues = self.values.copy()
        # Write value iteration code here
        #print "start location ", mdp.getStates()
        "*** YOUR CODE HERE ***"
        for i in xrange(0, self.iterations):
            for allstate in mdp.getStates():
                if not mdp.isTerminal(allstate):
                    value = []
                    for possibleaction in mdp.getPossibleActions(allstate):
                        statevalue = 0
                        statevalue = statevalue + self.computeQValueFromValues(
                            allstate, possibleaction)
                        value.append(statevalue)
                    self.tempvalues[allstate] = max(value)
            self.values = self.tempvalues.copy()
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"

        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        maxActionValue = float('-inf')
        maxAction = None

        if ((possibleActions == None) or (mdp.isTerminal(state))):
            return None

        for action in possibleActions:
            actionSum = self.getQValue(state, action)

            #Find the maximum action
            if maxActionValue < actionSum:
                maxAction = action
                maxActionValue = actionSum

        return maxAction
    def runValueIteration(self):
        # Write value iteration code here
        # print(self.iterations)
        for i in range(0, self.iterations):
            newValues = {}
            for state in self.values:
                newValues[state] = [state]
            # print("iteartion ",i)
            mdp = self.mdp
            statesInMDP = mdp.getStates()
            for currentState in statesInMDP:
                if (not mdp.isTerminal(currentState)):
                    actions = mdp.getPossibleActions(currentState)
                    maxActionReward = -math.inf
                    for currentAction in actions:
                        currentActionReward = self.computeQValueFromValues(
                            currentState, currentAction)

                        if (currentActionReward > maxActionReward):
                            maxActionReward = currentActionReward
                    newValues[currentState] = maxActionReward
                else:
                    newValues[currentState] = self.getValue(currentState)
            self.values = newValues

        # mdp.isTerminal(state)
        "*** YOUR CODE HERE ***"
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        "*** YOUR CODE HERE ***"
        for i in range(0, iterations):
            new_values = copy.copy(self.values)
            for s in mdp.getStates():

                if mdp.isTerminal(s):
                    new_values[s] = 0.0
                else:
                    values = []
                    actions = mdp.getPossibleActions(s)
                    for anaction in actions:
                        values.append(self.getQValue(s, anaction))
                    new_values[s] = max(values)
            self.values = new_values
Beispiel #33
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(0, iterations):
            valuesCopy = self.values.copy()
            for state in mdp.getStates():
                maximumScore = -10000
                for action in mdp.getPossibleActions(state):
                    currQVal = self.computeQValueFromValues(state, action)
                    maximumScore = max(maximumScore, currQVal)
                if mdp.isTerminal(state):
                    valuesCopy[state] = 0
                else:
                    valuesCopy[state] = maximumScore
            self.values = valuesCopy
Beispiel #34
0
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        import random
        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        valuesForAction = util.Counter()
        if (mdp.isTerminal(state)):
            return None

        for action in possibleActions:
            tp = mdp.getTransitionStatesAndProbs(state, action)
            sum = 0
            for i in range(len(tp)):
                nextState, prob = tp[i]
                sum = sum + prob * (mdp.getReward(state, action, nextState) +
                                    self.discount * self.values[nextState])
            valuesForAction[action] = sum

        if (valuesForAction.totalCount() == 0):
            return possibleActions[0]
        return valuesForAction.argMax()
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"

        mdp = self.mdp
        possibleActions = mdp.getPossibleActions(state)
        maxActionValue = float('-inf')
        maxAction = None

        if  ((possibleActions==None) or (mdp.isTerminal(state))):
            return None

        for action in possibleActions:
            actionSum = self.getQValue(state, action)
                        
            #Find the maximum action
            if maxActionValue < actionSum:
                maxAction = action
                maxActionValue = actionSum

        return maxAction
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() 
        
        for i in range(iterations): # running the alg on the indicated number of iterations
            y = self.values.copy() #V sub k-1
            
            for state in mdp.getStates():
                actions = util.Counter()
                
                if  mdp.isTerminal(state) == False:
                    for possibleActions in mdp.getPossibleActions(state):

                        for transitionState, prob in mdp.getTransitionStatesAndProbs(state, possibleActions):
                                value_iteration = prob * (mdp.getReward(state, possibleActions, transitionState) + (discount* y[transitionState]))
                                actions[possibleActions] += value_iteration
                    self.values[state] = actions[actions.argMax()] 
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        helper_vector = util.Counter() # Copy of vectors to be used for batch updating 
        
        for i in range(self.iterations):
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    continue
                if mdp.getPossibleActions(state):
                    helper_vector[state] = sum([transition[1]*(mdp.getReward(state,mdp.getPossibleActions(state)[0],transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, mdp.getPossibleActions(state)[0])] )
                for action in mdp.getPossibleActions(state):
                    helper_vector[state] = max(helper_vector[state],sum([ transition[1]*(mdp.getReward(state, action, transition[0])+self.discount*self.values[transition[0]])
                        for transition in mdp.getTransitionStatesAndProbs(state, action)] ))
            for state in helper_vector:
                self.values[state] = helper_vector[state]
Beispiel #38
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
		  Your value iteration agent should take an mdp on
		  construction, run the indicated number of iterations
		  and then act according to the resulting policy.

		  Some useful mdp methods you will use:
			  mdp.getStates()
			  mdp.getPossibleActions(state)
			  mdp.getTransitionStatesAndProbs(state, action)
			  mdp.getReward(state, action, nextState)
			  mdp.isTerminal(state)
		"""
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here

        for i in range(self.iterations):
            valCopy = self.values.copy()
            for state in self.mdp.getStates():
                maxVal = float("-inf")
                if not mdp.isTerminal(state):
                    for action in mdp.getPossibleActions(state):
                        maxVal = max(
                            maxVal,
                            self.computeQValueFromValues(state, action))
                    valCopy[state] = maxVal
            self.values = valCopy
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        temp = util.Counter()
        for i in xrange(iterations):
            temp = self.values.copy()
            for j in mdp.getStates():
                vlist = []
                actions = mdp.getPossibleActions(j)
                if not mdp.isTerminal(j):
                    for k in actions:
                        tran = mdp.getTransitionStatesAndProbs(j, k)
                        val = 0
                        for m in tran:
                            val += m[1] * (mdp.getReward(j, k, m[0]) + self.discount * temp[m[0]])
                        vlist.append(val)
                    self.values[j] = max(vlist)
    def computeActionFromValues(self, state):
        """
          The policy is the best action in the given state
          according to the values currently stored in self.values.

          You may break ties any way you see fit.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return None.
        """
        "*** YOUR CODE HERE ***"
        mdp = self.mdp
        max_val = -100
        direction = "exit"
        if mdp.isTerminal(state) == True:
            return "exit"
        else:
            for action in mdp.getPossibleActions(state):
                Q_value = self.computeQValueFromValues(state, action)
                if max_val < Q_value:
                    max_val = Q_value
                    direction = action
                '''max_val = Q_value if max_val < Q_value else max_val
                direction = action if max_val < Q_value else None'''

        return direction
Beispiel #41
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            oldv = self.values.copy()
            for s in mdp.getStates():
                if (mdp.isTerminal(s)):
                    self.values[s] = 0
                    continue
                maxu = None
                for a in mdp.getPossibleActions(s):
                    eu = 0
                    for (sp, p) in mdp.getTransitionStatesAndProbs(s, a):
                        r = mdp.getReward(s, a, sp)
                        r += self.discount * oldv[sp]
                        eu += p * r
                    if (maxu is None or eu > maxu): maxu = eu
                self.values[s] = maxu
Beispiel #42
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        states = self.mdp.getStates()
        for i in range(self.iterations):
            tempValue = util.Counter()
            for s in states:
                if mdp.isTerminal(s):
                    val = 0
                    tempValue[s] = val
                else:
                    val = float("-inf")
                    for a in self.mdp.getPossibleActions(s):
                        temp_value = self.computeQValueFromValues(s, a)
                        val = max(temp_value, val)
                    tempValue[s] = val
            self.values = tempValue
Beispiel #43
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        next_values = util.Counter()
        for i in range(0, iterations):
            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    arr = []

                    for action in self.mdp.getPossibleActions(state):
                        arr.append(self.getQValue(state, action))

                    next_values[state] = max(arr)

            self.values = next_values.copy()
Beispiel #44
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for index in range(self.iterations):
            nextValues = util.Counter()
            for state in mdp.getStates():
                if mdp.isTerminal(state):
                    # terminal states have 0 value;
                    # their actual 'value' is the transition reward
                    continue

                qValues = []
                for action in mdp.getPossibleActions(state):
                    qValues.append(self.computeQValueFromValues(state, action))
                nextValues[state] = max(qValues)

            self.values = nextValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.ValuesDup = util.Counter()
        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        iterations  = self.iterations
        while(iterations >0):
            for astate in mdp.getStates():
                if mdp.isTerminal(astate)==0:
                    
                    QVallist=[]
                    for action in mdp.getPossibleActions(astate):  
                        QVallist += [self.computeQValueFromValues(astate, action)]   
                    self.values[astate]=max(QVallist)
            for states,value in self.values.items():
                self.ValuesDup[states] = self.values[states]
            iterations+=-1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        self.depth = 1
        self.qTable = {}
        self.vTable = {}
        for state in mdp.getStates():
            self.vTable[state] = 0
            self.qTable[state] = {}
            for action in mdp.getPossibleActions(state):
                
                self.qTable[state][action] = 0
        
        while self.depth < self.iterations + 1:
            self.tempTable = {}
            for state in mdp.getStates():
                self.stateValue = 0
                if not mdp.isTerminal(state):
                    self.stateValue = -9999
                    for action in mdp.getPossibleActions(state):
                        self.Qtotal = 0
                        for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                            self.reward = mdp.getReward(state, action, nextState)
                            self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                            #print "###state:",state,"Next",nextState,"reward:",self.reward,"Qtotal",self.Qtotal,"Value:",self.vTable[nextState]
                        self.qTable[state][action] = self.Qtotal
                        #print self.qTable[state][action]
                        self.stateValue = max(self.stateValue,self.qTable[state][action])
                else:
                    self.tempTable[state] = 0
                self.tempTable[state] = self.stateValue
            self.vTable = self.tempTable
            self.depth += 1
            
        for state in mdp.getStates():
            self.stateValue = -9999
            for action in mdp.getPossibleActions(state):
                self.Qtotal = 0
                for nextState,prob in mdp.getTransitionStatesAndProbs(state,action):
                    self.reward = mdp.getReward(state, action, nextState)
                    self.Qtotal += prob * (self.reward + self.discount * self.vTable[nextState])
                self.qTable[state][action] = self.Qtotal
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
         
        "*** YOUR CODE HERE ***"
        states = mdp.getStates()

        #Iterate through user-defined number of iterations
        for num in range(iterations):
            temp = util.Counter()

            #Compute Ut+1 for all states
            for state in states:
                
                if mdp.isTerminal(state):
                    self.values[state] = 0
                    continue
                
                actions = mdp.getPossibleActions(state)
                maxVal = MIN

                #iterate through trans of each action of the state and sum up values 
                for action in actions:
                    transitions = mdp.getTransitionStatesAndProbs(state, action)
                    totalSum = 0
                    
                    for transition in transitions:
                        #transition[0] = nextState, transition[1] = probability
                        reward = mdp.getReward(state, action, transition[0])
                        #value of the nextState
                        UtValue = self.values[transition[0]]
                        #using formula of value iteration from wikipedia
                        totalSum += transition[1]*(reward + discount * UtValue)
                    maxVal = max(maxVal, totalSum)
                    
                    #for some reason, self.values[state] = maxVal doesn't work.
                    temp[state] = maxVal
            
            for state in states:
                self.values[state] = temp[state]
  def __init__(self, mdp, discount = 0.9, iterations = 100):
    """
      Your value iteration agent should take an mdp on
      construction, run the indicated number of iterations
      and then act according to the resulting policy.
    
      Some useful mdp methods you will use:
          mdp.getStates()
          mdp.getPossibleActions(state)
          mdp.getTransitionStatesAndProbs(state, action)
          mdp.getReward(state, action, nextState)
    """
    self.mdp = mdp
    self.discount = discount
    self.iterations = iterations
    self.values = util.Counter() # A Counter is a dict with default 0
     
    "*** YOUR CODE HERE ***"
    oldValues = util.Counter()
    self.policy = util.Counter()

    
    
    states=self.mdp.getStates()
    
    for k in range (iterations):
      

      
      
      for state in states:
        
        
              

        actionValues = util.Counter()
        actions=self.mdp.getPossibleActions(state)
        for action in actions:
          aux=self.mdp.getTransitionStatesAndProbs(state, action)
          for things in aux:
            #print "nextState:",things[0]
            #print "prob:",things[1]
            actionValues[action]+=things[1]*(mdp.getReward(state,action,things[0])+self.discount*oldValues[things[0]])
        tmp = actionValues.argMax()
        self.values[state] = actionValues[tmp]
        if mdp.isTerminal(state):
          self.policy[state] = None
          
        else:
          self.policy[state] = tmp     
      oldValues=self.values.copy()
 def computeValue(self,mdp,state,discount):
   actions = mdp.getPossibleActions(state)
   valueList = []
   if (mdp.isTerminal(state)):
     return
   for action in actions:
     transitions = mdp.getTransitionStatesAndProbs(state,action)
     value = 0
     for transition in transitions:
       subValue = float(transition[1]) * (float(mdp.getReward(state,action,transition[0]))+(float(discount)*(float(self.getValue(transition[0])))))
       value += subValue
     valueList.append(value)
   
   self.tmpValues[state] = max(valueList)
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        # fill every state with some action.
        self.actions = dict()
        for state in mdp.getStates():
            stateActions = mdp.getPossibleActions(state)
            if len(stateActions) > 0:
                action = stateActions[0]
                self.actions[state] = action

        for i in xrange(iterations):
            # make a copy of all the values.
            # this copy will get modified in the for-loop,
            # and at the end of the loop,
            # the new values will become then real values.
            nextValues = self.values.copy()

            # for every state, and if it isn't a terminal state
            # (you can't do any action on a terminal state):
            for state in mdp.getStates():
                if not mdp.isTerminal(state):
                    # get the best action.
                    action = self.computeActionFromValues(state)
                    self.actions[state] = action
                        
                    # get the value for doing the currently stored action.
                    nextValues[state] = self.computeQValueFromValues(state, action)

            # copy the new values over the old values.
            self.values.update(nextValues)
Beispiel #51
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # NOTE: The naming conventions are a bit off. This is to emphasize the
        #       connection between code and the formulas we were provided with.
        #       Enjoy!

        import sys

        values_temp = util.Counter()
        for i in range(iterations):
            for state in mdp.getStates():
                Q_max = -sys.maxint

                # A terminal state has no actions, so we must be careful to
                # reset the value to zero here.
                if mdp.isTerminal(state):
                    Q_max = 0.0

                # This is a trivial loop to find the 'best' possible action in
                # the current state, according to computed Q values.  This is
                # essentially the Pythonic way of saying the following:
                #   V_k+1(s) <- max Q(s, a)
                for action in mdp.getPossibleActions(state):
                    Q = self.getQValue(state, action)
                    if Q > Q_max:
                        Q_max = Q

                values_temp[state] = Q_max

            # Store the new values.
            self.values = values_temp
            values_temp = util.Counter()
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
#        print "Final state data"
#        print self.mdp.getPossibleActions((3,1))
#        print self.mdp.getTransitionStatesAndProbs((3,1),"exit")
#        print "Final state data"
        
        for i in range(self.iterations):
            
            newValues = self.values.copy()
            
            for state in self.mdp.getStates():
#                print "state is "+ str(state)
                if (mdp.isTerminal(state)):
                    continue
                    
                possibleActions = self.mdp.getPossibleActions(state)
#                print possibleActions
                maxVal=-999
                
                for action in possibleActions:
#                    print action
                    val = self.computeQValueFromValues(state,action)
#                    print val
                    
                    if val>maxVal:
                        maxVal = val
                newValues[state] = maxVal
            
            self.values = newValues.copy()
    def __init__(self, mdp, discount = 0.9, iterations = 1000):
        """
          Your cyclic value iteration agent should take an mdp on
          construction, run the indicated number of iterations,
          and then act according to the resulting policy. Each iteration
          updates the value of only one state, which cycles through
          the states list. If the chosen state is terminal, nothing
          happens in that iteration.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = collections.defaultdict(float)
        states = self.mdp.getStates()
        for state in states:
            self.values[state] = 0

        "*** YOUR CODE HERE ***"
        for i in range(iterations):
            start = time.time()
            state = states[i%len(states)]
            if not mdp.isTerminal(state):
                reward = mdp.getReward(state)                
                actions = mdp.getPossibleActions(state)
                EUActions = []
                for action in actions:
                    EUAction = 0
                    transitions = mdp.getTransitionStatesAndProbs(state, action)
                    for transition in transitions:
                        transitionState = transition[0]
                        transitionUtility = self.values[transitionState]
                        transitionProbability = transition[1]
                        EUAction += transitionUtility*transitionProbability
                    EUActions.append(EUAction)
                maxEU = max(EUActions)
                updatedUtility = reward + discount*maxEU
                self.values[state] = updatedUtility
            print('Iteration: ' + str(i))
            print(sum(abs(value - 100) for state, value in self.values.items() if not self.mdp.isTerminal(state)))
            print('Time elapsed: ' + str(time.time()-start))
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        self.action_array = {}
        self.last_values = util.Counter()

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        for state in mdp.getStates():
            self.values[state] = 0.0
        while self.iterations > 0:
            for state in mdp.getStates():
                qact = {}
                if mdp.isTerminal(state):
                     continue
                else:
                    currentActions = mdp.getPossibleActions(state)
                    for action in currentActions:
                        qact[action] = self.computeQValueFromValues(state, action)
                        #print action, qact.values()
                    #for v in qact.values():
                    #    tmp.append(v)
                    self.values[state] = max(qact.values())
                    for k, v in qact.items():
                        #print state
                        #print k, v
                        if v == self.values[state]:
                            self.action_array[state] = k

            self.last_values = self.values.copy()
            #print self.last_values
            self.iterations -= 1
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"

        Unew = util.Counter()
        delta = 0

        for iterate in range(0,self.iterations):

          Unew = util.Counter()
          for states in mdp.getStates():
            if mdp.isTerminal(states):
              Unew[states] = 0
              continue;

            actions = mdp.getPossibleActions(states)
            if not actions:
              Unew[states] = 0

            maxQvalue = float("-inf")
            qValue = 0

            for a in actions:
              qValue = self.getQValue(states,a);

              if qValue > maxQvalue:
                maxQvalue = qValue
                Unew[states] = qValue
          self.values = Unew
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        import copy
        for iteration in range(iterations):
            newvalues = util.Counter()
            for state in mdp.getStates():  
                if mdp.isTerminal(state):
                    newvalues[state] = 0
                    continue
                
                possibleActions = mdp.getPossibleActions(state)
                maxAction = None
                maxActionValue = float('-inf')
                
                if (possibleActions==None):
                    newvalues[state] = 0

                for action in possibleActions:
                    actionsum = self.getQValue(state, action)
                                
                    #Find the maximum action
                    if maxActionValue < actionsum:
                        maxAction = action
                        maxActionValue = actionsum

                #maxActionValue is now V:k+1 after iteration
                newvalues[state] = maxActionValue
            self.values = newvalues
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        # State = Coordinate
        for iteration in range(iterations):
            newValues = util.Counter()
            oldValues = self.values.copy()
            allStates = mdp.getStates()

            for state in allStates:
                if not mdp.isTerminal(state):
                    rewardsForActions = util.Counter()
                    possibleActions = mdp.getPossibleActions(state)

                    for possibleAction in possibleActions:
                        transitionStatesAndProbs = mdp.getTransitionStatesAndProbs(state, possibleAction)

                        for transitionStateAndProb in transitionStatesAndProbs:
                            newState = transitionStateAndProb[0]
                            prob = transitionStateAndProb[1]
                            rewardForTransition = mdp.getReward(state, possibleAction, newState)

                            rewardsForActions[possibleAction] = rewardsForActions[possibleAction] + prob * (
                                rewardForTransition + (discount * oldValues[newState])
                            )
                        maxAction = rewardsForActions.argMax()
                        newValues[state] = rewardsForActions[maxAction]
            self.values = newValues
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        counter = 0
        for state in self.mdp.getStates():
            self.values[state] = 0
        while counter < iterations:
            newValues = util.Counter()
            for state in self.mdp.getStates():
                if not mdp.isTerminal(state):
                    actions = self.mdp.getPossibleActions(state)
                    maxValue = float("-inf")
                    for action in actions:
                        value = 0
                        stateProbs = self.mdp.getTransitionStatesAndProbs(state, action)
                        for pair in stateProbs:
                            discounted = 0
                            if pair[0] == 'TERMINAL_STATE':
                                discounted = 0 #redundant???
                            else:
                                discounted = discount * self.getValue(pair[0])
                            value += (self.mdp.getReward(state, action, pair[0]) + discounted) * pair[1]
                        if value > maxValue:
                            maxValue = value
                    newValues[state] = maxValue
            for state in newValues:
                self.values[state] = newValues[state]
            counter += 1
        print self.values
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        print "-----------------------------------------------------"
        "*** MY CODE BEGINS ***"
        k = 0
        while k < iterations:
            val = self.values.copy()  #before each iteration, copy one.
            for s in mdp.getStates():
                if mdp.isTerminal(s) == False:
                    max = -999999
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        if v > max:
                            max = v
                    val[s] = max
                else:
                    for action in mdp.getPossibleActions(s):
                        v = 0
                        for pos_pro in mdp.getTransitionStatesAndProbs(s,action):
                            v = v + pos_pro[1]*(mdp.getReward(s,action,pos_pro[0])+discount*self.values[pos_pro[0]])
                        val[s] = v
            k = k+1
            for s in mdp.getStates():
                self.values[s] = val[s]