Esempio n. 1
0
    def __init__(self, mdp, discount=0.9, iterations=100, max_iters=100):
        """
          Your value rtdp agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
              mdp.getStartState()

          Other useful functions:
              weighted_choice(choices)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = {
        }  # note, we use a normal python dictionary for RTDPAgent.

        # Write rtdp code here
        start_state = mdp.getStartState()
        start_time = time.time()  #log
        self.setGoalStatesAndRewards()
        #print("precondition time:%f" % (time.time() - start_time))
        for i in range(iterations):
            #print(i)
            self.RTDPTrialReverse(start_state)
Esempio n. 2
0
 def __init__(self, mdp, discount=0.9, iterations=100):
     """
   Your value iteration agent should take an mdp on
   construction, run the indicated number of iterations
   and then act according to the resulting policy.
 
   Some useful mdp methods you will use:
       mdp.getStates()
       mdp.getPossibleActions(state)
       mdp.getTransitionStatesAndProbs(state, action)
       mdp.getReward(state, action, nextState)
 """
     self.mdp = mdp
     self.discount = discount
     self.iterations = iterations
     self.values = util.Counter()  # A Counter is a dict with default 0
     self.tmpValues = util.Counter()
     iterationsCompleted = 0
     startState = mdp.getStartState()
     while (iterationsCompleted < iterations):
         for state in mdp.getStates():
             self.computeValue(mdp, state, discount)
         for key in self.tmpValues:
             self.values[key] = self.tmpValues[key]
         iterationsCompleted += 1
 def __init__(self, mdp, discount = 0.9, iterations = 100):
   """
     Your value iteration agent should take an mdp on
     construction, run the indicated number of iterations
     and then act according to the resulting policy.
   
     Some useful mdp methods you will use:
         mdp.getStates()
         mdp.getPossibleActions(state)
         mdp.getTransitionStatesAndProbs(state, action)
         mdp.getReward(state, action, nextState)
   """
   self.mdp = mdp
   self.discount = discount
   self.iterations = iterations
   self.values = util.Counter() # A Counter is a dict with default 0
   self.tmpValues = util.Counter();
   iterationsCompleted = 0
   startState = mdp.getStartState();
   while (iterationsCompleted < iterations):
     for state in mdp.getStates():
       self.computeValue(mdp,state,discount)
     for key in self.tmpValues:
       self.values[key] = self.tmpValues[key]
     iterationsCompleted += 1
Esempio n. 4
0
    def __init__(self, mdp, discount=0.9, iterations=100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter()  # A Counter is a dict with default 0

        startState = mdp.getStartState()
        maxVal = None
        for iteration in xrange(iterations):
            values = self.values.copy()
            for state in mdp.getStates():
                actions = self.mdp.getPossibleActions(state)
                if not actions:
                    values[state] = 0
                else:
                    values[state] = \
                      max(self.computeQValueFromValues(state, action)
                          for action in actions)
            self.values = values
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        state = mdp.getStartState()
        for i in range(0,iterations):
            #print "iteration: ", i
            #iterate once through all states and actions, save q-values
            for state in mdp.getStates():
                for action in mdp.getPossibleActions(state):
                    #compute qValue for each action
                    qValue = self.getQValue(state, action)
                    self.values[(state,action)] = qValue
            #after all qValues are computed, iterate againt through states, save value from optimal policy. these values will be V* for next iteration
            for state in mdp.getStates():
                action = self.getAction(state)
                self.values[state] = self.values[(state, action)] 

        """
Esempio n. 6
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        startState = mdp.getStartState()
        maxVal = None
        for iteration in xrange(iterations):
            values = self.values.copy()
            for state in mdp.getStates():
                actions = self.mdp.getPossibleActions(state)
                if not actions:
                    values[state] = 0
                else:
                    values[state] = \
                      max(self.computeQValueFromValues(state, action)
                          for action in actions)
            self.values = values
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.
          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        """
        1. loop on iterations through states, starting from startState
        2. T(s,a,s') = mdp.getTransitionStatesAndProbs
        3. actions = mdp.getPossibleActions(state)
        4. R(s,a,s') = mdp.getReward(state, action, nextState) ; nextState is from Transition value tuple
        5. PLUS GAMMA (Discount Factor) * ????
        6. FINAL FORMULATION: Summation of T(s,a,s')*(R(s,a,s')+GAMMA*????) ---> is the value in the tuple, action is the current iteration of the state
        7. Do comparison on current value and previous value from Step 6.
        8. Maintain a tuple that saves the highest value associated with an action; return value
        max(self.actions[state], key=lambda d: d[1])
        """
        "*** YOUR CODE HERE ***"

        #Local Declarations
        startState = mdp.getStartState()
        count = 0
        self.actions = dict()
        qVals = dict()
        #loop to iterate through iterations
        while count < self.iterations:
          stateList = self.mdp.getStates()
          #iterate through state list to get each action list
          for s in stateList:
            self.actions[s] = list()
            actionList = self.mdp.getPossibleActions(s)
            #iterate through each action in action list
            for action in actionList:
              temp = self.computeQValueFromValues(s, action)
              self.actions[s].append((action, temp))
            if self.actions[s]:
              qVals[s] = max(self.actions[s], key=lambda d: d[1])[1]
          #update self.values dictionary
          for s in stateList:
            if self.mdp.isTerminal(s):
              self.values[s] = self.mdp.getReward(s, None, None)
            else:
              self.values[s] = qVals[s]
          count += 1
Esempio n. 8
0
    def __init__(self, mdp, discount=0.9, iterations=100, max_iters=100):
        """
          Your value rtdp agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
              mdp.getStartState()

          Other useful functions:
              weighted_choice(choices)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = {
        }  # note, we use a normal python dictionary for RTDPAgent.

        # Write rtdp code here
        self.start_state = mdp.getStartState()
        self.states = mdp.getStates()
        self.use_stack = True
        self.stack = []

        for _ in range(self.iterations):
            current_state = self.start_state
            iter = 0
            while not self.mdp.isTerminal(current_state) and iter < max_iters:
                iter += 1
                # Pick the best action and update hash
                action, utility = self.greedyAction(current_state)

                if not self.use_stack:
                    self.values[current_state] = utility
                else:
                    self.stack.append((current_state, utility))

                # Stochastically simulate next state
                current_state = self.pickNextState(current_state, action)

            if self.use_stack and self.stack:
                for i in range(len(self.stack)):
                    back_index = len(self.stack) - i - 1
                    item = self.stack[back_index]
                    self.values[item[0]] = item[1]
                # Reset
                self.stack = []
Esempio n. 9
0
    def __init__(self, mdp, discount, threshhold):
        self.mdp = mdp
        self.ɣ = discount
        self.θ = threshhold

        # initialize values and policy arbitrarily
        self.V = util.Counter()

        self.π = {}
        a = mdp.getPossibleActions(mdp.getStartState())[0]
        for s in self.mdp.getStates():
            self.π[s] = a
        self.iterate()
Esempio n. 10
0
def get_average_reward(agent, mdp):
    current_state = mdp.getStartState()
    iterations = 0
    total = 0
    while not mdp.isTerminal(current_state) and iterations < 1000:
        total += agent.getValue(current_state)
        action = agent.getPolicy(current_state)
        next_states = mdp.getTransitionStatesAndProbs(current_state, action)
        current_state = next_states[0][0]
        iterations += 1

    if mdp.isTerminal(current_state):
        return total / iterations
    return 0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0
        states = self.mdp.getStates();
        listAction=[None] * len(states)
        self.actions=dict(zip(states, listAction))
        # Write value iteration code here
        state=   mdp.getStartState()
        "*** YOUR CODE HERE ***S"
    #    print ('startstate', state)

     #   print(states);
        temp0=util.Counter()
        for i in range(self.iterations):
          for state in states:
            possActions=mdp.getPossibleActions(state)
          #  print( mdp.getPossibleActions(state))
            maxQ=float("-inf")
            maxAction=None
            for action in possActions:
              temp=self.computeQValueFromValues(state, action)
              if temp>maxQ:
                maxQ=temp
                maxAction=action
                temp0[state]=maxQ
              self.actions[state]=maxAction
          self.values=temp0.copy()
Esempio n. 12
0
    def __init__(self, mdp, discount = 0.9, iterations = 100):
        """
          Your value iteration agent should take an mdp on
          construction, run the indicated number of iterations
          and then act according to the resulting policy.

          Some useful mdp methods you will use:
              mdp.getStates()
              mdp.getPossibleActions(state)
              mdp.getTransitionStatesAndProbs(state, action)
              mdp.getReward(state, action, nextState)
              mdp.isTerminal(state)
        """
        self.mdp = mdp
        self.discount = discount
        self.iterations = iterations
        self.values = util.Counter() # A Counter is a dict with default 0

        # Write value iteration code here
        "*** YOUR CODE HERE ***"
        #get all states
        if self.mdp.isTerminal(mdp.getStartState()):
            return None
        allStates= mdp.getStates()
        vNew= util.Counter()
        for iteration in range(iterations):
            for newState in allStates:

                # compute Qvalues for all actions
                actionsFromNewState= mdp.getPossibleActions(newState)
                qValues=[]#store Qvalues
                for paction in actionsFromNewState :
                    qValues.append(self.computeQValueFromValues(newState, paction))
                if len(qValues)>0:
                    vNew[newState]= max(qValues)
            self.values=vNew
            vNew= util.Counter()   
Esempio n. 13
0
                'extractor': extractor,
                'actionFn': actionFn}
  agent = ApproximateSarsaAgent(**qLearnOpts)

  if TYPE == "43t" or TYPE == "43i":
    weights = pickle.load(open( "weights" + TYPE + ".p", "rb" ))
    agent.weights = featureExtractors.keepwayWeightTranslation(weights)
    agent.workingWeights = agent.weights.copy()

  tList = []
  for _ in xrange(episodes):
    t = 0
    lastT = None
    prevState = None
    prevAction = None
    state = mdp.getStartState()

    while True:
      if mdp.isTerminal(state) or t > 1000:
        agent.update(prevState, prevAction, state, t - lastT)
        break
      
      if mdp.weHaveBall(state):
        if lastT != None:
          agent.update(prevState, prevAction, state, t - lastT)
          #print prevAction
        prevState = state
        lastT = t

      action = agent.getAction(state)
      if action != None: prevAction = action