def __init__(self, mdp, discount=0.9, iterations=100, max_iters=100): """ Your value rtdp agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) mdp.getStartState() Other useful functions: weighted_choice(choices) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = { } # note, we use a normal python dictionary for RTDPAgent. # Write rtdp code here start_state = mdp.getStartState() start_time = time.time() #log self.setGoalStatesAndRewards() #print("precondition time:%f" % (time.time() - start_time)) for i in range(iterations): #print(i) self.RTDPTrialReverse(start_state)
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.tmpValues = util.Counter() iterationsCompleted = 0 startState = mdp.getStartState() while (iterationsCompleted < iterations): for state in mdp.getStates(): self.computeValue(mdp, state, discount) for key in self.tmpValues: self.values[key] = self.tmpValues[key] iterationsCompleted += 1
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 self.tmpValues = util.Counter(); iterationsCompleted = 0 startState = mdp.getStartState(); while (iterationsCompleted < iterations): for state in mdp.getStates(): self.computeValue(mdp,state,discount) for key in self.tmpValues: self.values[key] = self.tmpValues[key] iterationsCompleted += 1
def __init__(self, mdp, discount=0.9, iterations=100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 startState = mdp.getStartState() maxVal = None for iteration in xrange(iterations): values = self.values.copy() for state in mdp.getStates(): actions = self.mdp.getPossibleActions(state) if not actions: values[state] = 0 else: values[state] = \ max(self.computeQValueFromValues(state, action) for action in actions) self.values = values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" state = mdp.getStartState() for i in range(0,iterations): #print "iteration: ", i #iterate once through all states and actions, save q-values for state in mdp.getStates(): for action in mdp.getPossibleActions(state): #compute qValue for each action qValue = self.getQValue(state, action) self.values[(state,action)] = qValue #after all qValues are computed, iterate againt through states, save value from optimal policy. these values will be V* for next iteration for state in mdp.getStates(): action = self.getAction(state) self.values[state] = self.values[(state, action)] """
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 startState = mdp.getStartState() maxVal = None for iteration in xrange(iterations): values = self.values.copy() for state in mdp.getStates(): actions = self.mdp.getPossibleActions(state) if not actions: values[state] = 0 else: values[state] = \ max(self.computeQValueFromValues(state, action) for action in actions) self.values = values
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 """ 1. loop on iterations through states, starting from startState 2. T(s,a,s') = mdp.getTransitionStatesAndProbs 3. actions = mdp.getPossibleActions(state) 4. R(s,a,s') = mdp.getReward(state, action, nextState) ; nextState is from Transition value tuple 5. PLUS GAMMA (Discount Factor) * ???? 6. FINAL FORMULATION: Summation of T(s,a,s')*(R(s,a,s')+GAMMA*????) ---> is the value in the tuple, action is the current iteration of the state 7. Do comparison on current value and previous value from Step 6. 8. Maintain a tuple that saves the highest value associated with an action; return value max(self.actions[state], key=lambda d: d[1]) """ "*** YOUR CODE HERE ***" #Local Declarations startState = mdp.getStartState() count = 0 self.actions = dict() qVals = dict() #loop to iterate through iterations while count < self.iterations: stateList = self.mdp.getStates() #iterate through state list to get each action list for s in stateList: self.actions[s] = list() actionList = self.mdp.getPossibleActions(s) #iterate through each action in action list for action in actionList: temp = self.computeQValueFromValues(s, action) self.actions[s].append((action, temp)) if self.actions[s]: qVals[s] = max(self.actions[s], key=lambda d: d[1])[1] #update self.values dictionary for s in stateList: if self.mdp.isTerminal(s): self.values[s] = self.mdp.getReward(s, None, None) else: self.values[s] = qVals[s] count += 1
def __init__(self, mdp, discount=0.9, iterations=100, max_iters=100): """ Your value rtdp agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) mdp.getStartState() Other useful functions: weighted_choice(choices) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = { } # note, we use a normal python dictionary for RTDPAgent. # Write rtdp code here self.start_state = mdp.getStartState() self.states = mdp.getStates() self.use_stack = True self.stack = [] for _ in range(self.iterations): current_state = self.start_state iter = 0 while not self.mdp.isTerminal(current_state) and iter < max_iters: iter += 1 # Pick the best action and update hash action, utility = self.greedyAction(current_state) if not self.use_stack: self.values[current_state] = utility else: self.stack.append((current_state, utility)) # Stochastically simulate next state current_state = self.pickNextState(current_state, action) if self.use_stack and self.stack: for i in range(len(self.stack)): back_index = len(self.stack) - i - 1 item = self.stack[back_index] self.values[item[0]] = item[1] # Reset self.stack = []
def __init__(self, mdp, discount, threshhold): self.mdp = mdp self.ɣ = discount self.θ = threshhold # initialize values and policy arbitrarily self.V = util.Counter() self.π = {} a = mdp.getPossibleActions(mdp.getStartState())[0] for s in self.mdp.getStates(): self.π[s] = a self.iterate()
def get_average_reward(agent, mdp): current_state = mdp.getStartState() iterations = 0 total = 0 while not mdp.isTerminal(current_state) and iterations < 1000: total += agent.getValue(current_state) action = agent.getPolicy(current_state) next_states = mdp.getTransitionStatesAndProbs(current_state, action) current_state = next_states[0][0] iterations += 1 if mdp.isTerminal(current_state): return total / iterations return 0
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 states = self.mdp.getStates(); listAction=[None] * len(states) self.actions=dict(zip(states, listAction)) # Write value iteration code here state= mdp.getStartState() "*** YOUR CODE HERE ***S" # print ('startstate', state) # print(states); temp0=util.Counter() for i in range(self.iterations): for state in states: possActions=mdp.getPossibleActions(state) # print( mdp.getPossibleActions(state)) maxQ=float("-inf") maxAction=None for action in possActions: temp=self.computeQValueFromValues(state, action) if temp>maxQ: maxQ=temp maxAction=action temp0[state]=maxQ self.actions[state]=maxAction self.values=temp0.copy()
def __init__(self, mdp, discount = 0.9, iterations = 100): """ Your value iteration agent should take an mdp on construction, run the indicated number of iterations and then act according to the resulting policy. Some useful mdp methods you will use: mdp.getStates() mdp.getPossibleActions(state) mdp.getTransitionStatesAndProbs(state, action) mdp.getReward(state, action, nextState) mdp.isTerminal(state) """ self.mdp = mdp self.discount = discount self.iterations = iterations self.values = util.Counter() # A Counter is a dict with default 0 # Write value iteration code here "*** YOUR CODE HERE ***" #get all states if self.mdp.isTerminal(mdp.getStartState()): return None allStates= mdp.getStates() vNew= util.Counter() for iteration in range(iterations): for newState in allStates: # compute Qvalues for all actions actionsFromNewState= mdp.getPossibleActions(newState) qValues=[]#store Qvalues for paction in actionsFromNewState : qValues.append(self.computeQValueFromValues(newState, paction)) if len(qValues)>0: vNew[newState]= max(qValues) self.values=vNew vNew= util.Counter()
'extractor': extractor, 'actionFn': actionFn} agent = ApproximateSarsaAgent(**qLearnOpts) if TYPE == "43t" or TYPE == "43i": weights = pickle.load(open( "weights" + TYPE + ".p", "rb" )) agent.weights = featureExtractors.keepwayWeightTranslation(weights) agent.workingWeights = agent.weights.copy() tList = [] for _ in xrange(episodes): t = 0 lastT = None prevState = None prevAction = None state = mdp.getStartState() while True: if mdp.isTerminal(state) or t > 1000: agent.update(prevState, prevAction, state, t - lastT) break if mdp.weHaveBall(state): if lastT != None: agent.update(prevState, prevAction, state, t - lastT) #print prevAction prevState = state lastT = t action = agent.getAction(state) if action != None: prevAction = action