def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE valiter = ValueIteration() valiter.solve(smallMDP) # Simulate with 20% exploration probability, and then set to 0 after simulation rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, explorationProb = 0.2) util.simulate(mdp, rl, 30000, verbose = False) rl.explorationProb = 0 # Extract the optimal policies and replicate the dict that comes from valiter.pi rl_result = dict() same = 0 different = 0 for state in valiter.pi.keys(): rl_result[state] = rl.getAction(state) print rl.getAction(state), valiter.pi[state] if rl.getAction(state) == valiter.pi[state]: same = same + 1 else: different = different + 1 print same, different return valiter.pi, rl_result
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ql = QLearningAlgorithm(actions=mdp.actions, discount=1, featureExtractor=featureExtractor) util.simulate(mdp, ql, numTrials=90000, maxIterations=1000) print(ql.numIters) ql.explorationProb = 0 print(ql.explorationProb) ql.is_test = True vi = ValueIteration() vi.solve(mdp) match = [ql.getAction(state) == action for state, action in vi.pi.items()] # ql_action = [ql.getAction(state) for state, action in vi.pi.items()] # take_count = [action == 'Take' for state, action in vi.pi.items()] # peek_count = [action == 'Peek' for state, action in vi.pi.items()] # quit_count = [action == 'Quit' for state, action in vi.pi.items()] # print('Take: {}'.format(sum(take_count) / len(take_count))) # print('Peek: {}'.format(sum(peek_count) / len(take_count))) # print('Quit: {}'.format(sum(quit_count) / len(take_count))) percentage_match = sum(match) / len(match) # print(ql_action) # print(ql.weights) return percentage_match
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE original_mdp.computeStates() vi = ValueIteration() vi.solve(originalMDP) rl = util.FixedRLAlgorithm(vi.pi.copy()) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False) rl.explorationProb = 0.0 #print(rewards) modified_mdp.computeStates() rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, 0.2) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) q_rewards = util.simulate(mdp, ql, 30000) avg_reward_q = float(sum(q_rewards)) / len(q_rewards) vi = ValueIteration() vi.solve(mdp) rl = util.FixedRLAlgorithm(vi.pi) vi_rewards = util.simulate(mdp, rl, 30000) avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards) ql.explorationProb = 0 ql_pi = {} for state, _ in vi.pi.items(): ql_pi[state] = ql.getAction(state) p_vi = vi.pi diff = 0 for state in vi.pi.keys(): if vi.pi[state] != ql_pi[state]: diff += 1 print("difference", diff, "over " + str(len(p_vi.keys())) + " states") print("percentage diff ", float(diff) / len(p_vi.keys())) print("avg_reward_diff", avg_reward_q - avg_reward_vi)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) util.simulate(mdp, rl, 30000) zero_weight_count = 0 total_weight_count = 0 for key in rl.weights: weight = rl.weights[key] total_weight_count += 1 if abs(weight - 0.0) <= 0.00001: zero_weight_count += 1 print "Total Weights: %s, Zero Weights: %s" % (total_weight_count, zero_weight_count) rl.explorationProb = 0 vi = ValueIteration() vi.solve(mdp) count = 0 expected_result = 0 for key in vi.pi: count += 1 if vi.pi[key] is rl.getAction(key): expected_result += 1 print "total (state, action) pairs: %s" % (count * 3) print "Accuracy of MDP using the featureExtractor: %s" % ( float(expected_result) / count * 100)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(mdp) viQ = vi.pi mdp.computeStates() rl = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor, .2) util.simulate(mdp, rl, numTrials=30000, maxIterations=10, verbose=False, sort=False) mdp.explorationProb = 0 d = {} for state in mdp.states: d[state] = rl.getAction(state) Diff = 0 for k in d.keys(): if d[k] != viQ[k]: Diff += 1 return Diff
def testValueIteration(mdp): valueIter = ValueIteration() # implemented in util.py valueIter.solve(mdp, .001) states = sorted(valueIter.pi, key=lambda x: len(x)) # sorted by state space print('valueIter.pi:') for elem in sorted(valueIter.pi): print(elem, '\t:\t', valueIter.pi[elem]) return valueIter
def compareQLandVI(targetMDP, featureExtractor): QL = QLearningAlgorithm(targetMDP.actions, 1, featureExtractor) VI = ValueIteration() simulate(targetMDP, QL, numTrials=30000) VI.solve(targetMDP) diffPolicyStates = [] QL.explorationProb = 0 for state in targetMDP.states: #print state, QL.getAction(state), VI.pi[state] if QL.getAction(state) != VI.pi[state]: diffPolicyStates.append(state) print "%d/%d = %f%% different states"%(len(diffPolicyStates), len(targetMDP.states), len(diffPolicyStates)/float(len(targetMDP.states)))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIteration = ValueIteration() valueIteration.solve(original_mdp) rl = util.FixedRLAlgorithm(valueIteration.pi) rewards = util.simulate(modified_mdp, rl) print(sum(rewards) / len(rewards)) rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor) rewards = util.simulate(original_mdp, rl, numTrials=30000) rewards = util.simulate(modified_mdp, rl, numTrials=30000) print(sum(rewards) / len(rewards)) # END_YOUR_CODE
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE # for the reproductivity random.seed(123) # initialization mdp.computeStates() # to get the whole State Space of MDP rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.2) # Value Iteration part algorithm = ValueIteration() algorithm.solve( mdp, .001) # algorithm now contains the Value and Policy of the mdp. # Q-Learning part util.simulate( mdp, rl, 30000 ) # Model-Free, Simulate 30000 times. After this Q-learning has been learned. rl.explorationProb = 0 # set ε to 0 and then .getAction(state) works as a policy Π. Qpi = {} comparison = [] for state in mdp.states: # get the Q-learning policy and comparison results. Qpi[state] = rl.getAction(state) comparison.append(int(algorithm.pi[state] == Qpi[state])) if featureExtractor == identityFeatureExtractor: if mdp.multiplicity == 2: print( "The match rate of using identityFeatureExtractor for smallMDP: %.4f" % (sum(comparison) / len(comparison))) print( "Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison))) else: print( "The match rate of using identityFeatureExtractor for largeMDP: %.4f" % (sum(comparison) / len(comparison))) print( "Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison))) else: print( "The match rate of using blackjackFeatureExtractor for largeMDP: %.4f" % (sum(comparison) / len(comparison))) print("Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison)))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(original_mdp) rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000) print "Expected Reward on modified mdp using original mdp policy: %i" % ( float(sum(r for r in rewards)) / len(rewards)) rewards_new = util.simulate( modified_mdp, QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(), featureExtractor, 0.1), 10000) print "Expected Reward on modified mdp using Q Learning: %i" % ( float(sum(r for r in rewards_new)) / len(rewards_new))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE valueIteration = ValueIteration() valueIteration.solve(mdp) vi_pi = valueIteration.pi rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) util.simulate(mdp, rl, numTrials=30000, verbose=False) rl.explorationProb = 0 diff, total = 0, len(mdp.states) for state in mdp.states: if vi_pi[state] != rl.getAction(state): diff += 1 print('{:.3f}'.format(100 * diff / total) + '%')
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE viter = ValueIteration() viter.solve(original_mdp) fixed_rl = util.FixedRLAlgorithm(viter.pi) print "Expected reward value iteration: ", \ sum(util.simulate(modified_mdp, fixed_rl, numTrials=30000))/30000.0 ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) print "Expected reward q-learn: ", \ sum(util.simulate(modified_mdp, ql, numTrials=30000))/30000.0
def problem4d(): originalMDP.computeStates() newThresholdMDP.computeStates() vi = ValueIteration() vi.solve(originalMDP) fixedVi = util.FixedRLAlgorithm(vi.pi) vi_reward = util.simulate(newThresholdMDP, fixedVi, numTrials=30000) QL = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), blackjackFeatureExtractor, 0.2) util.simulate(newThresholdMDP, QL, numTrials=30000) QL.explorationProb = 0.0 QLreward = util.simulate(newThresholdMDP, QL, numTrials=1000) print('\n 4d now:') print('Value Iteration Reward:{}'.format( sum(vi_reward) / float(len(vi_reward)))) print('Q-learn Reward:{}'.format(sum(QLreward) / float(len(QLreward))))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE ValIter = ValueIteration() ValIter.solve(original_mdp, .0001) pi_val = ValIter.pi fix_policy = util.FixedRLAlgorithm(pi_val) old_reward = util.simulate(modified_mdp, fix_policy, 30000, 1000, False, False) RL = QLearningAlgorithm(newThresholdMDP.actions, newThresholdMDP.discount(), blackjackFeatureExtractor) new_reward = util.simulate(newThresholdMDP, RL, 30000, 1000, False, False) print("Reward from old policy:", sum(old_reward), "\nReward from new QL policy:", sum(new_reward)) pass
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(original_mdp) fixedRLAlgorithm = util.FixedRLAlgorithm(vi.pi) num_trials = 90 total_rewards = util.simulate(modified_mdp, fixedRLAlgorithm, num_trials) expected_reward_fixed_rl = sum(total_rewards) / len(total_rewards) ql = QLearningAlgorithm(actions=modified_mdp.actions, discount=1, featureExtractor=featureExtractor) util.simulate(modified_mdp, ql, numTrials=30000, maxIterations=1000) ql.explorationProb = 0 total_rewards = util.simulate(modified_mdp, ql, num_trials) expected_reward_ql = sum(total_rewards) / len(total_rewards) print("Expected reward fixed rl: " + str(expected_reward_fixed_rl)) print("Expected reward ql: " + str(expected_reward_ql))
def compare(mdp): mdp.computeStates() rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=identityFeatureExtractor) util.simulate(mdp, rl, 30000) rl.explorationProb = 0.0 QLearnPolicy = {} for state in mdp.states: QLearnPolicy[state] = rl.getAction(state) vi = ValueIteration() vi.solve(mdp, ) matchCount = 0 for state in mdp.states: if QLearnPolicy[state] == vi.pi[state]: matchCount += 1 print('policy match:{}/{}'.format(matchCount, len(mdp.states)))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valiter = ValueIteration() valiter.solve(original_mdp) orig_pi = valiter.pi print valiter.pi vi_rl = util.FixedRLAlgorithm(orig_pi) vi_result = util.simulate(modified_mdp, vi_rl, 30000, verbose=False) orig_rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, explorationProb=0.2) orig_result = util.simulate(modified_mdp, orig_rl, 30000, verbose=False) return vi_result, orig_result
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE value_iteration = ValueIteration() value_iteration.solve(mdp) q_learning = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=featureExtractor) q_learning.explorationProb = 0.0 util.simulate(mdp, q_learning, numTrials=30000, verbose=False, sort=False) total = 0 diff = 0 for state in mdp.states: if (value_iteration.pi[state] != q_learning.getAction(state)): diff += 1 total += 1 print('different-ratio between q learning and value iteration:') print(diff / total)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ValIter = ValueIteration() ValIter.solve(mdp, .0001) pi_val = ValIter.pi RL = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) util.simulate(mdp, RL, 30000, 1000, False, False) RL.explorationProb = 0 pi_RL = {} diff = 0 for state in mdp.states: pi_RL[state] = RL.getAction(state) if pi_RL[state] != pi_val[state]: diff += 1 ratio = diff / len(mdp.states) print("Different policies:", diff, "\tRatio:", ratio) pass
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE original_mdp.computeStates() # to get the whole State Space of MDP modified_mdp.computeStates() algorithm = ValueIteration() algorithm.solve(original_mdp, .001) # algorithm.solve(modified_mdp, .001) frl = util.FixedRLAlgorithm(algorithm.pi) random.seed(123) totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30) print( "*** Expected return for FixedRLAlgorithm (numTrials=30): %.4f \t***" % (sum(totalRewards) / len(totalRewards))) totalRewards = util.simulate(mdp=modified_mdp, rl=frl, numTrials=30000) print( "*** Expected return for FixedRLAlgorithm (numTrials=30000): %.4f \t***" % (sum(totalRewards) / len(totalRewards))) random.seed(123) rlQ = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor) totalRewards = util.simulate( mdp=modified_mdp, rl=rlQ, numTrials=30 ) # Model-Free, Simulate 30000 times. After this Q-learning has been learned. print( "*** Expected return for QLearningRLAlgorithm (numTrials=30): %.4f \t ***" % (sum(totalRewards) / len(totalRewards))) totalRewards = util.simulate(mdp=modified_mdp, rl=rlQ, numTrials=29970) print( "*** Expected return for QLearningRLAlgorithm (numTrials=30000): %.4f \t ***" % (sum(totalRewards) / len(totalRewards)))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=featureExtractor) util.simulate(mdp, rl, numTrials=30000) viter = ValueIteration() viter.solve(mdp, .001) rl.explorationProb = 0 total = 0 neq = 0 for state in viter.pi: if viter.pi[state] != rl.getAction(state): # print state, viter.pi[state], rl.getAction(state) neq += 1 print "Total:", len( viter.pi), ", neq:", neq, ", frac_neq:", neq / float(len(viter.pi))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE value_iteration = ValueIteration() value_iteration.solve(original_mdp) fixed_RL = util.FixedRLAlgorithm(value_iteration.pi) rewards1 = util.simulate(modified_mdp, fixed_RL, numTrials=50000, verbose=False, sort=False) q_learning = QLearningAlgorithm(actions=modified_mdp.actions, discount=modified_mdp.discount(), featureExtractor=featureExtractor) rewards2 = util.simulate(modified_mdp, q_learning, numTrials=50000, verbose=False, sort=False) print('fixed_RL reward :', sum(rewards1) / len(rewards1)) print('q_learning reward :', sum(rewards2) / len(rewards2))
# Large test case largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1) largeMDP.computeStates() if __name__ == '__main__' and args.p_4b == 'small': rl = QLearningAlgorithm(smallMDP.actions, smallMDP.discount(), identityFeatureExtractor, 0.2) simulated = util.simulate(smallMDP, rl, 30000, verbose=False) rl.explorationProb = 0 # value iteration value = ValueIteration() value.solve(smallMDP) for key in value.pi.keys(): print 'state:', key print 'valit:', value.pi[key] print 'RLalg:', rl.getAction(key) print '-------------------------' if __name__ == '__main__' and args.p_4b == 'large': rl = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(), identityFeatureExtractor, 0.2) simulated = util.simulate(largeMDP, rl, 30000, verbose=False) rl.explorationProb = 0 # value iteration value = ValueIteration()
for takenClass, _ in state[0]: if takenClass in action: return [] succs = [] classesDict = { } #will contain each class in |action| with a grade associated recurse(state[0], 1, action, succs, classesDict) return succs def discount(self): return 1 bulletin = json.loads(open('cartadata.json').read()) #Run Value Iteration" startState = simpleEnroll(bulletin) # startState = ((("MATH 19", "A"), ("MATH 20", "A"), ("CS 106B","B+"),("MATH 21", "A"),("ECON 1","A"),("MATH 51", "A-"),("CS 107", "A-"), # ("CS 106A", "A"), ("CS 109", "A")), # "Aut", 2, ()) courseMDP = CourseMDP(startState, bulletin) vi = ValueIteration() vi.solve(courseMDP) pi_vi = vi.pi value = vi.V bestAction = pi_vi[startState] #Print our recommended schedule! print("Best Action: ", bestAction)
def simulateVI(mdp): VIAlgorithm = ValueIteration() VIAlgorithm.solve(mdp) return VIAlgorithm.pi
mdp1 = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) rl = QLearningAlgorithm(mdp1.actions, mdp1.discount(), identityFeatureExtractor, 0.2) mdp1 = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) random.seed(1) startState = mdp1.startState() algo = ValueIteration() algo.solve(mdp1) print "pi of Value iteration is:" #print algo.pi states = algo.pi.keys() util.simulate(mdp1, rl, 30000) rl.explorationProb = 0 pi_rl = {} for state in states: pi_rl[state] = rl.getAction(state) print "small test case" #print "pi of reinforcement learning is:" #print pi_rl for key in pi_rl.keys():