def test_hidden(self): """4a-hidden: Hidden test for incorporateFeedback(). Run QLearningAlgorithm on smallMDP, then ensure that getQ returns reasonable value.""" smallMDP = self.run_with_solution_if_possible(submission, lambda sub_or_sol: sub_or_sol.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1)) smallMDP.computeStates() rl = submission.QLearningAlgorithm(smallMDP.actions, smallMDP.discount(), submission.identityFeatureExtractor, 0.2) util.simulate(smallMDP, rl, 30000)
def test_basic(self): """4c-basic: Basic test for blackjackFeatureExtractor. Runs QLearningAlgorithm using blackjackFeatureExtractor, then checks to see that Q-values are correct.""" mdp = submission.BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) mdp.computeStates() rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(), submission.blackjackFeatureExtractor, 0) # We call this here so that the stepSize will be 1 rl.numIters = 1 rl.incorporateFeedback((7, None, (0, 1)), 'Quit', 7, (7, None, None)) self.assertEqual(28, rl.getQ((7, None, (0, 1)), 'Quit')) self.assertEqual(7, rl.getQ((7, None, (1, 0)), 'Quit')) self.assertEqual(14, rl.getQ((2, None, (0, 2)), 'Quit')) self.assertEqual(0, rl.getQ((2, None, (0, 2)), 'Take'))
def simulation(mdp1, feature=submission.identityFeatureExtractor): learning = submission.QLearningAlgorithm(mdp1.actions, 1, feature) rewards = util.simulate(mdp1, learning, numTrials=30000) learning.explorationProb = 0 #states = mdp1.computeStates() vi = submission.ValueIteration() vi.solve(mdp1) total = 0 same = 0 for state in mdp1.states: print state, vi.pi[state],learning.getAction(state) if ( vi.pi[state] == learning.getAction(state) ): same += 1 total += 1 print "utility %.2f same action percentage is %.2f" % ( sum(rewards) / float(len(rewards)), same / float(total))
def test_basic(self): """4a-basic-0: Basic test for incorporateFeedback() using NumberLineMDP.""" mdp = util.NumberLineMDP() mdp.computeStates() rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(), submission.identityFeatureExtractor, 0) # We call this here so that the stepSize will be 1 rl.numIters = 1 rl.incorporateFeedback(0, 1, 0, 1) self.assertEqual(0, rl.getQ(0, -1)) self.assertEqual(0, rl.getQ(0, 1)) rl.incorporateFeedback(1, 1, 1, 2) self.assertEqual(0, rl.getQ(0, -1)) self.assertEqual(0, rl.getQ(0, 1)) self.assertEqual(0, rl.getQ(1, -1)) self.assertEqual(1, rl.getQ(1, 1)) rl.incorporateFeedback(2, -1, 1, 1) self.assertEqual(1.9, rl.getQ(2, -1)) self.assertEqual(0, rl.getQ(2, 1))
import util,submission #def simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False, # sort=False): mdp = submission.smallMDP rl = submission.QLearningAlgorithm(mdp.actions,mdp.discount(),submission.identityFeatureExtractor,explorationProb=0) res = util.simulate(mdp,rl,numTrials=30000) f=open('sim_res.txt','w') f.write(str(res)+"\n") f.close() print len(res) p={} for state in mdp.states: pi_rl[state] = rl.getAction(state) print "small test case" print "pi of reinforcement learning is:" print pi_rl algo = submission.ValueIteration() algo.solve(mdp) print "pi of Value iteration is:" print algo.pi
import util import submission vi = submission.ValueIteration() vi.solve(submission.originalMDP) fixedRLA = util.FixedRLAlgorithm(vi.pi) rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) mdp2 = submission.newThresholdMDP learning = submission.QLearningAlgorithm(mdp2.actions, 1, submission.blackjackFeatureExtractor) rewards = util.simulate(mdp2, learning, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards))) vi2 = submission.ValueIteration() vi2.solve(submission.newThresholdMDP) fixed2 = util.FixedRLAlgorithm(vi2.pi) rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000) print "average utility " + str(sum(rewards) / float(len(rewards)))