def test4a(): mdp = util.NumberLineMDP() mdp.computeStates() rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(), submission.identityFeatureExtractor, 0) # We call this here so that the stepSize will be 1 rl.numIters = 1 rl.incorporateFeedback(0, 1, 0, 1) print '1' grader.requireIsEqual(0, rl.getQ(0, -1)) print '2' grader.requireIsEqual(0, rl.getQ(0, 1)) rl.numIters = 1 rl.incorporateFeedback(1, 1, 1, 2) print '3' grader.requireIsEqual(0, rl.getQ(0, -1)) print '4' grader.requireIsEqual(0, rl.getQ(0, 1)) print '5' grader.requireIsEqual(0, rl.getQ(1, -1)) print '6' grader.requireIsEqual(1, rl.getQ(1, 1)) rl.numIters = 1 rl.incorporateFeedback(2, -1, 1, 1) print '7' grader.requireIsEqual(1.9, rl.getQ(2, -1)) print '8' grader.requireIsEqual(0, rl.getQ(2, 1))
def test_util(): print("Testing util module : ") print("...creating simple mdp instance ... ") mdp = util.NumberLineMDP() #instance of an MDP problem solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem print "Vopt : %s " % solver.V print "optimal_policy : %s " % solver.pi print("... done test_util.\n")
def test1c(): V = collections.Counter() # state -> value of state for state in range(-5,6): V[state] = state mdp = util.NumberLineMDP() mdp.computeStates() goldPi = collections.defaultdict(lambda: 1) pi = submission.computeOptimalPolicy(mdp, V) for state in range(-5,6): if not grader.requireIsEqual(goldPi[state], pi[state]): print (' state: {}'.format(state))
def testQ(f, V): mdp = util.NumberLineMDP() goldQ = {} values = [l.split() for l in open(f)] for state, action, value in values: goldQ[(int(state), int(action))] = float(value) for state in range(-5,6): for action in [-1,1]: if not grader.requireIsEqual(goldQ[(state, action)], submission.computeQ(mdp, V, state, action)): print (' state: {}, action: {}'.format(state, action))
def testIteration(algorithm): mdp = util.NumberLineMDP() goldPi = collections.defaultdict(lambda: 1) goldV = {} values = [l.split() for l in open('1d.gold')] for state, value in values: goldV[int(state)] = float(value) algorithm.solve(mdp, .0001) for state in range(-5,6): if not grader.requireIsEqual(goldPi[state], algorithm.pi[state]): print (' action for state: {}'.format(state)) if not grader.requireIsLessThan(.001, abs(goldV[state] - algorithm.V[state])): print (' value for state: {}'.format(state))
def test1b(): V = collections.defaultdict(int) pi = collections.defaultdict(lambda: -1) mdp = util.NumberLineMDP() mdp.computeStates() goldV = {} values = [l.split() for l in open('1b.gold')] for state, value in values: goldV[int(state)] = float(value) V = submission.policyEvaluation(mdp, V, pi, .0001) for state in range(-5,6): if not grader.requireIsLessThan(.001, abs(goldV[state] - V[state])): print (' state: {}'.format(state))
def test_basic(self): """4a-basic-0: Basic test for incorporateFeedback() using NumberLineMDP.""" mdp = util.NumberLineMDP() mdp.computeStates() rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(), submission.identityFeatureExtractor, 0) # We call this here so that the stepSize will be 1 rl.numIters = 1 rl.incorporateFeedback(0, 1, 0, 1) self.assertEqual(0, rl.getQ(0, -1)) self.assertEqual(0, rl.getQ(0, 1)) rl.incorporateFeedback(1, 1, 1, 2) self.assertEqual(0, rl.getQ(0, -1)) self.assertEqual(0, rl.getQ(0, 1)) self.assertEqual(0, rl.getQ(1, -1)) self.assertEqual(1, rl.getQ(1, 1)) rl.incorporateFeedback(2, -1, 1, 1) self.assertEqual(1.9, rl.getQ(2, -1)) self.assertEqual(0, rl.getQ(2, 1))