Esempio n. 1
0
 def test_hidden(self):
   """4a-hidden:  Hidden test for incorporateFeedback(). Run QLearningAlgorithm on smallMDP, then ensure that getQ returns reasonable value."""
   smallMDP = self.run_with_solution_if_possible(submission,
                                                 lambda sub_or_sol: sub_or_sol.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1))
   smallMDP.computeStates()
   rl = submission.QLearningAlgorithm(smallMDP.actions, smallMDP.discount(),
                                  submission.identityFeatureExtractor,
                                  0.2)
   util.simulate(smallMDP, rl, 30000)
Esempio n. 2
0
  def test_basic(self):
    """4c-basic:  Basic test for blackjackFeatureExtractor.  Runs QLearningAlgorithm using blackjackFeatureExtractor, then checks to see that Q-values are correct."""
    mdp = submission.BlackjackMDP(cardValues=[1, 5], multiplicity=2,
                                  threshold=10, peekCost=1)
    mdp.computeStates()
    rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(),
                                       submission.blackjackFeatureExtractor,
                                       0)
    # We call this here so that the stepSize will be 1
    rl.numIters = 1

    rl.incorporateFeedback((7, None, (0, 1)), 'Quit', 7, (7, None, None))
    self.assertEqual(28, rl.getQ((7, None, (0, 1)), 'Quit'))
    self.assertEqual(7, rl.getQ((7, None, (1, 0)), 'Quit'))
    self.assertEqual(14, rl.getQ((2, None, (0, 2)), 'Quit'))
    self.assertEqual(0, rl.getQ((2, None, (0, 2)), 'Take'))
Esempio n. 3
0
def simulation(mdp1, feature=submission.identityFeatureExtractor):
    learning = submission.QLearningAlgorithm(mdp1.actions, 1, feature)
    rewards = util.simulate(mdp1, learning, numTrials=30000) 

    learning.explorationProb = 0
#states = mdp1.computeStates()
    vi = submission.ValueIteration()
    vi.solve(mdp1)

    total = 0
    same = 0
    for state in mdp1.states:
        print state, vi.pi[state],learning.getAction(state)
        if ( vi.pi[state] == learning.getAction(state) ):
            same += 1
        total += 1
    print "utility %.2f same action percentage is %.2f" % ( sum(rewards) / float(len(rewards)), same / float(total))
Esempio n. 4
0
    def test_basic(self):
        """4a-basic-0:  Basic test for incorporateFeedback() using NumberLineMDP."""
        mdp = util.NumberLineMDP()
        mdp.computeStates()
        rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(),
                                           submission.identityFeatureExtractor,
                                           0)
        # We call this here so that the stepSize will be 1
        rl.numIters = 1

        rl.incorporateFeedback(0, 1, 0, 1)
        self.assertEqual(0, rl.getQ(0, -1))
        self.assertEqual(0, rl.getQ(0, 1))

        rl.incorporateFeedback(1, 1, 1, 2)
        self.assertEqual(0, rl.getQ(0, -1))
        self.assertEqual(0, rl.getQ(0, 1))
        self.assertEqual(0, rl.getQ(1, -1))
        self.assertEqual(1, rl.getQ(1, 1))

        rl.incorporateFeedback(2, -1, 1, 1)
        self.assertEqual(1.9, rl.getQ(2, -1))
        self.assertEqual(0, rl.getQ(2, 1))
import util,submission

#def simulate(mdp, rl, numTrials=10, maxIterations=1000, verbose=False,
#             sort=False):

mdp = submission.smallMDP
rl = submission.QLearningAlgorithm(mdp.actions,mdp.discount(),submission.identityFeatureExtractor,explorationProb=0)
res = util.simulate(mdp,rl,numTrials=30000)

f=open('sim_res.txt','w')
f.write(str(res)+"\n")
f.close()

print len(res)

p={}
for state in mdp.states:
    pi_rl[state] = rl.getAction(state)
print "small test case"
print "pi of reinforcement learning is:"
print pi_rl

algo = submission.ValueIteration()
algo.solve(mdp)
print "pi of Value iteration is:"
print algo.pi

Esempio n. 6
0
import util
import submission

vi = submission.ValueIteration()
vi.solve(submission.originalMDP)
fixedRLA = util.FixedRLAlgorithm(vi.pi)
rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))
rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))

mdp2 = submission.newThresholdMDP
learning = submission.QLearningAlgorithm(mdp2.actions, 1,
                                         submission.blackjackFeatureExtractor)
rewards = util.simulate(mdp2, learning, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))
vi2 = submission.ValueIteration()
vi2.solve(submission.newThresholdMDP)
fixed2 = util.FixedRLAlgorithm(vi2.pi)
rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000)
print "average utility " + str(sum(rewards) / float(len(rewards)))