Beispiel #1
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    mdp.computeStates()
    allStates = mdp.states
    # Run value iteration.
    solver = util.ValueIteration()
    solver.solve(mdp)
    optimalVIPolicy = solver.pi

    # Run Q-Learning algorithm and compute its optimal policy.
    ql = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=featureExtractor)
    util.simulate(smallMDP, ql, numTrials=30000, maxIterations=10000)
    ql.explorationProb = 0.0
    optimalQLPolicy = {state: ql.getAction(state) for state in allStates}

    # Compute some statistics
    numDifferent = sum(1 for state in allStates
                       if optimalQLPolicy[state] != optimalVIPolicy[state])
    print("{} out of {} states have different actions".format(
        numDifferent, len(allStates)))
Beispiel #2
0
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) #actions discount feature extractor
    util.simulate(mdp, rl, numTrials=30000)
    rl.explorationProb = 0
    valueIter = util.ValueIteration()
    valueIter.solve(mdp)


    numberOfStates = 0
    numberOfDifferentStates = 0
    for state in mdp.states:
        if state not in valueIter.pi:
            file.write('Pi does not contain state {}\n'.format(state))
        else:
            if valueIter.pi[state] != rl.getAction(state) and state[2] != None:
                numberOfDifferentStates += 1
                file.write('In state {} Pi gives action {}, but RL gives action {}\n'.format(state, valueIter.pi[state], rl.getAction(state)))
        numberOfStates += 1
    file.write('\n % of different actions = {}%\n'.format(numberOfDifferentStates/numberOfStates*100))
    for weight in rl.weights:
        file.write('weight ({}) =  {} \n'.format(weight, rl.weights[weight]))
Beispiel #3
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE

    val = util.ValueIteration()
    val.solve(original_mdp)
    val_policy = val.pi
    RL1 = util.FixedRLAlgorithm(val_policy)
    result1 = util.simulate(modified_mdp,
                            RL1,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg1 = sum(result1) / float(len(result1))
    print(avg1)
    RL2 = QLearningAlgorithm(modified_mdp.actions,
                             modified_mdp.discount(),
                             featureExtractor,
                             explorationProb=0.2)
    result2 = util.simulate(modified_mdp,
                            RL2,
                            numTrials=50000,
                            maxIterations=1000,
                            verbose=False,
                            sort=False)
    avg2 = sum(result2) / float(len(result2))
    print(avg2)
Beispiel #4
0
def simulate_QL_over_MDP(MDP, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    # pass
    RL = QLearningAlgorithm(MDP.actions,
                            MDP.discount(),
                            featureExtractor,
                            explorationProb=0)
    util.simulate(MDP,
                  RL,
                  numTrials=30000,
                  maxIterations=1000,
                  verbose=False,
                  sort=False)
    MDP.computeStates()
    RL_policy = {}
    for state in MDP.states:
        RL_policy[state] = RL.getAction(state)
    val = util.ValueIteration()
    val.solve(MDP)
    val_policy = val.pi
    sum_ = []
    for key in RL_policy:
        if RL_policy[key] == val_policy[key]:
            sum_.append(1)
        else:
            sum_.append(0)
    print(float(sum(sum_)) / len(RL_policy))
    return RL_policy, val_policy
Beispiel #5
0
 def test_hidden(self):
   """3a-hidden:  Hidden test for ValueIteration. Run ValueIteration on BlackjackMDP, then test if V[startState] is correct."""
   mdp = submission.BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3,
                                 threshold=40, peekCost=1)
   startState = mdp.startState()
   alg = util.ValueIteration()
   alg.solve(mdp, .0001)
def test3aHidden():
    mdp = submission.BlackjackMDP(cardValues=[1, 3, 5, 8, 10],
                                  multiplicity=3,
                                  threshold=40,
                                  peekCost=1)
    startState = mdp.startState()
    alg = util.ValueIteration()
    alg.solve(mdp, .0001)
Beispiel #7
0
def Q4c():
    # s = (3, None, (3,4,0))
    # fv = blackjackFeatureExtractor(s,'Take')
    # print "for state %s , action 'Take' ... \n ... feature vector returned: %s" %(s,fv)

    print "Comparing value iteration ag simulated Q-learning as in 4b but using better featureExtractor:"
    phi = blackjackFeatureExtractor
    mdp = smallMDP  #smallMDP #TOGGLE THIS
    numqtrials = 100  #CHANGE THIS : eg 10, 10000, 300000
    print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % (
        mdp.cardValues, mdp.multiplicity, numqtrials)

    # value iteration:
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(mdp)  #algo applied to the MDP problem

    # q-learning simulate :
    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.2)
    totPVs = util.simulate(
        mdp, rl, numTrials=numqtrials,
        verbose=False)  #returns list of totRewards for each trial
    print " ........ # non-zero weights = %s" % sum(
        [1 for k, v in rl.weights.items() if v])

    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print "\n...Comparison of Vopt : "
    print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % (
        solver.V[mdp.startState()])
    print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % (
        statistics.mean(totPVs), statistics.stdev(totPVs))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est
    # plotQL(totPVs)

    # Comparison of VI and QL policies:
    print "\n...Comparison of policies (rerun with explorationProb = 0) : "
    rl.explorationProb = 0  # rerun QL now with 0 exploration prob (since learned)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est

    diffs = 0  #counts number of differences in policy btw VI and QL
    for s, p in solver.pi.items(
    ):  # using value-iteration policy as starting point
        rlp = max((dotProduct(rl.weights, dict(phi(s, a))), a)
                  for a in rl.actions(s))[1]
        if rlp != p:
            diffs += 1
            print "rlp : %s does not equal VIp : %s for state %s" % (rlp, p, s)
    print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % (
        diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
Beispiel #8
0
def test_util():
    print("Testing util module : ")
    print("...creating simple mdp instance ... ")
    mdp = util.NumberLineMDP()  #instance of an MDP problem
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(mdp)  #algo applied to the MDP problem
    print "Vopt : %s " % solver.V
    print "optimal_policy : %s " % solver.pi
    print("... done test_util.\n")
Beispiel #9
0
def simulate_QL_over_MDP(mdp, featureExtractor, verbose=False):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.

    # 4b : identityFeatureExtractor, RL peut performant car la fonction choisie
    # Phi est particulièrement peu généralisable (fonction indicatrice de (s,a)))

    # BEGIN_YOUR_CODE
    print ("simulate_QL_over_MDP")

    # Résolution via Value Iteration
    vi = util.ValueIteration()
    vi.solve(mdp, .0001)
    pi_vi = vi.pi  # pi computed with value iteration

    if verbose:
        print('len pi_vi :  {}'.format(len(pi_vi)))

    # Résolution via Q-Learning
    mdp.computeStates()
    rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor,
                            0.05)  # meilleur qu'avec un taux d'exploration de 0.2
    util.simulate(mdp, rl, 30000)
    # util.simulate(mdp, rl, numTrials=30000, maxIterations=1000)

    # On connait l'ensemble des états possibles de notre mdp grace à la variable mdp.states (attribut de mdp)
    # Cet attribut est initialisé avec l'appel à la méthode computeStates
    pi_rl = rl.get_pi_opt(mdp.states)  # pi computed with Q-learning (RL)

    if verbose:
        print('len pi_rl :  {}'.format(len(pi_rl)))

    if verbose:
        print('pi : ')
        print('Value Iteration')
        print('Reinforcement Learning')
        print('---')
        for state in mdp.states:
            print('{} : {}'.format(state, pi_vi[state]))
            print('{} : {}'.format(state, pi_rl[state]))
            print('---')

    print('Stats')
    print 'Nb d\'états possibles : ', len(mdp.states)
    equal = 0.
    for state in mdp.states:  # Liste des clés  pi_rl (inclus dans pi_vi, car pi_vi exhaustif)
        if pi_vi[state] == pi_rl[state]:
            equal += 1

    print('Egalités : {0:.2f} %'.format(equal / len(mdp.states) * 100))
    print('---')
Beispiel #10
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    print ("compare_changed_MDP")

    # Résolution via Value Iteration
    # pi_original_mdp
    vio = util.ValueIteration()
    vio.solve(original_mdp, .0001)
    pi_original_mdp = vio.pi  # pi computed with value iteration
    print 'Récompenses original_mdp calculée via VI pour startState : '
    print vio.V[original_mdp.startState()]

    # modified_mdp
    vim = util.ValueIteration()
    vim.solve(modified_mdp, .0001)
    pi_modified_mdp = vim.pi  # pi computed with value iteration
    print 'Récompenses modified_mdp calculée via VI pour startState : '
    print vim.V[modified_mdp.startState()]

    # Exploitation de la stratégie Pi définie sur original_mdp
    # en l'appliquant via la simulation sur le nouvel mdp modified_mdp
    fixed_rl = util.FixedRLAlgorithm(pi_original_mdp)
    # totalRewards = util.simulate(newThresholdMDP, fixed_rl, 30000)
    totalRewards = util.simulate(modified_mdp, fixed_rl, numTrials=30000, maxIterations=1000, verbose=False, sort=False)
    print('Moyenne des récompenses sur le nouvel MDP en utilisant la Pi de l\'ancien MDP : ')
    print sum(totalRewards) / len(totalRewards)

    # Résolution via Q-Learning sur original_mdp
    original_mdp.computeStates()
    rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor, 0.2)
    totalRewards = util.simulate(original_mdp, rl, 30000)
    print('Moyenne des récompenses sur l\'ancien MDP en utilisant RL: ')
    print sum(totalRewards) / len(totalRewards)
    totalRewards = util.simulate(modified_mdp, rl, 30000)
    print('Moyenne des récompenses sur le nouveau MDP en utilisant RL entraîné sur l\'ancien: ')
    print sum(totalRewards) / len(totalRewards)
Beispiel #11
0
def Q4d():
    origMDP = BlackjackMDP(cardValues=[1, 5],
                           multiplicity=2,
                           threshold=10,
                           peekCost=1)
    newThreshMDP = BlackjackMDP(cardValues=[1, 5],
                                multiplicity=2,
                                threshold=9,
                                peekCost=1)

    #run VI on original MDP to obtain policy:
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(origMDP)  #algo applied to the MDP problem
    print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()])
    pi0 = solver.pi

    # apply this policy to an agent (in simulated mdp) playing the **new** MDP:
    numqtrials = 30000
    rl = util.FixedRLAlgorithm(pi0)

    mdp = origMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    mdp = newThreshMDP
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % (
        statistics.mean(totPVs), statistics.stdev(totPVs))

    # now skip the fixed policy and use QL :
    phi = identityFeatureExtractor  #blackjackFeatureExtractor

    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.5)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False)
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState : %s " % Vopt_est
    # plotQL(totPVs)

    # Comparison of VI and QL policies:
    rl.explorationProb = 0  # rerun QL now with 0 exploration prob (since learned)
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(
        dotProduct(rl.weights, dict(phi(mdp.startState(), a)))
        for a in rl.actions(mdp.startState()))
    print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
Beispiel #12
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    valueIterOriginal = util.ValueIteration()
    valueIterOriginal.solve(original_mdp)
    fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi)
    rewards = util.simulate(modified_mdp, fixedRL)
    print("Fixed RL")
    for reward in rewards:
        print(reward)
    rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor))
    print('QLearn')
    for reward in rewardsFromQ:
        print(reward)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    ValIter = util.ValueIteration()
    ValIter.solve(original_mdp)
    policyVal = ValIter.pi
    Fixed = FixedRLAlgorithm(policyVal)
    print('Rewards for value iteration: ',
          util.simulate(newThresholdMDP, Fixed))
    QL = QLearningAlgorithm(newThresholdMDP.actions,
                            newThresholdMDP.discount(),
                            featureExtractor,
                            explorationProb=0)
    print('Rewards for Q-learning iteration: ',
          util.simulate(newThresholdMDP, QL))
Beispiel #14
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    solver = util.ValueIteration()
    solver.solve(original_mdp)
    optimalVIOriginalMDPPolicy = solver.pi

    fixedRL = util.FixedRLAlgorithm(optimalVIOriginalMDPPolicy)
    rewards = util.simulate(modified_mdp,
                            fixedRL,
                            numTrials=30000,
                            maxIterations=10000)
    print("Sampled average reward for optimal policy from original MDP is {}.".
          format(sum(rewards) / float(len(rewards))))

    # Train Q-learning.
    ql = QLearningAlgorithm(actions=modified_mdp.actions,
                            discount=modified_mdp.discount(),
                            featureExtractor=featureExtractor)
    trainingRewards = util.simulate(modified_mdp,
                                    ql,
                                    numTrials=30000,
                                    maxIterations=10000)
    print(
        "Sampled average reward for Q-Learning during training is {}.".format(
            sum(trainingRewards) / float(len(trainingRewards))))
    ql.explorationProb = 0.0
    modified_mdp.computeStates()
    learnedQLPolicy = {
        state: ql.getAction(state)
        for state in modified_mdp.states
    }
    fixedQLRL = util.FixedRLAlgorithm(learnedQLPolicy)
    rewardsQL = util.simulate(modified_mdp,
                              fixedQLRL,
                              numTrials=30000,
                              maxIterations=10000)
    print(
        "Sampled average reward for policy learned directly on new problem with Q-Learning is {}."
        .format(sum(rewardsQL) / float(len(rewardsQL))))
Beispiel #15
0
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor):
    # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added
    # this partial function here to help you figure out the answer to 4d (a written question).
    # Consider adding some code here to simulate two different policies over the modified MDP
    # and compare the rewards generated by each.
    # BEGIN_YOUR_CODE
    vi = util.ValueIteration()
    vi.solve(original_mdp)
    summ = 0
    events = 0
    for i in util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000):
        summ += i
        events += 1
    print(summ * 1.0 / events)
    Qlearning = QLearningAlgorithm(modified_mdp.actions,
                                   modified_mdp.discount(), featureExtractor)
    summ = 0
    events = 0
    for i in util.simulate(modified_mdp, Qlearning, 100):
        summ += i
        events += 1
    print(summ * 1.0 / events)
def simulate_QL_over_MDP(mdp, featureExtractor):
    # NOTE: adding more code to this function is totally optional, but it will probably be useful
    # to you as you work to answer question 4b (a written question on this assignment).  We suggest
    # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP,
    # and then print some stats comparing the policies learned by these two approaches.
    # BEGIN_YOUR_CODE
    QL = QLearningAlgorithm(mdp.actions,
                            mdp.discount(),
                            featureExtractor,
                            explorationProb=0)
    util.simulate(mdp, QL, numTrials=30000, maxIterations=1000)  #To calculate
    mdp.computeStates()
    policyQL = {}
    for s in mdp.states:
        policyQL[s] = QL.getAction(s)
    # valueiteration
    ValIter = util.ValueIteration()
    ValIter.solve(mdp)
    policyVal = ValIter.pi
    Intersection = [1 if policyQL[k] == policyVal[k] else 0 for k in policyQL]
    print('MDP accuracy is: ', sum(Intersection) / len(policyQL))
    return
Beispiel #17
0
def main():
    vi = util.ValueIteration()
    vi.solve(MDP())
                num_all_cards = sum(deck)

                for idx, num in enumerate(deck):
                    if num == 0:
                        continue
                    prob = num/sum(deck) 
                    succ_prob_reward_list.append(((card_sum, idx, deck) , prob, - self.peekCost))  # HINT: has the form (new_card_sum, new_peek_idx, new_deck)
        # ---------------------------------------- Peek implementation

        elif action == 'Quit':
            succ_prob_reward_list.append(((card_sum, None, None), 1, card_sum))

        else:
            raise ValueError("Undefined action '{}'".format(action))

        return succ_prob_reward_list
        # END_YOUR_CODE

    def discount(self):
        return 1


if __name__ == '__main__':
    mdp = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1)

    algorithm = util.ValueIteration()
    algorithm.solve(mdp, verbose=0)

    for s in algorithm.pi:
        print(f'pi({s}) = {algorithm.pi[s]}')
Beispiel #19
0
def Q4b():
    print "Comparing value iteration ag simulated Q-learning :"

    mdp = largeMDP  #TOGGLE THIS
    numqtrials = 30000  #CHANGE THIS : eg 10, 10000, 300000
    print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % (
        mdp.cardValues, mdp.multiplicity, numqtrials)

    # value iteration
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(mdp)  #algo applied to the MDP problem

    # q-learning simulate :
    phi = identityFeatureExtractor
    # phi = blackjackFeatureExtractor

    rl = QLearningAlgorithm(actions=mdp.actions,
                            discount=mdp.discount(),
                            featureExtractor=phi,
                            explorationProb=0.2)
    # simulate_QL_over_MDP(mdp, rl)
    totPVs = util.simulate(
        mdp, rl, numTrials=numqtrials,
        verbose=False)  #returns list of totRewards for each trial
    # print " ........ totPVs : %s " %totPVs
    print " ........ # non-zero weights = %s" % sum(
        [1 for k, v in rl.weights.items() if v])

    # Vopt_est = max(rl.weights[(mdp.startState(),a)] for a in rl.actions(mdp.startState() ) )
    Vopt_est = max(rl.weights[(mdp.startState(), a)]
                   for a in rl.actions(mdp.startState()))

    print "...Comparison of Vopt : "
    print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % (
        solver.V[mdp.startState()])
    print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % (
        statistics.mean(totPVs), statistics.stdev(totPVs))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est
    # plotQL(totPVs)

    print "...Comparison of policies (rerun with explorationProb = 0) : "
    # rerun QL now with 0 exploration prob (since learned)
    rl.explorationProb = 0
    totPVs = util.simulate(mdp, rl, numTrials=numqtrials,
                           verbose=False)  #reruns simulation
    Vopt_est = max(rl.weights[(mdp.startState(), a)]
                   for a in rl.actions(mdp.startState()))
    print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est
    print " ... # non-zero weights = %s" % sum(
        [1 for k, v in rl.weights.items() if v])

    #sample weights :
    # s = mdp.startState()
    # print "weights for startState : %s" %{k:v for k,v in rl.weights.items() if k[0] == s}
    # print "--> vip = %s" %max((rl.weights[(s,a)],a) for a in rl.actions(s) )[1]

    diffs = 0  #counts number of differences in policy btw VI and QL
    for s, p in solver.pi.items(
    ):  # using value-iteration policy as starting point
        vip = max((rl.weights[(s, a)], a) for a in rl.actions(s))[1]
        if vip != p:
            diffs += 1
    print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % (
        diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
Beispiel #20
0
def mdpsolve(mdp):
    solver = util.ValueIteration()  #algorithm instantiated
    solver.solve(mdp)  #algo applied to the MDP problem
    print "Vopt : %s " % solver.V
    print "optimal_policy : %s " % solver.pi
Beispiel #21
0
policy_filename = results.output_policy_fn
value_filename = results.output_value_fn

print 'loading pkl'
all_flights = {}
with open(r"airport_to_flights_dict.pkl", "rb") as input_file:
    all_flights = pickle.load(input_file)
print 'done loading pkl'

np.random.seed(11)

mdp = FlightMDP(initial_origin=initial_origin,
                start_time=datetime.datetime(2015, 1, 11, 8, 30),
                final_destination=destination,
                prune_direct=prune_direct)
alg = util.ValueIteration()
alg.solve(mdp, epsilon)

with open(value_filename, 'wb') as handle:
    pickle.dump(alg.V, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(policy_filename, 'wb') as handle:
    pickle.dump(alg.pi, handle, protocol=pickle.HIGHEST_PROTOCOL)

print 'dumped new policies'
print 'printing final path'

state = mdp.startState()
path = [(state, None)]
while True:
    print '\n'
Beispiel #22
0
    featureKey = (state, action)
    featureValue = 1
    return [(featureKey, featureValue)]


############################################################
# Problem 4b: convergence of Q-learning
# Small test case

smallMDP = BlackjackMDP(cardValues=[1, 5],
                        multiplicity=2,
                        threshold=10,
                        peekCost=1)

smallMDP.computeStates()
ValueIterationSolution = util.ValueIteration()
ValueIterationSolution.solve(smallMDP)

rl = QLearningAlgorithm(smallMDP.actions, smallMDP.discount(),
                        identityFeatureExtractor)
util.explorationProb = 0
util.simulate(smallMDP,
              rl,
              numTrials=30000,
              maxIterations=1000,
              verbose=False,
              sort=False)

similar = 0.0
total = 0.0
for s in smallMDP.states: