def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) util.simulate(mdp, rl, 30000) zero_weight_count = 0 total_weight_count = 0 for key in rl.weights: weight = rl.weights[key] total_weight_count += 1 if abs(weight - 0.0) <= 0.00001: zero_weight_count += 1 print "Total Weights: %s, Zero Weights: %s" % (total_weight_count, zero_weight_count) rl.explorationProb = 0 vi = ValueIteration() vi.solve(mdp) count = 0 expected_result = 0 for key in vi.pi: count += 1 if vi.pi[key] is rl.getAction(key): expected_result += 1 print "total (state, action) pairs: %s" % (count * 3) print "Accuracy of MDP using the featureExtractor: %s" % ( float(expected_result) / count * 100)
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE original_mdp.computeStates() vi = ValueIteration() vi.solve(originalMDP) rl = util.FixedRLAlgorithm(vi.pi.copy()) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False) rl.explorationProb = 0.0 #print(rewards) modified_mdp.computeStates() rl = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, 0.2) rewards = util.simulate(modified_mdp, rl, numTrials=10000, maxIterations=1000, verbose=False, sort=False)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ql = QLearningAlgorithm(actions=mdp.actions, discount=1, featureExtractor=featureExtractor) util.simulate(mdp, ql, numTrials=90000, maxIterations=1000) print(ql.numIters) ql.explorationProb = 0 print(ql.explorationProb) ql.is_test = True vi = ValueIteration() vi.solve(mdp) match = [ql.getAction(state) == action for state, action in vi.pi.items()] # ql_action = [ql.getAction(state) for state, action in vi.pi.items()] # take_count = [action == 'Take' for state, action in vi.pi.items()] # peek_count = [action == 'Peek' for state, action in vi.pi.items()] # quit_count = [action == 'Quit' for state, action in vi.pi.items()] # print('Take: {}'.format(sum(take_count) / len(take_count))) # print('Peek: {}'.format(sum(peek_count) / len(take_count))) # print('Quit: {}'.format(sum(quit_count) / len(take_count))) percentage_match = sum(match) / len(match) # print(ql_action) # print(ql.weights) return percentage_match
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(mdp) viQ = vi.pi mdp.computeStates() rl = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor, .2) util.simulate(mdp, rl, numTrials=30000, maxIterations=10, verbose=False, sort=False) mdp.explorationProb = 0 d = {} for state in mdp.states: d[state] = rl.getAction(state) Diff = 0 for k in d.keys(): if d[k] != viQ[k]: Diff += 1 return Diff
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE valiter = ValueIteration() valiter.solve(smallMDP) # Simulate with 20% exploration probability, and then set to 0 after simulation rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, explorationProb = 0.2) util.simulate(mdp, rl, 30000, verbose = False) rl.explorationProb = 0 # Extract the optimal policies and replicate the dict that comes from valiter.pi rl_result = dict() same = 0 different = 0 for state in valiter.pi.keys(): rl_result[state] = rl.getAction(state) print rl.getAction(state), valiter.pi[state] if rl.getAction(state) == valiter.pi[state]: same = same + 1 else: different = different + 1 print same, different return valiter.pi, rl_result
def simulaMDP(mdp, extractor, explorationProb): value_iterator = ValueIteration() value_iterator.solve(mdp) policyVi = value_iterator.pi mdp.computeStates() qLearning = QLearningAlgorithm(mdp.actions, mdp.discount(), extractor, explorationProb) util.simulate(mdp,qLearning, 30000, 10, False, False) mdp.explorationProb = 0 actionsQ = {} for state in mdp.states: actionsQ[state] = qLearning.getAction(state) differentActions = 0 for state in actionsQ.keys(): if actionsQ[state] != policyVi[state]: differentActions += 1 return differentActions # SIMULAÇÕES (Descomentar para testar) # 1 - MDP1 com expProb 0.2 # print("Diferença em MDP1:", simulaMDP(MDP1, identityFeatureExtractor, 0.2)) # 2 - largeMDP com expProb 0 # print("Diferença em largeMDP:", simulaMDP(largeMDP, identityFeatureExtractor, 0)) # 3 - largeMDP com expProb 0 e blackJackFeature # print("Diferença em largeMDP:", simulaMDP(largeMDP, blackjackFeatureExtractor, 0.2)) # 4 - largeMDP com expProb 0.2 e blackJackFeature # print("Diferença em largeMDP:", simulaMDP(largeMDP, blackjackFeatureExtractor, 0.2))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) #actions discount feature extractor util.simulate(mdp, rl, numTrials=30000) rl.explorationProb = 0 valueIter = util.ValueIteration() valueIter.solve(mdp) numberOfStates = 0 numberOfDifferentStates = 0 for state in mdp.states: if state not in valueIter.pi: file.write('Pi does not contain state {}\n'.format(state)) else: if valueIter.pi[state] != rl.getAction(state) and state[2] != None: numberOfDifferentStates += 1 file.write('In state {} Pi gives action {}, but RL gives action {}\n'.format(state, valueIter.pi[state], rl.getAction(state))) numberOfStates += 1 file.write('\n % of different actions = {}%\n'.format(numberOfDifferentStates/numberOfStates*100)) for weight in rl.weights: file.write('weight ({}) = {} \n'.format(weight, rl.weights[weight]))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE val = util.ValueIteration() val.solve(original_mdp) val_policy = val.pi RL1 = util.FixedRLAlgorithm(val_policy) result1 = util.simulate(modified_mdp, RL1, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg1 = sum(result1) / float(len(result1)) print(avg1) RL2 = QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor, explorationProb=0.2) result2 = util.simulate(modified_mdp, RL2, numTrials=50000, maxIterations=1000, verbose=False, sort=False) avg2 = sum(result2) / float(len(result2)) print(avg2)
def simulate_QL_over_MDP(MDP, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE # pass RL = QLearningAlgorithm(MDP.actions, MDP.discount(), featureExtractor, explorationProb=0) util.simulate(MDP, RL, numTrials=30000, maxIterations=1000, verbose=False, sort=False) MDP.computeStates() RL_policy = {} for state in MDP.states: RL_policy[state] = RL.getAction(state) val = util.ValueIteration() val.solve(MDP) val_policy = val.pi sum_ = [] for key in RL_policy: if RL_policy[key] == val_policy[key]: sum_.append(1) else: sum_.append(0) print(float(sum(sum_)) / len(RL_policy)) return RL_policy, val_policy
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE ql = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) q_rewards = util.simulate(mdp, ql, 30000) avg_reward_q = float(sum(q_rewards)) / len(q_rewards) vi = ValueIteration() vi.solve(mdp) rl = util.FixedRLAlgorithm(vi.pi) vi_rewards = util.simulate(mdp, rl, 30000) avg_reward_vi = float(sum(vi_rewards)) / len(vi_rewards) ql.explorationProb = 0 ql_pi = {} for state, _ in vi.pi.items(): ql_pi[state] = ql.getAction(state) p_vi = vi.pi diff = 0 for state in vi.pi.keys(): if vi.pi[state] != ql_pi[state]: diff += 1 print("difference", diff, "over " + str(len(p_vi.keys())) + " states") print("percentage diff ", float(diff) / len(p_vi.keys())) print("avg_reward_diff", avg_reward_q - avg_reward_vi)
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE mdp.computeStates() allStates = mdp.states # Run value iteration. solver = util.ValueIteration() solver.solve(mdp) optimalVIPolicy = solver.pi # Run Q-Learning algorithm and compute its optimal policy. ql = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=featureExtractor) util.simulate(smallMDP, ql, numTrials=30000, maxIterations=10000) ql.explorationProb = 0.0 optimalQLPolicy = {state: ql.getAction(state) for state in allStates} # Compute some statistics numDifferent = sum(1 for state in allStates if optimalQLPolicy[state] != optimalVIPolicy[state]) print("{} out of {} states have different actions".format( numDifferent, len(allStates)))
def test4aHidden(): smallMDP = submission.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1) mdp = smallMDP mdp.computeStates() rl = submission.QLearningAlgorithm(mdp.actions, mdp.discount(), submission.identityFeatureExtractor, 0.2) util.simulate(mdp, rl, 30000)
def Q4c(): # s = (3, None, (3,4,0)) # fv = blackjackFeatureExtractor(s,'Take') # print "for state %s , action 'Take' ... \n ... feature vector returned: %s" %(s,fv) print "Comparing value iteration ag simulated Q-learning as in 4b but using better featureExtractor:" phi = blackjackFeatureExtractor mdp = smallMDP #smallMDP #TOGGLE THIS numqtrials = 100 #CHANGE THIS : eg 10, 10000, 300000 print "...comparison for %s x %s MDP; Q-learning numtrials : %s" % ( mdp.cardValues, mdp.multiplicity, numqtrials) # value iteration: solver = util.ValueIteration() #algorithm instantiated solver.solve(mdp) #algo applied to the MDP problem # q-learning simulate : rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.2) totPVs = util.simulate( mdp, rl, numTrials=numqtrials, verbose=False) #returns list of totRewards for each trial print " ........ # non-zero weights = %s" % sum( [1 for k, v in rl.weights.items() if v]) Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print "\n...Comparison of Vopt : " print " ... value iteration = expected optimal PV :: optimal utility of startState, stdev: ( %s, 0 )" % ( solver.V[mdp.startState()]) print " ... q-learning: avg PV :: utility, stdev over all trials: ( %s, %s ) (see note * below)" % ( statistics.mean(totPVs), statistics.stdev(totPVs)) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est # plotQL(totPVs) # Comparison of VI and QL policies: print "\n...Comparison of policies (rerun with explorationProb = 0) : " rl.explorationProb = 0 # rerun QL now with 0 exploration prob (since learned) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... q-learning: estimated optimal PV :: optimal utility of startState : ( %s, 0 )" % Vopt_est diffs = 0 #counts number of differences in policy btw VI and QL for s, p in solver.pi.items( ): # using value-iteration policy as starting point rlp = max((dotProduct(rl.weights, dict(phi(s, a))), a) for a in rl.actions(s))[1] if rlp != p: diffs += 1 print "rlp : %s does not equal VIp : %s for state %s" % (rlp, p, s) print "number of different policies btw VI and QL , out of total : %s / %s = %4.2f" % ( diffs, len(solver.pi), diffs / (1.0 * len(solver.pi)))
def simulateQL(mdp): mdp.computeStates() QLAlgorithm = QLearningAlgorithm(mdp.actions, mdp.discount(), identityFeatureExtractor) util.simulate(mdp, QLAlgorithm, 30000) QLAlgorithm.explorationProb = 0 stateAndAction = {} for state in mdp.states: stateAndAction[state] = QLAlgorithm.getAction(state) return stateAndAction
def test_hidden(self): """4a-hidden: Hidden test for incorporateFeedback(). Run QLearningAlgorithm on smallMDP, then ensure that getQ returns reasonable value.""" smallMDP = self.run_with_solution_if_possible(submission, lambda sub_or_sol: sub_or_sol.BlackjackMDP(cardValues=[1,5], multiplicity=2, threshold=10, peekCost=1)) smallMDP.computeStates() rl = submission.QLearningAlgorithm(smallMDP.actions, smallMDP.discount(), submission.identityFeatureExtractor, 0.2) util.simulate(smallMDP, rl, 30000)
def simulate_QL_over_MDP(mdp, featureExtractor, verbose=False): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # 4b : identityFeatureExtractor, RL peut performant car la fonction choisie # Phi est particulièrement peu généralisable (fonction indicatrice de (s,a))) # BEGIN_YOUR_CODE print ("simulate_QL_over_MDP") # Résolution via Value Iteration vi = util.ValueIteration() vi.solve(mdp, .0001) pi_vi = vi.pi # pi computed with value iteration if verbose: print('len pi_vi : {}'.format(len(pi_vi))) # Résolution via Q-Learning mdp.computeStates() rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.05) # meilleur qu'avec un taux d'exploration de 0.2 util.simulate(mdp, rl, 30000) # util.simulate(mdp, rl, numTrials=30000, maxIterations=1000) # On connait l'ensemble des états possibles de notre mdp grace à la variable mdp.states (attribut de mdp) # Cet attribut est initialisé avec l'appel à la méthode computeStates pi_rl = rl.get_pi_opt(mdp.states) # pi computed with Q-learning (RL) if verbose: print('len pi_rl : {}'.format(len(pi_rl))) if verbose: print('pi : ') print('Value Iteration') print('Reinforcement Learning') print('---') for state in mdp.states: print('{} : {}'.format(state, pi_vi[state])) print('{} : {}'.format(state, pi_rl[state])) print('---') print('Stats') print 'Nb d\'états possibles : ', len(mdp.states) equal = 0. for state in mdp.states: # Liste des clés pi_rl (inclus dans pi_vi, car pi_vi exhaustif) if pi_vi[state] == pi_rl[state]: equal += 1 print('Egalités : {0:.2f} %'.format(equal / len(mdp.states) * 100)) print('---')
def Q4d(): origMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) newThreshMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=9, peekCost=1) #run VI on original MDP to obtain policy: solver = util.ValueIteration() #algorithm instantiated solver.solve(origMDP) #algo applied to the MDP problem print " ... VI Vopt(startState) = %s ." % (solver.V[origMDP.startState()]) pi0 = solver.pi # apply this policy to an agent (in simulated mdp) playing the **new** MDP: numqtrials = 30000 rl = util.FixedRLAlgorithm(pi0) mdp = origMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print " ... QL: avg PV, stdev using above VI opt policy on same mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) mdp = newThreshMDP totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) print "\n ... QL: avg PV, stdev using above VI opt policy on *NEW* mdp: ( %s, %s ) " % ( statistics.mean(totPVs), statistics.stdev(totPVs)) # now skip the fixed policy and use QL : phi = identityFeatureExtractor #blackjackFeatureExtractor rl = QLearningAlgorithm(actions=mdp.actions, discount=mdp.discount(), featureExtractor=phi, explorationProb=0.5) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState : %s " % Vopt_est # plotQL(totPVs) # Comparison of VI and QL policies: rl.explorationProb = 0 # rerun QL now with 0 exploration prob (since learned) totPVs = util.simulate(mdp, rl, numTrials=numqtrials, verbose=False) #reruns simulation Vopt_est = max( dotProduct(rl.weights, dict(phi(mdp.startState(), a))) for a in rl.actions(mdp.startState())) print " ... QL: est. Vopt of startState re-run (with eps = 0) : %s " % Vopt_est
def compareQLandVI(targetMDP, featureExtractor): QL = QLearningAlgorithm(targetMDP.actions, 1, featureExtractor) VI = ValueIteration() util.simulate(targetMDP, QL, numTrials=30000) VI.solve(targetMDP) diffPolicyStates = [] QL.explorationProb = 0 for state in targetMDP.states: #print state, QL.getAction(state), VI.pi[state] if QL.getAction(state) != VI.pi[state]: diffPolicyStates.append(state) print("%d/%d = %f%% different states"%(len(diffPolicyStates), len(targetMDP.states), len(diffPolicyStates)/float(len(targetMDP.states))))
def problem4c(): print '\n4c now' largeMDP.computeStates() QL_1 = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(), identityFeatureExtractor, 0.2) QL_2 = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(), blackjackFeatureExtractor, 0.2) QLReward_1 = util.simulate(largeMDP, QL_1, numTrials=30000) QLReward_2 = util.simulate(largeMDP, QL_2, numTrials=30000) print('QL reward using identityFeatureExtractor: {}'.format( sum(QLReward_1) / float(len(QLReward_1)))) print('QL reward using blackjackFeatureExtractor: {}'.format( sum(QLReward_2) / float(len(QLReward_2))))
def weight_averages(): mdp = model.DisasterMDP() random.seed(42) print('=' * 6, 'initialization', '=' * 6) qLearningSolver = util.QLearningAlgorithm( mdp.actions, 1, model.joint_bucket_max_feature_extractor) print('=' * 6, 'simulating', '=' * 6) totalQLRewards, _, _, _ = util.simulate(mdp, qLearningSolver, numTrials=num_trials) print('Avg QL Reward:', sum(totalQLRewards) / len(totalQLRewards)) weights = qLearningSolver.weights labels = ['resources', 'severities', 'max', 'joint'] counter = [0, 0, 0, 0] sums = [0, 0, 0, 0] for w, val in weights.items(): if 'resource' in w: if 'severity' in w: counter[3] += 1 sums[3] += abs(val) else: counter[0] += 1 sums[0] += abs(val) elif 'severity' in w: counter[1] += 1 sums[1] += abs(val) elif 'max_severity' in w: counter[2] += 1 sums[2] += abs(val) for i in range(len(sums)): sums[i] /= counter[i] return labels, sums
def main(): with open('true__op.json') as f: op = json.load(f) # read parameters with open(op['optics']) as f: opt_op = json.load(f) ctf_op = opt_op['ctf'] v = TIF.pickle_load(op['maps file'])[op['pid']] # load a 3D density map if not op['intensity_positive']: v = -v v = GR.rotate_pad_zero(v, angle=op['rotate_angle'], loc_r=op['translation']) # rotate image p = N.squeeze(v.sum(axis=2)) # make a projection image along z-axis ctf = IOC.create(size=p.shape, Dz=ctf_op['Dz'], pix_size=ctf_op['pix_size'], voltage=ctf_op['voltage'], Cs=ctf_op['Cs'], sigma=ctf_op['sigma'])['ctf'] p_var = p.var() n_var = p_var / opt_op['snr'] # simulate a number of images imgs = [] for i in range(op['image_num']): print '\r', i, ' ', ; sys.stdout.flush() imgs.append(util.simulate(p=p, ctf=ctf, noise_total_var=n_var)) with open(op['images_out'], 'wb') as f: pickle.dump(imgs, f, protocol=-1)
def print_algorithms_compare(mdp, ql, episodes): vi = ValueIteration() vi.solve(mdp) rewards_vi = sum(simulateVI(mdp, vi, episodes)) rewards_ql = sum(util.simulate(mdp, ql, episodes)) print("VI | %.4f" % (rewards_vi / episodes)) print("QL | %.4f" % (rewards_ql / episodes))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIteration = ValueIteration() valueIteration.solve(original_mdp) rl = util.FixedRLAlgorithm(valueIteration.pi) rewards = util.simulate(modified_mdp, rl) print(sum(rewards) / len(rewards)) rl = QLearningAlgorithm(original_mdp.actions, original_mdp.discount(), featureExtractor) rewards = util.simulate(original_mdp, rl, numTrials=30000) rewards = util.simulate(modified_mdp, rl, numTrials=30000) print(sum(rewards) / len(rewards)) # END_YOUR_CODE
def find_weights_distribution(): mdp = model.DisasterMDP() random.seed(42) print('=' * 6, 'initialization', '=' * 6) qLearningSolver = util.QLearningAlgorithm( mdp.actions, 1, model.joint_bucket_max_feature_extractor) print('=' * 6, 'simulating', '=' * 6) totalQLRewards, _, _, _ = util.simulate(mdp, qLearningSolver, numTrials=num_trials) print('Avg QL Reward:', sum(totalQLRewards) / len(totalQLRewards)) weights = qLearningSolver.weights sorted_weights = sorted(weights.items(), key=lambda kv: abs(kv[1])) print('Here are the top 10% of weights by absolute value') num_keep = 100 highest_weights = sorted_weights[-1 * num_keep:] labels = ['resources', 'severities', 'max', 'joint'] counter = [0, 0, 0, 0] for w, _ in highest_weights: if 'resource' in w: if 'severity' in w: counter[3] += 1 else: counter[0] += 1 elif 'severity' in w: counter[1] += 1 elif 'max_severity' in w: counter[2] += 1 return labels, counter
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE # for the reproductivity random.seed(123) # initialization mdp.computeStates() # to get the whole State Space of MDP rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor, 0.2) # Value Iteration part algorithm = ValueIteration() algorithm.solve( mdp, .001) # algorithm now contains the Value and Policy of the mdp. # Q-Learning part util.simulate( mdp, rl, 30000 ) # Model-Free, Simulate 30000 times. After this Q-learning has been learned. rl.explorationProb = 0 # set ε to 0 and then .getAction(state) works as a policy Π. Qpi = {} comparison = [] for state in mdp.states: # get the Q-learning policy and comparison results. Qpi[state] = rl.getAction(state) comparison.append(int(algorithm.pi[state] == Qpi[state])) if featureExtractor == identityFeatureExtractor: if mdp.multiplicity == 2: print( "The match rate of using identityFeatureExtractor for smallMDP: %.4f" % (sum(comparison) / len(comparison))) print( "Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison))) else: print( "The match rate of using identityFeatureExtractor for largeMDP: %.4f" % (sum(comparison) / len(comparison))) print( "Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison))) else: print( "The match rate of using blackjackFeatureExtractor for largeMDP: %.4f" % (sum(comparison) / len(comparison))) print("Number of different actions: %d Number of total actions: %d" % (len(comparison) - sum(comparison), len(comparison)))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE listSmall = util.simulate(smallMDP, QLearningAlgorithm, 30000) print listSmall
def main(): # smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, # threshold = 15, peekCost = 1) # mdp_1 = QLearningAlgorithm( # MDP1.actions, MDP1.discount(), identityFeatureExtractor) # mdp_2 = QLearningAlgorithm( # MDP2.actions, MDP1.discount(), identityFeatureExtractor) vi = ValueIteration() vi.solve(largeMDP) for _, val in vi.pi.items(): print(val) l_mdp_identity = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(), identityFeatureExtractor) l_mdp_blackjack = QLearningAlgorithm(largeMDP.actions, largeMDP.discount(), blackjackFeatureExtractor) print(util.simulate(largeMDP, l_mdp_identity, 10, 30000, True)) print(util.simulate(largeMDP, l_mdp_blackjack, 10, 30000, True))
def simulate_QL_over_MDP(mdp, featureExtractor): # NOTE: adding more code to this function is totally optional, but it will probably be useful # to you as you work to answer question 4b (a written question on this assignment). We suggest # that you add a few lines of code here to run value iteration, simulate Q-learning on the MDP, # and then print some stats comparing the policies learned by these two approaches. # BEGIN_YOUR_CODE valueIteration = ValueIteration() valueIteration.solve(mdp) vi_pi = valueIteration.pi rl = QLearningAlgorithm(mdp.actions, mdp.discount(), featureExtractor) util.simulate(mdp, rl, numTrials=30000, verbose=False) rl.explorationProb = 0 diff, total = 0, len(mdp.states) for state in mdp.states: if vi_pi[state] != rl.getAction(state): diff += 1 print('{:.3f}'.format(100 * diff / total) + '%')
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE vi = ValueIteration() vi.solve(original_mdp) rewards = util.simulate(modified_mdp, util.FixedRLAlgorithm(vi.pi), 10000) print "Expected Reward on modified mdp using original mdp policy: %i" % ( float(sum(r for r in rewards)) / len(rewards)) rewards_new = util.simulate( modified_mdp, QLearningAlgorithm(modified_mdp.actions, original_mdp.discount(), featureExtractor, 0.1), 10000) print "Expected Reward on modified mdp using Q Learning: %i" % ( float(sum(r for r in rewards_new)) / len(rewards_new))
def compare_changed_MDP(original_mdp, modified_mdp, featureExtractor): # NOTE: as in 4b above, adding more code to this function is completely optional, but we've added # this partial function here to help you figure out the answer to 4d (a written question). # Consider adding some code here to simulate two different policies over the modified MDP # and compare the rewards generated by each. # BEGIN_YOUR_CODE valueIterOriginal = util.ValueIteration() valueIterOriginal.solve(original_mdp) fixedRL = util.FixedRLAlgorithm(valueIterOriginal.pi) rewards = util.simulate(modified_mdp, fixedRL) print("Fixed RL") for reward in rewards: print(reward) rewardsFromQ = util.simulate(modified_mdp, QLearningAlgorithm(modified_mdp.actions, modified_mdp.discount(), featureExtractor)) print('QLearn') for reward in rewardsFromQ: print(reward)
def simulation(mdp1, feature=submission.identityFeatureExtractor): learning = submission.QLearningAlgorithm(mdp1.actions, 1, feature) rewards = util.simulate(mdp1, learning, numTrials=30000) learning.explorationProb = 0 #states = mdp1.computeStates() vi = submission.ValueIteration() vi.solve(mdp1) total = 0 same = 0 for state in mdp1.states: print state, vi.pi[state],learning.getAction(state) if ( vi.pi[state] == learning.getAction(state) ): same += 1 total += 1 print "utility %.2f same action percentage is %.2f" % ( sum(rewards) / float(len(rewards)), same / float(total))
import util import submission vi = submission.ValueIteration() vi.solve(submission.originalMDP) fixedRLA = util.FixedRLAlgorithm(vi.pi) rewards = util.simulate(submission.newThresholdMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards)/float(len(rewards))) rewards = util.simulate(submission.originalMDP, fixedRLA, numTrials=30000) print "average utility " + str(sum(rewards)/float(len(rewards))) mdp2 = submission.newThresholdMDP learning = submission.QLearningAlgorithm(mdp2.actions, 1, submission.blackjackFeatureExtractor) rewards = util.simulate(mdp2, learning, numTrials=30000) print "average utility " + str(sum(rewards)/float(len(rewards))) vi2 = submission.ValueIteration() vi2.solve(submission.newThresholdMDP) fixed2 = util.FixedRLAlgorithm(vi2.pi) rewards = util.simulate(submission.newThresholdMDP, fixed2, numTrials=30000) print "average utility " + str(sum(rewards)/float(len(rewards)))
for player in allPlayers: last_name, num, team = player.split("-", 2) allPlayers[player].stats = all_player_features[num + "-" + team] ''' for p in allPlayers.keys(): print "------------" print allPlayers[p].name print allPlayers[p].team print allPlayers[p].position print allPlayers[p].price print allPlayers[p].stats print "------------" ''' budget = 100.0 mdp = ComputeRosterMDP(players, budget, allTeams, allPlayers) rl = util.QLearningAlgorithm(mdp.actions, mdp.discount(), util.fantasyFeatureExtractor) print "Finished in %s iterations" % rl.numIters bestSequence, qRewards = util.simulate(mdp, rl, numTrials=1, maxIterations=100,verbose=True) print "qRewards: %s" % (sum(qRewards) / len(qRewards)) bestSequenceNames = [p.name for p in bestSequence] print "best set of players is", bestSequenceNames # print "best set of players", rl.bestSequence #mdp.computeStates()
def testQL(): deck = poker.Deck() deck.shuffle() mdp = None QL = None human = False oppType = None humanActions = [] #function to load weight from file. #Text in file should be of the format {(feature1): value1, (feature2):value2} def loadWeight(fileName): with open(fileName,'r') as inf: dict_from_file = eval(inf.read()) return collections.Counter(dict_from_file) userInput = raw_input('Type S to simulate QLearning, hit Enter otherwise: ') if(userInput == 'S' or userInput == 's'): print 'What type of opponent would you like to simulate?' print '0. Tight-Aggressive' print '1. Loose-Aggressive' print '2. Tight-Passive' print '3. Loose-Passive' print '4. Random' userInput = int(raw_input('Opponent Type: ')) oppType = '' if userInput == 0: oppType = 'TAG' elif userInput == 1: oppType = 'LAG' elif userInput == 2: oppType = 'TPA' elif userInput == 3: oppType = 'LPA' elif userInput == 4: oppType = 'RANDOM' print 'How many Q-Learning trials do you wish to run?' print 'WARNING: We strongly recommend using 1000 trials or less' print 'In our experience, 1000 gets done in about 10 minutes most cases' print 'Anything over that can take hours' userTrial = int(raw_input('Number of trials: ')) print 'How many tests do you want to run on the generated weight vector?' numIter = int(raw_input('Number of tests: ')) mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) print util.simulate(mdp, QL, numTrials=userTrial, maxIterations=10000) print QL.weights print 'Weight length: %d' %len(QL.weights) else: human = True print 'What type of opponent weight-vector do you wish to start with?' print '0. Tight-Aggressive' print '1. Loose-Aggressive' print '2. Tight-Passive' print '3. Loose-Passive' print '4. Random' userInput = int(raw_input('Opponent Type: ')) oppType = '' if userInput == 0: oppType = 'TAG' mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) QL.weights = loadWeight('w_tag_5k.txt') elif userInput == 1: oppType = 'LAG' mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) QL.weights = loadWeight('w_lag_5k.txt') elif userInput == 2: oppType = 'TPA' mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) QL.weights = loadWeight('w_tpa_5k.txt') elif userInput == 3: oppType = 'LPA' mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) QL.weights = loadWeight('w_lpa_5k.txt') elif userInput == 4: oppType = 'RANDOM' mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) QL.weights = loadWeight('w_random_5k.txt') print 'How many games do you wish to play?' print 'Choose more than 10 to enable opponent recognition' numIter = int(raw_input()) def opponentRecognition(l): # Assume opponent plays 10 games # Features: total money bet, number of folds, number of checks, number of raises totalBet = 0 folds = 0 checks = 0 raises = 0 for action in l: if action[0] == 'Fold': folds += 1 elif action[1] == 0: checks += 1 else : raises += 1 totalBet += action[1] betPerRaise = (1.0*totalBet)/raises if folds > 2: # tight if betPerRaise > 6 or 3*raises > checks: return 'TAG' else : return 'TPA' else : # loose if betPerRaise > 6 or 3*raises > checks: return 'LAG' else : return 'LPA' def playGame(QL, deck, table, agent, opp, human): def oppPlay(i,agentAction): oppState = (agent.hand, table.tableCards, table.bettingPot, agentAction, i) if not human: oppAction = opp.determinePolicy(oppState) table.incrementOppBet(oppAction[1]) table.appendAction(oppAction) return oppAction else: actions = mdp.actions(oppState) index = input('Type action index:' + str(actions)) print 'Your Action: ' + str(actions[index]) table.incrementOppBet(actions[index][1]) table.appendAction(actions[index]) humanActions.append(actions[index]) return actions[index] def agentPlay(i, oppAction): agentState = (agent.hand, table.tableCards, table.bettingPot, oppAction, i) agentAction = QL.getAction(agentState) table.incrementAgentBet(agentAction[1]) table.appendAction(agentAction) if human: print 'Agent Action: ' + str(agentAction) return agentAction def determineFullGameWinner(deck, table, agent, opp): cardsNeeded = 5 - len(table.tableCards) if cardsNeeded > 0: for i in range(cardsNeeded): table.flipCard(deck) agentVal = agent.assessHand(table.tableCards) oppVal = opp.assessHand(table.tableCards) agentVal = (agentVal[0], sorted(agentVal[1], reverse=True)) oppVal = (oppVal[0], sorted(oppVal[1], reverse=True)) if agentVal[0] > oppVal[0]: return "Agent" elif agentVal[0] == oppVal[0]: if agentVal[1] > oppVal[1]: return "Agent" elif agentVal[1] < oppVal[1]: return "Opp" return "Tie" return 0 return "Opp" # shuffle deck deck.shuffle() # deal players #table.dealPlayers(agent,opp,deck) if human: print'Your cards are:' + str(opp.hand) oppAction = oppPlay(0, (None,0)) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(1,oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # in case agent raises if agentAction[1] > oppAction[1]: oppAction = oppPlay(2, agentAction) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(3, oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # deal table - flop table.flipCard(deck) table.flipCard(deck) table.flipCard(deck) if human: print 'Flop: ' + str(table.tableCards) print 'Your cards: ' + str(opp.hand) print 'Pot: ' + str(table.bettingPot) # asses hand oppAction = oppPlay(0, (None,0)) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(1,oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # in case agent raises if agentAction[1] > oppAction[1]: oppAction = oppPlay(2, agentAction) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(3, oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # deal table - turn table.flipCard(deck) if human: print 'Turn: ' + str(table.tableCards) print 'Your cards: ' + str(opp.hand) print 'Pot: ' + str(table.bettingPot) # asses hand oppAction = oppPlay(0, (None,0)) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(1,oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # in case agent raises if agentAction[1] > oppAction[1]: oppAction = oppPlay(2, agentAction) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(3, oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # deal table - river table.flipCard(deck) if human: print 'River: ' + str(table.tableCards) print 'Your cards: ' + str(opp.hand) print 'Pot: ' + str(table.bettingPot) # asses hand oppAction = oppPlay(0, (None,0)) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(1,oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) # in case agent raises if agentAction[1] > oppAction[1]: oppAction = oppPlay(2, agentAction) if oppAction[0] == 'Fold': agentUtility = table.getOppBet() return ('OppLeft', agentUtility) agentAction = agentPlay(3, oppAction) if agentAction[0] == 'Fold': if human: print 'Agent\'s hand revealed: ' + str(agent.hand) print 'You win: %d' %table.bettingPot agentUtility = -(table.getAgentBet()) couldHaveWon = determineFullGameWinner(deck, table, agent, opp) if couldHaveWon == "Agent" or couldHaveWon == "Tie": return ('GoodFold', agentUtility) return ('BadFold', agentUtility) agentVal = agent.assessHand(table.tableCards) oppVal = opp.assessHand(table.tableCards) agentVal = (agentVal[0], sorted(agentVal[1], reverse=True)) oppVal = (oppVal[0], sorted(oppVal[1], reverse=True)) if human: print 'Agent\'s hand revealed: ' + str(agent.hand) if agentVal[0] > oppVal[0]: if human: print 'You lose ' + str(table.bettingPot) return ('Win', table.getOppBet()) elif agentVal[0] == oppVal[0]: if agentVal[1] > oppVal[1]: if human: print 'You lose ' + str(table.bettingPot) return ('Win', table.getOppBet()) elif agentVal[1] < oppVal[1]: if human: print 'You win ' + str(table.bettingPot) return ('Lose', -(table.getAgentBet())) return ('Win', 0) #Count ties as a win for simplicity if human: print 'You win ' + str(table.bettingPot) return ('Lose', -(table.getAgentBet())) agent = poker.Agent() opp = poker.Opponent(oppType) table = poker.Table(mdp.deck) stateHistory = {} utilityHistory = {} QL.explorationProb = 0 #No more exploration humanMultigameHistory = [] for gameNum in range(numIter): mdp.startState() if human: mdp.table.bettingPot = 0 #Make sure starting pot is zero result = playGame(QL, mdp.deck, mdp.table, mdp.agent, mdp.opponent, human) state, utility = result[0], result[1] if state in stateHistory: stateHistory[state] += 1 else: stateHistory[state] = 1 if utility in utilityHistory: utilityHistory[utility] += 1 else: utilityHistory[utility] = 1 if human: humanMultigameHistory.append(humanActions) if len(humanMultigameHistory) > 10: humanMultigameHistory = humanMultigameHistory[1:] if len(humanMultigameHistory) == 10: humanActs = [] for i in range(len(humanMultigameHistory)): for j in range(len(humanMultigameHistory[i])): humanActs.append(humanMultigameHistory[i][j]) newOppType = opponentRecognition(humanActs) if newOppType != oppType: print 'You\'re playing more like a %s player' %newOppType print 'Loading %s weight vector' %newOppType oppType = newOppType mdp = util.pokerMDP(deck, oppType) QL= QLearningAlgorithm(mdp.actions, mdp.discount(), pokerFeatureExtractor, 0.2) if newOppType == 'TAG': QL.weights = loadWeight('w_tag_5k.txt') elif newOppType == 'LAG': QL.weights = loadWeight('w_lag_5k.txt') elif newOppType == 'TPA': QL.weights = loadWeight('w_tpa_5k.txt') else: #'LPA' QL.weights = loadWeight('w_lpa_5k.txt') deck.reset() agent = poker.Agent() #Easy way to reset agent, opp, table opp = poker.Opponent(oppType) table = poker.Table(mdp.deck) return stateHistory, utilityHistory
# END_YOUR_CODE ########################################################### # Problem 4b: convergence of Q-learning # Small test case smallMDP = BlackjackMDP(cardValues=[1, 5], multiplicity=2, threshold=10, peekCost=1) # Large test case largeMDP = BlackjackMDP(cardValues=[1, 3, 5, 8, 10], multiplicity=3, threshold=40, peekCost=1) vi = ValueIteration() vi.solve(largeMDP) ql = QLearningAlgorithm(largeMDP.actions, 1, blackjackFeatureExtractor, explorationProb=0.2) util.simulate(largeMDP, ql, 30000, 1000, False, False) ifPrint = False c = 0.0 for state in largeMDP.states: QLpi=max((ql.getQ(state, action), action) for action in largeMDP.actions(state))[1] if vi.pi[state] != QLpi: c += 1 ifPrint = True print state, 'VI: ',vi.pi[state],'vs ', 'QL: ', QLpi print c / len(largeMDP.states) if not ifPrint: print 'All policies are same!' ############################################################ # Problem 4d: What happens when the MDP changes underneath you?! # Original mdp
def runQLearning(): global model days = list(range(len(X))) print len(days) randomRewards = 0.0 testRewards = 0.0 # Test separately on 100-day periods period = 100 numSets = len(days)/period testSets = [n * period for n in range(numSets)] for n in testSets: print 'Testing on days %d - %d:' % (n, n+period) # Test on [n, n + period] examples testDays = days[n:n+period] # Train on all remaining examples trainDays = [d for d in days if d not in testDays] # Make train & test MDPs trainMDP = GoldMDP(trainDays) trainMDP.computeStates() testMDP = GoldMDP(testDays) testMDP.computeStates() # Train linear prediction model on train set model = linear_model.LinearRegression() model.fit(X[trainDays], Y[trainDays]) # Measure classification accuracy on test set Y_pred = model.predict(X[testDays]) Y_actual = Y[testDays] correct = 0 for i in range(len(Y_pred)): if Y_pred[i] * Y_actual[i] >= 0: correct = correct + 1 print "Accuracy = %.2f" % (float(correct)/len(Y_pred)) # Learn (reinforcement Q-learning) on trainMDP, choosing all random actions rl = QLearningAlgorithm(trainMDP.actions, trainMDP.discount(), predictFeatureExtractor, 1.0) rewards = util.simulate(trainMDP, rl, 1) print rl.weights print rewards randomRewards = randomRewards + rewards[0] # Run with trained RL algorithm on testMDP, choosing all max actions rl.explorationProb = 0.0 rewards = util.simulate(testMDP, rl, 1) print rl.weights print rewards testRewards = testRewards + rewards[0] print "Average rewards per day =", rewards[0]/period # Output total profit & average daily profit print 'Random rewards = %.2f' % randomRewards print 'Test rewards = %.2f' % testRewards numDays = (len(days)/period) * period print 'Avg random = %.2f' % (randomRewards/numDays) print 'Avg test = %.2f' % (testRewards/numDays)