def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) if len(hist) < 100: avgscore = np.mean(hist) else: avgscore = np.mean(hist[-100:]) print("epoch:",ii, "highest:", np.max(hist), "current score:", swing.score, "average:", avgscore) # Reset the state of the learner. learner.reset() pg.quit() return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): learner.last_state = swing.get_state() pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. #print('Epoch Gravity:', swing.gravity) while swing.game_loop(): pass # Save score history. hist.append(swing.score) print(f'it: {ii}') print(f'best score: {max(hist)}') print(f'average score: {sum(hist)/len(hist)}') # Reset the state of the learner. learner.reset() # print(f'best score: {max(hist)}') # print(f'average score: {mean(hist)}') pg.quit() return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' timestamp = int(time.time()) for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) print('epoch: ' + str(ii) + ', score: ' + str(swing.score) + ', gravity: ' + str(swing.gravity) + ', running_avg: ' + str(np.average(hist[-10:]))) results = { 'gamma': learner.gamma, 'eta': learner.eta, 'epsilon_decay': learner.epsilon_decay, 'hist': hist } with open ('results/results_approx_' + str(timestamp) + '.p', 'wb') as f: pickle.dump(results, f) # Reset the state of the learner. learner.reset() pg.quit() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' print("epoch", "\t", "score", "\t", "high", "\t", "avg") highscore, avgscore = 0.0, 0.0 for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) score = swing.score highscore = max([highscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) print(ii, "\t", score, "\t", highscore, "\t", avgscore) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) print("[Game #%d / Score: %d / " % (ii, swing.score), end="") # Train learner on the last game toc = time.time() learner.train(swing.score) tic = time.time() print("training time: %3.3f]" % float(tic - toc)) # Reset last_state, last_action, last_reward, and game memory of the learner (learned parameters are retained). learner.reset() return
def evaluate(gamma=0.4, iters=100, chatter=True): learner = TDValueLearner() learner.gamma = gamma highscore = 0 avgscore = 0.0 for ii in xrange(iters): learner.epsilon = 1/(ii+1) # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highscore = max([highscore, score]) avgscore = (ii*avgscore+score)/(ii+1) if chatter: print ii, score, highscore, avgscore # Reset the state of the learner. learner.reset() return -avgscore
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): print "Epoch: %i |" % ii, # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) if learner.algo == "qlearn": q_filled = float(np.count_nonzero( learner.Q)) * 100 / learner.Q.size print 'score: %d |' % swing.score, 'Q: %s' % str(round( q_filled, 3)) + "%" else: print 'score %d' % swing.score # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): # This is where we build sarsa arrays utilizing learner.method() # You can get the action via learner.last_action (False=0/glide, True=1/jump) # You can get the state via learner.last_state # You can get the reward via learner.last_reward (0,+1 if pass, -5 if hit, -10 if fall off screen) # Can infer gravity by checking monkey velocity from time step to time step if action is false # Gravity is an integer 1, 2, 3, or 4 pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_game(): # Make a new monkey object. swing = SwingyMonkey(visual=False, # no video sound=False, # no audio action_callback=learner_class.action_callback, reward_callback=learner_class.reward_callback) # Loop until you hit something. while swing.game_loop(): pass return swing
def testgame(iters=100, show=True): learner = QLearner2() highestscore = 0 avgscore = 0 learner.alpha = 0.2 learner.gamma = 0.6 alpha = learner.alpha gamma = learner.gamma with open("test_Q2.csv", "w", newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow( ["alpha", "gamma", "epoch", "highest", "average", "score", "q"]) for ii in range(iters): learner.epsilon = 1 / (ii + 1) # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) q = round(float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3) if show == True: print("epoch:", ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q) with open("test_Q2.csv", "a+", newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows( [[alpha, gamma, ii, highestscore, avgscore, score, q]]) # Reset the state of the learner. learner.reset() pg.quit() return avgscore, highestscore, score
def testgame(iters=100, show=True): learner = QLearner2() highestscore = 0 avgscore = 0 record = {} record['epoch'] = [] record['highest'] = [] record['avg'] = [] record['score'] = [] record['q'] = [] for ii in range(iters): learner.epsilon = 1 / (ii + 1) # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) q = round(float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3) if show == True: print "epoch:", ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q record['epoch'].append(ii) record['highest'].append(highestscore) record['avg'].append(avgscore) record['score'].append(score) record['q'].append(q) pickle.dump(record, open("record12.p", "wb")) # Reset the state of the learner. learner.reset() return avgscore, highestscore, score
def testgame(iters=100,show=True): learner = QLearner2() highestscore = 0 avgscore = 0 record={} record['epoch']=[] record['highest']=[] record['avg']=[] record['score']=[] record['q']=[] for ii in range(iters): learner.epsilon = 1/(ii+1) # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii*avgscore+score)/(ii+1) q=round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3) if show==True: print "epoch:",ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q record['epoch'].append(ii) record['highest'].append(highestscore) record['avg'].append(avgscore) record['score'].append(score) record['q'].append(q) pickle.dump( record, open( "record12.p", "wb" ) ) # Reset the state of the learner. learner.reset() return avgscore,highestscore,score
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey() # Initialize history dictionaries for iteration ii hist['state'][ii] = [] hist['action'][ii] = [] hist['reward'][ii] = [] # Loop until you hit something. while swing.game_loop(): # This is where we build sarsa arrays utilizing learner.method() # You can get the action via learner.last_action (False=0/glide, True=1/jump) # You can get the state via learner.last_state # You can get the reward via learner.last_reward (0,+1 if pass, -5 if hit, -10 if fall off screen) # Can infer gravity by checking monkey velocity from time step to time step if action is false # Gravity is an integer 1, 2, 3, or 4 # import pdb # pdb.set_trace() hist['state'][ii].append(learner.last_state) hist['action'][ii].append(learner.last_action) hist['reward'][ii].append(learner.last_reward) else: # Get final action,reward and state just to see how the monkey failed. hist['state'][ii].append(learner.last_state) hist['action'][ii].append(learner.last_action) hist['reward'][ii].append(learner.last_reward) # Save score history. hist['score'].append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' if iters < 20: print "I can't learn that fast! Try more iterations." # DATA-GATHERING PHASE for ii in range(30): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.explore_action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() # EXPLOITATION PHASE for ii in range(iters)[30:]: # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # make a new monkey object swing = SwingyMonkey(sound=False, # don't play sounds text="Epoch %d" % (ii), # display the epoch on screen tick_length = t_len, # make game ticks super fast action_callback=learner.action_callback, reward_callback=learner.reward_callback) # pass the screen dimensions to the agent learner.update_specs(swing.screen_height, swing.screen_width) # loop until you hit something while swing.game_loop(): pass # update transition to terminal state learner.update_terminal_transition() # save score history hist.append(swing.score) print 'Epoch %i: current score %i; best score %i' % (ii, swing.score, np.max(hist)) # reset the state of the learner learner.reset() # display score history and stats print '----------' print 'Parameters: %0.2f alpha; %0.2f gamma; %0.2f epsilon' % (learner.alpha, learner.gamma, learner.epsilon) print 'Score history:', hist print 'Best score:', np.max(hist) print 'Average score:', np.mean(hist) print '----------' return np.max(hist)
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' high = 0 avg = 0 for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass new_score = swing.score # Save score history. if new_score > high: high = new_score avg = (new_score + ii*avg)/(ii+1.0) print "%i\t%i\t%i\t%s:\t%s"%(ii,new_score,high,avg,np.mean(learner.Q)) hist.append(swing.score) # Reset the state of the learner. learner.reset() print learner.Q print learner.state_counts return
return self.last_action def reward_callback(self, reward): self.last_reward = reward iters = 100 nvars = 3 nstates = 10 alpha = 0.2 gamma = 0.9 epsil = 0.1 learner = Learner(nvars, nstates, alpha, gamma, epsil) for ii in xrange(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): print swing.get_state() pass # Reset the state of the learner. learner.reset()
def run_games(learner, hist, policy="random", eps=0.9, gam=0.5, alph=0.75, iters=20, t_len=100): """ Driver function to simulate learning by having the agent play a sequence of games. """ # Place alpha and epsilon values into learner learner.eps = eps learner.gam = gam learner.alph = alph learner.num_actions = 2 # Initialize estimator for Q-function total_states = [] total_actions = [] total_rewards = [] total_scores = [] for ii in range(iters): # Make a new monkey object. if policy == "random": swing = SwingyMonkey( sound=False, text="Random Epoch %d" % (ii), tick_length=t_len, action_callback=learner.random_actions, reward_callback=learner.reward_callback, ) else: swing = SwingyMonkey( sound=False, # Don't play sounds. text="Learned Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback, ) learner.fitted = True # Initialize history dictionaries for iteration ii states = [] actions = [] rewards = [] loop_counter = 0 # Loop until you hit something. while swing.game_loop(): states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action == True)) rewards.append(learner.last_reward) if learner.learn_g & (loop_counter > 1): learner.infer_g(states, actions) for pp in range(len(states)): states[pp][-1] = learner.gravity loop_counter += 1 else: # Get final action,reward and state just to see how the monkey failed. states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action == True)) rewards.append(learner.last_reward) # Append histories from most recent epoch, create training arrays total_scores.append(swing.score) total_states += states total_actions += actions total_rewards += rewards # Reset the state of the learner. learner.reset() hist["state_history"] = hist["state_history"] + total_states hist["action_history"] += total_actions hist["reward_history"] += total_rewards hist["score_history"] += total_scores return
return new_action def reward_callback(self, reward): '''This gets called so you can see what reward you get.''' self.last_reward = reward iters = 10000 learner = Learner() for ii in xrange(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. tick_length=1, # Make game ticks super fast. # Display the epoch on screen and % of Q matrix filled text="Epoch %d " % (ii) + str(round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3)) + "%", action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Keep track of the score for that epoch. learner.scores.append(learner.last_state['score']) if learner.last_state['score'] > learner.best_score: print 'New best Q' learner.best_score = learner.last_state['score'] learner.bestQ = learner.Q.copy() print 'score %d' % learner.last_state['score'], str(round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3)) + "%"
def reward_callback(self, reward): '''This gets called so you can see what reward you get.''' self.last_reward = reward iters = 150 learner = Learner() scores = [] for ii in xrange(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass scores.append(swing.get_score()) # Reset the state of the learner. learner.reset() domain = np.arange(1, iters + 1, 1) plt.plot(domain, scores) plt.title("Scores over each Epoch (discount = " + str(learner.discount) + ")")
def run_games(learner, hist, eps=0.5, gam=0.5, alph=0.75, iters = 20, t_len = 100, test=False): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' # Place alpha and epsilon values into learner learner.eps = eps learner.gam = gam learner.alph = alph learner.num_actions = 2 # Initialize estimator for Q-function total_states = [] total_actions = [] total_rewards = [] total_scores = [] for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Initialize history dictionaries for iteration ii states = [] actions = [] rewards = [] loop_counter = 0 # Loop until you hit something. while swing.game_loop(): states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action==True)) rewards.append(learner.last_reward) if learner.learn_g & (loop_counter > 1): learner.infer_g(states,actions) for pp in range(len(states)): states[pp][-1] = learner.gravity loop_counter += 1 else: # Get final action,reward and state just to see how the monkey failed. states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action==True)) rewards.append(learner.last_reward) # Append histories from most recent epoch, create training arrays total_scores.append(swing.score) total_states += states total_actions += actions total_rewards += rewards if not test: # Iteratively refine the optimal policy after each epoch if ii == 0: X_train = np.array([np.append(total_states[kk],total_actions[kk]) for kk in range(len(total_states))]) y_train = np.array(total_rewards) #Build tree using first stage Q-learning extraTrees = ExtraTreesRegressor(n_estimators=50) extraTrees.fit(X_train, y_train) # Refit random forest estimator based on composite epochs else: # Generate new X(state,action) and y(reward) lists from newly run batch, based off of Q-estimator and using prior rewards a la Ernst '06' X_train = np.array([np.append(total_states[kk],total_actions[kk]) for kk in range(len(total_rewards)-1)]) # Construct Bellman's equations to get expected rewards based on next proposed state y_train = np.array([agent.estimator.predict(np.append(total_states[kk],total_actions[kk])) \ +agent.alph*(total_rewards[kk]+(agent.gam * np.max([agent.estimator.predict(np.append(total_states[kk+1]\ ,act)) for act in range(agent.num_actions)]))-agent.estimator.predict(np.append(total_states[kk],total_actions[kk])))\ for kk in range(len(total_states)-1)]) # Re-fit regression to refine optimal policy according to expected reward. extraTrees = ExtraTreesRegressor(n_estimators=50) extraTrees.fit(X_train,y_train) # As we refine the policy, we should reduce the amount we explore. if ii % 10 == 0: learner.eps += 0.05 learner.estimator = extraTrees learner.fitted = True else: learner.fitted = True # Reset the state of the learner. learner.reset() # Place state, action, reward and score histories to be saved by wrapper. hist['state_history'] = total_states hist['action_history'] = total_actions hist['reward_history'] = total_rewards hist['score_history'] = total_scores return
'''This gets called so you can see what reward you get.''' self.last_reward = reward iters = 10000 learner = Learner() scorelist=[] for ii in xrange(iters): learning_rate=(learning_rate_start+.5)/(iters/100) # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass #store all values for mins and max calcs -- only need to run once to get values for the find_state_bounds function which saves these values scorelist.append(swing.get_state()['score']) #print swing.get_state() # Reset the state of the learner. learner.reset() #calculate avg score for this approach
score_cur = 0 ii = 0 # for ii in xrange(iters): # learner.Q = np.load("Qmat_manual.npy") # learner.learnTime = np.load("Lmat_manual.npy") # while score_cur < 5000: while ii < 1e5: ii += 1 # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=0, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback, ) # Loop until you hit something. while swing.game_loop(): pass reward.append(learner.last_reward) score_cur = swing.get_state()["score"] veloc_cur = swing.get_state()["monkey"]["vel"] result_cur = learner.result_callback() qnorm = np.linalg.norm(learner.Q) score.append(score_cur) state_grid.append(learner.state_grid) state_num.append(learner.state_num)
# formal learning step iters = 10000 learner = Learner() reward = [] score = [] score_cur = 0 ii = 0 #for ii in xrange(iters): while score_cur < 100: ii += 1 # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=0, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass reward.append(learner.last_reward) score_cur = swing.get_state()["score"] score.append(swing.get_state()["score"]) print "################### Score = " + \ str(swing.get_state()["score"]) + " ########################" # Reset the state of the learner. learner.reset()
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' net_states = [] net_rewards = [] net_actions = [] for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. iter_states = [] iter_rewards = [] iter_actions = [] iter_count = 0 while swing.game_loop(): state = swing.get_state() iter_states.append(np.array(state['tree'].values()+\ state['monkey'].values()+[learner.gravity])) iter_rewards.append(learner.last_reward) iter_actions.append(int(learner.last_action)) iter_count += 1 if iter_count > 1 and learner.know_gravity == False: learner.learn_gravity(iter_states, iter_actions) if learner.know_gravity == True: for num in range(len(iter_states)): iter_states[num][-1] = learner.gravity #To get the state after the state = swing.get_state() iter_states.append(state['tree'].values()+\ state['monkey'].values()+[learner.gravity]) iter_rewards.append(learner.last_reward) iter_actions.append(int(learner.last_action)) #Adding to the net training set net_states += iter_states net_rewards += iter_rewards net_actions += iter_actions if ii == 0: xtrain = build_training_set(net_states, net_actions) ytrain = np.array(net_rewards) RF = ExtraTreesRegressor(n_estimators=50) RF.fit(xtrain, ytrain) else: xtrain = build_training_set(net_states[:-1], net_actions[:-1]) #Building the q_state update. ytrain = np.array([learner.model.predict(np.append(net_states[k], net_actions[k])) + \ learner.alpha*(net_rewards[k] + learner.gamma* np.max([learner.model.predict(np.append(net_states[k+1], int(action)))\ for action in learner.actions]) - \ learner.model.predict(np.append(net_states[k], net_actions[k]))) for k in range(len(net_states)-1)]) RF = ExtraTreesRegressor(n_estimators=50) RF.fit(xtrain, ytrain) learner.model = RF learner.model_trained = True if ii % 10 == 0: learner.epsilon -= 0.05 # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return