def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' print("epoch", "\t", "score", "\t", "high", "\t", "avg") highscore, avgscore = 0.0, 0.0 for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) score = swing.score highscore = max([highscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) print(ii, "\t", score, "\t", highscore, "\t", avgscore) # Reset the state of the learner. learner.reset() return
def run_games(learner, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' # intialize df df = pd.DataFrame(columns=["gravity", "score", "death"]) # run iters games for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) learner.swing = swing # Loop until you hit something. while swing.game_loop(): pass # Save score history. df.loc[len(df)] = [swing.gravity, swing.score, swing.death] # Reset the state of the learner. learner.reset() return df
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): # This is where we build sarsa arrays utilizing learner.method() # You can get the action via learner.last_action (False=0/glide, True=1/jump) # You can get the state via learner.last_state # You can get the reward via learner.last_reward (0,+1 if pass, -5 if hit, -10 if fall off screen) # Can infer gravity by checking monkey velocity from time step to time step if action is false # Gravity is an integer 1, 2, 3, or 4 pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) print("[Game #%d / Score: %d / " % (ii, swing.score), end="") # Train learner on the last game toc = time.time() learner.train(swing.score) tic = time.time() print("training time: %3.3f]" % float(tic - toc)) # Reset last_state, last_action, last_reward, and game memory of the learner (learned parameters are retained). learner.reset() return
def run_games(learner, hist, iters=100, t_len=1): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. # Display the epoch on screen. text="Epoch %d" % (ii), # Make game ticks super fast. tick_length=t_len, action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def evaluate(gamma=0.4, iters=100, chatter=True): learner = TDValueLearner() learner.gamma = gamma highscore = 0 avgscore = 0.0 for ii in xrange(iters): learner.epsilon = 1 / (ii + 1) # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highscore = max([highscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) if chatter: print ii, score, highscore, avgscore # Reset the state of the learner. learner.reset() return -avgscore
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): learner.last_state = swing.get_state() pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) if len(hist) < 100: avgscore = np.mean(hist) else: avgscore = np.mean(hist[-100:]) print("epoch:",ii, "highest:", np.max(hist), "current score:", swing.score, "average:", avgscore) # Reset the state of the learner. learner.reset() pg.quit() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): print "Epoch: %i |" % ii, # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) if learner.algo == "qlearn": q_filled = float(np.count_nonzero( learner.Q)) * 100 / learner.Q.size print 'score: %d |' % swing.score, 'Q: %s' % str(round( q_filled, 3)) + "%" else: print 'score %d' % swing.score # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. #print('Epoch Gravity:', swing.gravity) while swing.game_loop(): pass # Save score history. hist.append(swing.score) print(f'it: {ii}') print(f'best score: {max(hist)}') print(f'average score: {sum(hist)/len(hist)}') # Reset the state of the learner. learner.reset() # print(f'best score: {max(hist)}') # print(f'average score: {mean(hist)}') pg.quit() return
def run_games(learner, hist, iters=1000, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' counter = 0 while counter < iters: try: # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text= f"Epoch {counter}: {learner.best_score}", # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. if (swing.score > learner.best_score): learner.best_score = swing.score hist.append(swing.score) # Reset the state of the learner. learner.reset() counter += 1 except: pass return
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' timestamp = int(time.time()) for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) print('epoch: ' + str(ii) + ', score: ' + str(swing.score) + ', gravity: ' + str(swing.gravity) + ', running_avg: ' + str(np.average(hist[-10:]))) results = { 'gamma': learner.gamma, 'eta': learner.eta, 'epsilon_decay': learner.epsilon_decay, 'hist': hist } with open ('results/results_approx_' + str(timestamp) + '.p', 'wb') as f: pickle.dump(results, f) # Reset the state of the learner. learner.reset() pg.quit() return
def evaluate(gamma=0.4, iters=100, chatter=True): learner = TDValueLearner() learner.gamma = gamma highscore = 0 avgscore = 0.0 for ii in xrange(iters): learner.epsilon = 1/(ii+1) # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highscore = max([highscore, score]) avgscore = (ii*avgscore+score)/(ii+1) if chatter: print ii, score, highscore, avgscore # Reset the state of the learner. learner.reset() return -avgscore
def run_games(learner, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' scores = [] scores1 = [] scores4 = [] for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) scores.append(swing.score) if swing.gravity == 1: scores1.append(swing.score) elif swing.gravity == 4: scores4.append(swing.score) # Reset the state of the learner. learner.reset() # plot the game scores over time, and save to a png plt.plot(range(iters), scores) plt.title('Scores') plt.get_current_fig_manager().window.showMaximized() plt.savefig('scores.png') plt.show() plt.close() window = 50 # compute a moving average of the score ma = np.convolve(scores, np.ones(window) / window, mode='valid') plt.plot(np.arange(len(ma)) + window, ma) plt.title('50-Game Moving Average Score') plt.get_current_fig_manager().window.showMaximized() plt.savefig('scores_ma.png') plt.show() plt.close() print 'When gravity=1: %d games, with an average score of %.3f' % ( len(scores1), np.mean(scores1)) print 'When gravity=4: %d games, with an average score of %.3f' % ( len(scores4), np.mean(scores4)) print 'For all games: %d games, with an average score of %.3f' % ( len(scores), np.mean(scores)) # store the scores in a pickle file pickle.dump((scores, scores1, scores4), open('scores.p', 'w')) return
def run_games(file, learner, hist, iters=100, t_len=10): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' f = open(file + '.txt', 'w') f.write('Training History\n') for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # save modulo if learner.epoch <= 200: mod = 50 else: mod = 10 if learner.epoch % mod == 0: curr_best = '\nEpoch {} - Current Best Score: {}\n'.format( learner.epoch, np.max(hist)) print(curr_best) f.write(curr_best) with open(file + '.pickle', 'wb') as outputfile: pickle.dump(learner.Q, outputfile) np.save(file, np.array(hist)) if DECREASING_EPSILON: learner.epsilon *= 0.99 # Reset the state of the learner. learner.reset() best_score = 'Best Score: {}'.format(np.max(hist)) print(best_score) f.write(best_score + '\n') avg_score = 'Average Score: {}'.format(np.mean(hist)) print(avg_score) f.write(avg_score + '\n') f.close() pg.quit() return
def run_game(): # Make a new monkey object. swing = SwingyMonkey(visual=False, # no video sound=False, # no audio action_callback=learner_class.action_callback, reward_callback=learner_class.reward_callback) # Loop until you hit something. while swing.game_loop(): pass return swing
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' if iters < 20: print "I can't learn that fast! Try more iterations." # DATA-GATHERING PHASE for ii in range(30): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.explore_action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() # EXPLOITATION PHASE for ii in range(iters)[30:]: # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def session(learner, options): learner_class = init_learner(learner, options.learner_classes) # history dictionaries: epoch # -> whatever rewards = {} scores = {} history = History(rewards, scores) # save state pvideo = options.video print "Starting training phase for %s ..." % (learner) max_score = 0 for t in xrange(options.train_iters + options.test_iters): prev_score = scores[t - 1] if t > 0 else 0 # print information about the epoch currently being run if t == options.train_iters: print("Starting testing phase for %s ..." % (learner)) options.video = (options.test_tick > 0) if t < options.train_iters: print("======= Training epoch %d / %d." % (t, options.train_iters)) else: print("======= Test epoch %d / %d." % (t - options.train_iters, options.test_iters)) print("Max score: %d. Previous epoch score: %d" % (max_score, prev_score)) # Make a new monkey object. swing = SwingyMonkey(visual=options.video, sound=False, tick_length=options.train_tick if t < options.train_iters else options.test_tick, action_callback=learner_class.action_callback, reward_callback=learner_class.reward_callback) # Loop until you hit something. episode_rewards = [] while swing.game_loop(): if learner_class.last_reward is not None: episode_rewards.append(learner_class.last_reward) # collect statistics rewards[t] = copy.deepcopy(episode_rewards) scores[t] = copy.deepcopy(swing.score) max_score = max(max_score, scores[t]) # reset options.video = pvideo return history, learner_class
def testgame(iters=100, show=True): learner = QLearner2() highestscore = 0 avgscore = 0 learner.alpha = 0.2 learner.gamma = 0.6 alpha = learner.alpha gamma = learner.gamma with open("test_Q2.csv", "w", newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow( ["alpha", "gamma", "epoch", "highest", "average", "score", "q"]) for ii in range(iters): learner.epsilon = 1 / (ii + 1) # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) q = round(float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3) if show == True: print("epoch:", ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q) with open("test_Q2.csv", "a+", newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows( [[alpha, gamma, ii, highestscore, avgscore, score, q]]) # Reset the state of the learner. learner.reset() pg.quit() return avgscore, highestscore, score
def sim_games(learner, iters=None, t_len=50): i = 0 # demonstrate the learner playing the game; this will end only when you close the window manually while (iters == None or i < iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, tick_length=50, action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass learner.reset() i += 1 return
def testgame(iters=100, show=True): learner = QLearner2() highestscore = 0 avgscore = 0 record = {} record['epoch'] = [] record['highest'] = [] record['avg'] = [] record['score'] = [] record['q'] = [] for ii in range(iters): learner.epsilon = 1 / (ii + 1) # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii * avgscore + score) / (ii + 1) q = round(float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3) if show == True: print "epoch:", ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q record['epoch'].append(ii) record['highest'].append(highestscore) record['avg'].append(avgscore) record['score'].append(score) record['q'].append(q) pickle.dump(record, open("record12.p", "wb")) # Reset the state of the learner. learner.reset() return avgscore, highestscore, score
def testgame(iters=100,show=True): learner = QLearner2() highestscore = 0 avgscore = 0 record={} record['epoch']=[] record['highest']=[] record['avg']=[] record['score']=[] record['q']=[] for ii in range(iters): learner.epsilon = 1/(ii+1) # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=1, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass score = swing.get_state()['score'] highestscore = max([highestscore, score]) avgscore = (ii*avgscore+score)/(ii+1) q=round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3) if show==True: print "epoch:",ii, "highest:", highestscore, "current score:", score, "average:", avgscore, "% of Q mx filled:", q record['epoch'].append(ii) record['highest'].append(highestscore) record['avg'].append(avgscore) record['score'].append(score) record['q'].append(q) pickle.dump( record, open( "record12.p", "wb" ) ) # Reset the state of the learner. learner.reset() return avgscore,highestscore,score
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # make a new monkey object swing = SwingyMonkey( sound=False, # don't play sounds text="Epoch %d" % (ii), # display the epoch on screen tick_length=t_len, # make game ticks super fast action_callback=learner.action_callback, reward_callback=learner.reward_callback) # pass the screen dimensions to the agent learner.update_specs(swing.screen_height, swing.screen_width) # loop until you hit something while swing.game_loop(): pass # update transition to terminal state learner.update_terminal_transition() # save score history hist.append(swing.score) print 'Epoch %i: current score %i; best score %i' % (ii, swing.score, np.max(hist)) # reset the state of the learner learner.reset() # display score history and stats print '----------' print 'Parameters: %0.2f alpha; %0.2f gamma; %0.2f epsilon' % ( learner.alpha, learner.gamma, learner.epsilon) print 'Score history:', hist print 'Best score:', np.max(hist) print 'Average score:', np.mean(hist) print '----------' return np.max(hist)
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey() # Initialize history dictionaries for iteration ii hist['state'][ii] = [] hist['action'][ii] = [] hist['reward'][ii] = [] # Loop until you hit something. while swing.game_loop(): # This is where we build sarsa arrays utilizing learner.method() # You can get the action via learner.last_action (False=0/glide, True=1/jump) # You can get the state via learner.last_state # You can get the reward via learner.last_reward (0,+1 if pass, -5 if hit, -10 if fall off screen) # Can infer gravity by checking monkey velocity from time step to time step if action is false # Gravity is an integer 1, 2, 3, or 4 # import pdb # pdb.set_trace() hist['state'][ii].append(learner.last_state) hist['action'][ii].append(learner.last_action) hist['reward'][ii].append(learner.last_reward) else: # Get final action,reward and state just to see how the monkey failed. hist['state'][ii].append(learner.last_state) hist['action'][ii].append(learner.last_action) hist['reward'][ii].append(learner.last_reward) # Save score history. hist['score'].append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' for ii in range(iters): # make a new monkey object swing = SwingyMonkey(sound=False, # don't play sounds text="Epoch %d" % (ii), # display the epoch on screen tick_length = t_len, # make game ticks super fast action_callback=learner.action_callback, reward_callback=learner.reward_callback) # pass the screen dimensions to the agent learner.update_specs(swing.screen_height, swing.screen_width) # loop until you hit something while swing.game_loop(): pass # update transition to terminal state learner.update_terminal_transition() # save score history hist.append(swing.score) print 'Epoch %i: current score %i; best score %i' % (ii, swing.score, np.max(hist)) # reset the state of the learner learner.reset() # display score history and stats print '----------' print 'Parameters: %0.2f alpha; %0.2f gamma; %0.2f epsilon' % (learner.alpha, learner.gamma, learner.epsilon) print 'Score history:', hist print 'Best score:', np.max(hist) print 'Average score:', np.mean(hist) print '----------' return np.max(hist)
def run_games(learner, hist, iters = 100, t_len = 100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' high = 0 avg = 0 for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass new_score = swing.score # Save score history. if new_score > high: high = new_score avg = (new_score + ii*avg)/(ii+1.0) print "%i\t%i\t%i\t%s:\t%s"%(ii,new_score,high,avg,np.mean(learner.Q)) hist.append(swing.score) # Reset the state of the learner. learner.reset() print learner.Q print learner.state_counts return
for ii in xrange(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. tick_length=1, # Make game ticks super fast. # Display the epoch on screen and % of Q matrix filled text="Epoch %d " % (ii) + str(round( float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3)) + "%", action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Keep track of the score for that epoch. learner.scores.append(learner.last_state['score']) if learner.last_state['score'] > learner.best_score: print 'New best Q' learner.best_score = learner.last_state['score'] learner.bestQ = learner.Q.copy() print 'score %d' % learner.last_state['score'], str( round(float(np.count_nonzero(learner.Q)) * 100 / learner.Q.size, 3)) + "%" # Reset the state of the learner. learner.reset()
def run_games(learner, hist, iters=100, t_len=100): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' net_states = [] net_rewards = [] net_actions = [] for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey( sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. iter_states = [] iter_rewards = [] iter_actions = [] iter_count = 0 while swing.game_loop(): state = swing.get_state() iter_states.append(np.array(state['tree'].values()+\ state['monkey'].values()+[learner.gravity])) iter_rewards.append(learner.last_reward) iter_actions.append(int(learner.last_action)) iter_count += 1 if iter_count > 1 and learner.know_gravity == False: learner.learn_gravity(iter_states, iter_actions) if learner.know_gravity == True: for num in range(len(iter_states)): iter_states[num][-1] = learner.gravity #To get the state after the state = swing.get_state() iter_states.append(state['tree'].values()+\ state['monkey'].values()+[learner.gravity]) iter_rewards.append(learner.last_reward) iter_actions.append(int(learner.last_action)) #Adding to the net training set net_states += iter_states net_rewards += iter_rewards net_actions += iter_actions if ii == 0: xtrain = build_training_set(net_states, net_actions) ytrain = np.array(net_rewards) RF = ExtraTreesRegressor(n_estimators=50) RF.fit(xtrain, ytrain) else: xtrain = build_training_set(net_states[:-1], net_actions[:-1]) #Building the q_state update. ytrain = np.array([learner.model.predict(np.append(net_states[k], net_actions[k])) + \ learner.alpha*(net_rewards[k] + learner.gamma* np.max([learner.model.predict(np.append(net_states[k+1], int(action)))\ for action in learner.actions]) - \ learner.model.predict(np.append(net_states[k], net_actions[k]))) for k in range(len(net_states)-1)]) RF = ExtraTreesRegressor(n_estimators=50) RF.fit(xtrain, ytrain) learner.model = RF learner.model_trained = True if ii % 10 == 0: learner.epsilon -= 0.05 # Save score history. hist.append(swing.score) # Reset the state of the learner. learner.reset() return
def run_games(learner, hist, eps=0.5, gam=0.5, alph=0.75, iters = 20, t_len = 100, test=False): ''' Driver function to simulate learning by having the agent play a sequence of games. ''' # Place alpha and epsilon values into learner learner.eps = eps learner.gam = gam learner.alph = alph learner.num_actions = 2 # Initialize estimator for Q-function total_states = [] total_actions = [] total_rewards = [] total_scores = [] for ii in range(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. text="Epoch %d" % (ii), # Display the epoch on screen. tick_length = t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Initialize history dictionaries for iteration ii states = [] actions = [] rewards = [] loop_counter = 0 # Loop until you hit something. while swing.game_loop(): states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action==True)) rewards.append(learner.last_reward) if learner.learn_g & (loop_counter > 1): learner.infer_g(states,actions) for pp in range(len(states)): states[pp][-1] = learner.gravity loop_counter += 1 else: # Get final action,reward and state just to see how the monkey failed. states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action==True)) rewards.append(learner.last_reward) # Append histories from most recent epoch, create training arrays total_scores.append(swing.score) total_states += states total_actions += actions total_rewards += rewards if not test: # Iteratively refine the optimal policy after each epoch if ii == 0: X_train = np.array([np.append(total_states[kk],total_actions[kk]) for kk in range(len(total_states))]) y_train = np.array(total_rewards) #Build tree using first stage Q-learning extraTrees = ExtraTreesRegressor(n_estimators=50) extraTrees.fit(X_train, y_train) # Refit random forest estimator based on composite epochs else: # Generate new X(state,action) and y(reward) lists from newly run batch, based off of Q-estimator and using prior rewards a la Ernst '06' X_train = np.array([np.append(total_states[kk],total_actions[kk]) for kk in range(len(total_rewards)-1)]) # Construct Bellman's equations to get expected rewards based on next proposed state y_train = np.array([agent.estimator.predict(np.append(total_states[kk],total_actions[kk])) \ +agent.alph*(total_rewards[kk]+(agent.gam * np.max([agent.estimator.predict(np.append(total_states[kk+1]\ ,act)) for act in range(agent.num_actions)]))-agent.estimator.predict(np.append(total_states[kk],total_actions[kk])))\ for kk in range(len(total_states)-1)]) # Re-fit regression to refine optimal policy according to expected reward. extraTrees = ExtraTreesRegressor(n_estimators=50) extraTrees.fit(X_train,y_train) # As we refine the policy, we should reduce the amount we explore. if ii % 10 == 0: learner.eps += 0.05 learner.estimator = extraTrees learner.fitted = True else: learner.fitted = True # Reset the state of the learner. learner.reset() # Place state, action, reward and score histories to be saved by wrapper. hist['state_history'] = total_states hist['action_history'] = total_actions hist['reward_history'] = total_rewards hist['score_history'] = total_scores return
iters = 10000 learner = Learner() for ii in xrange(iters): # Make a new monkey object. swing = SwingyMonkey(sound=False, # Don't play sounds. tick_length=1, # Make game ticks super fast. # Display the epoch on screen and % of Q matrix filled text="Epoch %d " % (ii) + str(round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3)) + "%", action_callback=learner.action_callback, reward_callback=learner.reward_callback) # Loop until you hit something. while swing.game_loop(): pass # Keep track of the score for that epoch. learner.scores.append(learner.last_state['score']) if learner.last_state['score'] > learner.best_score: print 'New best Q' learner.best_score = learner.last_state['score'] learner.bestQ = learner.Q.copy() print 'score %d' % learner.last_state['score'], str(round(float(np.count_nonzero(learner.Q))*100/learner.Q.size,3)) + "%" # Reset the state of the learner. learner.reset() print np.mean(scores)
def run_games(learner, hist, policy="random", eps=0.9, gam=0.5, alph=0.75, iters=20, t_len=100): """ Driver function to simulate learning by having the agent play a sequence of games. """ # Place alpha and epsilon values into learner learner.eps = eps learner.gam = gam learner.alph = alph learner.num_actions = 2 # Initialize estimator for Q-function total_states = [] total_actions = [] total_rewards = [] total_scores = [] for ii in range(iters): # Make a new monkey object. if policy == "random": swing = SwingyMonkey( sound=False, text="Random Epoch %d" % (ii), tick_length=t_len, action_callback=learner.random_actions, reward_callback=learner.reward_callback, ) else: swing = SwingyMonkey( sound=False, # Don't play sounds. text="Learned Epoch %d" % (ii), # Display the epoch on screen. tick_length=t_len, # Make game ticks super fast. action_callback=learner.action_callback, reward_callback=learner.reward_callback, ) learner.fitted = True # Initialize history dictionaries for iteration ii states = [] actions = [] rewards = [] loop_counter = 0 # Loop until you hit something. while swing.game_loop(): states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action == True)) rewards.append(learner.last_reward) if learner.learn_g & (loop_counter > 1): learner.infer_g(states, actions) for pp in range(len(states)): states[pp][-1] = learner.gravity loop_counter += 1 else: # Get final action,reward and state just to see how the monkey failed. states.append(learner.create_state_tuple(learner.last_state)) actions.append(int(learner.last_action == True)) rewards.append(learner.last_reward) # Append histories from most recent epoch, create training arrays total_scores.append(swing.score) total_states += states total_actions += actions total_rewards += rewards # Reset the state of the learner. learner.reset() hist["state_history"] = hist["state_history"] + total_states hist["action_history"] += total_actions hist["reward_history"] += total_rewards hist["score_history"] += total_scores return