def main(**kargs): initial_weights_file, initial_i_frame = latest(kargs['weights_dir']) print("Continuing using weights from file: ", initial_weights_file, "from", initial_i_frame) if kargs['theano_verbose']: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' ale = ag.init(display_screen=(kargs['visualize'] == 'ale'), record_dir=kargs['record_dir']) game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 game.lives = 4 return game replay_memory = dqn.ReplayMemory(size=kargs['dqn.replay_memory_size']) if not kargs['dqn.no_replay'] else None # dqn_algo = q.ConstAlgo([3]) dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file, build_network=kargs['dqn.network'], updates=kargs['dqn.updates']) dqn_algo.replay_start_size = kargs['dqn.replay_start_size'] dqn_algo.final_epsilon = kargs['dqn.final_epsilon'] dqn_algo.initial_epsilon = kargs['dqn.initial_epsilon'] dqn_algo.i_frames = initial_i_frame dqn_algo.log_frequency=kargs['dqn.log_frequency'] import Queue dqn_algo.mood_q = Queue.Queue() if kargs['show_mood'] else None if kargs['show_mood'] is not None: plot = kargs['show_mood']() def worker(): while True: item = dqn_algo.mood_q.get() plot.show(item) dqn_algo.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(dqn_algo)) visualizer = ag.SpaceInvadersGameCombined2Visualizer() if kargs['visualize'] == 'q' else q.GameNoVisualizer() teacher = q.Teacher(new_game, dqn_algo, visualizer, ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0) teacher.teach(500000)
def random_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag reload(q) reload(ag) ale = ag.init() game = ag.SpaceInvadersGame(ale) #game.show_vectorized(game.vectorized(ale.getScreen())) teacher = q.Teacher(game, q.RandomAlgo(game.get_actions()), ag.SpaceInvadersGameVectorizedVisualizer()) teacher.teach(1)
def const_on_space_invaders(): import teacher as q import ale_game as ag import dqn reload(q) reload(ag) reload(dqn) ale = ag.init() game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game const_algo = q.ConstAlgo([2, 2, 2, 2, 2, 0, 0, 0, 0]) teacher = q.Teacher(new_game, const_algo, ag.SpaceInvadersGameCombined2Visualizer(), ag.Phi(skip_every=6), repeat_action=6) teacher.teach(1)
def sarsa_gd_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag import matplotlib.pyplot as plt plt.ion() reload(q) reload(ag) ale = ag.init() run = '1' n_colors = 5 def state_adapter(scr): vect = np.reshape(ag.vectorized(scr, 14, 20), 14 * 20 * n_colors) return np.where(vect)[0] game = ag.SpaceInvadersGame(ale) q_algo1 = q.SARSALambdaGradientDescent(game.get_actions(), game.get_state(), initial_q=5, initial_theta=[1] * 14 * 20 * n_colors, be_positive=False, state_adapter=state_adapter) q_algo1.epsilon = 0.05 q_algo1.lmbda = 0.99 # 0.9 q_algo1.gamma = 0.999 q_algo1.alpha = 0.5 def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameVectorizedVisualizer(), repeat_action=3) # teacher.single_step(Game) q_algo1.epsilon = 0 q_algo1.log_freq = 1 teacher.teach(1) initial_training = 1000 training_decay_from = 95 training_decay_ex = 50 result_test = [] result_1 = [] result_2 = [] teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) q_algo1.log_freq = 0.05 q_algo1.epsilon = 1 result_1 = teacher.teach(initial_training) q_algo1.epsilon = 0 q_algo1.log_freq = 0.05 result_test.append(teacher.teach(1)) for i in range(training_decay_from): q_algo1.epsilon = 1 - i / 100 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) result_2.append(teacher.teach(training_decay_ex)) q_algo1.epsilon = 0 result_test.append(teacher.teach(1)) import cPickle as pickle with open('gradient_descent.theta' + run, 'wb') as handle: pickle.dump(q_algo1.theta, handle) with open('gradient_descent.gamma' + run, 'wb') as handle: pickle.dump(q_algo1.gamma, handle) with open('gradient_descent.lmbda' + run, 'wb') as handle: pickle.dump(q_algo1.lmbda, handle) with open('gradient_descent.alpha' + run, 'wb') as handle: pickle.dump(q_algo1.alpha, handle) r1 = [a[1] for a in result_1] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])]) / 200) r2 = [a[1] for r in result_2 for a in r] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])]) / 200) r_test = [a[1] for r in result_test for a in r] plt.plot( np.array([ x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:]) ]) / 50) r_4 = [a[1] for a in result_4] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])]) / 2) q_algo1.epsilon = 0.1 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) teacher.teach(100)