def main(**kargs): initial_weights_file, initial_i_frame = latest(kargs['weights_dir']) print("Continuing using weights from file: ", initial_weights_file, "from", initial_i_frame) if kargs['theano_verbose']: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' ale = ag.init(display_screen=(kargs['visualize'] == 'ale'), record_dir=kargs['record_dir']) game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 game.lives = 4 return game replay_memory = dqn.ReplayMemory(size=kargs['dqn.replay_memory_size']) if not kargs['dqn.no_replay'] else None # dqn_algo = q.ConstAlgo([3]) dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file, build_network=kargs['dqn.network'], updates=kargs['dqn.updates']) dqn_algo.replay_start_size = kargs['dqn.replay_start_size'] dqn_algo.final_epsilon = kargs['dqn.final_epsilon'] dqn_algo.initial_epsilon = kargs['dqn.initial_epsilon'] dqn_algo.i_frames = initial_i_frame dqn_algo.log_frequency=kargs['dqn.log_frequency'] import Queue dqn_algo.mood_q = Queue.Queue() if kargs['show_mood'] else None if kargs['show_mood'] is not None: plot = kargs['show_mood']() def worker(): while True: item = dqn_algo.mood_q.get() plot.show(item) dqn_algo.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(dqn_algo)) visualizer = ag.SpaceInvadersGameCombined2Visualizer() if kargs['visualize'] == 'q' else q.GameNoVisualizer() teacher = q.Teacher(new_game, dqn_algo, visualizer, ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0) teacher.teach(500000)
def random_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag reload(q) reload(ag) ale = ag.init() game = ag.SpaceInvadersGame(ale) #game.show_vectorized(game.vectorized(ale.getScreen())) teacher = q.Teacher(game, q.RandomAlgo(game.get_actions()), ag.SpaceInvadersGameVectorizedVisualizer()) teacher.teach(1)
def dqn_on_space_invaders_play(initial_weights_file, visualize='q', show_mood=False): import q_learning as q import ale_game as ag import dqn reload(q) reload(ag) reload(dqn) print("Using weights from file: ", initial_weights_file) ale = ag.init(display_screen=(visualize == 'ale')) game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 game.lives = 4 return game replay_memory = dqn.ReplayMemory(size=100, grace=10) dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file) dqn_algo.epsilon = 0.1 dqn_algo.initial_epsilon = 0.1 dqn_algo.final_epsilon = 0.1 dqn_algo.ignore_feedback = True dqn_algo.log_frequency = 0 import Queue dqn_algo.mood_q = Queue.Queue() if show_mood else None if show_mood: plot = Plot() def worker(): while True: item = dqn_algo.mood_q.get() plot.show(item) dqn_algo.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(dqn_algo)) visualizer = ag.SpaceInvadersGameCombined2Visualizer() if visualize == 'q' else q.GameNoVisualizer() teacher = q.Teacher(new_game, dqn_algo, visualizer, ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0) return teacher.teach(100)
def dqn_on_space_invaders_cpu(visualize=False, theano_verbose=False, initial_weights_file=None, ignore_feedback=False): import q_learning as q import ale_game as ag import dqn import theano reload(q) reload(ag) reload(dqn) if theano_verbose: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' ale = ag.init() game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 game.lives = 4 return game replay_memory = dqn.ReplayMemory(size=100, grace=10) dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file) dqn_algo.target_network_update_frequency = 50 dqn_algo.replay_memory_size = 100 dqn_algo.replay_start_size = 75 dqn_algo.epsilon = 0.1 dqn_algo.initial_epsilon = 0.1 dqn_algo.final_epsilon = 0.1 dqn_algo.log_frequency = 10 dqn_algo.ignore_feedback = ignore_feedback # dqn_algo.ignore_feedback = True print(str(dqn_algo)) visualizer = ag.SpaceInvadersGameCombined2Visualizer() if visualize else q.GameNoVisualizer() teacher = q.Teacher(new_game, dqn_algo, visualizer, ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0) teacher.teach(500000)
def random_on_space_invaders(): import q_learning as q import ale_game as ag reload(q) reload(ag) ale = ag.init() game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game # game.show_vectorized(game.vectorized(ale.getScreen())) teacher = q.Teacher(new_game, q.RandomAlgo(game.get_actions()), ag.SpaceInvadersGameCombined2Visualizer(), ag.Phi(skip_every=6), repeat_action=6) teacher.teach(1)
def const_on_space_invaders(): import teacher as q import ale_game as ag import dqn reload(q) reload(ag) reload(dqn) ale = ag.init() game = ag.SpaceInvadersGame(ale) def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game const_algo = q.ConstAlgo([2, 2, 2, 2, 2, 0, 0, 0, 0]) teacher = q.Teacher(new_game, const_algo, ag.SpaceInvadersGameCombined2Visualizer(), ag.Phi(skip_every=6), repeat_action=6) teacher.teach(1)
def sarsa_gd_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag import matplotlib.pyplot as plt plt.ion() reload(q) reload(ag) ale = ag.init() run = '1' n_colors = 5 def state_adapter(scr): vect = np.reshape(ag.vectorized(scr, 14, 20), 14 * 20 * n_colors) return np.where(vect)[0] game = ag.SpaceInvadersGame(ale) q_algo1 = q.SARSALambdaGradientDescent(game.get_actions(), game.get_state(), initial_q = 5, initial_theta = [1] * 14 * 20 * n_colors, be_positive = False, state_adapter = state_adapter) q_algo1.epsilon = 0.05 q_algo1.lmbda = 0.99 # 0.9 q_algo1.gamma = 0.999 q_algo1.alpha = 0.5 def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameVectorizedVisualizer(), repeat_action = 3) # teacher.single_step(Game) q_algo1.epsilon = 0 q_algo1.log_freq = 1 teacher.teach(1) initial_training = 1000 training_decay_from = 95 training_decay_ex = 50 result_test = [] result_1 = [] result_2 = [] teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3) q_algo1.log_freq = 0.05 q_algo1.epsilon = 1 result_1 = teacher.teach(initial_training) q_algo1.epsilon = 0 q_algo1.log_freq = 0.05 result_test.append(teacher.teach(1)) for i in range(training_decay_from): q_algo1.epsilon = 1 - i/100 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3) result_2.append(teacher.teach(training_decay_ex)) q_algo1.epsilon = 0 result_test.append(teacher.teach(1)) import cPickle as pickle with open('gradient_descent.theta' + run , 'wb') as handle: pickle.dump(q_algo1.theta, handle) with open('gradient_descent.gamma' + run, 'wb') as handle: pickle.dump(q_algo1.gamma, handle) with open('gradient_descent.lmbda' + run, 'wb') as handle: pickle.dump(q_algo1.lmbda, handle) with open('gradient_descent.alpha' + run, 'wb') as handle: pickle.dump(q_algo1.alpha, handle) r1 = [a[1] for a in result_1] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])])/200) r2 = [a[1] for r in result_2 for a in r] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])])/200) r_test = [a[1] for r in result_test for a in r] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:])])/50) r_4 = [a[1] for a in result_4 ] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])])/2) q_algo1.epsilon = 0.1 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3) teacher.teach(100)
def main(**kargs): initial_weights_file, i_total_action = latest(kargs['weights_dir']) print("Continuing using weights from file: ", initial_weights_file, "from", i_total_action) if kargs['theano_verbose']: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' if kargs['game'] == 'simple_breakout': game = simple_breakout.SimpleBreakout() class P(object): def __init__(self): self.screen_size = 12 def __call__(self, frames): return frames phi = P() else: ale = ag.init(game=kargs['game'], display_screen=(kargs['visualize'] == 'ale'), record_dir=kargs['record_dir']) game = ag.ALEGame(ale) phi = ag.Phi(method=kargs["phi_method"]) replay_memory = dqn.ReplayMemory(size=kargs['dqn.replay_memory_size'] ) if not kargs['dqn.no_replay'] else None algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file, build_network=kargs['dqn.network'], updates=kargs['dqn.updates'], screen_size=phi.screen_size) algo.replay_start_size = kargs['dqn.replay_start_size'] algo.final_epsilon = kargs['dqn.final_epsilon'] algo.initial_epsilon = kargs['dqn.initial_epsilon'] algo.i_action = i_total_action algo.log_frequency = kargs['dqn.log_frequency'] algo.target_network_update_frequency = kargs[ 'target_network_update_frequency'] algo.final_exploration_frame = kargs['final_exploration_frame'] import Queue algo.mood_q = Queue.Queue() if kargs['show_mood'] else None if kargs['show_mood'] is not None: plot = kargs['show_mood']() def worker(): while True: item = algo.mood_q.get() plot.show(item) algo.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(algo)) if kargs['visualize'] != 'q': visualizer = q.GameNoVisualizer() else: if kargs['game'] == 'simple_breakout': visualizer = simple_breakout.SimpleBreakoutVisualizer(algo) else: visualizer = ag.ALEGameVisualizer(phi.screen_size) teacher = q.Teacher( game=game, algo=algo, game_visualizer=visualizer, phi=phi, repeat_action=kargs['repeat_action'], i_total_action=i_total_action, total_n_actions=50000000, max_actions_per_game=10000, skip_n_frames_after_lol=kargs['skip_n_frames_after_lol'], run_test_every_n=kargs['run_test_every_n']) teacher.teach()
def main(game_name, network_type, updates_method, target_network_update_frequency, initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size, deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs, n_test_epochs, visualize, record_dir, show_mood, replay_memory_size, no_replay, repeat_action, skip_n_frames_after_lol, max_actions_per_game, weights_dir, algo_initial_state_file, log_frequency, theano_verbose): args = locals() if theano_verbose: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' if game_name == 'simple_breakout': game = simple_breakout.SimpleBreakout() class P(object): def __init__(self): self.screen_size = (12, 12) def __call__(self, frames): return frames phi = P() else: ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir) game = ag.ALEGame(ale) if phi_type == '4': phi = ag.Phi4(method=phi_method) elif phi_type == '1': phi = ag.Phi(method=phi_method) else: raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type)) if network_type == 'nature': build_network = network.build_nature elif network_type == 'nature_with_pad': build_network = network.build_nature_with_pad elif network_type == 'nips': build_network = network.build_nips elif network_type == 'nature_with_pad_he': build_network = network.build_nature_with_pad_he elif hasattr(network_type, '__call__'): build_network = network_type else: raise RuntimeError("Unknown network: {network}".format(network=network_type)) if updates_method == 'deepmind_rmsprop': updates = \ lambda loss, params: u.deepmind_rmsprop(loss, params, learning_rate=deepmind_rmsprop_learning_rate, rho=deepmind_rmsprop_rho, epsilon=deepmind_rmsprop_epsilon) elif updates_method == 'rmsprop': updates = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=rmsprop_learning_rate, rho=rmsprop_rho, epsilon=rmsprop_epsilon) else: raise RuntimeError("Unknown updates: {updates}".format(updates=updates_method)) replay_memory = dqn.ReplayMemory(size=replay_memory_size) if not no_replay else None def create_algo(): algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, build_network=build_network, updates=updates, screen_size=phi.screen_size) algo.replay_start_size = replay_start_size algo.final_epsilon = final_epsilon algo.initial_epsilon = initial_epsilon algo.log_frequency = log_frequency algo.target_network_update_frequency = target_network_update_frequency algo.final_exploration_frame = final_exploration_frame return algo algo_train = create_algo() algo_test = create_algo() algo_test.final_epsilon = test_epsilon algo_test.initial_epsilon = test_epsilon algo_test.epsilon = test_epsilon import Queue algo_train.mood_q = Queue.Queue() if show_mood else None if show_mood is not None: import Queue algo_train.mood_q = Queue.Queue() if show_mood == 'plot': plot = Plot() elif show_mood == "log": plot = Log() def worker(): while True: item = algo_train.mood_q.get() plot.show(item) algo_train.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(algo_train)) if visualize != 'q': visualizer = q.GameNoVisualizer() else: if game_name == 'simple_breakout': visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train) else: visualizer = ag.ALEGameVisualizer(phi.screen_size) teacher = q.Teacher(game=game, algo=algo_train, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=False) tester = q.Teacher(game=game, algo=algo_test, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=True) q.teach_and_test(teacher, tester, n_epochs=n_training_epochs, frames_to_test_on=n_test_epochs * epoch_size, epoch_size=epoch_size, state_dir=weights_dir, algo_initial_state_file=algo_initial_state_file)
def sarsa_gd_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag import matplotlib.pyplot as plt plt.ion() reload(q) reload(ag) ale = ag.init() run = '1' n_colors = 5 def state_adapter(scr): vect = np.reshape(ag.vectorized(scr, 14, 20), 14 * 20 * n_colors) return np.where(vect)[0] game = ag.SpaceInvadersGame(ale) q_algo1 = q.SARSALambdaGradientDescent(game.get_actions(), game.get_state(), initial_q=5, initial_theta=[1] * 14 * 20 * n_colors, be_positive=False, state_adapter=state_adapter) q_algo1.epsilon = 0.05 q_algo1.lmbda = 0.99 # 0.9 q_algo1.gamma = 0.999 q_algo1.alpha = 0.5 def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameVectorizedVisualizer(), repeat_action=3) # teacher.single_step(Game) q_algo1.epsilon = 0 q_algo1.log_freq = 1 teacher.teach(1) initial_training = 1000 training_decay_from = 95 training_decay_ex = 50 result_test = [] result_1 = [] result_2 = [] teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) q_algo1.log_freq = 0.05 q_algo1.epsilon = 1 result_1 = teacher.teach(initial_training) q_algo1.epsilon = 0 q_algo1.log_freq = 0.05 result_test.append(teacher.teach(1)) for i in range(training_decay_from): q_algo1.epsilon = 1 - i / 100 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) result_2.append(teacher.teach(training_decay_ex)) q_algo1.epsilon = 0 result_test.append(teacher.teach(1)) import cPickle as pickle with open('gradient_descent.theta' + run, 'wb') as handle: pickle.dump(q_algo1.theta, handle) with open('gradient_descent.gamma' + run, 'wb') as handle: pickle.dump(q_algo1.gamma, handle) with open('gradient_descent.lmbda' + run, 'wb') as handle: pickle.dump(q_algo1.lmbda, handle) with open('gradient_descent.alpha' + run, 'wb') as handle: pickle.dump(q_algo1.alpha, handle) r1 = [a[1] for a in result_1] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])]) / 200) r2 = [a[1] for r in result_2 for a in r] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])]) / 200) r_test = [a[1] for r in result_test for a in r] plt.plot( np.array([ x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:]) ]) / 50) r_4 = [a[1] for a in result_4] plt.plot( np.array( [x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])]) / 2) q_algo1.epsilon = 0.1 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) teacher.teach(100)
def main(game_name, network_type, updates_method, target_network_update_frequency, initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size, deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs, n_test_epochs, visualize, record_dir, show_mood, replay_memory_size, no_replay, repeat_action, skip_n_frames_after_lol, max_actions_per_game, weights_dir, algo_initial_state_file, log_frequency, theano_verbose): args = locals() if theano_verbose: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' if game_name == 'simple_breakout': game = simple_breakout.SimpleBreakout() class P(object): def __init__(self): self.screen_size = (12, 12) def __call__(self, frames): return frames phi = P() else: ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir) game = ag.ALEGame(ale) if phi_type == '4': phi = ag.Phi4(method=phi_method) elif phi_type == '1': phi = ag.Phi(method=phi_method) else: raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type)) if network_type == 'nature': build_network = network.build_nature elif network_type == 'nature_with_pad': build_network = network.build_nature_with_pad elif network_type == 'nips': build_network = network.build_nips elif network_type == 'nature_with_pad_he': build_network = network.build_nature_with_pad_he elif hasattr(network_type, '__call__'): build_network = network_type else: raise RuntimeError( "Unknown network: {network}".format(network=network_type)) if updates_method == 'deepmind_rmsprop': updates = \ lambda loss, params: u.deepmind_rmsprop(loss, params, learning_rate=deepmind_rmsprop_learning_rate, rho=deepmind_rmsprop_rho, epsilon=deepmind_rmsprop_epsilon) elif updates_method == 'rmsprop': updates = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=rmsprop_learning_rate, rho=rmsprop_rho, epsilon=rmsprop_epsilon) else: raise RuntimeError( "Unknown updates: {updates}".format(updates=updates_method)) replay_memory = dqn.ReplayMemory( size=replay_memory_size) if not no_replay else None def create_algo(): algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, build_network=build_network, updates=updates, screen_size=phi.screen_size) algo.replay_start_size = replay_start_size algo.final_epsilon = final_epsilon algo.initial_epsilon = initial_epsilon algo.log_frequency = log_frequency algo.target_network_update_frequency = target_network_update_frequency algo.final_exploration_frame = final_exploration_frame return algo algo_train = create_algo() algo_test = create_algo() algo_test.final_epsilon = test_epsilon algo_test.initial_epsilon = test_epsilon algo_test.epsilon = test_epsilon import Queue algo_train.mood_q = Queue.Queue() if show_mood else None if show_mood is not None: import Queue algo_train.mood_q = Queue.Queue() if show_mood == 'plot': plot = Plot() elif show_mood == "log": plot = Log() def worker(): while True: item = algo_train.mood_q.get() plot.show(item) algo_train.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(algo_train)) if visualize != 'q': visualizer = q.GameNoVisualizer() else: if game_name == 'simple_breakout': visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train) else: visualizer = ag.ALEGameVisualizer(phi.screen_size) teacher = q.Teacher(game=game, algo=algo_train, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=False) tester = q.Teacher(game=game, algo=algo_test, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=True) q.teach_and_test(teacher, tester, n_epochs=n_training_epochs, frames_to_test_on=n_test_epochs * epoch_size, epoch_size=epoch_size, state_dir=weights_dir, algo_initial_state_file=algo_initial_state_file)
def sarsa_gd_on_space_invaders(): import q_learning as q import numpy as np import ale_game as ag import matplotlib.pyplot as plt import sarsa as ss plt.ion() reload(ss) reload(q) reload(ag) ale = ag.init() run = '1' def state_adapter(frames): result = np.where(np.reshape(np.concatenate(frames), 80 * 80 * 4) > 0) if len(result) == 0: return [0] else: return result game = ag.SpaceInvadersGame(ale) q_algo1 = ss.SARSALambdaGradientDescent(game.n_actions(), theta_len=80 * 80 * 4, state_adapter=state_adapter) q_algo1.epsilon = 0.9 q_algo1.lmbda = 0.99 q_algo1.gamma = 0.999 q_algo1.alpha = 0.1 def new_game(): game.ale.reset_game() game.finished = False game.cum_reward = 0 return game result_test = [] result_1 = [] result_2 = [] teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), phi=ag.Phi(skip_every=6), repeat_action=6) q_algo1.epsilon = 1 q_algo1.log_freq = 1 result_test.append(teacher.teach(10)) vis_teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameCombined2Visualizer(), phi=ag.Phi(skip_every=6), repeat_action=6) # teacher.single_step(Game) q_algo1.epsilon = 0.1 q_algo1.log_freq = 1 # vis_teacher.teach(5) for i in xrange(90): q_algo1.log_freq = 0.03 q_algo1.epsilon = 1 - i / 100 result_2.append(teacher.teach(50)) q_algo1.epsilon = 0.1 result_test.append(teacher.teach(10)) import cPickle as pickle with open('gradient_descent.theta' + run, 'wb') as handle: pickle.dump(q_algo1.theta, handle) with open('gradient_descent.gamma' + run, 'wb') as handle: pickle.dump(q_algo1.gamma, handle) with open('gradient_descent.lmbda' + run, 'wb') as handle: pickle.dump(q_algo1.lmbda, handle) with open('gradient_descent.alpha' + run, 'wb') as handle: pickle.dump(q_algo1.alpha, handle) r1 = [a[1] for a in result_1] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])]) / 200) r2 = [a[1] for r in result_2 for a in r] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])]) / 200) r_test = [a[1] for r in result_test for a in r] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:])]) / 50) r_4 = [a[1] for a in result_4] plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])]) / 2) q_algo1.epsilon = 0.1 teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3) teacher.teach(100)