def evaluation(session, graph_ops, saver): saver.restore(session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start(FLAGS.eval_dir + "/" + FLAGS.experiment + "/eval") # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=monitor_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) for i_episode in xrange(FLAGS.num_eval_episodes): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() readout_t = q_values.eval(session=session, feed_dict={s: [s_t]}) action_index = np.argmax(readout_t) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print ep_reward monitor_env.monitor.close()
def evaluation(session, graph_ops, saver): saver.restore(session, CHECKPOINT_NAME) print "Restored model weights from ", CHECKPOINT_NAME monitor_env = gym.make(GAME) monitor_env.monitor.start('/tmp/'+EXPERIMENT_NAME+"/eval") # Unpack graph ops s, a_t, R_t, learning_rate, minimize, p_network, v_network = graph_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=monitor_env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH) for i_episode in xrange(100): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() # Forward the deep q network, get Q(s,a) values probs = p_network.eval(session = session, feed_dict = {s : [s_t]})[0] action_index = sample_policy_action(ACTIONS, probs) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print ep_reward monitor_env.monitor.close()
def evaluation(session, graph_ops, saver): saver.restore(session, CHECKPOINT_NAME) print "Restored model weights from ", CHECKPOINT_NAME monitor_env = gym.make(GAME) monitor_env.monitor.start('/tmp/' + EXPERIMENT_NAME + "/eval") # Unpack graph ops s, a_t, R_t, minimize, p_network, v_network = graph_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=monitor_env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH) for i_episode in xrange(100): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() # Forward the deep q network, get Q(s,a) values probs = p_network.eval(session=session, feed_dict={s: [s_t]})[0] action_index = sample_policy_action(ACTIONS, probs) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print ep_reward monitor_env.monitor.close()
def evaluation(session, graph_ops, saver): saver.restore(session, FLAGS.checkpoint_path) print "Restored model weights from ", FLAGS.checkpoint_path monitor_env = gym.make(FLAGS.game) monitor_env.monitor.start(FLAGS.eval_dir+"/"+FLAGS.experiment+"/eval") # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=monitor_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) for i_episode in xrange(FLAGS.num_eval_episodes): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() readout_t = q_values.eval(session = session, feed_dict = {s : [s_t]}) action_index = np.argmax(readout_t) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print ep_reward monitor_env.monitor.close()
def __init__(self, id, prediction_q, training_q, episode_log_q): super(Agent, self).__init__(name="Agent_{}".format(id)) self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q gym_env = gym.make(FLAGS.game) gym_env.seed(FLAGS.seed) self.env = AtariEnvironment( gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) self.nb_actions = len(self.env.gym_actions) self.wait_q = Queue(maxsize=1) self.stop = Value('i', 0)
def evaluation(session, graph_ops, saver): saver.restore(session, FLAGS.checkpoint_path) print("Restored model weights from ", FLAGS.checkpoint_path) monitor_env = gym.make(FLAGS.game) gym.wrappers.Monitor(monitor_env, FLAGS.eval_dir + "/" + FLAGS.experiment + "/eval") # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] # Wrap env with AtariEnvironment helper class if env_type in {'atari'}: env = AtariEnvironment(gym_env=monitor_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) else: env = CustomEnvironment( gym_env=monitor_env, input_size=FLAGS.input_size, agent_history_length=FLAGS.agent_history_length, extra_args={ 'init_with_args': FLAGS.init_with_args, 'setting_file_path': FLAGS.setting_file_path }) for i_episode in range(FLAGS.num_eval_episodes): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() readout_t = q_values.eval(session=session, feed_dict={s: [s_t]}) action_index = np.argmax(readout_t) print("action", action_index) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print(ep_reward) monitor_env.monitor.close()
def __init__(self, id, prediction_q, training_q, episode_log_q): super(Agent, self).__init__(name="Agent_{}".format(id)) self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q gym_env = gym.make(FLAGS.game) gym_env.seed(FLAGS.seed) self.env = AtariEnvironment(gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) self.nb_actions = len(self.env.gym_actions) self.wait_q = Queue(maxsize=1) self.stop = Value('i', 0)
def main(): env = gym.make(ENV_NAME) env = AtariEnvironment(gym_env=env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, sequence_length=SEQUENCE_LENGTH) g = tf.Graph() with g.as_default(), tf.Session() as sess: K.set_session(sess) graph_ops = build_graph() sess.run(tf.global_variables_initializer()) times, rewards = train(env, sess, graph_ops) print(times) print(rewards) visualize(np.arange(len(times)), times, "times.png") visualize(np.arange(len(rewards)), rewards, "rewards.png")
def run(): tf.reset_default_graph() with tf.Session() as sess: with tf.device("/cpu:0"): global_step = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) # optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr) optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, 0.99, 0.0, 1e-6) gym_env_monitor = gym.make(FLAGS.game) gym_env_monitor.seed(FLAGS.seed) gym_env_monitor_wrapper = AtariEnvironment(gym_env=gym_env_monitor, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) nb_actions = len(gym_env_monitor_wrapper.gym_actions) if FLAGS.lstm: global_network = ACNetworkLSTM('global', nb_actions, None) else: global_network = ACNetwork('global', nb_actions, None) saver = tf.train.Saver(max_to_keep=5) if FLAGS.resume: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) pe = PolicyMonitor( game=gym_env_monitor_wrapper, nb_actions=nb_actions, optimizer=optimizer, global_step=global_step ) pe.eval_1000(sess)
class Agent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q): super(Agent, self).__init__(name="Agent_{}".format(id)) self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q gym_env = gym.make(FLAGS.game) gym_env.seed(FLAGS.seed) self.env = AtariEnvironment( gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) self.nb_actions = len(self.env.gym_actions) self.wait_q = Queue(maxsize=1) self.stop = Value('i', 0) def run(self): time.sleep(np.random.rand()) while not self.stop.value: if FLAGS.verbose: print("Agent_{} started a new episode".format(self.id)) # total_reward = 0 # total_length = 0 for episode_buffer, episode_reward, episode_length in self.run_episode_generator( ): if FLAGS.verbose: print("Agent_{} puts a new episode in the training queue". format(self.id)) self.training_q.put(episode_buffer) print( "Agent_{} fished an episode and logs the result in the logs queue" .format(self.id)) self.episode_log_q.put( [datetime.now(), episode_reward, episode_length]) def run_episode_generator(self): s, _ = self.env.get_initial_state() d = False episode_buffer = [] episode_reward = 0 episode_step_count = 0 while not d: self.prediction_q.put((self.id, s)) pi, v = self.wait_q.get() a = np.random.choice(pi[0], p=pi[0]) a = np.argmax(pi == a) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_buffer.append([s, a, pi, r, s1, d, v[0, 0]]) episode_reward += r episode_step_count += 1 s = s1 if len(episode_buffer) == FLAGS.max_episode_buffer_size and not d: self.prediction_q.put((self.id, s)) pi, v1 = self.wait_q.get() updated_episode_buffer = self.get_training_data( episode_buffer, v1) yield updated_episode_buffer, episode_reward, episode_step_count if d: break if len(episode_buffer) != 0: updated_episode_buffer = self.get_training_data(episode_buffer, 0) yield updated_episode_buffer, episode_reward, episode_step_count def discount(self, x): return lfilter([1], [1, -FLAGS.gamma], x[::-1], axis=0)[::-1] def get_training_data(self, rollout, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] pis = rollout[:, 2] rewards = rollout[:, 3] next_observations = rollout[:, 4] values = rollout[:, 5] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = self.discount(rewards_plus, FLAGS.gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] rollout.extend([discounted_rewards])
def actor_learner_thread(thread_id, env, session, graph_ops, num_actions, summary_ops, saver): """ Actor-learner thread implementing asynchronous one-step Q-learning, as specified in algorithm 1 here: http://arxiv.org/pdf/1602.01783v1.pdf. """ global TMAX, T # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] st = graph_ops["st"] target_q_values = graph_ops["target_q_values"] reset_target_network_params = graph_ops["reset_target_network_params"] a = graph_ops["a"] y = graph_ops["y"] grad_update = graph_ops["grad_update"] summary_placeholders, update_ops, summary_op = summary_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) # Initialize network gradients s_batch = [] a_batch = [] y_batch = [] final_epsilon = sample_final_epsilon() initial_epsilon = 1.0 epsilon = 1.0 print "Starting thread ", thread_id, "with final epsilon ", final_epsilon time.sleep(3 * thread_id) t = 0 while T < TMAX: # Get initial game observation s_t = env.get_initial_state() terminal = False # Set up per-episode counters ep_reward = 0 episode_ave_max_q = 0 ep_t = 0 while True: # Forward the deep q network, get Q(s,a) values readout_t = q_values.eval(session=session, feed_dict={s: [s_t]}) # Choose next action based on e-greedy policy a_t = np.zeros([num_actions]) action_index = 0 if random.random() <= epsilon: action_index = random.randrange(num_actions) else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # Scale down epsilon if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps # Gym excecutes action in game environment on behalf of actor-learner s_t1, r_t, terminal, info = env.step(action_index) # Accumulate gradients readout_j1 = target_q_values.eval(session=session, feed_dict={st: [s_t1]}) clipped_r_t = np.clip(r_t, -1, 1) if terminal: y_batch.append(clipped_r_t) else: y_batch.append(clipped_r_t + FLAGS.gamma * np.max(readout_j1)) a_batch.append(a_t) s_batch.append(s_t) # Update the state and counters s_t = s_t1 T += 1 t += 1 ep_t += 1 ep_reward += r_t episode_ave_max_q += np.max(readout_t) # Optionally update target network if T % FLAGS.target_network_update_frequency == 0: session.run(reset_target_network_params) # Optionally update online network if t % FLAGS.network_update_frequency == 0 or terminal: if s_batch: session.run(grad_update, feed_dict={ y: y_batch, a: a_batch, s: s_batch }) # Clear gradients s_batch = [] a_batch = [] y_batch = [] # Save model progress if t % FLAGS.checkpoint_interval == 0: saver.save(session, FLAGS.checkpoint_dir + "/" + FLAGS.experiment + ".ckpt", global_step=t) # Print end of episode stats if terminal: stats = [ep_reward, episode_ave_max_q / float(ep_t), epsilon] for i in range(len(stats)): session.run( update_ops[i], feed_dict={summary_placeholders[i]: float(stats[i])}) print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % ( episode_ave_max_q / float(ep_t)), "/ EPSILON PROGRESS", t / float( FLAGS.anneal_epsilon_timesteps) break
parser.add_argument("--save-model-freq", type=int, default=10000, help="save the model once per 10000 training sessions") parser.add_argument("--observation-steps", type=int, default=50000, help="train only after this many stesp (=4 frames)") parser.add_argument("--learning-rate", type=float, default=0.00025, help="learning rate (step size for optimization algo)") parser.add_argument("--target-model-update-freq", type=int, default=10000, help="how often (in steps) to update the target model. Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8") parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from") parser.add_argument("rom", help="rom file to run") args = parser.parse_args() print 'Arguments: %s' % (args) baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(baseOutputDir) State.setup(args) environment = AtariEnvironment(args, baseOutputDir) dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args) replayMemory = replay.ReplayMemory(args) def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time() stateReward = 0
help= "how often (in steps) to update the target model. Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8" ) parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from") parser.add_argument("rom", help="rom file to run") args = parser.parse_args() print 'Arguments: %s' % (args) baseOutputDir = 'game-out-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(baseOutputDir) State.setup(args) environment = AtariEnvironment(args, baseOutputDir) dqn = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args) replayMemory = replay.ReplayMemory(args) def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time()
def actor_learner_thread(num, env, session, graph_ops, summary_ops, saver): # We use global shared counter T, and TMAX constant global TMAX, T # Unpack graph ops s, a, R, minimize, p_network, v_network = graph_ops # Unpack tensorboard summary stuff r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op = summary_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH) time.sleep(5*num) # Set up per-episode counters ep_reward = 0 ep_avg_v = 0 v_steps = 0 ep_t = 0 probs_summary_t = 0 s_t = env.get_initial_state() terminal = False while T < TMAX: s_batch = [] past_rewards = [] a_batch = [] t = 0 t_start = t while not (terminal or ((t - t_start) == t_max)): # Perform action a_t according to policy pi(a_t | s_t) probs = session.run(p_network, feed_dict={s: [s_t]})[0] action_index = sample_policy_action(ACTIONS, probs) a_t = np.zeros([ACTIONS]) a_t[action_index] = 1 if probs_summary_t % 100 == 0: print "P, ", np.max(probs), "V ", session.run(v_network, feed_dict={s: [s_t]})[0][0] s_batch.append(s_t) a_batch.append(a_t) s_t1, r_t, terminal, info = env.step(action_index) ep_reward += r_t r_t = np.clip(r_t, -1, 1) past_rewards.append(r_t) t += 1 T += 1 ep_t += 1 probs_summary_t += 1 s_t = s_t1 if terminal: R_t = 0 else: R_t = session.run(v_network, feed_dict={s: [s_t]})[0][0] # Bootstrap from last state R_batch = np.zeros(t) for i in reversed(range(t_start, t)): R_t = past_rewards[i] + GAMMA * R_t R_batch[i] = R_t session.run(minimize, feed_dict={R : R_batch, a : a_batch, s : s_batch}) # Save progress every 5000 iterations if T % CHECKPOINT_INTERVAL == 0: saver.save(session, CHECKPOINT_SAVE_PATH, global_step = T) if terminal: # Episode ended, collect stats and reset game session.run(update_ep_reward, feed_dict={r_summary_placeholder: ep_reward}) print "THREAD:", num, "/ TIME", T, "/ REWARD", ep_reward s_t = env.get_initial_state() terminal = False # Reset per-episode counters ep_reward = 0 ep_t = 0
def run(): recreate_directory_structure() tf.reset_default_graph() sess = tf.Session() # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) with sess: with tf.device("/cpu:0"): global_step = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) # optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr) optimizer = tf.train.RMSPropOptimizer(FLAGS.lr, 0.99, 0.0, 1e-6) num_workers = FLAGS.nb_concurrent #num_workers = multiprocessing.cpu_count() - 1 workers = [] envs = [] for i in range(num_workers): gym_env = gym.make(FLAGS.game) if FLAGS.seed: gym_env.seed(FLAGS.seed) if FLAGS.monitor: gym_env = gym.wrappers.Monitor( gym_env, FLAGS.experiments_dir + '/worker_{}'.format(i)) this_env = AtariEnvironment( gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) envs.append(this_env) nb_actions = len(envs[0].gym_actions) if FLAGS.lstm: global_network = ACNetworkLSTM('global', nb_actions, None) else: global_network = ACNetwork('global', nb_actions, None) for i in range(num_workers): workers.append( Worker(envs[i], sess, i, nb_actions, optimizer, global_step)) saver = tf.train.Saver(max_to_keep=5) # gym_env_monitor = gym.make(FLAGS.game) # gym_env_monitor.seed(FLAGS.seed) # gym_env_monitor_wrapper = AtariEnvironment(gym_env=gym_env_monitor, resized_width=FLAGS.resized_width, # resized_height=FLAGS.resized_height, # agent_history_length=FLAGS.agent_history_length) # nb_actions = len(gym_env_monitor_wrapper.gym_actions) # pe = PolicyMonitor( # game=gym_env_monitor_wrapper, # nb_actions=nb_actions, # optimizer=optimizer, # global_step=global_step # ) coord = tf.train.Coordinator() if FLAGS.resume: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) print("Loading Model from {}".format(ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) worker_threads = [] for worker in workers: t = threading.Thread(target=(lambda: worker.play(coord, saver))) t.start() worker_threads.append(t) # Start a thread for policy eval task # monitor_thread = threading.Thread(target=lambda: pe.continuous_eval(FLAGS.eval_every, sess, coord)) # monitor_thread.start() import time while True: if FLAGS.show_training: for env in envs: # time.sleep(1) # with main_lock: env.env.render() coord.join(worker_threads)
class Agent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q): super(Agent, self).__init__(name="Agent_{}".format(id)) self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q gym_env = gym.make(FLAGS.game) gym_env.seed(FLAGS.seed) self.env = AtariEnvironment(gym_env=gym_env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) self.nb_actions = len(self.env.gym_actions) self.wait_q = Queue(maxsize=1) self.stop = Value('i', 0) def run(self): time.sleep(np.random.rand()) while not self.stop.value: if FLAGS.verbose: print("Agent_{} started a new episode".format(self.id)) # total_reward = 0 # total_length = 0 for episode_buffer, episode_reward, episode_length in self.run_episode_generator(): if FLAGS.verbose: print("Agent_{} puts a new episode in the training queue".format(self.id)) self.training_q.put(episode_buffer) print("Agent_{} fished an episode and logs the result in the logs queue".format(self.id)) self.episode_log_q.put([datetime.now(), episode_reward, episode_length]) def run_episode_generator(self): s, _ = self.env.get_initial_state() d = False episode_buffer = [] episode_reward = 0 episode_step_count = 0 while not d: self.prediction_q.put((self.id, s)) pi, v = self.wait_q.get() a = np.random.choice(pi[0], p=pi[0]) a = np.argmax(pi == a) s1, r, d, info = self.env.step(a) r = np.clip(r, -1, 1) episode_buffer.append([s, a, pi, r, s1, d, v[0, 0]]) episode_reward += r episode_step_count += 1 s = s1 if len(episode_buffer) == FLAGS.max_episode_buffer_size and not d: self.prediction_q.put((self.id, s)) pi, v1 = self.wait_q.get() updated_episode_buffer = self.get_training_data(episode_buffer, v1) yield updated_episode_buffer, episode_reward, episode_step_count if d: break if len(episode_buffer) != 0: updated_episode_buffer = self.get_training_data(episode_buffer, 0) yield updated_episode_buffer, episode_reward, episode_step_count def discount(self, x): return lfilter([1], [1, -FLAGS.gamma], x[::-1], axis=0)[::-1] def get_training_data(self, rollout, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] pis = rollout[:, 2] rewards = rollout[:, 3] next_observations = rollout[:, 4] values = rollout[:, 5] rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = self.discount(rewards_plus, FLAGS.gamma)[:-1] value_plus = np.asarray(values.tolist() + [bootstrap_value]) policy_target = discounted_rewards - value_plus[:-1] rollout.extend([discounted_rewards])
def actor_learner_thread(thread_id, env, session, graph_ops, num_actions, summary_ops, saver): """ Actor-learner thread implementing asynchronous one-step Q-learning, as specified in algorithm 1 here: http://arxiv.org/pdf/1602.01783v1.pdf. """ global TMAX, T # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] st = graph_ops["st"] target_q_values = graph_ops["target_q_values"] reset_target_network_params = graph_ops["reset_target_network_params"] a = graph_ops["a"] y = graph_ops["y"] grad_update = graph_ops["grad_update"] summary_placeholders, update_ops, summary_op = summary_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=env, resized_width=FLAGS.resized_width, resized_height=FLAGS.resized_height, agent_history_length=FLAGS.agent_history_length) # Initialize network gradients s_batch = [] a_batch = [] y_batch = [] final_epsilon = sample_final_epsilon() initial_epsilon = 1.0 epsilon = 1.0 print "Starting thread ", thread_id, "with final epsilon ", final_epsilon time.sleep(3*thread_id) t = 0 while T < TMAX: # Get initial game observation s_t = env.get_initial_state() terminal = False # Set up per-episode counters ep_reward = 0 episode_ave_max_q = 0 ep_t = 0 while True: # Forward the deep q network, get Q(s,a) values readout_t = q_values.eval(session = session, feed_dict = {s : [s_t]}) # Choose next action based on e-greedy policy a_t = np.zeros([num_actions]) action_index = 0 if random.random() <= epsilon: action_index = random.randrange(num_actions) else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # Scale down epsilon if epsilon > final_epsilon: epsilon -= (initial_epsilon - final_epsilon) / FLAGS.anneal_epsilon_timesteps # Gym excecutes action in game environment on behalf of actor-learner s_t1, r_t, terminal, info = env.step(action_index) # Accumulate gradients readout_j1 = target_q_values.eval(session = session, feed_dict = {st : [s_t1]}) clipped_r_t = np.clip(r_t, -1, 1) if terminal: y_batch.append(clipped_r_t) else: y_batch.append(clipped_r_t + FLAGS.gamma * np.max(readout_j1)) a_batch.append(a_t) s_batch.append(s_t) # Update the state and counters s_t = s_t1 T += 1 t += 1 ep_t += 1 ep_reward += r_t episode_ave_max_q += np.max(readout_t) # Optionally update target network if T % FLAGS.target_network_update_frequency == 0: session.run(reset_target_network_params) # Optionally update online network if t % FLAGS.network_update_frequency == 0 or terminal: if s_batch: session.run(grad_update, feed_dict = {y : y_batch, a : a_batch, s : s_batch}) # Clear gradients s_batch = [] a_batch = [] y_batch = [] # Save model progress if t % FLAGS.checkpoint_interval == 0: saver.save(session, FLAGS.checkpoint_dir+"/"+FLAGS.experiment+".ckpt", global_step = t) # Print end of episode stats if terminal: stats = [ep_reward, episode_ave_max_q/float(ep_t), epsilon] for i in range(len(stats)): session.run(update_ops[i], feed_dict={summary_placeholders[i]:float(stats[i])}) print "THREAD:", thread_id, "/ TIME", T, "/ TIMESTEP", t, "/ EPSILON", epsilon, "/ REWARD", ep_reward, "/ Q_MAX %.4f" % (episode_ave_max_q/float(ep_t)), "/ EPSILON PROGRESS", t/float(FLAGS.anneal_epsilon_timesteps) break
def main(_): # Reproducability tf.reset_default_graph() np.random.seed(cfg.random_seed) tf.set_random_seed(cfg.random_seed) # Logging summary_writer = tf.summary.FileWriter(cfg.log_dir) if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir): tf.gfile.MakeDirs(cfg.save_dir) else: assert tf.gfile.Exists(cfg.save_dir) # TODO handel this episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv") episode_results = tf.gfile.GFile(episode_results_path, "w") episode_results.write("model_freq={},save_dir={}".format( cfg.model_freq, cfg.save_dir)) episode_results.write("episode,reward,steps\n") episode_results.flush() # Setup ALE and DQN graph obs_shape = (84, 84, 1) input_height, input_width, _ = obs_shape dqn = DQN(input_height, input_width, cfg.num_actions) # Global step global_step = tf.train.get_or_create_global_step() increment_step = tf.assign_add(global_step, 1) # Save all variables vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent/q") vars_to_save.append(global_step) saver = tf.train.Saver(var_list=vars_to_save) # Handle loading specific variables sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) restore_or_initialize_weights(sess, dqn, saver) sess.run(dqn.copy_to_target) if cfg.evaluate: # if in evaluation mode, saver is no longer needed saver = None # ##### Restoring AEs ######## if not cfg.evaluate: vaes = create_generative_models(sess) image_summaries = [] image_summaries_ph = tf.placeholder(tf.float32, shape=(4, 84, 84, 4), name="image_summaries_placeholder") for i in range(4): for j in range(4): image_summaries.append( tf.summary.image( "VAE_OUT_{}_{}".format(i, j), tf.reshape(image_summaries_ph[i, :, :, j], (1, 84, 84, 1)))) # ############################ if not cfg.evaluate: summary_writer.add_graph(tf.get_default_graph()) summary_writer.add_graph(vaes[0].graph) summary_writer.add_graph(vaes[1].graph) summary_writer.add_graph(vaes[2].graph) summary_writer.flush() # Initialize ALE postprocess_frame = lambda frame: sess.run(dqn.process_frame, feed_dict={dqn.image: frame}) env = AtariEnvironment(obs_shape, postprocess_frame) # Replay buffer if not cfg.evaluate: replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape) # Perform random policy to get some training data with tqdm(total=cfg.seed_frames, disable=cfg.disable_progress or cfg.evaluate) as pbar: seed_steps = 0 while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate: action = np.random.randint(cfg.num_actions) reward, next_state, terminal = env.act(action) seed_steps += 1 replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) if terminal: pbar.update(env.episode_frames) env.reset(inc_episode_count=False) if cfg.evaluate: assert cfg.max_episode_count > 0 else: assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip # Main training loop steps = tf.train.global_step(sess, global_step) env.reset(inc_episode_count=False) terminal = False total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames with tqdm(total=total, disable=cfg.disable_progress) as pbar: # Loop while we haven't observed our max frame number # If we are at our max frame number we will finish the current episode while (not ( # We must be evaluating or observed the last frame # As well as be terminal # As well as seen the maximum episode number (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate) and terminal and env.episode_count >= cfg.max_episode_count)): # Epsilon greedy policy with epsilon annealing if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over: # Only compute epsilon step while we're still annealing epsilon epsilon = cfg.eps_initial - steps * ( (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over) else: epsilon = cfg.eps_final # Epsilon greedy policy if np.random.uniform() < epsilon: action = np.random.randint(0, cfg.num_actions) else: action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]}) # Perform environment step steps = sess.run(increment_step) reward, next_state, terminal = env.act(action) if not cfg.evaluate: replay_buffer.append(next_state[:, :, -1, np.newaxis], action, reward, terminal) # Sample and do gradient updates if steps % cfg.learning_freq == 0: placeholders = [ dqn.S, dqn.actions, dqn.rewards, dqn.S_p, dqn.terminals, ] batch = replay_buffer.sample(cfg.batch_size) train_op = [dqn.train] if steps % (cfg.learning_freq * cfg.model_freq) == 0: experience_batch = batch batch = imagined_batch(vaes, batch[1]) if steps / (cfg.learning_freq * cfg.model_freq) < 10: placeholders.append(image_summaries_ph) batch = list(batch) batch.append(batch[0][ np.random.randint(0, 32, size=4), :, :, :]) train_op.extend(image_summaries) if steps % cfg.log_summary_every: train_op.append(dqn.summary) result = sess.run( train_op, feed_dict=dict(zip(placeholders, batch)), ) if len(result) > 1: for i in range(1, len(result)): summary_writer.add_summary(result[i], global_step=steps) if steps % cfg.target_update_every == 0: sess.run([dqn.copy_to_target]) if steps % cfg.model_chkpt_every == 0: saver.save(sess, "%s/model_epoch_%04d" % (cfg.save_dir, steps)) if terminal: episode_results.write("%d,%d,%d\n" % (env.episode_count, env.episode_reward, env.episode_frames)) episode_results.flush() # Log episode summaries to Tensorboard add_simple_summary(summary_writer, "episode/reward", env.episode_reward, env.episode_count) add_simple_summary(summary_writer, "episode/frames", env.episode_frames, env.episode_count) pbar.update(env.episode_frames if not cfg.evaluate else 1) env.reset() episode_results.close() tf.logging.info("Finished %d %s" % ( cfg.max_episode_count if cfg.evaluate else cfg.num_frames, "episodes" if cfg.evaluate else "frames", ))
parser.add_argument("--target-model-update-freq", type=int, default=10000, help="how often (in steps) to update the target model. Note nature paper says this is in 'number of parameter updates' but their code says steps. see tinyurl.com/hokp4y8") parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from") parser.add_argument("rom", help="rom file to run") args = parser.parse_args() print('Arguments: %s' % (args)) game_name = os.path.splitext(os.path.split(args.rom)[1])[0] baseOutputDir = 'out-'+ game_name + '-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(baseOutputDir) logging.info("Training game "+game_name) logging.info("Storing training into "+baseOutputDir) State.setup(args) environment = AtariEnvironment(args, baseOutputDir) dqn_network = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args) replayMemory = replay.ReplayMemory(args) def runEpoch(minEpochSteps, evalWithEpsilon=None): logging.info('Running epoch with min epoch steps: %d' % minEpochSteps) stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 pbar = tqdm(total=minEpochSteps) while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time()
def __init__(self, args): super(ActorLearner, self).__init__() self.summ_base_dir = args.summ_base_dir self.local_step = 0 self.global_step = args.global_step self.actor_id = args.actor_id self.alg_type = args.alg_type self.max_local_steps = args.max_local_steps self.optimizer_type = args.opt_type self.optimizer_mode = args.opt_mode self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps # Shared mem vars self.learning_vars = args.learning_vars size = self.learning_vars.size self.flat_grads = np.empty(size, dtype=ctypes.c_float) if (self.optimizer_mode == "local"): if (self.optimizer_type == "rmsprop"): self.opt_st = np.ones(size, dtype=ctypes.c_float) else: self.opt_st = np.zeros(size, dtype=ctypes.c_float) elif (self.optimizer_mode == "shared"): self.opt_st = args.opt_state # rmsprop/momentum self.alpha = args.alpha # adam self.b1 = args.b1 self.b2 = args.b2 self.e = args.e if args.env == "GYM": from atari_environment import AtariEnvironment self.emulator = AtariEnvironment(args.game, args.visualize) else: from emulator import Emulator self.emulator = Emulator(args.rom_path, args.game, args.visualize, self.actor_id, args.random_seed, args.single_life_episodes) self.grads_update_steps = args.grads_update_steps self.max_global_steps = args.max_global_steps self.gamma = args.gamma # Exploration epsilons self.epsilon = 1.0 self.initial_epsilon = 1.0 self.final_epsilon = generate_final_epsilon() self.epsilon_annealing_steps = args.epsilon_annealing_steps self.rescale_rewards = args.rescale_rewards self.max_achieved_reward = -1000000 if self.rescale_rewards: self.thread_max_reward = 1.0 # Barrier to synchronize all actors after initialization is done self.barrier = args.barrier self.summary_ph, self.update_ops, self.summary_ops = self.setup_summaries( ) self.game = args.game
def actor_learner_thread(num, env, session, graph_ops, summary_ops, saver): # We use global shared counter T, and TMAX constant global TMAX, T # Unpack graph ops s, a, R, minimize, p_network, v_network = graph_ops # Unpack tensorboard summary stuff r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op = summary_ops # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH) time.sleep(5 * num) # Set up per-episode counters ep_reward = 0 ep_avg_v = 0 v_steps = 0 ep_t = 0 probs_summary_t = 0 s_t = env.get_initial_state() terminal = False while T < TMAX: s_batch = [] past_rewards = [] a_batch = [] t = 0 t_start = t while not (terminal or ((t - t_start) == t_max)): # Perform action a_t according to policy pi(a_t | s_t) probs = session.run(p_network, feed_dict={s: [s_t]})[0] action_index = sample_policy_action(ACTIONS, probs) a_t = np.zeros([ACTIONS]) a_t[action_index] = 1 if probs_summary_t % 100 == 0: print "P, ", np.max(probs), "V ", session.run( v_network, feed_dict={s: [s_t]})[0][0] s_batch.append(s_t) a_batch.append(a_t) s_t1, r_t, terminal, info = env.step(action_index) ep_reward += r_t r_t = np.clip(r_t, -1, 1) past_rewards.append(r_t) t += 1 T += 1 ep_t += 1 probs_summary_t += 1 s_t = s_t1 if terminal: R_t = 0 else: R_t = session.run(v_network, feed_dict={s: [s_t] })[0][0] # Bootstrap from last state R_batch = np.zeros(t) for i in reversed(range(t_start, t)): R_t = past_rewards[i] + GAMMA * R_t R_batch[i] = R_t session.run(minimize, feed_dict={R: R_batch, a: a_batch, s: s_batch}) # Save progress every 5000 iterations if T % CHECKPOINT_INTERVAL == 0: saver.save(session, CHECKPOINT_SAVE_PATH, global_step=T) if terminal: # Episode ended, collect stats and reset game session.run(update_ep_reward, feed_dict={r_summary_placeholder: ep_reward}) print "THREAD:", num, "/ TIME", T, "/ REWARD", ep_reward s_t = env.get_initial_state() terminal = False # Reset per-episode counters ep_reward = 0 ep_t = 0
parser.add_argument("--model", help="tensorflow model checkpoint file to initialize from") parser.add_argument("rom", help="rom file to run") args = parser.parse_args() print('Arguments: %s' % (args)) game_name = os.path.splitext(os.path.split(args.rom)[1])[0] baseOutputDir = 'out-' + game_name + '-' + time.strftime("%Y-%m-%d-%H-%M-%S") os.makedirs(baseOutputDir) logging.info("Training game " + game_name) logging.info("Storing training into " + baseOutputDir) State.setup(args) environment = AtariEnvironment(args, baseOutputDir) dqn_network = dqn.DeepQNetwork(environment.getNumActions(), baseOutputDir, args) replayMemory = replay.ReplayMemory(args) def runEpoch(minEpochSteps, evalWithEpsilon=None): logging.info('Running epoch with min epoch steps: %d' % minEpochSteps) stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 pbar = tqdm(total=minEpochSteps)