def test_run(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) network1 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 1) with self.test_session(graph=graph) as session: test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) test_thread2 = actor_thread.ActorLearnerThread( session, environment, shared_network, network1, 20, 1) session.run(tf.initialize_all_variables()) shared_weight = test_thread.shared_network.weights_and_biases()[0] local_weight = test_thread.local_network.weights_and_biases()[0] test_thread.synchronize_network() session.run(tf.Print(shared_weight, [shared_weight])) session.run(tf.Print(local_weight, [local_weight])) test_thread.run() session.run(tf.Print(shared_weight, [shared_weight])) session.run(tf.Print(local_weight, [local_weight])) assert shared_network.shared_counter.eval() == 10 test_thread2.run() assert shared_network.shared_counter.eval() == 20
def test_select_action(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) probabilities = [0.1, 0.2, 0.3, 0.4] selected_num = np.zeros(4) available_actions = environment.available_actions() samples = 10000 for i in range(samples): action = test_thread.select_action_with( available_actions, probabilities) selected_num[action] += 1 allowed_diff = 500 for i in range(len(selected_num)): mean = probabilities[i] * samples print 'mean:%d selected_num:%d for action:%d' % ( mean, selected_num[i], i) self.assertTrue((mean - allowed_diff) < selected_num[i] and selected_num[i] < (mean + allowed_diff))
def test_shape(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: tf.initialize_all_variables().run() test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) initial_state = test_thread.get_initial_state(environment) state = np.stack(initial_state, axis=-1) pi = session.run(test_thread.pi, feed_dict={ test_thread.state_input: [state, state], test_thread.action_input: [[0, 0, 1, 0], [0, 1, 0, 0]] }) print 'pi' + str(np.shape(pi)) self.assertTrue(np.shape(pi) == (2, 4)) value = session.run(test_thread.value, feed_dict={ test_thread.state_input: [state, state], test_thread.action_input: [[0, 0, 1, 0], [0, 1, 0, 0]], test_thread.reward_input: [[0], [0]] }) print 'value shape: ' + str( np.shape(value)) + ' value: ' + str(value) self.assertTrue(np.shape(value) == (2, 1))
def start_evaluation(): checkpoint_dir = FLAGS.checkpoint_dir graph = tf.Graph() config = tf.ConfigProto() shared_network = None evaluation_network = None environment = ale.AleEnvironment(FLAGS.rom, record_display=FLAGS.take_video, show_display=show_display, id=0, shrink=FLAGS.shrink_image, life_lost_as_end=False) with graph.as_default(): num_actions = len(environment.available_actions()) device = '/gpu:0' if FLAGS.use_gpu else '/cpu:0' shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, num_actions, 100, FLAGS.local_t_max, FLAGS.global_t_max, device) evaluation_network = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, num_actions, 0, device) saver = tf.train.Saver(max_to_keep=None) with tf.Session(graph=graph, config=config) as session: show_display = True thread_num = 0 thread = actor_thread.ActorLearnerThread( session, environment, shared_network, evaluation_network, FLAGS.local_t_max, FLAGS.global_t_max, thread_num) thread.set_saver(saver) thread.daemon = True session.run(tf.initialize_all_variables()) if FLAGS.trained_file is not '': saver.restore(session, checkpoint_dir + '/' + FLAGS.trained_file) else: print 'No trained file specified. Abort evaluation' return trials = 10 rewards = [] for i in range(trials): reward = thread.test_run(environment) rewards.append(reward) maximum = np.max(rewards) median = np.median(rewards) average = np.average(rewards) print 'Evaluation finished. max: %d, med: %d, avg: %f' % ( maximum, median, average)
def test_loop_listener(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) test_thread.set_loop_listener(self.loop_listener) session.run(tf.initialize_all_variables()) test_thread.run()
def test_local_gradients(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) test_thread.reset_gradients() self.assertEquals(len(test_thread.local_grads), 10) for local_grad in test_thread.local_grads: session.run(tf.Print(local_grad, [local_grad]))
def test_play_game(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: tf.initialize_all_variables().run() test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) test_thread.reset_gradients() initial_state = test_thread.get_initial_state(environment) self.assertEquals(len(initial_state), NUM_CHANNELS) history, last_state = test_thread.play_game(initial_state) if last_state is not None: self.assertEquals(len(history), test_thread.local_t_max) self.assertEquals(len(history[0]), 3)
def test_pi(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 100) network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: tf.initialize_all_variables().run() test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, 10, 0) test_thread.reset_gradients() initial_state = test_thread.get_initial_state(environment) state = np.stack(initial_state, axis=-1) probabilities = session.run( test_thread.pi, feed_dict={test_thread.state_input: [state]}) self.assertTrue( len(probabilities[0]) == test_thread.local_network.actor_outputs) self.assertTrue(0.99 < np.sum(probabilities) and np.sum(probabilities) < 1.01)
def start_training(): global previous_evaluation_step global previous_step global summary_writer global summary_op global maximum_input global median_input global average_input global evaluation_environment checkpoint_dir = FLAGS.checkpoint_dir summary_dir = FLAGS.summary_dir graph = tf.Graph() config = tf.ConfigProto() # Output to tensorboard create_dir_if_not_exist(summary_dir) # Model parameter saving create_dir_if_not_exist(checkpoint_dir) if FLAGS.trained_file is '': remove_old_files(summary_dir) remove_old_files(checkpoint_dir) summary_writer = tf.train.SummaryWriter(summary_dir, graph=graph) write_training_settings('results') networks = [] shared_network = None summary_op = None evaluation_environment = ale.AleEnvironment(FLAGS.rom, record_display=False, show_display=True, id=100, shrink=FLAGS.shrink_image, life_lost_as_end=False) with graph.as_default(): num_actions = len(evaluation_environment.available_actions()) maximum_input = tf.placeholder(tf.int32) median_input = tf.placeholder(tf.int32) average_input = tf.placeholder(tf.int32) summary_op = merged_summaries(maximum_input, median_input, average_input) device = '/gpu:0' if FLAGS.use_gpu else '/cpu:0' shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, num_actions, 100, FLAGS.local_t_max, FLAGS.global_t_max, device) for i in range(FLAGS.threads_num): network = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS, num_actions, i, device) networks.append(network) saver = tf.train.Saver(max_to_keep=None) with tf.Session(graph=graph, config=config) as session: threads = [] for thread_num in range(FLAGS.threads_num): show_display = True if (thread_num == 0) else False environment = ale.AleEnvironment(FLAGS.rom, record_display=False, show_display=show_display, id=thread_num, shrink=FLAGS.shrink_image) thread = actor_thread.ActorLearnerThread( session, environment, shared_network, networks[thread_num], FLAGS.local_t_max, FLAGS.global_t_max, thread_num) thread.set_saver(saver) thread.daemon = True if thread_num == 0: thread.set_loop_listener(loop_listener) threads.append(thread) session.run(tf.initialize_all_variables()) if FLAGS.trained_file is not '': saver.restore(session, checkpoint_dir + '/' + FLAGS.trained_file) previous_step = threads[0].get_global_step() previous_evaluation_step = previous_step else: print 'No trained file specified. Use default parameter' for i in range(FLAGS.threads_num): threads[i].start() while True: try: ts = [ thread.join(10) for thread in threads if thread is not None and thread.isAlive() ] if len(ts) == 0: break except KeyboardInterrupt: print 'Ctrl-c received! Sending kill to threads...' for thread in threads: thread.kill_received = True break print 'Training finished!!'
def test_accumulate_gradients(self): environment = ale.AleEnvironment('breakout.bin', record_display=False) graph = tf.Graph() local_t_max = 20 global_t_max = 100 with graph.as_default(): shared_network = shared.SharedNetwork(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS, NUM_ACTIONS, 100, local_t_max, global_t_max) network0 = a3c.A3CNetwork(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS, NUM_ACTIONS, 0) with self.test_session(graph=graph) as session: tf.initialize_all_variables().run() test_thread = actor_thread.ActorLearnerThread( session, environment, shared_network, network0, local_t_max, global_t_max, 0) test_thread.reset_gradients() initial_state = test_thread.get_initial_state(environment) self.assertEquals(np.shape(initial_state), (IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS)) history, last_state = test_thread.play_game(initial_state) session.run(test_thread.reset_local_grads_ops) local_grad = test_thread.local_grads[0] r = 0.0 gamma = 0.99 for i in range((test_thread.t - 1) - test_thread.t_start, -1, -1): state = history[i]['state'] action = np.zeros(test_thread.local_network.actor_outputs) action[history[i]['action']] = 1 reward = history[i]['reward'] value = history[i]['value'] r = reward + gamma * r test_thread.accumulate_gradients([state], [action], [[r]], [[value]]) local_grad_step = local_grad.eval() tf.Print(local_grad_step, [local_grad_step]).eval() session.run(test_thread.reset_local_grads_ops) r = 0 states_batch = [] action_batch = [] r_batch = [] value_batch = [] for i in range((test_thread.t - 1) - test_thread.t_start, -1, -1): state = history[i]['state'] action = np.zeros(test_thread.local_network.actor_outputs) action[history[i]['action']] = 1 reward = history[i]['reward'] value = history[i]['value'] r = reward + gamma * r states_batch.append(state) action_batch.append(action) r_batch.append([r]) value_batch.append([value]) test_thread.accumulate_gradients(states_batch, action_batch, r_batch, value_batch) local_grad_batch = local_grad.eval() tf.Print(local_grad_batch, [local_grad_batch]).eval() frobenius_norm = np.linalg.norm(local_grad_step - local_grad_batch) print 'Frobenius norm between batch and step: ' + str( frobenius_norm) self.assertTrue(frobenius_norm < 1e-2)