Exemple #1
0
    def test_run(self):
        environment = ale.AleEnvironment('breakout.bin', record_display=False)
        graph = tf.Graph()
        with graph.as_default():
            shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                                  NUM_CHANNELS, NUM_ACTIONS,
                                                  100)
            network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                      NUM_ACTIONS, 0)
            network1 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                      NUM_ACTIONS, 1)
        with self.test_session(graph=graph) as session:
            test_thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network0, 10, 0)
            test_thread2 = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network1, 20, 1)
            session.run(tf.initialize_all_variables())
            shared_weight = test_thread.shared_network.weights_and_biases()[0]
            local_weight = test_thread.local_network.weights_and_biases()[0]
            test_thread.synchronize_network()
            session.run(tf.Print(shared_weight, [shared_weight]))
            session.run(tf.Print(local_weight, [local_weight]))
            test_thread.run()
            session.run(tf.Print(shared_weight, [shared_weight]))
            session.run(tf.Print(local_weight, [local_weight]))
            assert shared_network.shared_counter.eval() == 10

            test_thread2.run()
            assert shared_network.shared_counter.eval() == 20
Exemple #2
0
    def test_select_action(self):
        environment = ale.AleEnvironment('breakout.bin', record_display=False)
        graph = tf.Graph()
        with graph.as_default():
            shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                                  NUM_CHANNELS, NUM_ACTIONS,
                                                  100)
            network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                      NUM_ACTIONS, 0)
        with self.test_session(graph=graph) as session:
            test_thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network0, 10, 0)
            probabilities = [0.1, 0.2, 0.3, 0.4]
            selected_num = np.zeros(4)
            available_actions = environment.available_actions()

            samples = 10000
            for i in range(samples):
                action = test_thread.select_action_with(
                    available_actions, probabilities)
                selected_num[action] += 1

            allowed_diff = 500
            for i in range(len(selected_num)):
                mean = probabilities[i] * samples
                print 'mean:%d selected_num:%d for action:%d' % (
                    mean, selected_num[i], i)
                self.assertTrue((mean - allowed_diff) < selected_num[i]
                                and selected_num[i] < (mean + allowed_diff))
Exemple #3
0
    def test_shape(self):
        environment = ale.AleEnvironment('breakout.bin', record_display=False)
        graph = tf.Graph()
        with graph.as_default():
            shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                                  NUM_CHANNELS, NUM_ACTIONS,
                                                  100)
            network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                      NUM_ACTIONS, 0)
        with self.test_session(graph=graph) as session:
            tf.initialize_all_variables().run()
            test_thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network0, 10, 0)
            initial_state = test_thread.get_initial_state(environment)
            state = np.stack(initial_state, axis=-1)

            pi = session.run(test_thread.pi,
                             feed_dict={
                                 test_thread.state_input: [state, state],
                                 test_thread.action_input: [[0, 0, 1, 0],
                                                            [0, 1, 0, 0]]
                             })
            print 'pi' + str(np.shape(pi))
            self.assertTrue(np.shape(pi) == (2, 4))

            value = session.run(test_thread.value,
                                feed_dict={
                                    test_thread.state_input: [state, state],
                                    test_thread.action_input: [[0, 0, 1, 0],
                                                               [0, 1, 0, 0]],
                                    test_thread.reward_input: [[0], [0]]
                                })
            print 'value shape: ' + str(
                np.shape(value)) + ' value: ' + str(value)
            self.assertTrue(np.shape(value) == (2, 1))
Exemple #4
0
def start_evaluation():
    checkpoint_dir = FLAGS.checkpoint_dir
    graph = tf.Graph()
    config = tf.ConfigProto()

    shared_network = None
    evaluation_network = None
    environment = ale.AleEnvironment(FLAGS.rom,
                                     record_display=FLAGS.take_video,
                                     show_display=show_display,
                                     id=0,
                                     shrink=FLAGS.shrink_image,
                                     life_lost_as_end=False)

    with graph.as_default():
        num_actions = len(environment.available_actions())
        device = '/gpu:0' if FLAGS.use_gpu else '/cpu:0'
        shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                              NUM_CHANNELS, num_actions, 100,
                                              FLAGS.local_t_max,
                                              FLAGS.global_t_max, device)
        evaluation_network = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                            NUM_CHANNELS, num_actions, 0,
                                            device)
        saver = tf.train.Saver(max_to_keep=None)

    with tf.Session(graph=graph, config=config) as session:
        show_display = True
        thread_num = 0
        thread = actor_thread.ActorLearnerThread(
            session, environment, shared_network, evaluation_network,
            FLAGS.local_t_max, FLAGS.global_t_max, thread_num)
        thread.set_saver(saver)
        thread.daemon = True

        session.run(tf.initialize_all_variables())

        if FLAGS.trained_file is not '':
            saver.restore(session, checkpoint_dir + '/' + FLAGS.trained_file)
        else:
            print 'No trained file specified. Abort evaluation'
            return

        trials = 10
        rewards = []
        for i in range(trials):
            reward = thread.test_run(environment)
            rewards.append(reward)

        maximum = np.max(rewards)
        median = np.median(rewards)
        average = np.average(rewards)

        print 'Evaluation finished. max: %d, med: %d, avg: %f' % (
            maximum, median, average)
Exemple #5
0
 def test_loop_listener(self):
     environment = ale.AleEnvironment('breakout.bin', record_display=False)
     graph = tf.Graph()
     with graph.as_default():
         shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                               NUM_CHANNELS, NUM_ACTIONS,
                                               100)
         network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                   NUM_ACTIONS, 0)
     with self.test_session(graph=graph) as session:
         test_thread = actor_thread.ActorLearnerThread(
             session, environment, shared_network, network0, 10, 0)
         test_thread.set_loop_listener(self.loop_listener)
         session.run(tf.initialize_all_variables())
         test_thread.run()
Exemple #6
0
 def test_local_gradients(self):
     environment = ale.AleEnvironment('breakout.bin', record_display=False)
     graph = tf.Graph()
     with graph.as_default():
         shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                               NUM_CHANNELS, NUM_ACTIONS,
                                               100)
         network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                   NUM_ACTIONS, 0)
     with self.test_session(graph=graph) as session:
         test_thread = actor_thread.ActorLearnerThread(
             session, environment, shared_network, network0, 10, 0)
         test_thread.reset_gradients()
         self.assertEquals(len(test_thread.local_grads), 10)
         for local_grad in test_thread.local_grads:
             session.run(tf.Print(local_grad, [local_grad]))
Exemple #7
0
    def test_play_game(self):
        environment = ale.AleEnvironment('breakout.bin', record_display=False)
        graph = tf.Graph()
        with graph.as_default():
            shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                                  NUM_CHANNELS, NUM_ACTIONS,
                                                  100)
            network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                      NUM_ACTIONS, 0)
        with self.test_session(graph=graph) as session:
            tf.initialize_all_variables().run()
            test_thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network0, 10, 0)
            test_thread.reset_gradients()
            initial_state = test_thread.get_initial_state(environment)
            self.assertEquals(len(initial_state), NUM_CHANNELS)

            history, last_state = test_thread.play_game(initial_state)
            if last_state is not None:
                self.assertEquals(len(history), test_thread.local_t_max)
            self.assertEquals(len(history[0]), 3)
Exemple #8
0
 def test_pi(self):
     environment = ale.AleEnvironment('breakout.bin', record_display=False)
     graph = tf.Graph()
     with graph.as_default():
         shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                               NUM_CHANNELS, NUM_ACTIONS,
                                               100)
         network0 = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                   NUM_ACTIONS, 0)
     with self.test_session(graph=graph) as session:
         tf.initialize_all_variables().run()
         test_thread = actor_thread.ActorLearnerThread(
             session, environment, shared_network, network0, 10, 0)
         test_thread.reset_gradients()
         initial_state = test_thread.get_initial_state(environment)
         state = np.stack(initial_state, axis=-1)
         probabilities = session.run(
             test_thread.pi, feed_dict={test_thread.state_input: [state]})
         self.assertTrue(
             len(probabilities[0]) ==
             test_thread.local_network.actor_outputs)
         self.assertTrue(0.99 < np.sum(probabilities)
                         and np.sum(probabilities) < 1.01)
Exemple #9
0
def start_training():
    global previous_evaluation_step
    global previous_step
    global summary_writer
    global summary_op
    global maximum_input
    global median_input
    global average_input
    global evaluation_environment
    checkpoint_dir = FLAGS.checkpoint_dir
    summary_dir = FLAGS.summary_dir
    graph = tf.Graph()
    config = tf.ConfigProto()

    # Output to tensorboard
    create_dir_if_not_exist(summary_dir)
    # Model parameter saving
    create_dir_if_not_exist(checkpoint_dir)
    if FLAGS.trained_file is '':
        remove_old_files(summary_dir)
        remove_old_files(checkpoint_dir)

    summary_writer = tf.train.SummaryWriter(summary_dir, graph=graph)
    write_training_settings('results')

    networks = []
    shared_network = None
    summary_op = None

    evaluation_environment = ale.AleEnvironment(FLAGS.rom,
                                                record_display=False,
                                                show_display=True,
                                                id=100,
                                                shrink=FLAGS.shrink_image,
                                                life_lost_as_end=False)

    with graph.as_default():
        num_actions = len(evaluation_environment.available_actions())
        maximum_input = tf.placeholder(tf.int32)
        median_input = tf.placeholder(tf.int32)
        average_input = tf.placeholder(tf.int32)
        summary_op = merged_summaries(maximum_input, median_input,
                                      average_input)
        device = '/gpu:0' if FLAGS.use_gpu else '/cpu:0'
        shared_network = shared.SharedNetwork(IMAGE_WIDTH, IMAGE_HEIGHT,
                                              NUM_CHANNELS, num_actions, 100,
                                              FLAGS.local_t_max,
                                              FLAGS.global_t_max, device)
        for i in range(FLAGS.threads_num):
            network = a3c.A3CNetwork(IMAGE_WIDTH, IMAGE_HEIGHT, NUM_CHANNELS,
                                     num_actions, i, device)
            networks.append(network)
        saver = tf.train.Saver(max_to_keep=None)

    with tf.Session(graph=graph, config=config) as session:
        threads = []
        for thread_num in range(FLAGS.threads_num):
            show_display = True if (thread_num == 0) else False
            environment = ale.AleEnvironment(FLAGS.rom,
                                             record_display=False,
                                             show_display=show_display,
                                             id=thread_num,
                                             shrink=FLAGS.shrink_image)
            thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, networks[thread_num],
                FLAGS.local_t_max, FLAGS.global_t_max, thread_num)
            thread.set_saver(saver)
            thread.daemon = True
            if thread_num == 0:
                thread.set_loop_listener(loop_listener)
            threads.append(thread)

        session.run(tf.initialize_all_variables())
        if FLAGS.trained_file is not '':
            saver.restore(session, checkpoint_dir + '/' + FLAGS.trained_file)
            previous_step = threads[0].get_global_step()
            previous_evaluation_step = previous_step
        else:
            print 'No trained file specified. Use default parameter'

        for i in range(FLAGS.threads_num):
            threads[i].start()

        while True:
            try:
                ts = [
                    thread.join(10) for thread in threads
                    if thread is not None and thread.isAlive()
                ]
                if len(ts) == 0:
                    break
            except KeyboardInterrupt:
                print 'Ctrl-c received! Sending kill to threads...'
                for thread in threads:
                    thread.kill_received = True
                break

        print 'Training finished!!'
Exemple #10
0
    def test_accumulate_gradients(self):
        environment = ale.AleEnvironment('breakout.bin', record_display=False)
        graph = tf.Graph()
        local_t_max = 20
        global_t_max = 100
        with graph.as_default():
            shared_network = shared.SharedNetwork(IMAGE_HEIGHT, IMAGE_WIDTH,
                                                  NUM_CHANNELS, NUM_ACTIONS,
                                                  100, local_t_max,
                                                  global_t_max)
            network0 = a3c.A3CNetwork(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS,
                                      NUM_ACTIONS, 0)
        with self.test_session(graph=graph) as session:
            tf.initialize_all_variables().run()
            test_thread = actor_thread.ActorLearnerThread(
                session, environment, shared_network, network0, local_t_max,
                global_t_max, 0)
            test_thread.reset_gradients()
            initial_state = test_thread.get_initial_state(environment)
            self.assertEquals(np.shape(initial_state),
                              (IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS))

            history, last_state = test_thread.play_game(initial_state)

            session.run(test_thread.reset_local_grads_ops)
            local_grad = test_thread.local_grads[0]

            r = 0.0
            gamma = 0.99
            for i in range((test_thread.t - 1) - test_thread.t_start, -1, -1):
                state = history[i]['state']
                action = np.zeros(test_thread.local_network.actor_outputs)
                action[history[i]['action']] = 1
                reward = history[i]['reward']
                value = history[i]['value']
                r = reward + gamma * r
                test_thread.accumulate_gradients([state], [action], [[r]],
                                                 [[value]])
                local_grad_step = local_grad.eval()
                tf.Print(local_grad_step, [local_grad_step]).eval()

            session.run(test_thread.reset_local_grads_ops)

            r = 0
            states_batch = []
            action_batch = []
            r_batch = []
            value_batch = []
            for i in range((test_thread.t - 1) - test_thread.t_start, -1, -1):
                state = history[i]['state']
                action = np.zeros(test_thread.local_network.actor_outputs)
                action[history[i]['action']] = 1
                reward = history[i]['reward']
                value = history[i]['value']

                r = reward + gamma * r
                states_batch.append(state)
                action_batch.append(action)
                r_batch.append([r])
                value_batch.append([value])

            test_thread.accumulate_gradients(states_batch, action_batch,
                                             r_batch, value_batch)
            local_grad_batch = local_grad.eval()
            tf.Print(local_grad_batch, [local_grad_batch]).eval()

            frobenius_norm = np.linalg.norm(local_grad_step - local_grad_batch)
            print 'Frobenius norm between batch and step: ' + str(
                frobenius_norm)
            self.assertTrue(frobenius_norm < 1e-2)