Exemple #1
0
def measure(name, iters=5000, **settings):
    print(name)
    for k, v in settings.items():
        print("\t{}: {}".format(k, v))

    # Vizdoom wrapper
    doom_wrapper = VizdoomWrapper(**settings)
    start = time()
    for _ in trange(iters, leave=False):
        current_img, current_misc = doom_wrapper.get_current_state()
        action_index = randint(0, doom_wrapper.actions_num - 1)
        doom_wrapper.make_action(action_index)

        if doom_wrapper.is_terminal():
            doom_wrapper.reset()
    end = time()
    wrapper_t = (end - start)

    # Vanilla vizdoom:
    doom = vzd.DoomGame()
    if "scenarios_path" not in settings:
        scenarios_path = vzd.__path__[0] + "/scenarios"
    else:
        scenarios_path = settings["scenarios_path"]
    config_file = scenarios_path + "/" + settings["config_file"]
    doom.load_config(config_file)
    doom.set_window_visible(False)
    doom.set_screen_format(vzd.ScreenFormat.GRAY8)
    doom.set_screen_resolution(vzd.ScreenResolution.RES_160X120)
    doom.init()
    actions = [
        list(a)
        for a in it.product([0, 1],
                            repeat=len(doom.get_available_game_variables()))
    ]
    start = time()
    frame_skip = settings["frame_skip"]
    for _ in trange(iters, leave=False):
        if doom.is_episode_finished():
            doom.new_episode()
        doom.make_action(choice(actions), frame_skip)

    end = time()
    vanilla_t = end - start
    print(green("\twrapper: {:0.2f} steps/s".format(iters / wrapper_t)))
    print(
        green("\twrapper: {:0.2f} s/1000 steps".format(wrapper_t / iters *
                                                       1000)))
    print(blue("\tvanilla: {:0.2f} steps/s".format(iters / vanilla_t)))
    print(
        blue("\tvanilla: {:0.2f} s/1000 steps\n".format(vanilla_t / iters *
                                                        1000)))
    def _print_train_log(self, scores, overall_start_time, last_log_time, steps):
        current_time = time.time()
        mean_score = np.mean(scores)
        score_std = np.std(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)

        elapsed_time = time.time() - overall_start_time
        global_steps = self._global_steps_counter.get()
        local_steps_per_sec = steps / (current_time - last_log_time)
        global_steps_per_sec = global_steps / elapsed_time
        global_mil_steps_per_hour = global_steps_per_sec * 3600 / 1000000.0
        log(
            "TRAIN: {}(GlobalSteps), {} episodes, mean: {}, min: {}, max: {}, "
            "\nLocalSpd: {:.0f} STEPS/s GlobalSpd: "
            "{} STEPS/s, {:.2f}M STEPS/hour, total elapsed time: {}".format(
                global_steps,
                len(scores),
                green("{:0.3f}±{:0.2f}".format(mean_score, score_std)),
                red("{:0.3f}".format(min_score)),
                blue("{:0.3f}".format(max_score)),
                local_steps_per_sec,
                blue("{:.0f}".format(
                    global_steps_per_sec)),
                global_mil_steps_per_hour,
                sec_to_str(elapsed_time)
            ))
    def test(self, episodes_num=None, deterministic=True):
        if episodes_num is None:
            episodes_num = self.test_episodes_per_epoch

        test_start_time = time.time()
        test_rewards = []
        test_actions = []
        test_frameskips = []
        for _ in trange(episodes_num, desc="Testing", file=sys.stdout,
                        leave=False, disable=not self.enable_progress_bar):
            total_reward, actions, frameskips, _ = self.run_episode(deterministic=deterministic, return_stats=True)
            test_rewards.append(total_reward)
            test_actions += actions
            test_frameskips += frameskips

        self.doom_wrapper.reset()
        if self.local_network.has_state():
            self.local_network.reset_state()

        test_end_time = time.time()
        test_duration = test_end_time - test_start_time
        min_score = np.min(test_rewards)
        max_score = np.max(test_rewards)
        mean_score = np.mean(test_rewards)
        score_std = np.std(test_rewards)
        log(
            "TEST: mean: {}, min: {}, max: {}, test time: {}".format(
                green("{:0.3f}±{:0.2f}".format(mean_score, score_std)),
                red("{:0.3f}".format(min_score)),
                blue("{:0.3f}".format(max_score)),
                sec_to_str(test_duration)))
        return test_rewards, test_actions, test_frameskips
Exemple #4
0
    def print_epoch_log(prefix, scores, steps, epoch_time):
        mean_score = np.mean(scores)
        score_std = np.std(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)
        episodes = len(scores)

        steps_per_sec = steps / epoch_time
        mil_steps_per_hour = steps_per_sec * 3600 / 1000000.0
        log("{}: Episodes: {}, mean: {}, min: {}, max: {}, "
            " Speed: {:.0f} STEPS/s, {:.2f}M STEPS/hour, time: {}".format(
                prefix, episodes,
                green("{:0.3f}±{:0.2f}".format(mean_score, score_std)),
                red("{:0.3f}".format(min_score)),
                blue("{:0.3f}".format(max_score)), steps_per_sec,
                mil_steps_per_hour, sec_to_str(epoch_time)))
def train_async(q_learning, settings):
    proto_vizdoom = VizdoomWrapper(noinit=True, **settings)
    actions_num = proto_vizdoom.actions_num
    misc_len = proto_vizdoom.misc_len
    img_shape = proto_vizdoom.img_shape
    del proto_vizdoom

    # TODO target global network
    # This global step counts gradient applications not performed actions.
    global_train_step = tf.Variable(0, trainable=False, name="global_step")
    global_learning_rate = tf.train.polynomial_decay(
        name="larning_rate",
        learning_rate=settings["initial_learning_rate"],
        end_learning_rate=settings["final_learning_rate"],
        decay_steps=settings["learning_rate_decay_steps"],
        global_step=global_train_step)
    optimizer = ClippingRMSPropOptimizer(learning_rate=global_learning_rate,
                                         **settings["rmsprop"])

    learners = []
    network_class = eval(settings["network_type"])

    global_network = network_class(actions_num=actions_num,
                                   misc_len=misc_len,
                                   img_shape=img_shape,
                                   **settings)

    global_steps_counter = ThreadsafeCounter()
    if q_learning:
        global_target_network = network_class(thread="global_target",
                                              actions_num=actions_num,
                                              misc_len=misc_len,
                                              img_shape=img_shape,
                                              **settings)
        global_network.prepare_unfreeze_op(global_target_network)
        unfreeze_thread = min(1, settings["threads_num"] - 1)
        for i in range(settings["threads_num"]):
            learner = ADQNLearner(thread_index=i,
                                  global_network=global_network,
                                  unfreeze_thread=i == unfreeze_thread,
                                  global_target_network=global_target_network,
                                  optimizer=optimizer,
                                  learning_rate=global_learning_rate,
                                  global_steps_counter=global_steps_counter,
                                  **settings)
            learners.append(learner)
    else:
        for i in range(settings["threads_num"]):
            learner = A3CLearner(thread_index=i,
                                 global_network=global_network,
                                 optimizer=optimizer,
                                 learning_rate=global_learning_rate,
                                 global_steps_counter=global_steps_counter,
                                 **settings)
            learners.append(learner)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)

    log("Initializing variables...")
    session.run(tf.global_variables_initializer())
    log("Initialization finished.\n")

    if q_learning:
        session.run(global_network.ops.unfreeze)

    log(green("Starting training.\n"))

    for l in learners:
        l.run_training(session)
    for l in learners:
        l.join()
Exemple #6
0
    def train(self, session):

        # Prefill replay memory:
        for _ in trange(self.replay_memory.capacity, desc="Filling replay memory",
                        leave=False, disable=not self.enable_progress_bar, file=sys.stdout):
            if self.doom_wrapper.is_terminal():
                self.doom_wrapper.reset()
            s1 = self.doom_wrapper.get_current_state()
            action_frameskip_index = randint(0, self.actions_num * len(self.frameskips) - 1)
            action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index)
            reward = self.doom_wrapper.make_action(action_index, frameskip)
            terminal = self.doom_wrapper.is_terminal()
            s2 = self.doom_wrapper.get_current_state()
            self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal)

        overall_start_time = time()
        self.network.update_target_network(session)

        log(green("Starting training.\n"))
        while self._epoch <= self._epochs:
            self.doom_wrapper.reset()
            train_scores = []
            test_scores = []
            train_start_time = time()

            for _ in trange(self.train_steps_per_epoch, desc="Training, epoch {}".format(self._epoch),
                            leave=False, disable=not self.enable_progress_bar, file=sys.stdout):
                self.steps += 1
                s1 = self.doom_wrapper.get_current_state()

                if random() <= self.get_current_epsilon():
                    action_frameskip_index = randint(0, self.actions_num*len(self.frameskips) - 1)
                    action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index)
                else:
                    action_frameskip_index = self.network.get_action(session, s1)
                    action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index)

                reward = self.doom_wrapper.make_action(action_index, frameskip)
                terminal = self.doom_wrapper.is_terminal()
                s2 = self.doom_wrapper.get_current_state()
                self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal)

                if self.steps % self.update_pattern[0] == 0:
                    for _ in range(self.update_pattern[1]):
                        self.network.train_batch(session, self.replay_memory.get_sample())

                if terminal:
                    train_scores.append(self.doom_wrapper.get_total_reward())
                    self.doom_wrapper.reset()
                if self.steps % self.frozen_steps == 0:
                    self.network.update_target_network(session)

            train_time = time() - train_start_time

            log("Epoch {}".format(self._epoch))
            log("Training steps: {}, epsilon: {}".format(self.steps, self.get_current_epsilon()))
            self.print_epoch_log("TRAIN", train_scores, self.train_steps_per_epoch, train_time)
            test_start_time = time()
            test_steps = 0
            # TESTING
            for _ in trange(self.test_episodes_per_epoch, desc="Testing, epoch {}".format(self._epoch),
                            leave=False, disable=not self.enable_progress_bar, file=sys.stdout):
                self.doom_wrapper.reset()
                while not self.doom_wrapper.is_terminal():
                    test_steps += 1
                    state = self.doom_wrapper.get_current_state()
                    action_frameskip_index = self.network.get_action(session, state)
                    action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index)
                    self.doom_wrapper.make_action(action_index, frameskip)

                test_scores.append(self.doom_wrapper.get_total_reward())

            test_time = time() - test_start_time

            self.print_epoch_log("TEST", test_scores, test_steps, test_time)

            if self.write_summaries:
                log("Writing summaries.")
                train_summary = session.run(self._summaries, {self.scores_placeholder: train_scores})
                self._train_writer.add_summary(train_summary, self.steps)
                if self._run_tests:
                    test_summary = session.run(self._summaries, {self.scores_placeholder: test_scores})
                    self._test_writer.add_summary(test_summary, self.steps)

            # Save model
            if self._epoch % self.save_interval == 0:
                savedir = os.path.dirname(self._model_savefile)
                if not os.path.exists(savedir):
                    log("Creating directory: {}".format(savedir))
                    os.makedirs(savedir)
                log("Saving model to: {}".format(self._model_savefile))
                saver = tf.train.Saver()
                saver.save(session, self._model_savefile)

            overall_time = time() - overall_start_time
            log("Total elapsed time: {}\n".format(sec_to_str(overall_time)))
            self._epoch += 1