Beispiel #1
0
    def __init__(self, env: Env, params: dict, model_path: str, log_path: str):
        """Initialize.

        :param env: gym environment. Assuming observation space is a tuple,
            where first component is from original env, and the second is
            temporal goal state.
        :param params: dict of parameters, like `default_parameters`.
        :param model_path: directory where to save models.
        :param log_path: directory where to save tensorboard logs.
        """
        # Check
        if params["initialize_file"]:
            raise ValueError(
                "Initialization not supported; use resuming option")
        if params["action_bias"]:
            raise ValueError("Action bias is not maintained here")

        # Alias
        original_env = env

        # Load a saved agent for the action bias
        self.biased_agent: Optional[DQN] = None
        if params["action_bias"]:
            loading_params = dict(params)
            loading_params["resume_file"] = params["action_bias"]
            loading_params["action_bias"] = None

            self.biased_agent = TrainStableBaselines(
                env=env,
                params=loading_params,
                model_path=model_path,
                log_path=log_path,
            ).model

        # Collect statistics
        #    (assuming future wrappers do not modify episodes)
        env = MyStatsRecorder(env=env, gamma=params["gamma"])

        # Callbacks
        checkpoint_callback = CustomCheckpointCallback(
            save_path=model_path,
            save_freq=params["save_freq"],
            extra=None,
        )
        stats_logger_callback = StatsLoggerCallback(stats_recorder=env,
                                                    scope="env0")

        callbacks_list = [checkpoint_callback, stats_logger_callback]
        if params["render"]:
            renderer_callback = RendererCallback()
            callbacks_list.append(renderer_callback)

        # If training a passive agent log this too
        if params["active_passive_agents"]:

            # Find the reward shaping env
            reward_shaping_env = find_wrapper(env, RewardShapingWrapper)

            passive_stats_env = MyStatsRecorder(
                env=UnshapedEnv(reward_shaping_env),
                gamma=params["gamma"],
            )

            passive_stats_callback = StatsLoggerCallback(
                stats_recorder=passive_stats_env,
                scope="env1",
            )
            callbacks_list.append(passive_stats_callback)

            # Make it move with the original env
            env = UnshapedEnvWrapper(
                shaped_env=env,
                unshaped_env=passive_stats_env,
            )
            original_reward_getter = env.get_reward  # alias
        else:
            original_reward_getter = None

        # Combine callbacks
        all_callbacks = CallbackList(callbacks_list)

        # Define or load
        resuming = bool(params["resume_file"])
        if not resuming:
            # Normalizer
            normalized_env = NormalizeEnvWrapper(
                env=env,
                training=True,
                entry=0,  # Only env features, not temporal goal state
            )
            flat_env = BoxAutomataStates(normalized_env)
            # Saving normalizer too
            checkpoint_callback.saver.extra_model = normalized_env

            # Agent
            model = DQN(
                env=flat_env,
                policy=ModularPolicy,
                policy_kwargs={
                    "layer_norm": params["layer_norm"],
                    "layers": params["layers"],
                    "shared_layers": params["shared_layers"],
                    "dueling": params["dueling"],
                },
                gamma=params["gamma"],
                learning_rate=params["learning_rate"],
                train_freq=params["train_freq"],
                double_q=True,
                batch_size=params["batch_size"],
                buffer_size=params["buffer_size"],
                learning_starts=params["learning_starts"],
                prioritized_replay=True,
                target_network_update_freq=params[
                    "target_network_update_freq"],
                exploration_fraction=params["exploration_fraction"],
                exploration_final_eps=params["exploration_final_eps"],
                exploration_initial_eps=params["exploration_initial_eps"],
                active_passive_agents=params["active_passive_agents"],
                passive_reward_getter=original_reward_getter,
                tensorboard_log=log_path,
                full_tensorboard_log=False,
                verbose=1,
            )
        else:
            # Reload model
            model, extra_model, counters = checkpoint_callback.load(
                path=params["resume_file"], )

            # Restore normalizer and env
            normalized_env = extra_model
            normalized_env.set_env(env)
            flat_env = BoxAutomataStates(normalized_env)

            # Restore properties
            model.tensorboard_log = log_path
            model.num_timesteps = counters["step"]
            model.learning_starts = params["learning_starts"] + counters["step"]
            model.set_env(flat_env)
            model.passive_reward_getter = original_reward_getter

        # Store
        self.params = params
        self.resuming = resuming
        self.saver = checkpoint_callback
        self.logger = stats_logger_callback
        self.callbacks = all_callbacks
        self.model: DQN = model
        self.normalized_env = normalized_env
        self.testing_agent = model if not params[
            "test_passive"] else model.passive_agent
            n_cpu_tf_sess=256,
            buffer_size=20000,
            gamma=0.95,
            batch_size=512)

load_steps = 0

if load_steps > 0:
    tmp_path = os.path.join('./tmp/%s' % CASE_NAME, "%d" % load_steps)
    del model

    model = DQN.load(tmp_path,
                     learning_rate=0.00025,
                     env=env,
                     verbose=1,
                     tensorboard_log="./dqn_%s_tensorboard/" % CASE_NAME)
    model.num_timesteps = load_steps

model.learn(total_timesteps=int(time_steps), callback=callback)
model.save("dqn_%s" % CASE_NAME)

del model  # remove to demonstrate saving and loading

model = DQN.load("dqn_%s" % CASE_NAME)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()