コード例 #1
0
    def eval(self, checkpoint_path):
        r"""Evaluates a single checkpoint.

        Args:
            checkpoint_path: path of checkpoint
            writer: tensorboard writer object for logging to tensorboard
            checkpoint_index: index of cur checkpoint for logging

        Returns:
            None
        """
        self.device = (torch.device("cuda", self.config.TORCH_GPU_ID)
                       if torch.cuda.is_available() else torch.device("cpu"))
        # Map location CPU is almost always better than mapping to a CUDA device.
        ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu")

        if self.config.EVAL.USE_CKPT_CONFIG:
            config = self._setup_eval_config(ckpt_dict["config"])
        else:
            config = self.config.clone()

        ppo_cfg = config.RL.PPO

        config.defrost()
        config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT
        config.freeze()

        if len(self.config.VIDEO_OPTION) > 0:
            config.defrost()
            config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP")
            config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS")
            config.freeze()

        self.env = construct_envs(config, get_env_class(config.ENV_NAME))
        self._setup_actor_critic_agent(ppo_cfg)

        self.agent.load_state_dict(ckpt_dict["state_dict"])
        self.actor_critic = self.agent.actor_critic

        # get name of performance metric, e.g. "spl"
        metric_name = self.config.TASK_CONFIG.TASK.MEASUREMENTS[0]
        metric_cfg = getattr(self.config.TASK_CONFIG.TASK, metric_name)
        measure_type = baseline_registry.get_measure(metric_cfg.TYPE)
        assert measure_type is not None, "invalid measurement type {}".format(
            metric_cfg.TYPE)
        self.metric_uuid = measure_type(sim=None, task=None,
                                        config=None)._get_uuid()

        observations = self.env.reset()
        batch = batch_obs(observations, self.device)

        current_episode_reward = torch.zeros(self.env.num_envs,
                                             1,
                                             device=self.device)

        test_recurrent_hidden_states = torch.zeros(
            self.actor_critic.net.num_recurrent_layers,
            self.config.NUM_PROCESSES,
            ppo_cfg.hidden_size,
            device=self.device,
        )
        prev_actions = torch.zeros(self.config.NUM_PROCESSES,
                                   1,
                                   device=self.device,
                                   dtype=torch.long)
        not_done_masks = torch.zeros(self.config.NUM_PROCESSES,
                                     1,
                                     device=self.device)
        stats_episodes = dict()  # dict of dicts that stores stats per episode

        rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES)
                      ]  # type: List[List[np.ndarray]]
        if len(self.config.VIDEO_OPTION) > 0:
            os.makedirs(self.config.VIDEO_DIR, exist_ok=True)

        self.actor_critic.eval()
        while (len(stats_episodes) < self.config.TEST_EPISODE_COUNT
               and self.env.num_envs > 0):
            current_episodes = self.env.current_episodes()

            with torch.no_grad():
                (
                    _,
                    actions,
                    _,
                    test_recurrent_hidden_states,
                ) = self.actor_critic.act(
                    batch,
                    test_recurrent_hidden_states,
                    prev_actions,
                    not_done_masks,
                    deterministic=False,
                )

                prev_actions.copy_(actions)

            outputs = self.env.step([a[0].item() for a in actions])

            observations, rewards, dones, infos = [
                list(x) for x in zip(*outputs)
            ]
            batch = batch_obs(observations, self.device)

            not_done_masks = torch.tensor(
                [[0.0] if done else [1.0] for done in dones],
                dtype=torch.float,
                device=self.device,
            )

            rewards = torch.tensor(rewards,
                                   dtype=torch.float,
                                   device=self.device).unsqueeze(1)
            current_episode_reward += rewards
            next_episodes = self.env.current_episodes()
            envs_to_pause = []
            n_envs = self.env.num_envs
            for i in range(n_envs):
                if (
                        next_episodes[i].scene_id,
                        next_episodes[i].episode_id,
                ) in stats_episodes:
                    envs_to_pause.append(i)

                # episode ended
                if not_done_masks[i].item() == 0:
                    episode_stats = dict()
                    episode_stats[self.metric_uuid] = infos[i][
                        self.metric_uuid]
                    episode_stats["success"] = int(
                        infos[i][self.metric_uuid] > 0)
                    episode_stats["reward"] = current_episode_reward[i].item()
                    current_episode_reward[i] = 0
                    # use scene_id + episode_id as unique id for storing stats
                    stats_episodes[(
                        current_episodes[i].scene_id,
                        current_episodes[i].episode_id,
                    )] = episode_stats

                    if len(self.config.VIDEO_OPTION) > 0:
                        generate_video(
                            video_option=self.config.VIDEO_OPTION,
                            video_dir=self.config.VIDEO_DIR,
                            images=rgb_frames[i],
                            episode_id=current_episodes[i].episode_id,
                            checkpoint_idx=0,
                            metric_name=self.metric_uuid,
                            metric_value=infos[i][self.metric_uuid],
                        )

                        rgb_frames[i] = []

                # episode continues
                elif len(self.config.VIDEO_OPTION) > 0:
                    frame = observations_to_image(observations[i], infos[i])
                    rgb_frames[i].append(frame)

            (
                self.env,
                test_recurrent_hidden_states,
                not_done_masks,
                current_episode_reward,
                prev_actions,
                batch,
                rgb_frames,
            ) = self._pause_envs(
                envs_to_pause,
                self.env,
                test_recurrent_hidden_states,
                not_done_masks,
                current_episode_reward,
                prev_actions,
                batch,
                rgb_frames,
            )

        aggregated_stats = dict()
        for stat_key in next(iter(stats_episodes.values())).keys():
            aggregated_stats[stat_key] = sum(
                [v[stat_key] for v in stats_episodes.values()])
        num_episodes = len(stats_episodes)

        episode_reward_mean = aggregated_stats["reward"] / num_episodes
        episode_metric_mean = aggregated_stats[self.metric_uuid] / num_episodes
        episode_success_mean = aggregated_stats["success"] / num_episodes

        print(f"Average episode reward: {episode_reward_mean:.6f}")
        print(f"Average episode success: {episode_success_mean:.6f}")
        print(f"Average episode {self.metric_uuid}: {episode_metric_mean:.6f}")

        if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]:
            step_id = ckpt_dict["extra_state"]["step"]

        print("eval_reward", {"average reward": episode_reward_mean})
        print(
            f"eval_{self.metric_uuid}",
            {f"average {self.metric_uuid}": episode_metric_mean},
        )
        print("eval_success", {"average success": episode_success_mean})

        self.env.close()
コード例 #2
0
    def _collect_rollout_step(self, rollouts, current_episode_reward,
                              episode_rewards, episode_counts, episode_dist):
        pth_time = 0.0
        env_time = 0.0

        t_sample_action = time.time()
        # sample actions
        with torch.no_grad():
            step_observation = {
                k: v[rollouts.step]
                for k, v in rollouts.observations.items()
            }

            (
                values,
                actions,
                actions_log_probs,
                recurrent_hidden_states,
            ) = self.actor_critic.act(
                step_observation,
                rollouts.recurrent_hidden_states[rollouts.step],
                rollouts.prev_actions[rollouts.step],
                rollouts.masks[rollouts.step],
            )

        pth_time += time.time() - t_sample_action

        t_step_env = time.time()

        outputs = self.env.step([a[0].item() for a in actions])
        # dones = [False]
        # if outputs['pointgoal_with_gps_compass'][0] < 0.2:
        #     dones = [True]
        # rewards = -1 * (outputs['pointgoal_with_gps_compass'][0])
        # observations = [outputs]
        observations, rewards, dones, infos = [list(x) for x in zip(*outputs)]

        env_time += time.time() - t_step_env

        t_update_stats = time.time()
        batch = batch_obs(observations)
        rewards = torch.tensor(rewards,
                               dtype=torch.float,
                               device=episode_rewards.device)
        rewards = rewards.unsqueeze(1)

        masks = torch.tensor(
            [[0.0] if done else [1.0] for done in dones],
            dtype=torch.float,
            device=episode_rewards.device,
        )

        current_episode_reward += rewards
        episode_rewards += (1 - masks) * current_episode_reward
        episode_counts += 1 - masks
        current_episode_reward *= masks
        episode_dist = observations[0]['pointgoal_with_gps_compass'][0]

        if self._static_encoder:
            with torch.no_grad():
                batch["visual_features"] = self._encoder(batch)

        rollouts.insert(
            batch,
            recurrent_hidden_states,
            actions,
            actions_log_probs,
            values,
            rewards,
            masks,
        )

        pth_time += time.time() - t_update_stats

        return pth_time, env_time, 1, episode_dist
コード例 #3
0
    def train(self, env) -> None:
        r"""Main method for training PPO.

        Returns:
            None
        """
        self.env = env
        ppo_cfg = self.config.RL.PPO
        self.device = (torch.device("cuda", self.config.TORCH_GPU_ID)
                       if torch.cuda.is_available() else torch.device("cpu"))
        if not os.path.isdir(self.config.CHECKPOINT_FOLDER):
            os.makedirs(self.config.CHECKPOINT_FOLDER)
        self._setup_actor_critic_agent(ppo_cfg)

        rollouts = RolloutStorage(
            ppo_cfg.num_steps,
            self.env.num_envs,
            self.env.observation_spaces[0],
            self.env.action_spaces[0],
            ppo_cfg.hidden_size,
        )
        rollouts.to(self.device)

        observations = self.env.reset()
        batch = batch_obs(observations)

        for sensor in rollouts.observations:
            rollouts.observations[sensor][0].copy_(batch[sensor])

        # batch and observations may contain shared PyTorch CUDA
        # tensors.  We must explicitly clear them here otherwise
        # they will be kept in memory for the entire duration of training!
        batch = None
        observations = None

        episode_rewards = torch.zeros(self.env.num_envs, 1)
        episode_counts = torch.zeros(self.env.num_envs, 1)
        episode_dist = torch.zeros(self.env.num_envs, 1)
        current_episode_reward = torch.zeros(self.env.num_envs, 1)
        # window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size)
        # window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size)

        window_episode_reward = deque(maxlen=self.config.NUM_UPDATES)
        window_episode_counts = deque(maxlen=self.config.NUM_UPDATES)
        dist_val = deque(maxlen=self.config.NUM_UPDATES)

        t_start = time.time()
        env_time = 0
        pth_time = 0
        count_steps = 0
        count_checkpoints = 0

        lr_scheduler = LambdaLR(
            optimizer=self.agent.optimizer,
            lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES),
        )

        for update in range(self.config.NUM_UPDATES):
            if ppo_cfg.use_linear_lr_decay:
                lr_scheduler.step()

            if ppo_cfg.use_linear_clip_decay:
                self.agent.clip_param = ppo_cfg.clip_param * linear_decay(
                    update, self.config.NUM_UPDATES)

            for step in range(ppo_cfg.num_steps):
                (
                    delta_pth_time,
                    delta_env_time,
                    delta_steps,
                    tmp_dist,
                ) = self._collect_rollout_step(
                    rollouts,
                    current_episode_reward,
                    episode_rewards,
                    episode_counts,
                    episode_dist,
                )
                pth_time += delta_pth_time
                env_time += delta_env_time
                count_steps += delta_steps
                episode_dist = tmp_dist

            (
                delta_pth_time,
                value_loss,
                action_loss,
                dist_entropy,
            ) = self._update_agent(ppo_cfg, rollouts)
            pth_time += delta_pth_time

            window_episode_reward.append(episode_rewards.clone())
            window_episode_counts.append(episode_counts.clone())
            dist_val.append(episode_dist)

            losses = [value_loss, action_loss]
            stats = zip(
                ["count", "reward"],
                [window_episode_counts, window_episode_reward],
            )
            deltas = {
                k: ((v[-1] -
                     v[0]).sum().item() if len(v) > 1 else v[0].sum().item())
                for k, v in stats
            }
            deltas["count"] = max(deltas["count"], 1.0)

            print("reward", deltas["reward"] / deltas["count"], count_steps)

            print(
                "losses",
                {k: l
                 for l, k in zip(losses, ["value", "policy"])},
                count_steps,
            )

            # log stats
            if update > 0 and update % self.config.LOG_INTERVAL == 0:
                print("update: {}\tfps: {:.3f}\t".format(
                    update, count_steps / (time.time() - t_start)))

                print("update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t"
                      "frames: {}".format(update, env_time, pth_time,
                                          count_steps))

                window_rewards = (window_episode_reward[-1] -
                                  window_episode_reward[0]).sum()
                window_counts = (window_episode_counts[-1] -
                                 window_episode_counts[0]).sum()

                if window_counts > 0:
                    print("Average window size {} reward: {:3f}".format(
                        len(window_episode_reward),
                        (window_rewards / window_counts).item(),
                    ))
                else:
                    print("No episodes finish in current window")

            # checkpoint model
            if update % self.config.CHECKPOINT_INTERVAL == 0:
                self.save_checkpoint(f"ckpt.{count_checkpoints}.pth",
                                     dict(step=count_steps))
                count_checkpoints += 1

        np.savetxt("window_episode_reward_ppo.csv",
                   window_episode_reward,
                   delimiter=",")
        np.savetxt("window_episode_counts_ppo.csv",
                   window_episode_counts,
                   delimiter=",")
        np.savetxt("episode_dist_ppo.csv", episode_dist, delimiter=",")

        self.env.close()
def run(config, env, max_steps):
    r"""Main method for training PPO.

    Returns:
        None
    """

    observations = env.reset()
    batch = batch_obs(observations)

    batch = None
    observations = None

    episode_rewards = torch.zeros(env.num_envs, 1)
    episode_counts = torch.zeros(env.num_envs, 1)
    episode_dist = torch.zeros(env.num_envs, 1)
    current_episode_reward = torch.zeros(env.num_envs, 1)

    window_episode_reward = deque(maxlen=max_steps)
    window_episode_counts = deque(maxlen=max_steps)
    dist_val = deque(maxlen=max_steps)

    t_start = time.time()
    env_time = 0
    pth_time = 0
    count_steps = 0
    count_checkpoints = 0

    for update in range(max_steps):
        print(update)
        reward_sum = 0
        dist_sum = 0
        iter = 0
        rgb_frames = []
        if len(config.VIDEO_OPTION) > 0:
            os.makedirs(config.VIDEO_DIR, exist_ok=True)

        # get name of performance metric, e.g. "spl"
        metric_name = config.TASK_CONFIG.TASK.MEASUREMENTS[0]
        metric_cfg = getattr(config.TASK_CONFIG.TASK, metric_name)
        measure_type = baseline_registry.get_measure(metric_cfg.TYPE)

        for step in range(500):
            dones = [False]
            while dones[0] == False:
                outputs = env.step([env.action_spaces[0].sample()])
                observations, rewards, dones, infos = [
                    list(x) for x in zip(*outputs)
                ]
                reward_sum += rewards[0]
                dist_sum += observations[0]['pointgoal_with_gps_compass'][0]
                iter += 1

                frame = observations_to_image(observations[0], [])
                rgb_frames.append(frame)

        observations = env.reset()
        window_episode_reward.append(reward_sum / iter)
        window_episode_counts.append(iter)
        dist_val.append(dist_sum / iter)

        generate_video(
            video_option=config.VIDEO_OPTION,
            video_dir=config.VIDEO_DIR,
            images=np.array(rgb_frames),
            episode_id=update,
            checkpoint_idx=0,
            metric_name="spl",
            metric_value=1.0,
        )

        rgb_frames = []

    np.savetxt("window_episode_reward_ppo.csv",
               window_episode_reward,
               delimiter=",")
    np.savetxt("window_episode_counts_ppo.csv",
               window_episode_counts,
               delimiter=",")
    np.savetxt("episode_dist_ppo.csv", episode_dist, delimiter=",")

    env.close()