Ejemplo n.º 1
0
def observations_to_image(observation: Dict, info: Dict) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        egocentric_view.append(observation["rgb"][:, :, :3])

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = (observation["depth"].squeeze() * 255).astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (len(egocentric_view) >
            0), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)
    return frame
Ejemplo n.º 2
0
def draw_top_down_map(info, heading, output_size):
    """Generates a map that displays the state of the agent in the given environment,
    for the current frame.

    Args:
        info: environment info for current frame.
        heading: where the agent heads toward.
        output_size: height of output map.
    Returns:
        the output_size x width x 1 map
    """
    top_down_map = maps.colorize_topdown_map(
        info["top_down_map"]["map"], info["top_down_map"]["fog_of_war_mask"])
    original_map_size = top_down_map.shape[:2]
    map_scale = np.array(
        (1, original_map_size[1] * 1.0 / original_map_size[0]))
    new_map_size = np.round(output_size * map_scale).astype(np.int32)
    # OpenCV expects w, h but map size is in h, w
    top_down_map = cv2.resize(top_down_map, (new_map_size[1], new_map_size[0]))

    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    map_agent_pos = np.round(map_agent_pos * new_map_size /
                             original_map_size).astype(np.int32)
    top_down_map = maps.draw_agent(
        top_down_map,
        map_agent_pos,
        heading - np.pi / 2,
        agent_radius_px=top_down_map.shape[0] / 40,
    )
    return top_down_map
Ejemplo n.º 3
0
def draw_top_down_map(info):
    top_down_map = info["top_down_map"]["map"]

    top_down_map = maps.colorize_topdown_map(top_down_map)
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=top_down_map.shape[0] // 25,
    )

    return top_down_map
Ejemplo n.º 4
0
def plot_top_down_map(info, dataset='replica', pred=None):
    top_down_map = info["top_down_map"]["map"]
    top_down_map = maps.colorize_topdown_map(
        top_down_map, info["top_down_map"]["fog_of_war_mask"])
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    if dataset == 'replica':
        agent_radius_px = top_down_map.shape[0] // 16
    else:
        agent_radius_px = top_down_map.shape[0] // 50
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=agent_radius_px)
    if pred is not None:
        from habitat.utils.geometry_utils import quaternion_rotate_vector

        source_rotation = info["top_down_map"]["agent_rotation"]

        rounded_pred = np.round(pred[1])
        direction_vector_agent = np.array(
            [rounded_pred[1], 0, -rounded_pred[0]])
        direction_vector = quaternion_rotate_vector(source_rotation,
                                                    direction_vector_agent)

        grid_size = (
            (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
            (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
        )
        delta_x = int(-direction_vector[0] / grid_size[0])
        delta_y = int(direction_vector[2] / grid_size[1])

        x = np.clip(map_agent_pos[0] + delta_x,
                    a_min=0,
                    a_max=top_down_map.shape[0])
        y = np.clip(map_agent_pos[1] + delta_y,
                    a_min=0,
                    a_max=top_down_map.shape[1])
        point_padding = 20
        for m in range(x - point_padding, x + point_padding + 1):
            for n in range(y - point_padding, y + point_padding + 1):
                if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \
                        0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]:
                    top_down_map[m, n] = (0, 255, 255)
        if np.linalg.norm(rounded_pred) < 1:
            assert delta_x == 0 and delta_y == 0

    if top_down_map.shape[0] > top_down_map.shape[1]:
        top_down_map = np.rot90(top_down_map, 1)
    return top_down_map
Ejemplo n.º 5
0
def plot_top_down_map(info, dataset='replica'):
    top_down_map = info["top_down_map"]["map"]
    top_down_map = maps.colorize_topdown_map(
        top_down_map, info["top_down_map"]["fog_of_war_mask"])
    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    if dataset == 'replica':
        agent_radius_px = top_down_map.shape[0] // 16
    else:
        agent_radius_px = top_down_map.shape[0] // 50
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=info["top_down_map"]["agent_angle"],
        agent_radius_px=agent_radius_px)

    if top_down_map.shape[0] > top_down_map.shape[1]:
        top_down_map = np.rot90(top_down_map, 1)
    return top_down_map
def draw_top_down_map(info, heading, output_size):
    top_down_map = maps.colorize_topdown_map(info["top_down_map"]["map"])
    original_map_size = top_down_map.shape[:2]
    map_scale = np.array(
        (1, original_map_size[1] * 1.0 / original_map_size[0]))
    new_map_size = np.round(output_size * map_scale).astype(np.int32)
    # OpenCV expects w, h but map size is in h, w
    top_down_map = cv2.resize(top_down_map, (new_map_size[1], new_map_size[0]))

    map_agent_pos = info["top_down_map"]["agent_map_coord"]
    map_agent_pos = np.round(map_agent_pos * new_map_size /
                             original_map_size).astype(np.int32)
    top_down_map = maps.draw_agent(
        top_down_map,
        map_agent_pos,
        heading - np.pi / 2,
        agent_radius_px=top_down_map.shape[0] / 40,
    )
    return top_down_map
Ejemplo n.º 7
0
def topdown_to_image(topdown_info: np.ndarray) -> np.ndarray:
    r"""Convert topdown map to an RGB image.
    """
    top_down_map = topdown_info["map"]
    fog_of_war_mask = topdown_info["fog_of_war_mask"]
    top_down_map = maps.colorize_topdown_map(top_down_map, fog_of_war_mask)
    map_agent_pos = topdown_info["agent_map_coord"]

    # Add zero padding
    min_map_size = 200
    if top_down_map.shape[0] != top_down_map.shape[1]:
        H = top_down_map.shape[0]
        W = top_down_map.shape[1]
        if H > W:
            pad_value = (H - W) // 2
            padding = ((0, 0), (pad_value, pad_value), (0, 0))
            map_agent_pos = (map_agent_pos[0], map_agent_pos[1] + pad_value)
        else:
            pad_value = (W - H) // 2
            padding = ((pad_value, pad_value), (0, 0), (0, 0))
            map_agent_pos = (map_agent_pos[0] + pad_value, map_agent_pos[1])
        top_down_map = np.pad(top_down_map,
                              padding,
                              mode="constant",
                              constant_values=255)

    if top_down_map.shape[0] < min_map_size:
        H, W = top_down_map.shape[:2]
        top_down_map = cv2.resize(top_down_map, (min_map_size, min_map_size))
        map_agent_pos = (
            int(map_agent_pos[0] * min_map_size // H),
            int(map_agent_pos[1] * min_map_size // W),
        )
    top_down_map = maps.draw_agent(
        image=top_down_map,
        agent_center_coord=map_agent_pos,
        agent_rotation=topdown_info["agent_angle"],
        agent_radius_px=top_down_map.shape[0] // 16,
    )

    return top_down_map
Ejemplo n.º 8
0
def observations_to_image(observation: Dict, info: Dict) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    observation_size = observation["rgb"].shape[0]
    egocentric_view = observation["rgb"][:, :, -3:]
    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    if "goal_coord_in_camera" in observation:
        _, _, _, xpx, ypx = observation["goal_coord_in_camera"]

        if xpx != -1 and ypx != -1:

            xpx = int(xpx * observation_size + observation_size / 2)
            ypx = int(ypx * observation_size + observation_size / 2)

            egocentric_view = cv2.circle(egocentric_view, (xpx, ypx), 15,
                                         (0, 0, 255), 5)

    # draw depth map if observation has depth info
    if "depth" in observation:
        depth_map = (observation["depth"][:, :, -1] * 255).astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)

        egocentric_view = np.concatenate((egocentric_view, depth_map), axis=1)

    if "goalclass" in observation:
        from habitat.tasks.nav.nav_task_multi_goal import CLASSES
        index = np.nonzero(observation["goalclass"])[0][0]
        classes = list(CLASSES.keys())
        class_name = classes[index]
        cv2.putText(egocentric_view, class_name, (15, 15),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255))

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)

    return frame
Ejemplo n.º 9
0
    def train_model(self):
        episode_rewards = deque(maxlen=10)
        current_episode_rewards = np.zeros(self.shell_args.num_processes)
        episode_lengths = deque(maxlen=10)
        current_episode_lengths = np.zeros(self.shell_args.num_processes)
        current_rewards = np.zeros(self.shell_args.num_processes)

        total_num_steps = self.start_iter
        fps_timer = [time.time(), total_num_steps]
        timers = np.zeros(3)
        egomotion_loss = 0

        video_frames = []
        num_episodes = 0
        # self.evaluate_model()

        obs = self.envs.reset()
        if self.compute_surface_normals:
            obs["surface_normals"] = pt_util.depth_to_surface_normals(
                obs["depth"].to(self.device))
        obs["prev_action_one_hot"] = obs[
            "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32)
        if self.shell_args.algo == "supervised":
            obs["best_next_action"] = pt_util.from_numpy(
                obs["best_next_action"][:, ACTION_SPACE])
        self.rollouts.copy_obs(obs, 0)
        distances = pt_util.to_numpy_array(obs["goal_geodesic_distance"])
        self.train_stats["start_geodesic_distance"][:] = distances
        previous_visual_features = None
        egomotion_pred = None
        prev_action = None
        prev_action_probs = None
        num_updates = (int(self.shell_args.num_env_steps) //
                       self.shell_args.num_forward_rollout_steps
                       ) // self.shell_args.num_processes

        try:
            for iter_count in range(num_updates):
                if self.shell_args.tensorboard:
                    if iter_count % 500 == 0:
                        print("Logging conv summaries")
                        self.logger.network_conv_summary(
                            self.agent, total_num_steps)
                    elif iter_count % 100 == 0:
                        print("Logging variable summaries")
                        self.logger.network_variable_summary(
                            self.agent, total_num_steps)

                if self.shell_args.use_linear_lr_decay:
                    # decrease learning rate linearly
                    update_linear_schedule(self.optimizer.optimizer,
                                           iter_count, num_updates,
                                           self.shell_args.lr)

                if self.shell_args.algo == "ppo" and self.shell_args.use_linear_clip_decay:
                    self.optimizer.clip_param = self.shell_args.clip_param * (
                        1 - iter_count / float(num_updates))

                if hasattr(self.agent.base, "enable_decoder"):
                    if self.shell_args.record_video:
                        self.agent.base.enable_decoder()
                    else:
                        self.agent.base.disable_decoder()

                for step in range(self.shell_args.num_forward_rollout_steps):
                    with torch.no_grad():
                        start_t = time.time()
                        value, action, action_log_prob, recurrent_hidden_states = self.agent.act(
                            {
                                "images":
                                self.rollouts.obs[step],
                                "target_vector":
                                self.rollouts.additional_observations_dict[
                                    "pointgoal"][step],
                                "prev_action_one_hot":
                                self.rollouts.additional_observations_dict[
                                    "prev_action_one_hot"][step],
                            },
                            self.rollouts.recurrent_hidden_states[step],
                            self.rollouts.masks[step],
                        )
                        action_cpu = pt_util.to_numpy_array(action.squeeze(1))
                        translated_action_space = ACTION_SPACE[action_cpu]
                        if not self.shell_args.end_to_end:
                            self.rollouts.additional_observations_dict[
                                "visual_encoder_features"][
                                    self.rollouts.step].copy_(
                                        self.agent.base.visual_encoder_features
                                    )

                        if self.shell_args.use_motion_loss:
                            if self.shell_args.record_video:
                                if previous_visual_features is not None:
                                    egomotion_pred = self.agent.base.predict_egomotion(
                                        self.agent.base.visual_features,
                                        previous_visual_features)
                            previous_visual_features = self.agent.base.visual_features.detach(
                            )

                        timers[1] += time.time() - start_t

                        if self.shell_args.record_video:
                            # Copy so we don't mess with obs itself
                            draw_obs = OrderedDict()
                            for key, val in obs.items():
                                draw_obs[key] = pt_util.to_numpy_array(
                                    val).copy()
                            best_next_action = draw_obs.pop(
                                "best_next_action", None)

                            if prev_action is not None:
                                draw_obs[
                                    "action_taken"] = pt_util.to_numpy_array(
                                        self.agent.last_dist.probs).copy()
                                draw_obs["action_taken"][:] = 0
                                draw_obs["action_taken"][
                                    np.arange(self.shell_args.num_processes),
                                    prev_action] = 1
                                draw_obs[
                                    "action_taken_name"] = SIM_ACTION_TO_NAME[
                                        ACTION_SPACE_TO_SIM_ACTION[
                                            ACTION_SPACE[
                                                prev_action.squeeze()]]]
                                draw_obs[
                                    "action_prob"] = pt_util.to_numpy_array(
                                        prev_action_probs).copy()
                            else:
                                draw_obs["action_taken"] = None
                                draw_obs[
                                    "action_taken_name"] = SIM_ACTION_TO_NAME[
                                        SimulatorActions.STOP]
                                draw_obs["action_prob"] = None
                            prev_action = action_cpu
                            prev_action_probs = self.agent.last_dist.probs.detach(
                            )
                            if (hasattr(self.agent.base, "decoder_outputs")
                                    and self.agent.base.decoder_outputs
                                    is not None):
                                min_channel = 0
                                for key, num_channels in self.agent.base.decoder_output_info:
                                    outputs = self.agent.base.decoder_outputs[:,
                                                                              min_channel:
                                                                              min_channel
                                                                              +
                                                                              num_channels,
                                                                              ...]
                                    draw_obs["output_" +
                                             key] = pt_util.to_numpy_array(
                                                 outputs).copy()
                                    min_channel += num_channels
                            draw_obs["rewards"] = current_rewards.copy()
                            draw_obs["step"] = current_episode_lengths.copy()
                            draw_obs["method"] = self.shell_args.method_name
                            if best_next_action is not None:
                                draw_obs["best_next_action"] = best_next_action
                            if self.shell_args.use_motion_loss:
                                if egomotion_pred is not None:
                                    draw_obs[
                                        "egomotion_pred"] = pt_util.to_numpy_array(
                                            F.softmax(egomotion_pred,
                                                      dim=1)).copy()
                                else:
                                    draw_obs["egomotion_pred"] = None
                            images, titles, normalize = draw_outputs.obs_to_images(
                                draw_obs)
                            if self.shell_args.algo == "supervised":
                                im_inds = [0, 2, 3, 1, 9, 6, 7, 8, 5, 4]
                            else:
                                im_inds = [0, 2, 3, 1, 6, 7, 8, 5]
                            height, width = images[0].shape[:2]
                            subplot_image = drawing.subplot(
                                images,
                                2,
                                5,
                                titles=titles,
                                normalize=normalize,
                                order=im_inds,
                                output_width=max(width, 320),
                                output_height=max(height, 320),
                            )
                            video_frames.append(subplot_image)

                        # save dists from previous step or else on reset they will be overwritten
                        distances = pt_util.to_numpy_array(
                            obs["goal_geodesic_distance"])

                        start_t = time.time()
                        obs, rewards, dones, infos = self.envs.step(
                            translated_action_space)
                        timers[0] += time.time() - start_t
                        obs["reward"] = rewards
                        if self.shell_args.algo == "supervised":
                            obs["best_next_action"] = pt_util.from_numpy(
                                obs["best_next_action"][:, ACTION_SPACE]).to(
                                    torch.float32)
                        obs["prev_action_one_hot"] = obs[
                            "prev_action_one_hot"][:, ACTION_SPACE].to(
                                torch.float32)
                        rewards *= REWARD_SCALAR
                        rewards = np.clip(rewards, -10, 10)

                        if self.shell_args.record_video and not dones[0]:
                            obs["top_down_map"] = infos[0]["top_down_map"]

                        if self.compute_surface_normals:
                            obs["surface_normals"] = pt_util.depth_to_surface_normals(
                                obs["depth"].to(self.device))

                        current_rewards = pt_util.to_numpy_array(rewards)
                        current_episode_rewards += pt_util.to_numpy_array(
                            rewards).squeeze()
                        current_episode_lengths += 1
                        for ii, done_e in enumerate(dones):
                            if done_e:
                                num_episodes += 1
                                if self.shell_args.record_video:
                                    final_rgb = draw_obs["rgb"].transpose(
                                        0, 2, 3, 1).squeeze(0)
                                    if self.shell_args.task == "pointnav":
                                        if infos[ii]["spl"] > 0:
                                            draw_obs[
                                                "action_taken_name"] = "Stop. Success"
                                            draw_obs["reward"] = [
                                                self.configs[0].TASK.
                                                SUCCESS_REWARD
                                            ]
                                            final_rgb[:] = final_rgb * np.float32(
                                                0.5) + np.tile(
                                                    np.array([0, 128, 0],
                                                             dtype=np.uint8),
                                                    (final_rgb.shape[0],
                                                     final_rgb.shape[1], 1),
                                                )
                                        else:
                                            draw_obs[
                                                "action_taken_name"] = "Timeout. Failed"
                                            final_rgb[:] = final_rgb * np.float32(
                                                0.5) + np.tile(
                                                    np.array([128, 0, 0],
                                                             dtype=np.uint8),
                                                    (final_rgb.shape[0],
                                                     final_rgb.shape[1], 1),
                                                )
                                    elif self.shell_args.task == "exploration" or self.shell_args.task == "flee":
                                        draw_obs[
                                            "action_taken_name"] = "End of episode."
                                    final_rgb = final_rgb[np.newaxis,
                                                          ...].transpose(
                                                              0, 3, 1, 2)
                                    draw_obs["rgb"] = final_rgb

                                    images, titles, normalize = draw_outputs.obs_to_images(
                                        draw_obs)
                                    im_inds = [0, 2, 3, 1, 6, 7, 8, 5]
                                    height, width = images[0].shape[:2]
                                    subplot_image = drawing.subplot(
                                        images,
                                        2,
                                        5,
                                        titles=titles,
                                        normalize=normalize,
                                        order=im_inds,
                                        output_width=max(width, 320),
                                        output_height=max(height, 320),
                                    )
                                    video_frames.extend(
                                        [subplot_image] *
                                        (self.configs[0].ENVIRONMENT.
                                         MAX_EPISODE_STEPS + 30 -
                                         len(video_frames)))

                                    if "top_down_map" in infos[0]:
                                        video_dir = os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos")
                                        if not os.path.exists(video_dir):
                                            os.makedirs(video_dir)
                                        im_path = os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos", "total_steps_%d.png" %
                                            total_num_steps)
                                        from habitat.utils.visualizations import maps
                                        import imageio

                                        top_down_map = maps.colorize_topdown_map(
                                            infos[0]["top_down_map"]["map"])
                                        imageio.imsave(im_path, top_down_map)

                                    images_to_video(
                                        video_frames,
                                        os.path.join(
                                            self.shell_args.log_prefix,
                                            "videos"),
                                        "total_steps_%d" % total_num_steps,
                                    )
                                    video_frames = []

                                if self.shell_args.task == "pointnav":
                                    print(
                                        "FINISHED EPISODE %d Length %d Reward %.3f SPL %.4f"
                                        % (
                                            num_episodes,
                                            current_episode_lengths[ii],
                                            current_episode_rewards[ii],
                                            infos[ii]["spl"],
                                        ))
                                    self.train_stats["spl"][ii] = infos[ii][
                                        "spl"]
                                    self.train_stats["success"][
                                        ii] = self.train_stats["spl"][ii] > 0
                                    self.train_stats["end_geodesic_distance"][
                                        ii] = (distances[ii] - self.configs[0].
                                               SIMULATOR.FORWARD_STEP_SIZE)
                                    self.train_stats[
                                        "delta_geodesic_distance"][ii] = (
                                            self.train_stats[
                                                "start_geodesic_distance"][ii]
                                            - self.train_stats[
                                                "end_geodesic_distance"][ii])
                                    self.train_stats["num_steps"][
                                        ii] = current_episode_lengths[ii]
                                elif self.shell_args.task == "exploration":
                                    print(
                                        "FINISHED EPISODE %d Reward %.3f States Visited %d"
                                        % (num_episodes,
                                           current_episode_rewards[ii],
                                           infos[ii]["visited_states"]))
                                    self.train_stats["visited_states"][
                                        ii] = infos[ii]["visited_states"]
                                elif self.shell_args.task == "flee":
                                    print(
                                        "FINISHED EPISODE %d Reward %.3f Distance from start %.4f"
                                        % (num_episodes,
                                           current_episode_rewards[ii],
                                           infos[ii]["distance_from_start"]))
                                    self.train_stats["distance_from_start"][
                                        ii] = infos[ii]["distance_from_start"]

                                self.train_stats["num_episodes"][ii] += 1
                                self.train_stats["reward"][
                                    ii] = current_episode_rewards[ii]

                                if self.shell_args.tensorboard:
                                    log_dict = {
                                        "single_episode/reward":
                                        self.train_stats["reward"][ii]
                                    }
                                    if self.shell_args.task == "pointnav":
                                        log_dict.update({
                                            "single_episode/num_steps":
                                            self.train_stats["num_steps"][ii],
                                            "single_episode/spl":
                                            self.train_stats["spl"][ii],
                                            "single_episode/success":
                                            self.train_stats["success"][ii],
                                            "single_episode/start_geodesic_distance":
                                            self.train_stats[
                                                "start_geodesic_distance"][ii],
                                            "single_episode/end_geodesic_distance":
                                            self.train_stats[
                                                "end_geodesic_distance"][ii],
                                            "single_episode/delta_geodesic_distance":
                                            self.train_stats[
                                                "delta_geodesic_distance"][ii],
                                        })
                                    elif self.shell_args.task == "exploration":
                                        log_dict[
                                            "single_episode/visited_states"] = self.train_stats[
                                                "visited_states"][ii]
                                    elif self.shell_args.task == "flee":
                                        log_dict[
                                            "single_episode/distance_from_start"] = self.train_stats[
                                                "distance_from_start"][ii]
                                    self.logger.dict_log(
                                        log_dict,
                                        step=(total_num_steps +
                                              self.shell_args.num_processes *
                                              step + ii))

                                episode_rewards.append(
                                    current_episode_rewards[ii])
                                current_episode_rewards[ii] = 0
                                episode_lengths.append(
                                    current_episode_lengths[ii])
                                current_episode_lengths[ii] = 0
                                self.train_stats["start_geodesic_distance"][
                                    ii] = obs["goal_geodesic_distance"][ii]

                        # If done then clean the history of observations.
                        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                   for done_ in dones])
                        bad_masks = torch.FloatTensor(
                            [[0.0]
                             if "bad_transition" in info.keys() else [1.0]
                             for info in infos])

                        self.rollouts.insert(obs, recurrent_hidden_states,
                                             action, action_log_prob, value,
                                             rewards, masks, bad_masks)

                with torch.no_grad():
                    start_t = time.time()
                    next_value = self.agent.get_value(
                        {
                            "images":
                            self.rollouts.obs[-1],
                            "target_vector":
                            self.rollouts.
                            additional_observations_dict["pointgoal"][-1],
                            "prev_action_one_hot":
                            self.rollouts.additional_observations_dict[
                                "prev_action_one_hot"][-1],
                        },
                        self.rollouts.recurrent_hidden_states[-1],
                        self.rollouts.masks[-1],
                    ).detach()
                    timers[1] += time.time() - start_t

                self.rollouts.compute_returns(next_value,
                                              self.shell_args.use_gae,
                                              self.shell_args.gamma,
                                              self.shell_args.tau)

                if not self.shell_args.no_weight_update:
                    start_t = time.time()
                    if self.shell_args.algo == "supervised":
                        (
                            total_loss,
                            action_loss,
                            visual_loss_total,
                            visual_loss_dict,
                            egomotion_loss,
                            forward_model_loss,
                        ) = self.optimizer.update(self.rollouts,
                                                  self.shell_args)
                    else:
                        (
                            total_loss,
                            value_loss,
                            action_loss,
                            dist_entropy,
                            visual_loss_total,
                            visual_loss_dict,
                            egomotion_loss,
                            forward_model_loss,
                        ) = self.optimizer.update(self.rollouts,
                                                  self.shell_args)

                    timers[2] += time.time() - start_t

                self.rollouts.after_update()

                # save for every interval-th episode or for the last epoch
                if iter_count % self.shell_args.save_interval == 0 or iter_count == num_updates - 1:
                    self.save_checkpoint(5, total_num_steps)

                total_num_steps += self.shell_args.num_processes * self.shell_args.num_forward_rollout_steps

                if not self.shell_args.no_weight_update and iter_count % self.shell_args.log_interval == 0:
                    log_dict = {}
                    if len(episode_rewards) > 1:
                        end = time.time()
                        nsteps = total_num_steps - fps_timer[1]
                        fps = int((total_num_steps - fps_timer[1]) /
                                  (end - fps_timer[0]))
                        timers /= nsteps
                        env_spf = timers[0]
                        forward_spf = timers[1]
                        backward_spf = timers[2]
                        print((
                            "{} Updates {}, num timesteps {}, FPS {}, Env FPS "
                            "{}, \n Last {} training episodes: mean/median reward "
                            "{:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n"
                        ).format(
                            datetime.datetime.now(),
                            iter_count,
                            total_num_steps,
                            fps,
                            int(1.0 / env_spf),
                            len(episode_rewards),
                            np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards),
                            np.max(episode_rewards),
                        ))

                        if self.shell_args.tensorboard:
                            log_dict.update({
                                "stats/full_spf":
                                1.0 / (fps + 1e-10),
                                "stats/env_spf":
                                env_spf,
                                "stats/forward_spf":
                                forward_spf,
                                "stats/backward_spf":
                                backward_spf,
                                "stats/full_fps":
                                fps,
                                "stats/env_fps":
                                1.0 / (env_spf + 1e-10),
                                "stats/forward_fps":
                                1.0 / (forward_spf + 1e-10),
                                "stats/backward_fps":
                                1.0 / (backward_spf + 1e-10),
                                "episode/mean_rewards":
                                np.mean(episode_rewards),
                                "episode/median_rewards":
                                np.median(episode_rewards),
                                "episode/min_rewards":
                                np.min(episode_rewards),
                                "episode/max_rewards":
                                np.max(episode_rewards),
                                "episode/mean_lengths":
                                np.mean(episode_lengths),
                                "episode/median_lengths":
                                np.median(episode_lengths),
                                "episode/min_lengths":
                                np.min(episode_lengths),
                                "episode/max_lengths":
                                np.max(episode_lengths),
                            })
                        fps_timer[0] = time.time()
                        fps_timer[1] = total_num_steps
                        timers[:] = 0
                    if self.shell_args.tensorboard:
                        log_dict.update({
                            "loss/action":
                            action_loss,
                            "loss/0_total":
                            total_loss,
                            "loss/visual/0_total":
                            visual_loss_total,
                            "loss/exploration/egomotion":
                            egomotion_loss,
                            "loss/exploration/forward_model":
                            forward_model_loss,
                        })
                        if self.shell_args.algo != "supervised":
                            log_dict.update({
                                "loss/entropy": dist_entropy,
                                "loss/value": value_loss
                            })
                        for key, val in visual_loss_dict.items():
                            log_dict["loss/visual/" + key] = val
                        self.logger.dict_log(log_dict, step=total_num_steps)

                if self.shell_args.eval_interval is not None and total_num_steps % self.shell_args.eval_interval < (
                        self.shell_args.num_processes *
                        self.shell_args.num_forward_rollout_steps):
                    self.save_checkpoint(-1, total_num_steps)
                    self.set_log_iter(total_num_steps)
                    self.evaluate_model()
                    # reset the env datasets
                    self.envs.unwrapped.call(
                        ["switch_dataset"] * self.shell_args.num_processes,
                        [("train", )] * self.shell_args.num_processes)
                    obs = self.envs.reset()
                    if self.compute_surface_normals:
                        obs["surface_normals"] = pt_util.depth_to_surface_normals(
                            obs["depth"].to(self.device))
                    obs["prev_action_one_hot"] = obs[
                        "prev_action_one_hot"][:,
                                               ACTION_SPACE].to(torch.float32)
                    if self.shell_args.algo == "supervised":
                        obs["best_next_action"] = pt_util.from_numpy(
                            obs["best_next_action"][:, ACTION_SPACE])
                    self.rollouts.copy_obs(obs, 0)
                    distances = pt_util.to_numpy_array(
                        obs["goal_geodesic_distance"])
                    self.train_stats["start_geodesic_distance"][:] = distances
                    previous_visual_features = None
                    egomotion_pred = None
                    prev_action = None
                    prev_action_probs = None
        except:
            # Catch all exceptions so a final save can be performed
            import traceback

            traceback.print_exc()
        finally:
            self.save_checkpoint(-1, total_num_steps)
Ejemplo n.º 10
0
    def _eval_checkpoint(self,
                         checkpoint_path: str,
                         writer: TensorboardWriter,
                         checkpoint_index: int = 0,
                         log_diagnostics=[],
                         output_dir='.',
                         label='.',
                         num_eval_runs=1) -> None:
        r"""Evaluates a single checkpoint.

        Args:
            checkpoint_path: path of checkpoint
            writer: tensorboard writer object for logging to tensorboard
            checkpoint_index: index of cur checkpoint for logging

        Returns:
            None
        """
        if checkpoint_index == -1:
            ckpt_file = checkpoint_path.split('/')[-1]
            split_info = ckpt_file.split('.')
            checkpoint_index = split_info[1]
        # Map location CPU is almost always better than mapping to a CUDA device.
        ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu")

        if self.config.EVAL.USE_CKPT_CONFIG:
            config = self._setup_eval_config(ckpt_dict["config"])
        else:
            config = self.config.clone()

        ppo_cfg = config.RL.PPO
        task_cfg = config.TASK_CONFIG.TASK

        config.defrost()
        config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT
        config.freeze()

        if len(self.config.VIDEO_OPTION) > 0:
            config.defrost()
            config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP")
            config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS")
            config.freeze()

        logger.info(f"env config: {config}")
        self.envs = construct_envs(config, get_env_class(config.ENV_NAME))
        # pass in aux config if we're doing attention
        aux_cfg = self.config.RL.AUX_TASKS
        self._setup_actor_critic_agent(ppo_cfg, task_cfg, aux_cfg)

        # Check if we accidentally recorded `visual_resnet` in our checkpoint and drop it (it's redundant with `visual_encoder`)
        ckpt_dict['state_dict'] = {
            k: v
            for k, v in ckpt_dict['state_dict'].items()
            if 'visual_resnet' not in k
        }
        self.agent.load_state_dict(ckpt_dict["state_dict"])

        logger.info("agent number of trainable parameters: {}".format(
            sum(param.numel() for param in self.agent.parameters()
                if param.requires_grad)))

        self.actor_critic = self.agent.actor_critic

        observations = self.envs.reset()
        batch = batch_obs(observations, device=self.device)

        current_episode_reward = torch.zeros(self.envs.num_envs,
                                             1,
                                             device=self.device)

        test_recurrent_hidden_states = torch.zeros(
            self.actor_critic.net.num_recurrent_layers,
            self.config.NUM_PROCESSES,
            ppo_cfg.hidden_size,
            device=self.device,
        )
        _, num_recurrent_memories, _ = self._setup_auxiliary_tasks(
            aux_cfg, ppo_cfg, task_cfg, is_eval=True)
        if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES:
            aux_tasks = self.config.RL.AUX_TASKS.tasks
            num_recurrent_memories = len(self.config.RL.AUX_TASKS.tasks)
            test_recurrent_hidden_states = test_recurrent_hidden_states.unsqueeze(
                2).repeat(1, 1, num_recurrent_memories, 1)

        prev_actions = torch.zeros(self.config.NUM_PROCESSES,
                                   1,
                                   device=self.device,
                                   dtype=torch.long)

        not_done_masks = torch.zeros(self.config.NUM_PROCESSES,
                                     1,
                                     device=self.device)
        stats_episodes = dict()  # dict of dicts that stores stats per episode

        rgb_frames = [[] for _ in range(self.config.NUM_PROCESSES)
                      ]  # type: List[List[np.ndarray]]

        if len(self.config.VIDEO_OPTION) > 0:
            os.makedirs(self.config.VIDEO_DIR, exist_ok=True)

        number_of_eval_episodes = self.config.TEST_EPISODE_COUNT
        if number_of_eval_episodes == -1:
            number_of_eval_episodes = sum(self.envs.number_of_episodes)
        else:
            total_num_eps = sum(self.envs.number_of_episodes)
            if total_num_eps < number_of_eval_episodes:
                logger.warn(
                    f"Config specified {number_of_eval_episodes} eval episodes"
                    ", dataset only has {total_num_eps}.")
                logger.warn(f"Evaluating with {total_num_eps} instead.")
                number_of_eval_episodes = total_num_eps

        videos_cap = 2  # number of videos to generate per checkpoint
        if len(log_diagnostics) > 0:
            videos_cap = 10
        # video_indices = random.sample(range(self.config.TEST_EPISODE_COUNT),
        # min(videos_cap, self.config.TEST_EPISODE_COUNT))
        video_indices = range(10)
        print(f"Videos: {video_indices}")

        total_stats = []
        dones_per_ep = dict()

        # Logging more extensive evaluation stats for analysis
        if len(log_diagnostics) > 0:
            d_stats = {}
            for d in log_diagnostics:
                d_stats[d] = [
                    [] for _ in range(self.config.NUM_PROCESSES)
                ]  # stored as nested list envs x timesteps x k (# tasks)

        pbar = tqdm.tqdm(total=number_of_eval_episodes * num_eval_runs)
        self.agent.eval()
        while (len(stats_episodes) < number_of_eval_episodes * num_eval_runs
               and self.envs.num_envs > 0):
            current_episodes = self.envs.current_episodes()
            with torch.no_grad():
                weights_output = None
                if self.config.RL.PPO.policy in MULTIPLE_BELIEF_CLASSES:
                    weights_output = torch.empty(self.envs.num_envs,
                                                 len(aux_tasks))
                (
                    _,
                    actions,
                    _,
                    test_recurrent_hidden_states,
                ) = self.actor_critic.act(batch,
                                          test_recurrent_hidden_states,
                                          prev_actions,
                                          not_done_masks,
                                          deterministic=False,
                                          weights_output=weights_output)
                prev_actions.copy_(actions)

                for i in range(self.envs.num_envs):
                    if Diagnostics.actions in log_diagnostics:
                        d_stats[Diagnostics.actions][i].append(
                            prev_actions[i].item())
                    if Diagnostics.weights in log_diagnostics:
                        aux_weights = None if weights_output is None else weights_output[
                            i]
                        if aux_weights is not None:
                            d_stats[Diagnostics.weights][i].append(
                                aux_weights.half().tolist())

            outputs = self.envs.step([a[0].item() for a in actions])

            observations, rewards, dones, infos = [
                list(x) for x in zip(*outputs)
            ]
            batch = batch_obs(observations, device=self.device)

            not_done_masks = torch.tensor(
                [[0.0] if done else [1.0] for done in dones],
                dtype=torch.float,
                device=self.device,
            )

            rewards = torch.tensor(rewards,
                                   dtype=torch.float,
                                   device=self.device).unsqueeze(1)
            current_episode_reward += rewards
            next_episodes = self.envs.current_episodes()
            envs_to_pause = []
            n_envs = self.envs.num_envs
            for i in range(n_envs):
                next_k = (
                    next_episodes[i].scene_id,
                    next_episodes[i].episode_id,
                )
                if dones_per_ep.get(next_k, 0) == num_eval_runs:
                    envs_to_pause.append(i)  # wait for the rest

                if not_done_masks[i].item() == 0:
                    episode_stats = dict()

                    episode_stats["reward"] = current_episode_reward[i].item()
                    episode_stats.update(
                        self._extract_scalars_from_info(infos[i]))

                    current_episode_reward[i] = 0
                    # use scene_id + episode_id as unique id for storing stats

                    k = (
                        current_episodes[i].scene_id,
                        current_episodes[i].episode_id,
                    )
                    dones_per_ep[k] = dones_per_ep.get(k, 0) + 1

                    if dones_per_ep.get(k, 0) == 1 and len(
                            self.config.VIDEO_OPTION) > 0 and len(
                                stats_episodes) in video_indices:
                        logger.info(f"Generating video {len(stats_episodes)}")
                        category = getattr(current_episodes[i],
                                           "object_category", "")
                        if category != "":
                            category += "_"
                        try:
                            generate_video(
                                video_option=self.config.VIDEO_OPTION,
                                video_dir=self.config.VIDEO_DIR,
                                images=rgb_frames[i],
                                episode_id=current_episodes[i].episode_id,
                                checkpoint_idx=checkpoint_index,
                                metrics=self._extract_scalars_from_info(
                                    infos[i]),
                                tag=f"{category}{label}",
                                tb_writer=writer,
                            )
                        except Exception as e:
                            logger.warning(str(e))
                    rgb_frames[i] = []

                    stats_episodes[(
                        current_episodes[i].scene_id,
                        current_episodes[i].episode_id,
                        dones_per_ep[k],
                    )] = episode_stats

                    if len(log_diagnostics) > 0:
                        diagnostic_info = dict()
                        for metric in log_diagnostics:
                            diagnostic_info[metric] = d_stats[metric][i]
                            d_stats[metric][i] = []
                        if Diagnostics.top_down_map in log_diagnostics:
                            top_down_map = torch.tensor([])
                            if len(self.config.VIDEO_OPTION) > 0:
                                top_down_map = infos[i]["top_down_map"]["map"]
                                top_down_map = maps.colorize_topdown_map(
                                    top_down_map, fog_of_war_mask=None)
                            diagnostic_info.update(
                                dict(top_down_map=top_down_map))
                        total_stats.append(
                            dict(
                                stats=episode_stats,
                                did_stop=bool(prev_actions[i] == 0),
                                episode_info=attr.asdict(current_episodes[i]),
                                info=diagnostic_info,
                            ))
                    pbar.update()

                # episode continues
                else:
                    if len(self.config.VIDEO_OPTION) > 0:
                        aux_weights = None if weights_output is None else weights_output[
                            i]
                        frame = observations_to_image(
                            observations[i], infos[i],
                            current_episode_reward[i].item(), aux_weights,
                            aux_tasks)
                        rgb_frames[i].append(frame)
                    if Diagnostics.gps in log_diagnostics:
                        d_stats[Diagnostics.gps][i].append(
                            observations[i]["gps"].tolist())
                    if Diagnostics.heading in log_diagnostics:
                        d_stats[Diagnostics.heading][i].append(
                            observations[i]["heading"].tolist())

            (
                self.envs,
                test_recurrent_hidden_states,
                not_done_masks,
                current_episode_reward,
                prev_actions,
                batch,
                rgb_frames,
            ) = self._pause_envs(
                envs_to_pause,
                self.envs,
                test_recurrent_hidden_states,
                not_done_masks,
                current_episode_reward,
                prev_actions,
                batch,
                rgb_frames,
            )

        num_episodes = len(stats_episodes)
        aggregated_stats = dict()
        for stat_key in next(iter(stats_episodes.values())).keys():
            aggregated_stats[stat_key] = (
                sum([v[stat_key]
                     for v in stats_episodes.values()]) / num_episodes)

        for k, v in aggregated_stats.items():
            logger.info(f"Average episode {k}: {v:.4f}")

        step_id = checkpoint_index
        if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]:
            step_id = ckpt_dict["extra_state"]["step"]

        writer.add_scalars(
            "eval_reward",
            {"average reward": aggregated_stats["reward"]},
            step_id,
        )

        metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"}
        if len(metrics) > 0:
            writer.add_scalars("eval_metrics", metrics, step_id)
            logger.info("eval_metrics")
            logger.info(metrics)
        if len(log_diagnostics) > 0:
            os.makedirs(output_dir, exist_ok=True)
            eval_fn = f"{label}.json"
            with open(os.path.join(output_dir, eval_fn), 'w',
                      encoding='utf-8') as f:
                json.dump(total_stats, f, ensure_ascii=False, indent=4)
        self.envs.close()
Ejemplo n.º 11
0
    def evaluate_model(self):
        self.envs.unwrapped.call(["switch_dataset"] *
                                 self.shell_args.num_processes,
                                 [("val", )] * self.shell_args.num_processes)

        if not os.path.exists(self.eval_dir):
            os.makedirs(self.eval_dir)
        try:
            eval_net_file_name = sorted(
                glob.glob(
                    os.path.join(self.shell_args.log_prefix,
                                 self.shell_args.checkpoint_dirname, "*") +
                    "/*.pt"),
                key=os.path.getmtime,
            )[-1]
            eval_net_file_name = (
                self.shell_args.log_prefix.replace(os.sep, "_") + "_" +
                "_".join(eval_net_file_name.split(os.sep)[-2:])[:-3])
        except IndexError:
            print("Warning, no weights found")
            eval_net_file_name = "random_weights"
        eval_output_file = open(
            os.path.join(self.eval_dir, eval_net_file_name + ".csv"), "w")
        print("Writing results to", eval_output_file.name)

        # Save the evaled net for posterity
        if self.shell_args.save_checkpoints:
            save_model = self.agent
            pt_util.save(
                save_model,
                os.path.join(self.shell_args.log_prefix,
                             self.shell_args.checkpoint_dirname,
                             "eval_weights"),
                num_to_keep=-1,
                iteration=self.log_iter,
            )
            print("Wrote model to file for safe keeping")

        obs = self.envs.reset()
        if self.compute_surface_normals:
            obs["surface_normals"] = pt_util.depth_to_surface_normals(
                obs["depth"].to(self.device))
        obs["prev_action_one_hot"] = obs[
            "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32)
        recurrent_hidden_states = torch.zeros(
            self.shell_args.num_processes,
            self.agent.recurrent_hidden_state_size,
            dtype=torch.float32,
            device=self.device,
        )
        masks = torch.ones(self.shell_args.num_processes,
                           1,
                           dtype=torch.float32,
                           device=self.device)

        episode_rewards = deque(maxlen=10)
        current_episode_rewards = np.zeros(self.shell_args.num_processes)
        episode_lengths = deque(maxlen=10)
        current_episode_lengths = np.zeros(self.shell_args.num_processes)

        total_num_steps = self.log_iter
        fps_timer = [time.time(), total_num_steps]
        timers = np.zeros(3)

        num_episodes = 0

        print("Config\n", self.configs[0])

        # Initialize every time eval is run rather than just at the start
        dataset_sizes = np.array(
            [len(dataset.episodes) for dataset in self.eval_datasets])

        eval_stats = dict(
            episode_ids=[None for _ in range(self.shell_args.num_processes)],
            num_episodes=np.zeros(self.shell_args.num_processes,
                                  dtype=np.int32),
            num_steps=np.zeros(self.shell_args.num_processes, dtype=np.int32),
            reward=np.zeros(self.shell_args.num_processes, dtype=np.float32),
            spl=np.zeros(self.shell_args.num_processes, dtype=np.float32),
            visited_states=np.zeros(self.shell_args.num_processes,
                                    dtype=np.int32),
            success=np.zeros(self.shell_args.num_processes, dtype=np.int32),
            end_geodesic_distance=np.zeros(self.shell_args.num_processes,
                                           dtype=np.float32),
            start_geodesic_distance=np.zeros(self.shell_args.num_processes,
                                             dtype=np.float32),
            delta_geodesic_distance=np.zeros(self.shell_args.num_processes,
                                             dtype=np.float32),
            distance_from_start=np.zeros(self.shell_args.num_processes,
                                         dtype=np.float32),
        )
        eval_stats_means = dict(
            num_episodes=0,
            num_steps=0,
            reward=0,
            spl=0,
            visited_states=0,
            success=0,
            end_geodesic_distance=0,
            start_geodesic_distance=0,
            delta_geodesic_distance=0,
            distance_from_start=0,
        )
        eval_output_file.write("name,%s,iter,%d\n\n" %
                               (eval_net_file_name, self.log_iter))
        if self.shell_args.task == "pointnav":
            eval_output_file.write((
                "episode_id,num_steps,reward,spl,success,start_geodesic_distance,"
                "end_geodesic_distance,delta_geodesic_distance\n"))
        elif self.shell_args.task == "exploration":
            eval_output_file.write("episode_id,reward,visited_states\n")
        elif self.shell_args.task == "flee":
            eval_output_file.write("episode_id,reward,distance_from_start\n")
        distances = pt_util.to_numpy(obs["goal_geodesic_distance"])
        eval_stats["start_geodesic_distance"][:] = distances
        progress_bar = tqdm.tqdm(total=self.num_eval_episodes_total)
        all_done = False
        iter_count = 0
        video_frames = []
        previous_visual_features = None
        egomotion_pred = None
        prev_action = None
        prev_action_probs = None
        if hasattr(self.agent.base, "enable_decoder"):
            if self.shell_args.record_video:
                self.agent.base.enable_decoder()
            else:
                self.agent.base.disable_decoder()
        while not all_done:
            with torch.no_grad():
                start_t = time.time()
                value, action, action_log_prob, recurrent_hidden_states = self.agent.act(
                    {
                        "images":
                        obs["rgb"].to(self.device),
                        "target_vector":
                        obs["pointgoal"].to(self.device),
                        "prev_action_one_hot":
                        obs["prev_action_one_hot"].to(self.device),
                    },
                    recurrent_hidden_states,
                    masks,
                )
                action_cpu = pt_util.to_numpy(action.squeeze(1))
                translated_action_space = ACTION_SPACE[action_cpu]

                timers[1] += time.time() - start_t

                if self.shell_args.record_video:
                    if self.shell_args.use_motion_loss:
                        if previous_visual_features is not None:
                            egomotion_pred = self.agent.base.predict_egomotion(
                                self.agent.base.visual_features,
                                previous_visual_features)
                        previous_visual_features = self.agent.base.visual_features.detach(
                        )

                    # Copy so we don't mess with obs itself
                    draw_obs = OrderedDict()
                    for key, val in obs.items():
                        draw_obs[key] = pt_util.to_numpy(val).copy()
                    best_next_action = draw_obs.pop("best_next_action", None)

                    if prev_action is not None:
                        draw_obs["action_taken"] = pt_util.to_numpy(
                            self.agent.last_dist.probs).copy()
                        draw_obs["action_taken"][:] = 0
                        draw_obs["action_taken"][
                            np.arange(self.shell_args.num_processes),
                            prev_action] = 1
                        draw_obs["action_taken_name"] = SIM_ACTION_TO_NAME[
                            draw_obs['prev_action'].item()]
                        draw_obs["action_prob"] = pt_util.to_numpy(
                            prev_action_probs).copy()
                    else:
                        draw_obs["action_taken"] = None
                        draw_obs["action_taken_name"] = SIM_ACTION_TO_NAME[
                            SimulatorActions.STOP]
                        draw_obs["action_prob"] = None
                    prev_action = action_cpu
                    prev_action_probs = self.agent.last_dist.probs.detach()
                    if hasattr(
                            self.agent.base, "decoder_outputs"
                    ) and self.agent.base.decoder_outputs is not None:
                        min_channel = 0
                        for key, num_channels in self.agent.base.decoder_output_info:
                            outputs = self.agent.base.decoder_outputs[:,
                                                                      min_channel:
                                                                      min_channel
                                                                      +
                                                                      num_channels,
                                                                      ...]
                            draw_obs["output_" +
                                     key] = pt_util.to_numpy(outputs).copy()
                            min_channel += num_channels
                    draw_obs["rewards"] = eval_stats["reward"]
                    draw_obs["step"] = current_episode_lengths.copy()
                    draw_obs["method"] = self.shell_args.method_name
                    if best_next_action is not None:
                        draw_obs["best_next_action"] = best_next_action
                    if self.shell_args.use_motion_loss:
                        if egomotion_pred is not None:
                            draw_obs["egomotion_pred"] = pt_util.to_numpy(
                                F.softmax(egomotion_pred, dim=1)).copy()
                        else:
                            draw_obs["egomotion_pred"] = None
                    images, titles, normalize = draw_outputs.obs_to_images(
                        draw_obs)
                    im_inds = [0, 2, 3, 1, 6, 7, 8, 5]
                    height, width = images[0].shape[:2]
                    subplot_image = drawing.subplot(
                        images,
                        2,
                        4,
                        titles=titles,
                        normalize=normalize,
                        output_width=max(width, 320),
                        output_height=max(height, 320),
                        order=im_inds,
                        fancy_text=True,
                    )
                    video_frames.append(subplot_image)

                # save dists from previous step or else on reset they will be overwritten
                distances = pt_util.to_numpy(obs["goal_geodesic_distance"])

                start_t = time.time()
                obs, rewards, dones, infos = self.envs.step(
                    translated_action_space)
                timers[0] += time.time() - start_t
                obs["prev_action_one_hot"] = obs[
                    "prev_action_one_hot"][:, ACTION_SPACE].to(torch.float32)
                rewards *= REWARD_SCALAR
                rewards = np.clip(rewards, -10, 10)

                if self.shell_args.record_video and not dones[0]:
                    obs["top_down_map"] = infos[0]["top_down_map"]

                if self.compute_surface_normals:
                    obs["surface_normals"] = pt_util.depth_to_surface_normals(
                        obs["depth"].to(self.device))

                current_episode_rewards += pt_util.to_numpy(rewards).squeeze()
                current_episode_lengths += 1
                to_pause = []
                for ii, done_e in enumerate(dones):
                    if done_e:
                        num_episodes += 1

                        if self.shell_args.record_video:
                            if "top_down_map" in infos[ii]:
                                video_dir = os.path.join(
                                    self.shell_args.log_prefix, "videos")
                                if not os.path.exists(video_dir):
                                    os.makedirs(video_dir)
                                im_path = os.path.join(
                                    self.shell_args.log_prefix, "videos",
                                    "total_steps_%d.png" % total_num_steps)
                                top_down_map = maps.colorize_topdown_map(
                                    infos[ii]["top_down_map"]["map"])
                                imageio.imsave(im_path, top_down_map)

                            images_to_video(
                                video_frames,
                                os.path.join(self.shell_args.log_prefix,
                                             "videos"),
                                "total_steps_%d" % total_num_steps,
                            )
                            video_frames = []

                        eval_stats["episode_ids"][ii] = infos[ii]["episode_id"]

                        if self.shell_args.task == "pointnav":
                            print(
                                "FINISHED EPISODE %d Length %d Reward %.3f SPL %.4f"
                                % (
                                    num_episodes,
                                    current_episode_lengths[ii],
                                    current_episode_rewards[ii],
                                    infos[ii]["spl"],
                                ))
                            eval_stats["spl"][ii] = infos[ii]["spl"]
                            eval_stats["success"][
                                ii] = eval_stats["spl"][ii] > 0
                            eval_stats["num_steps"][
                                ii] = current_episode_lengths[ii]
                            eval_stats["end_geodesic_distance"][ii] = (
                                infos[ii]["final_distance"] if
                                eval_stats["success"][ii] else distances[ii])
                            eval_stats["delta_geodesic_distance"][ii] = (
                                eval_stats["start_geodesic_distance"][ii] -
                                eval_stats["end_geodesic_distance"][ii])
                        elif self.shell_args.task == "exploration":
                            print(
                                "FINISHED EPISODE %d Reward %.3f States Visited %d"
                                % (num_episodes, current_episode_rewards[ii],
                                   infos[ii]["visited_states"]))
                            eval_stats["visited_states"][ii] = infos[ii][
                                "visited_states"]
                        elif self.shell_args.task == "flee":
                            print(
                                "FINISHED EPISODE %d Reward %.3f Distance from start %.4f"
                                % (num_episodes, current_episode_rewards[ii],
                                   infos[ii]["distance_from_start"]))
                            eval_stats["distance_from_start"][ii] = infos[ii][
                                "distance_from_start"]

                        eval_stats["num_episodes"][ii] += 1
                        eval_stats["reward"][ii] = current_episode_rewards[ii]

                        if eval_stats["num_episodes"][ii] <= dataset_sizes[ii]:
                            progress_bar.update(1)
                            eval_stats_means["num_episodes"] += 1
                            eval_stats_means["reward"] += eval_stats["reward"][
                                ii]
                            if self.shell_args.task == "pointnav":
                                eval_output_file.write(
                                    "%s,%d,%f,%f,%d,%f,%f,%f\n" % (
                                        eval_stats["episode_ids"][ii],
                                        eval_stats["num_steps"][ii],
                                        eval_stats["reward"][ii],
                                        eval_stats["spl"][ii],
                                        eval_stats["success"][ii],
                                        eval_stats["start_geodesic_distance"]
                                        [ii],
                                        eval_stats["end_geodesic_distance"]
                                        [ii],
                                        eval_stats["delta_geodesic_distance"]
                                        [ii],
                                    ))
                                eval_stats_means["num_steps"] += eval_stats[
                                    "num_steps"][ii]
                                eval_stats_means["spl"] += eval_stats["spl"][
                                    ii]
                                eval_stats_means["success"] += eval_stats[
                                    "success"][ii]
                                eval_stats_means[
                                    "start_geodesic_distance"] += eval_stats[
                                        "start_geodesic_distance"][ii]
                                eval_stats_means[
                                    "end_geodesic_distance"] += eval_stats[
                                        "end_geodesic_distance"][ii]
                                eval_stats_means[
                                    "delta_geodesic_distance"] += eval_stats[
                                        "delta_geodesic_distance"][ii]
                            elif self.shell_args.task == "exploration":
                                eval_output_file.write("%s,%f,%d\n" % (
                                    eval_stats["episode_ids"][ii],
                                    eval_stats["reward"][ii],
                                    eval_stats["visited_states"][ii],
                                ))
                                eval_stats_means[
                                    "visited_states"] += eval_stats[
                                        "visited_states"][ii]
                            elif self.shell_args.task == "flee":
                                eval_output_file.write("%s,%f,%f\n" % (
                                    eval_stats["episode_ids"][ii],
                                    eval_stats["reward"][ii],
                                    eval_stats["distance_from_start"][ii],
                                ))
                                eval_stats_means[
                                    "distance_from_start"] += eval_stats[
                                        "distance_from_start"][ii]
                            eval_output_file.flush()
                            if eval_stats["num_episodes"][ii] == dataset_sizes[
                                    ii]:
                                to_pause.append(ii)

                        episode_rewards.append(current_episode_rewards[ii])
                        current_episode_rewards[ii] = 0
                        episode_lengths.append(current_episode_lengths[ii])
                        current_episode_lengths[ii] = 0
                        eval_stats["start_geodesic_distance"][ii] = obs[
                            "goal_geodesic_distance"][ii]

                # If done then clean the history of observations.
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in dones]).to(self.device)

                # Reverse in order to maintain order in case of multiple.
                to_pause.reverse()
                for ii in to_pause:
                    # Pause the environments that are done from the vectorenv.
                    print("Pausing env", ii)
                    self.envs.unwrapped.pause_at(ii)
                    current_episode_rewards = np.concatenate(
                        (current_episode_rewards[:ii],
                         current_episode_rewards[ii + 1:]))
                    current_episode_lengths = np.concatenate(
                        (current_episode_lengths[:ii],
                         current_episode_lengths[ii + 1:]))
                    for key in eval_stats:
                        eval_stats[key] = np.concatenate(
                            (eval_stats[key][:ii], eval_stats[key][ii + 1:]))
                    dataset_sizes = np.concatenate(
                        (dataset_sizes[:ii], dataset_sizes[ii + 1:]))

                    for key in obs:
                        if type(obs[key]) == torch.Tensor:
                            obs[key] = torch.cat(
                                (obs[key][:ii], obs[key][ii + 1:]), dim=0)
                        else:
                            obs[key] = np.concatenate(
                                (obs[key][:ii], obs[key][ii + 1:]), axis=0)

                    recurrent_hidden_states = torch.cat(
                        (recurrent_hidden_states[:ii],
                         recurrent_hidden_states[ii + 1:]),
                        dim=0)
                    masks = torch.cat((masks[:ii], masks[ii + 1:]), dim=0)

                if len(dataset_sizes) == 0:
                    progress_bar.close()
                    all_done = True

            total_num_steps += self.shell_args.num_processes

            if iter_count % (self.shell_args.log_interval * 100) == 0:
                log_dict = {}
                if len(episode_rewards) > 1:
                    end = time.time()
                    nsteps = total_num_steps - fps_timer[1]
                    fps = int((total_num_steps - fps_timer[1]) /
                              (end - fps_timer[0]))
                    timers /= nsteps
                    env_spf = timers[0]
                    forward_spf = timers[1]
                    print((
                        "{} Updates {}, num timesteps {}, FPS {}, Env FPS {}, "
                        "\n Last {} training episodes: mean/median reward {:.3f}/{:.3f}, "
                        "min/max reward {:.3f}/{:.3f}\n").format(
                            datetime.datetime.now(),
                            iter_count,
                            total_num_steps,
                            fps,
                            int(1.0 / env_spf),
                            len(episode_rewards),
                            np.mean(episode_rewards),
                            np.median(episode_rewards),
                            np.min(episode_rewards),
                            np.max(episode_rewards),
                        ))

                    if self.shell_args.tensorboard:
                        log_dict.update({
                            "stats/full_spf":
                            1.0 / (fps + 1e-10),
                            "stats/env_spf":
                            env_spf,
                            "stats/forward_spf":
                            forward_spf,
                            "stats/full_fps":
                            fps,
                            "stats/env_fps":
                            1.0 / (env_spf + 1e-10),
                            "stats/forward_fps":
                            1.0 / (forward_spf + 1e-10),
                            "episode/mean_rewards":
                            np.mean(episode_rewards),
                            "episode/median_rewards":
                            np.median(episode_rewards),
                            "episode/min_rewards":
                            np.min(episode_rewards),
                            "episode/max_rewards":
                            np.max(episode_rewards),
                            "episode/mean_lengths":
                            np.mean(episode_lengths),
                            "episode/median_lengths":
                            np.median(episode_lengths),
                            "episode/min_lengths":
                            np.min(episode_lengths),
                            "episode/max_lengths":
                            np.max(episode_lengths),
                        })
                        self.eval_logger.dict_log(log_dict, step=self.log_iter)
                    fps_timer[0] = time.time()
                    fps_timer[1] = total_num_steps
                    timers[:] = 0
            iter_count += 1
        print("Finished testing")
        print("Wrote results to", eval_output_file.name)

        eval_stats_means = {
            key: val / eval_stats_means["num_episodes"]
            for key, val in eval_stats_means.items()
        }
        if self.shell_args.tensorboard:
            log_dict = {"single_episode/reward": eval_stats_means["reward"]}
            if self.shell_args.task == "pointnav":
                log_dict.update({
                    "single_episode/num_steps":
                    eval_stats_means["num_steps"],
                    "single_episode/spl":
                    eval_stats_means["spl"],
                    "single_episode/success":
                    eval_stats_means["success"],
                    "single_episode/start_geodesic_distance":
                    eval_stats_means["start_geodesic_distance"],
                    "single_episode/end_geodesic_distance":
                    eval_stats_means["end_geodesic_distance"],
                    "single_episode/delta_geodesic_distance":
                    eval_stats_means["delta_geodesic_distance"],
                })
            elif self.shell_args.task == "exploration":
                log_dict["single_episode/visited_states"] = eval_stats_means[
                    "visited_states"]
            elif self.shell_args.task == "flee":
                log_dict[
                    "single_episode/distance_from_start"] = eval_stats_means[
                        "distance_from_start"]
            self.eval_logger.dict_log(log_dict, step=self.log_iter)
        self.envs.unwrapped.resume_all()
Ejemplo n.º 12
0
def obs_to_images(obs):
    img = obs["rgb"].copy()
    images = [img.transpose(0, 2, 3, 1)]

    # Draw top down view
    if "visited_grid" in obs:
        top_down_map = obs["visited_grid"][0, ...]
    elif "top_down_map" in obs:
        top_down_map = maps.colorize_topdown_map(obs["top_down_map"]["map"])
        map_size = 1024
        original_map_size = top_down_map.shape[:2]
        if original_map_size[0] > original_map_size[1]:
            map_scale = np.array(
                (1, original_map_size[1] * 1.0 / original_map_size[0]))
        else:
            map_scale = np.array(
                (original_map_size[0] * 1.0 / original_map_size[1], 1))

        new_map_size = np.round(map_size * map_scale).astype(np.int32)
        # OpenCV expects w, h but map size is in h, w
        top_down_map = cv2.resize(top_down_map,
                                  (new_map_size[1], new_map_size[0]))

        map_agent_pos = obs["top_down_map"]["agent_map_coord"]
        map_agent_pos = np.round(map_agent_pos * new_map_size /
                                 original_map_size).astype(np.int32)
        top_down_map = maps.draw_agent(top_down_map,
                                       map_agent_pos,
                                       obs["heading"] - np.pi / 2,
                                       agent_radius_px=top_down_map.shape[0] /
                                       40)
    else:
        top_down_map = None

    normalize = [True]
    titles = [(
        ("Method: %s" % obs["method"].replace("_", " ")),
        ("Step: %03d Reward: %.3f" %
         (obs["step"][0], obs.get("reward", [0])[0])),
        ("Action: %s" %
         string.capwords(obs["action_taken_name"].replace("_", " "))),
    )]
    images.append(top_down_map)
    if "visited" in obs:
        titles.append((("Visited Cube Count:  %d" % obs["visited"][0]), ))
    elif "distance_from_start" in obs:
        titles.append("Geo Dist From Origin: %.3f" %
                      obs["distance_from_start"][0])
    elif "pointgoal" in obs:
        titles.append((("Euc Dist:  %.3f" % obs["pointgoal"][0, 0]),
                       ("Geo Dist: %.3f" % obs["goal_geodesic_distance"][0])))

    normalize.append(False)

    for key, val in obs.items():
        if key == "depth" or key == "output_depth":
            normalize.append(False)
            val = val[:, 0, ...]
            depth = np.clip(val, -0.5, 0.5)
            depth += 0.5
            depth *= 255
            titles.append(key)
            depth = depth.astype(np.uint8)
            depth = np.reshape(depth, (-1, depth.shape[-1]))
            images.append(depth)
        elif key == "surface_normals" or key == "output_surface_normals":
            titles.append(key)
            normalize.append(False)
            val = val.copy()
            if key == "output_surface_normals":
                # Still need to be normalized
                val /= np.sqrt(np.sum(val**2, axis=1, keepdims=True))
            surfnorm = (np.clip(
                (val + 1), 0, 2) * 127).astype(np.uint8).transpose(
                    (0, 2, 3, 1))
            images.append(surfnorm)
        elif key == "semantic":
            titles.append(key)
            normalize.append(False)
            seg = (val * 314.159 % 255).astype(np.uint8)
            seg = np.reshape(seg, (-1, seg.shape[-1]))
            images.append(seg)
        elif key == "output_reconstruction":
            titles.append(key)
            normalize.append(False)
            val = np.clip(val, -0.5, 0.5)
            val += 0.5
            val *= 255
            val = val.astype(np.uint8).transpose((0, 2, 3, 1))
            images.append(val)
        elif key in {
                "action_prob", "action_taken", "egomotion_pred",
                "best_next_action"
        }:
            if key == "action_prob":
                titles.append(("Output Distribution",
                               "p(Forward)     p(Left)     p(Right)"))
            else:
                titles.append(key)
            if val is not None:
                normalize.append(True)
                prob_hists = np.concatenate(
                    [draw_probability_hist(pi) for pi in val.copy()], axis=0)
                images.append(prob_hists)

            else:
                images.append(None)
                normalize.append(False)
    images.append(top_down_map)
    normalize.append(True)
    titles = [
        string.capwords(title.replace("_", " "))
        if isinstance(title, str) else title for title in titles
    ]
    return images, titles, normalize
Ejemplo n.º 13
0
def observations_to_image(observation: Dict, info: Dict, reward, weights_output=None, aux_tasks=[]) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().
        reward: float to append
        weights_output: attention weights for viz

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        rgb = observation["rgb"]
        if not isinstance(rgb, np.ndarray):
            rgb = rgb.cpu().numpy()

        egocentric_view.append(rgb)

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = observation["depth"].squeeze() * 255.0
        if not isinstance(depth_map, np.ndarray):
            depth_map = depth_map.cpu().numpy()

        depth_map = depth_map.astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (
        len(egocentric_view) > 0
    ), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"]
        )
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        old_h, old_w, _ = top_down_map.shape
        top_down_height = observation_size
        top_down_width = int(float(top_down_height) / old_h * old_w)
        # cv2 resize (dsize is width first)
        top_down_map = cv2.resize(
            top_down_map,
            (top_down_width, top_down_height),
            interpolation=cv2.INTER_CUBIC,
        )
        frame = np.concatenate((egocentric_view, top_down_map), axis=1)

    if weights_output is not None and len(aux_tasks) > 1:
        # add a strip to the right of the video
        strip_height = observation_size # ~256 -> we'll have 5-10 tasks, let's do 24 pixels each
        strip_gap = 24
        strip_width = strip_gap + 12
        strip = np.ones((strip_height, strip_width, 3), dtype=np.uint8) * 255 # white bg

        num_tasks = weights_output.size(0)
        total_height = num_tasks * strip_gap
        offset = int((strip_height - total_height)/2)
        assert offset > 0, "too many aux tasks to visualize"
        for i in range(num_tasks):
            start_height = i * strip_gap + offset
            strength = int(255 * weights_output[i])
            color = np.array([strength, 0, 0])
            if weights_output[i] > 1.001:
                raise Exception(f"weights is {weights_output}, that's too big")
            strip[start_height: start_height + strip_gap] = color

            task_name = AUX_ABBREV.get(aux_tasks[i], aux_tasks[i])
            task_abbrev = task_name[:3]
            cv2.putText(img=strip,
                text=f"{task_abbrev}", org=(2, int(start_height + strip_gap / 2)),
                fontFace=2, fontScale=.4, color=(256, 256, 256), thickness=1)
        frame = np.concatenate((frame, strip), axis=1)
    return frame
Ejemplo n.º 14
0
            (path_point[0] - 2.04),
            grid_dimensions,
            pathfinder=sim.pathfinder,
        ) for path_point in second_xyz
    ]

    ground_truth = [
        maps.to_grid(
            -(path_point[2]) + 19.25,
            (path_point[0] - 2.04),
            grid_dimensions,
            pathfinder=sim.pathfinder,
        ) for path_point in first_xyz
    ]

    colored_map = maps.colorize_topdown_map(hablab_topdown_map)
    trajectory = np.array(trajectory)
    ground_truth = np.array(ground_truth)

    #plt.figure(figsize=(20, 20))
    fig, ax = plt.subplots()
    ax.imshow(colored_map)
    ax.plot(trajectory[:, 1],
            trajectory[:, 0],
            linewidth=1,
            color='b',
            label='estimated')
    ax.plot(ground_truth[:, 1],
            ground_truth[:, 0],
            linewidth=1,
            color='r',
Ejemplo n.º 15
0
def observations_to_image(observation: Dict,
                          info: Dict,
                          pred=None) -> np.ndarray:
    r"""Generate image of single frame from observation and info
    returned from a single environment step().

    Args:
        observation: observation returned from an environment step().
        info: info returned from an environment step().

    Returns:
        generated image of a single frame.
    """
    egocentric_view = []
    if "rgb" in observation:
        observation_size = observation["rgb"].shape[0]
        rgb = observation["rgb"]
        if not isinstance(rgb, np.ndarray):
            rgb = rgb.cpu().numpy()

        egocentric_view.append(rgb)

    # draw depth map if observation has depth info
    if "depth" in observation:
        observation_size = observation["depth"].shape[0]
        depth_map = observation["depth"].squeeze() * 255.0
        if not isinstance(depth_map, np.ndarray):
            depth_map = depth_map.cpu().numpy()

        depth_map = depth_map.astype(np.uint8)
        depth_map = np.stack([depth_map for _ in range(3)], axis=2)
        egocentric_view.append(depth_map)

    assert (len(egocentric_view) >
            0), "Expected at least one visual sensor enabled."
    egocentric_view = np.concatenate(egocentric_view, axis=1)

    # draw collision
    if "collisions" in info and info["collisions"]["is_collision"]:
        egocentric_view = draw_collision(egocentric_view)

    frame = egocentric_view

    if "top_down_map" in info:
        top_down_map = info["top_down_map"]["map"]
        top_down_map = maps.colorize_topdown_map(
            top_down_map, info["top_down_map"]["fog_of_war_mask"])
        map_agent_pos = info["top_down_map"]["agent_map_coord"]
        top_down_map = maps.draw_agent(
            image=top_down_map,
            agent_center_coord=map_agent_pos,
            agent_rotation=info["top_down_map"]["agent_angle"],
            agent_radius_px=top_down_map.shape[0] // 16,
        )
        if pred is not None:
            from habitat.utils.geometry_utils import quaternion_rotate_vector

            # current_position = sim.get_agent_state().position
            # agent_state = sim.get_agent_state()
            source_rotation = info["top_down_map"]["agent_rotation"]

            rounded_pred = np.round(pred[1])
            direction_vector_agent = np.array(
                [rounded_pred[1], 0, -rounded_pred[0]])
            direction_vector = quaternion_rotate_vector(
                source_rotation, direction_vector_agent)
            # pred_goal_location = source_position + direction_vector.astype(np.float32)

            grid_size = (
                (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
                (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000,
            )
            delta_x = int(-direction_vector[0] / grid_size[0])
            delta_y = int(direction_vector[2] / grid_size[1])

            x = np.clip(map_agent_pos[0] + delta_x,
                        a_min=0,
                        a_max=top_down_map.shape[0])
            y = np.clip(map_agent_pos[1] + delta_y,
                        a_min=0,
                        a_max=top_down_map.shape[1])
            point_padding = 12
            for m in range(x - point_padding, x + point_padding + 1):
                for n in range(y - point_padding, y + point_padding + 1):
                    if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \
                            0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]:
                        top_down_map[m, n] = (0, 255, 255)
            if np.linalg.norm(rounded_pred) < 1:
                assert delta_x == 0 and delta_y == 0

        if top_down_map.shape[0] > top_down_map.shape[1]:
            top_down_map = np.rot90(top_down_map, 1)

        # scale top down map to align with rgb view
        if pred is None:
            old_h, old_w, _ = top_down_map.shape
            top_down_height = observation_size
            top_down_width = int(float(top_down_height) / old_h * old_w)
            # cv2 resize (dsize is width first)
            top_down_map = cv2.resize(
                top_down_map.astype(np.float32),
                (top_down_width, top_down_height),
                interpolation=cv2.INTER_CUBIC,
            )
        else:
            # draw label
            CATEGORY_INDEX_MAPPING = {
                'chair': 0,
                'table': 1,
                'picture': 2,
                'cabinet': 3,
                'cushion': 4,
                'sofa': 5,
                'bed': 6,
                'chest_of_drawers': 7,
                'plant': 8,
                'sink': 9,
                'toilet': 10,
                'stool': 11,
                'towel': 12,
                'tv_monitor': 13,
                'shower': 14,
                'bathtub': 15,
                'counter': 16,
                'fireplace': 17,
                'gym_equipment': 18,
                'seating': 19,
                'clothes': 20
            }
            index2label = {v: k for k, v in CATEGORY_INDEX_MAPPING.items()}
            pred_label = index2label[pred[0]]
            text_height = int(observation_size * 0.1)

            old_h, old_w, _ = top_down_map.shape
            top_down_height = observation_size - text_height
            top_down_width = int(float(top_down_height) / old_h * old_w)
            # cv2 resize (dsize is width first)
            top_down_map = cv2.resize(
                top_down_map.astype(np.float32),
                (top_down_width, top_down_height),
                interpolation=cv2.INTER_CUBIC,
            )

            top_down_map = np.concatenate([
                np.ones([text_height, top_down_map.shape[1], 3],
                        dtype=np.int32) * 255, top_down_map
            ],
                                          axis=0)
            top_down_map = cv2.putText(top_down_map,
                                       'C_t: ' + pred_label.replace('_', ' '),
                                       (10, text_height - 10),
                                       cv2.FONT_HERSHEY_SIMPLEX, 1.4,
                                       (0, 0, 0), 2, cv2.LINE_AA)

        frame = np.concatenate((egocentric_view, top_down_map), axis=1)
    return frame