def __init__(
        self,
        root_dir: str,
        split: str = "train",
        seed: int = 5,
        train_ratio: float = 0.9,
    ):
        """Initializes the MPII dataset class, relevant paths and the Joints
        initializes the class for remapping of MPII formatted joints to that of AIT.
        joints mapping at
        https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md#hand-output-format

        Args:
            root_dir (str): Path to the directory with image samples.
            split (str): To select train or test split.
        """
        self.root_dir = root_dir
        self.split = split
        self.seed = seed
        self.train_ratio = train_ratio
        split_set = "train" if self.split in ["train", "val"] else split
        self.image_dir_path = os.path.join(self.root_dir,
                                           f"manual_{split_set}")
        self.label_dir_path = os.path.join(self.root_dir,
                                           f"manual_{split_set}")
        self.img_names = self.get_image_names()
        self.labels = self.get_labels()
        self.indices = self.create_train_val_split()
        # To convert from MPII to AIT format.
        self.joints = Joints()
Esempio n. 2
0
 def __init__(
     self,
     root_dir: str,
     split: str,
     annotor: str = "all",
     seed: int = 5,
     train_ratio: float = 0.9,
 ):
     """Initializes the Interhand dataset class, relevant paths, meta_info jsons,
     dataframes and the Joints class for remappinng interhand formatted joints to
     that of AIT.
     Args:
         root_dir (str): Path to the directory with image samples.
         split (str): set to 'train', 'val' or 'test'.
         annotor (str, optional): [description]. Defaults to 'all'. Other options are
           'human_annot' and 'machine_annot' .
     """
     self.root_dir = root_dir
     # To convert from freihand to AIT format.
     self.joints = Joints()
     self.seed = seed
     self.train_ratio = train_ratio
     self.annotor = annotor  # "human_annot" and "machine_annot" possible.
     self.annotation_sampling_folder = "InterHand2.6M.annotations.5.fps"
     self.image_sampling_folder = "InterHand2.6M_5fps_batch0/images"
     self._split = split
     self.split = "train" if split in ["train", "val"] else split
     (
         self.image_info,
         self.annotations_info,
         self.camera_info,
         self.joints_dict,
     ) = self.get_meta_info()
     self.indices = self.create_train_val_split()
 def __init__(self, root_dir: str, split: str = "train"):
     self.root_dir = root_dir
     self.split = split
     self.joints_list, self.img_list = self.get_joints_labels_and_images()
     self.img_dict = {item["id"]: item for item in self.img_list}
     self.joints = Joints()
     self.indices = self.create_train_val_split()
Esempio n. 4
0
    def __init__(
        self, root_dir: str, split: str, seed: int = 5, train_ratio: float = 0.9
    ):
        """Initializes the freihand dataset class, relevant paths and the Joints
        class for remapping of freihand formatted joints to that of AIT.

        Args:
            root_dir (str): Path to the directory with image samples.
        """
        self.root_dir = root_dir
        self.split = split
        self.seed = seed
        self.train_ratio = train_ratio
        self.labels = self.get_labels()
        self.scale = self.get_scale()
        self.camera_param = self.get_camera_param()
        self.img_names, self.img_path = self.get_image_names()
        self.indices = self.create_train_val_split()
        # To convert from freihand to AIT format.
        self.joints = Joints()
Esempio n. 5
0
import copy
from math import cos, pi, sin
from typing import Tuple, Union

import numpy as np
import torch
from PIL import Image
from src.data_loader.joints import Joints
from src.types import CAMERA_PARAM, JOINTS_3D, JOINTS_25D, SCALE
from src.constants import MANO_MAT
from torch.utils.data import ConcatDataset, DataLoader, Dataset, WeightedRandomSampler
from torchvision import transforms

JOINTS = Joints()
PARENT_JOINT = JOINTS.mapping.ait.wrist
CHILD_JOINT = JOINTS.mapping.ait.index_mcp


def convert_to_2_5D(K: CAMERA_PARAM,
                    joints_3D: JOINTS_3D) -> Tuple[JOINTS_25D, SCALE]:
    """Converts coordinates from 3D to 2.5D
    Refer: https://arxiv.org/pdf/1804.09534.pdf

    Args:
        K (CAMERA_PARAM):3x3 Matrix with camera parameters.
        joints_3D (JOINTS_3D): Original 3D coordinates unscaled.

    Returns:
        Tuple[JOINTS_25D, SCALE]: 2.5 D coordinates and scale information.
    """
    scale = (((joints_3D[CHILD_JOINT] -
Esempio n. 6
0
class IH_DB(Dataset):
    """Class to load samples from the Interhand dataset.
    https://mks0601.github.io/InterHand2.6M/
    Inherits from the Dataset class in  torch.utils.data.
    Note: The keypoints are mapped to format used at AIT.
    Refer to joint_mapping.json in src/data_loader/utils.
    """

    IS_LEFT = True

    def __init__(
        self,
        root_dir: str,
        split: str,
        annotor: str = "all",
        seed: int = 5,
        train_ratio: float = 0.9,
    ):
        """Initializes the Interhand dataset class, relevant paths, meta_info jsons,
        dataframes and the Joints class for remappinng interhand formatted joints to
        that of AIT.
        Args:
            root_dir (str): Path to the directory with image samples.
            split (str): set to 'train', 'val' or 'test'.
            annotor (str, optional): [description]. Defaults to 'all'. Other options are
              'human_annot' and 'machine_annot' .
        """
        self.root_dir = root_dir
        # To convert from freihand to AIT format.
        self.joints = Joints()
        self.seed = seed
        self.train_ratio = train_ratio
        self.annotor = annotor  # "human_annot" and "machine_annot" possible.
        self.annotation_sampling_folder = "InterHand2.6M.annotations.5.fps"
        self.image_sampling_folder = "InterHand2.6M_5fps_batch0/images"
        self._split = split
        self.split = "train" if split in ["train", "val"] else split
        (
            self.image_info,
            self.annotations_info,
            self.camera_info,
            self.joints_dict,
        ) = self.get_meta_info()
        self.indices = self.create_train_val_split()

    def get_meta_info(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, dict]:
        data = read_json(
            os.path.join(
                self.root_dir,
                self.annotation_sampling_folder,
                self.annotor,
                f"InterHand2.6M_{self.split}_data.json",
            )
        )
        camera_info = pd.DataFrame(
            read_json(
                os.path.join(
                    self.root_dir,
                    self.annotation_sampling_folder,
                    self.annotor,
                    f"InterHand2.6M_{self.split}_camera.json",
                )
            )
        ).T
        joints_dict = read_json(
            os.path.join(
                self.root_dir,
                self.annotation_sampling_folder,
                self.annotor,
                f"InterHand2.6M_{self.split}_joint_3d.json",
            )
        )
        annotations_info = pd.DataFrame(data["annotations"])
        # selecting only single hand images
        annotations_info = annotations_info[
            annotations_info["hand_type"] != "interacting"
        ]
        annotations_info = annotations_info.set_index(np.arange(len(annotations_info)))
        image_info = pd.DataFrame(data["images"]).set_index("id")
        return image_info, annotations_info, camera_info, joints_dict

    def get_camera_params(
        self, camera, capture_id
    ) -> Tuple[np.array, np.array, np.array]:
        camera_param = self.camera_info.loc[str(capture_id)]
        t, r, (fx, fy), (px, py) = (
            camera_param.campos[camera],
            camera_param.camrot[camera],
            camera_param.focal[camera],
            camera_param.princpt[camera],
        )
        intrinsic_param = np.array([[fx, 0, px], [0, fy, py], [0, 0, 1.0]])
        # intrinsic_param = np.array([[fx, 0, px], [0, fy, py]])
        return intrinsic_param, np.array(r), np.array(t)

    def get_joints(
        self, capture_id: Union[int, str], frame_idx: Union[int, str]
    ) -> Tuple[np.array, np.array, bool]:
        joint_item = self.joints_dict[str(capture_id)][str(frame_idx)]
        if joint_item["hand_type"] == "left":
            return (
                np.array(joint_item["world_coord"][-21:]),
                np.array(joint_item["joint_valid"][-21:]),
                IH_DB.IS_LEFT,
            )
        elif joint_item["hand_type"] == "right":
            return (
                np.array(joint_item["world_coord"][:21]),
                np.array(joint_item["joint_valid"][:21]),
                not IH_DB.IS_LEFT,
            )
        else:
            raise NotImplementedError

    def create_train_val_split(self) -> np.array:
        """Creates split for train and val data in mpii
        Raises:
            NotImplementedError: In case the split doesn't match test, train or val.
        Returns:
            np.array: array of indices
        """
        num_images = len(self.annotations_info)
        train_indices, val_indices = train_test_split(
            np.arange(num_images), train_size=self.train_ratio, random_state=self.seed
        )
        if self._split == "train":
            return np.sort(train_indices)
        elif self._split == "val":
            return np.sort(val_indices)
        elif self._split == "test":
            return np.arange(len(self.annotations_info))
        else:
            raise NotImplementedError

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx: int) -> dict:
        """Returns a sample corresponding to the index.

        Args:
            idx (int): index

        Returns:
            dict: item with following elements.
                "image" in opencv bgr format.
                "K": camera params
                "joints3D": 3D coordinates of joints in AIT format.
        """

        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx_ = self.indices[idx]
        image_id = self.annotations_info.loc[idx_]["image_id"]
        image_item = self.image_info.loc[image_id]
        image = cv2.imread(
            os.path.join(
                self.root_dir,
                self.image_sampling_folder,
                self.split,
                image_item.file_name,
            )
        )
        joints, joints_valid, is_left = self.get_joints(
            image_item.capture, image_item.frame_idx
        )
        joints, joints_valid = (
            self.joints.interhand_to_ait(joints),
            self.joints.interhand_to_ait(joints_valid),
        )
        intrinsic_camera_matrix, camera_rot, camera_t = self.get_camera_params(
            image_item.camera, image_item.capture
        )
        if is_left:
            image = cv2.flip(image, 1)
            _, W = image.shape[:2]
            intrinsic_camera_matrix = (
                np.float32([[-1, 0, W - 1], [0, 1, 0], [0, 0, 1]])
                @ intrinsic_camera_matrix
            )

        joints_camera_frame = (joints - camera_t) @ camera_rot.T
        # To avoid division by zero.
        joints_camera_frame[:, -1] += 1e-5
        sample = {
            "image": image,
            "K": torch.tensor(intrinsic_camera_matrix).float(),
            "joints3D": torch.tensor(joints_camera_frame).float() / 1000.0,
            "joints_valid": torch.tensor(joints_valid),
        }
        return sample
Esempio n. 7
0
import os
from typing import Union

import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from comet_ml import Experiment
from pytorch_lightning.loggers import comet
from src.constants import MASTER_THESIS_DIR
from src.data_loader.joints import Joints
from src.types import JOINTS_3D, JOINTS_25D
from src.utils import read_json
from torchvision import transforms

joints = Joints()


def plot_hand(
    axis: plt.Axes,
    coords_hand: np.array,
    plot_3d: bool = False,
    linewidth: str = "1",
    linestyle: str = "-",
    alpha: float = 1.0,
    ms=1,
):
    """Makes a hand stick figure from the coordinates wither in uv plane or xyz plane on the passed axes object.
    Code adapted from:  https://github.com/lmb-freiburg/freihand/blob/master/utils/eval_util.py

    Args:
class MPII_DB(Dataset):
    def __init__(
        self,
        root_dir: str,
        split: str = "train",
        seed: int = 5,
        train_ratio: float = 0.9,
    ):
        """Initializes the MPII dataset class, relevant paths and the Joints
        initializes the class for remapping of MPII formatted joints to that of AIT.
        joints mapping at
        https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md#hand-output-format

        Args:
            root_dir (str): Path to the directory with image samples.
            split (str): To select train or test split.
        """
        self.root_dir = root_dir
        self.split = split
        self.seed = seed
        self.train_ratio = train_ratio
        split_set = "train" if self.split in ["train", "val"] else split
        self.image_dir_path = os.path.join(self.root_dir,
                                           f"manual_{split_set}")
        self.label_dir_path = os.path.join(self.root_dir,
                                           f"manual_{split_set}")
        self.img_names = self.get_image_names()
        self.labels = self.get_labels()
        self.indices = self.create_train_val_split()
        # To convert from MPII to AIT format.
        self.joints = Joints()

    def get_image_names(self) -> List[str]:
        """Gets the name of all the files in root_dir.
        Make sure there are only image in that directory as it reads all the file names.

        Returns:
            List[str]: List of image names.
        """

        img_names = [
            file_name for file_name in next(os.walk(self.image_dir_path))[2]
            if ".jpg" in file_name
        ]
        img_names.sort()
        # popping, images with annottaaions out of image bounds
        try:
            img_names.remove("Ricki_unit_8.flv_000003_l.jpg")
            img_names.remove("Ricki_unit_8.flv_000002_l.jpg")
        except Exception as e:
            print(f"Out of frame images not found {e}")
        return img_names

    def get_labels(self) -> Dict[str, dict]:
        label_file_names = [
            file_name for file_name in next(os.walk(self.label_dir_path))[2]
            if ".json" in file_name
        ]
        labels = {
            file_name.replace(".json", ""):
            read_json(os.path.join(self.label_dir_path, file_name))
            for file_name in label_file_names
        }
        return labels

    def create_train_val_split(self) -> np.array:
        """Creates split for train and val data in mpii
        Raises:
            NotImplementedError: In case the split doesn't match test, train or val.

        Returns:
            np.array: array of indices
        """
        num_unique_images = len(self.img_names)
        train_indices, val_indices = train_test_split(
            np.arange(num_unique_images),
            train_size=self.train_ratio,
            random_state=self.seed,
        )
        if self.split == "train":
            return np.sort(train_indices)
        elif self.split == "val":
            return np.sort(val_indices)
        elif self.split == "test":
            return np.arange(len(self.img_names))
        else:
            raise NotImplementedError

    def __len__(self):
        return len(self.indices)

    def __getitem__(self,
                    idx: int) -> Dict[str, Union[np.array, torch.Tensor]]:
        """Returns a sample corresponding to the index.

        Args:
            idx (int): index

        Returns:
            dict: item with following elements.
                "image" in opencv bgr format.
                "K": camera params (Indetity matrix in this case)
                "joints3D": 3D coordinates of joints in AIT format. (z coordinate is 1.0)
        """

        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx_ = self.indices[idx]
        img_name = os.path.join(self.image_dir_path, self.img_names[idx_])
        img = cv2.cvtColor(cv2.imread(img_name), cv2.COLOR_BGR2RGB)
        # mpii follow the same strategy as the freihand for joint naming.
        label = self.labels[self.img_names[idx_].replace(".jpg", "")]
        joints3D = self.joints.freihand_to_ait(
            torch.tensor(label["hand_pts"]).float())
        if label["is_left"] == 1:
            # flipping horizontally to make it right hand
            img = cv2.flip(img, 1)
            # width - x coord
            joints3D[:, 0] = img.shape[1] - joints3D[:, 0]
        camera_param = torch.eye(3).float()
        joints_valid = torch.ones_like(joints3D[..., -1:])
        sample = {
            "image": img,
            "K": camera_param,
            "joints3D": joints3D,
            "joints_valid": joints_valid,
        }

        return sample
class YTB_DB(Dataset):
    """Class to load samples from the youtube dataset.
    Inherits from the Dataset class in  torch.utils.data.
    Not be used for supervised learning!!
    Camera matrix is unity to fit with the sample augmenter.
    """
    def __init__(self, root_dir: str, split: str = "train"):
        self.root_dir = root_dir
        self.split = split
        self.joints_list, self.img_list = self.get_joints_labels_and_images()
        self.img_dict = {item["id"]: item for item in self.img_list}
        self.joints = Joints()
        self.indices = self.create_train_val_split()

    def get_joints_labels_and_images(self) -> Tuple[dict, dict]:
        """Returns the dictionary conatinign the bound box of the image and dictionary
        containig image information.

        Returns:
            Tuple[dict, dict]: joints, image_dict
                image_dict
                    - `name` - Image name in the form
                        of `youtube/VIDEO_ID/video/frames/FRAME_ID.png`.
                    - `width` - Width of the image.
                    - `height` - Height of the image.
                    - `id` - Image ID.
                joints
                    - `joints` - 21 joints, containing bound box limits as vertices.
                    - `is_left` - Binary value indicating a right/left hand side.
                    - `image_id` - ID to the corresponding entry in `images`.
                    - `id` - Annotation ID (an image can contain multiple hands).
        """
        data_json_path = os.path.join(self.root_dir,
                                      f"youtube_{self.split}.json")
        joints_path = os.path.join(self.root_dir,
                                   f"youtube_{self.split}_joints.json")
        images_json_path = os.path.join(self.root_dir,
                                        f"youtube_{self.split}_images.json")
        if os.path.exists(joints_path) and os.path.exists(images_json_path):
            return read_json(joints_path), read_json(images_json_path)
        else:
            data_json = read_json(data_json_path)
            images_dict = data_json["images"]
            save_json(images_dict, images_json_path)
            annotations_dict = data_json["annotations"]
            joints = self.get_joints_from_annotations(annotations_dict)
            save_json(joints, joints_path)
            return joints, images_dict

    def get_joints_from_annotations(self, annotations: dict) -> dict:
        """Converts vertices corresponding to mano mesh to 21 coordinates signifying
        the bound box.

        Args:
            annotations (dict): dictionary containing annotations.

        Returns:
            dict: same dictionary as annotations except 'vertices' is removed and
                'joints' key is added.
        """
        optimized_vertices = []
        mano_matrix = torch.load(MANO_MAT)
        for elem in tqdm(annotations):
            # joints_21 = sudo_joint_bound(elem["vertices"])
            joints_21 = get_joints_from_mano_mesh(
                torch.tensor(elem["vertices"]), mano_matrix)
            optimized_vertices.append({
                **{key: val
                   for key, val in elem.items() if key != "vertices"},
                **{
                    "joints": joints_21.tolist()
                },
            })
        return optimized_vertices

    def create_train_val_split(self) -> np.array:
        """Creates split for train and val data in mpii
        Raises:
            NotImplementedError: In case the split doesn't match test, train or val.

        Returns:
            np.array: array of indices
        """
        if self.split == "train":
            return np.arange(len(self.joints_list))
        elif self.split == "val":
            valid_index_df = pd.read_csv(
                os.path.join(self.root_dir,
                             f"youtube_{self.split}_invalid_index.csv"))
            return valid_index_df[valid_index_df.valid]["joint_idx"].values
        elif self.split == "test":
            valid_index_df = pd.read_csv(
                os.path.join(self.root_dir,
                             f"youtube_{self.split}_invalid_index.csv"))
            return valid_index_df[valid_index_df.valid]["joint_idx"].values
        else:
            raise NotImplementedError

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx: int) -> dict:
        """Returns a sample corresponding to the index.

        Args:
            idx (int): index

        Returns:
            dict: item with following elements.
                "image" in opencv bgr format.
                "K": camera params
                "joints3D": 3D coordinates of joints in AIT format.
        """

        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx_ = self.indices[idx]
        img_name = os.path.join(
            self.root_dir,
            self.img_dict[self.joints_list[idx_]["image_id"]]["name"])
        img = cv2.cvtColor(cv2.imread(img_name.replace(".png", ".jpg")),
                           cv2.COLOR_BGR2RGB)
        joints3D = self.joints.mano_to_ait(
            torch.tensor(self.joints_list[idx_]["joints"]).float())
        if self.joints_list[idx_]["is_left"] == 1:
            # flipping horizontally to make it right hand
            img = cv2.flip(img, 1)
            # width - x coord
            joints3D[:, 0] = img.shape[1] - joints3D[:, 0]
        joints_raw = joints3D.clone()
        # joints3D = torch.tensor(self.bbox[idx_]["joints"]).float()

        # because image is cropped and rotated with the 2d projections of these coordinates.
        # It needs to have depth as 1.0 to not cause problems. For procrustes use "joints_raw"
        joints3D[..., -1] = 1.0
        camera_param = torch.eye(3).float()
        joints_valid = torch.zeros_like(joints3D[..., -1:])
        sample = {
            "image": img,
            "K": camera_param,
            "joints3D": joints3D,
            "joints_valid": joints_valid,
            "joints_raw": joints_raw,
        }
        return sample
Esempio n. 10
0
class F_DB(Dataset):
    """Class to load samples from the Freihand dataset.
    Inherits from the Dataset class in  torch.utils.data.
    Note: The keypoints are mapped to format used at AIT.
    Refer to joint_mapping.json in src/data_loader/utils.
    """

    def __init__(
        self, root_dir: str, split: str, seed: int = 5, train_ratio: float = 0.9
    ):
        """Initializes the freihand dataset class, relevant paths and the Joints
        class for remapping of freihand formatted joints to that of AIT.

        Args:
            root_dir (str): Path to the directory with image samples.
        """
        self.root_dir = root_dir
        self.split = split
        self.seed = seed
        self.train_ratio = train_ratio
        self.labels = self.get_labels()
        self.scale = self.get_scale()
        self.camera_param = self.get_camera_param()
        self.img_names, self.img_path = self.get_image_names()
        self.indices = self.create_train_val_split()
        # To convert from freihand to AIT format.
        self.joints = Joints()

    def create_train_val_split(self) -> np.array:
        """Creates split for train and val data in freihand

        Raises:
            NotImplementedError: In case the split doesn't match test, train or val.

        Returns:
            np.array: array of indices
        """
        num_unique_images = len(self.camera_param)
        train_indices, val_indices = train_test_split(
            np.arange(num_unique_images),
            train_size=self.train_ratio,
            random_state=self.seed,
        )
        if self.split == "train":
            train_indices = np.sort(train_indices)
            train_indices = np.concatenate(
                (
                    train_indices,
                    train_indices + num_unique_images,
                    train_indices + num_unique_images * 2,
                    train_indices + num_unique_images * 3,
                ),
                axis=0,
            )
            return train_indices
        elif self.split == "val":
            val_indices = np.sort(val_indices)
            val_indices = np.concatenate(
                (
                    val_indices,
                    val_indices + num_unique_images,
                    val_indices + num_unique_images * 2,
                    val_indices + num_unique_images * 3,
                ),
                axis=0,
            )
            return val_indices
        elif self.split == "test":
            return np.arange(len(self.camera_param))
        else:
            raise NotImplementedError

    def get_image_names(self) -> Tuple[List[str], str]:
        """Gets the name of all the files in root_dir.
        Make sure there are only image in that directory as it reads all the file names.

        Returns:
            List[str]: List of image names.
            str: base path for image directory
        """
        if self.split in ["train", "val"]:
            img_path = os.path.join(self.root_dir, "training", "rgb")
        else:
            img_path = os.path.join(self.root_dir, "evaluation", "rgb")
        img_names = next(os.walk(img_path))[2]
        img_names.sort()
        return img_names, img_path

    def get_labels(self) -> list:
        """Extacts the labels(joints coordinates) from the label_json at labels_path
        Returns:
            list: List of all the the coordinates(32650).
        """
        if self.split in ["train", "val"]:
            labels_path = os.path.join(self.root_dir, "training_xyz.json")
            return read_json(labels_path)
        else:
            return None

    def get_scale(self) -> list:
        """Extacts the scale from freihand data."""
        if self.split in ["train", "val"]:
            labels_path = os.path.join(self.root_dir, "training_scale.json")
        else:
            labels_path = os.path.join(self.root_dir, "evaluation_scale.json")
        return read_json(labels_path)

    def get_camera_param(self) -> list:
        """Extacts the camera parameters from the camera_param_json at camera_param_path.
        Returns:
            list: List of camera paramters for all images(32650)
        """
        if self.split in ["train", "val"]:
            camera_param_path = os.path.join(self.root_dir, "training_K.json")
        else:
            camera_param_path = os.path.join(self.root_dir, "evaluation_K.json")
        return read_json(camera_param_path)

    def __len__(self):
        return len(self.indices)

    def create_sudo_bound_box(self, scale) -> Tensor:
        max_bound = torch.tensor([224.0, 224.0])
        min_bound = torch.tensor([0.0, 0.0])
        c = (max_bound + min_bound) / 2.0
        s = ((max_bound - min_bound) / 2.0) * scale
        bound_box = torch.tensor(
            [[0, 0, 0]]
            + [[s[0], s[1], 1]] * 5
            + [[-s[0], s[1], 1]] * 5
            + [[s[0], -s[1], 1]] * 5
            + [[-s[0], -s[1], 1]] * 5
        ) + torch.tensor([c[0], c[1], 0])
        return bound_box.float()

    def __getitem__(self, idx: int) -> dict:
        """Returns a sample corresponding to the index.

        Args:
            idx (int): index

        Returns:
            dict: item with following elements.
                "image" in opencv bgr format.
                "K": camera params
                "joints3D": 3D coordinates of joints in AIT format.
        """

        if torch.is_tensor(idx):
            idx = idx.tolist()
        idx_ = self.indices[idx]
        img_name = os.path.join(self.img_path, self.img_names[idx_])
        img = cv2.cvtColor(cv2.imread(img_name),cv2.COLOR_BGR2RGB)
        if self.labels is not None:
            camera_param = torch.tensor(self.camera_param[idx_ % 32560]).float()
            joints3D = self.joints.freihand_to_ait(
                torch.tensor(self.labels[idx_ % 32560]).float()
            )
        else:
            camera_param = torch.tensor(self.camera_param[idx_]).float()
            joints2d_orthogonal = self.create_sudo_bound_box(scale=BOUND_BOX_SCALE)
            joints3D = convert_2_5D_to_3D(
                joints2d_orthogonal, scale=1.0, K=camera_param.clone()
            )
        joints_valid = torch.ones_like(joints3D[..., -1:])
        sample = {
            "image": img,
            "K": camera_param,
            "joints3D": joints3D,
            "joints_valid": joints_valid,
        }
        return sample