def run(cfg, model, video_data, num_frames, step_frames, fout):
    frames, labels, video_idx, meta = video_data
    length = (frames.shape[1] // cfg.SLOWFAST.ALPHA) * cfg.SLOWFAST.ALPHA
    features = []
    for k in range(0, length, step_frames):
        start = k
        end = min(k + num_frames, length)
        inputs = frames[:, start:end]
        slow, fast = utils.pack_pathway_output(cfg, inputs)
        slow = slow.unsqueeze(0).contiguous()
        fast = fast.unsqueeze(0).contiguous()
        if torch.cuda.is_available():
            slow = slow.cuda(non_blocking=True)
            fast = fast.cuda(non_blocking=True)
        feat = model([slow, fast], ftype="video")
        features.append(feat.detach().cpu())

    features = torch.cat(features, dim=0).numpy()
    feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy")
    np.save(feat_name, features)

    meta["feature_shape"] = features.shape
    meta["feature_frame"] = length
    meta["video_feature"] = feat_name
    meta["step_frames"] = step_frames
    json_str = json.dumps(meta)
    fout.write(json_str + "\n")
    fout.flush()
Exemple #2
0
def _get_model_analysis_input(cfg, is_train):
    """
    Return a dummy input for model analysis with batch size 1. The input is
        used for analyzing the model (counting flops and activations etc.).
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        is_train (bool): if True, return the input for training. Otherwise,
            return the input for testing.

    Returns:
        inputs: the input for model analysis.
    """
    rgb_dimension = 3
    if is_train:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TRAIN_CROP_SIZE,
            cfg.DATA.TRAIN_CROP_SIZE,
        )
    else:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TEST_CROP_SIZE,
            cfg.DATA.TEST_CROP_SIZE,
        )
    model_inputs = pack_pathway_output(cfg, input_tensors)
    for i in range(len(model_inputs)):
        model_inputs[i] = model_inputs[i].unsqueeze(0).cuda(non_blocking=True)

    inputs = (model_inputs, )
    return inputs
Exemple #3
0
def _get_model_analysis_input(cfg):
    """
    Return a dummy input for model analysis with batch size 1. The input is
        used for analyzing the model (counting flops and activations etc.).
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py

    Returns:
        inputs: the input for model analysis.
    """
    spectrogram_dimension = 1
    input_tensors = torch.rand(
        spectrogram_dimension,
        cfg.AUDIO_DATA.NUM_FRAMES,
        cfg.AUDIO_DATA.NUM_FREQUENCIES,
    )
    model_inputs = pack_pathway_output(cfg, input_tensors)
    for i in range(len(model_inputs)):
        model_inputs[i] = model_inputs[i].unsqueeze(0)
        if cfg.NUM_GPUS:
            model_inputs[i] = model_inputs[i].cuda(non_blocking=True)

    inputs = (model_inputs,)
    return inputs
Exemple #4
0
def _get_model_analysis_input(cfg, use_train_input):
    """
    Return a dummy input for model analysis with batch size 1. The input is
        used for analyzing the model (counting flops and activations etc.).
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        use_train_input (bool): if True, return the input for training. Otherwise,
            return the input for testing.

    Returns:
        inputs: the input for model analysis.
    """
    rgb_dimension = 3
    if use_train_input:
        if cfg.TRAIN.DATASET in ["imagenet", "imagenetprefetch"]:
            input_tensors = torch.rand(
                rgb_dimension,
                cfg.DATA.TRAIN_CROP_SIZE,
                cfg.DATA.TRAIN_CROP_SIZE,
            )
        else:
            input_tensors = torch.rand(
                rgb_dimension,
                cfg.DATA.NUM_FRAMES,
                cfg.DATA.TRAIN_CROP_SIZE,
                cfg.DATA.TRAIN_CROP_SIZE,
            )
    else:
        if cfg.TEST.DATASET in ["imagenet", "imagenetprefetch"]:
            input_tensors = torch.rand(
                rgb_dimension,
                cfg.DATA.TEST_CROP_SIZE,
                cfg.DATA.TEST_CROP_SIZE,
            )
        else:
            input_tensors = torch.rand(
                rgb_dimension,
                cfg.DATA.NUM_FRAMES,
                cfg.DATA.TEST_CROP_SIZE,
                cfg.DATA.TEST_CROP_SIZE,
            )
    model_inputs = pack_pathway_output(cfg, input_tensors)
    for i in range(len(model_inputs)):
        model_inputs[i] = model_inputs[i].unsqueeze(0)
        if cfg.NUM_GPUS:
            model_inputs[i] = model_inputs[i].cuda(non_blocking=True)

    # If detection is enabled, count flops for one proposal.
    if cfg.DETECTION.ENABLE:
        bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]])
        if cfg.NUM_GPUS:
            bbox = bbox.cuda()
        inputs = (model_inputs, bbox)
    else:
        inputs = (model_inputs, )
    return inputs
Exemple #5
0
    def __getitem__(self, index):
        # Decode video. Meta info is used to perform selective decoding.
        frames = self.sample_frames(index)

        # Perform color normalization.
        frames = utils.tensor_normalize(frames, self.cfg.DATA.MEAN,
                                        self.cfg.DATA.STD)
        # T H W C -> C T H W.
        frames = frames.permute(3, 0, 1, 2)
        frames = utils.pack_pathway_output(self.cfg, frames)

        return frames, index
Exemple #6
0
    def prepare_action_inference_input(self, imgs, pred_person_boxes):
        """
        Preprocesses the inputs to feed them to our action prediction model
        The preprocessing of the data is analogous to preprocessing test data in tools/test_net.py
        Before returning, we reformat our variables to be able to directly do inference with our activity_prediction_model
        :param imgs: (list of ndarrays with shape (H, W, C)) (in BGR order) and [0,255])
                            the images that are preprocessed
        :param pred_person_boxes: (ndarray(float32) of shape (num_boxes, 4 =x1, y1, x2, y2)) the predicted person boxes
        :return:
            imgs: (list of tensors with shape (1=number_of_batches, C, num_frames, H, W)) the images used for inference
                        Important: they are usually transferred to RGB, since Kinetics pre-training uses RGB
            pred_person_boxes: (tensor,  shape(num_boxes, 5=BatchIdx, x1, y1, x2, y2)) the boxes for the current clip - not normalized.
        """

        if self.cfg.ACTIONRECOGNIZER.IMG_PROC_BACKEND == "pytorch":
            # Transform images to required format for pytorch backend
            if all(img is not None for img in imgs):
                imgs = torch.as_tensor(np.stack(imgs))

            # T H W C -> T C H W.
            imgs = imgs.permute(0, 3, 1, 2)
            # Preprocess images and pred_person_boxes.
            imgs, pred_person_boxes = self.images_and_boxes_preprocessing(
                imgs, boxes=pred_person_boxes)
            # T C H W -> C T H W.
            imgs = imgs.permute(1, 0, 2, 3)

        else:
            # Preprocess images and pred_person_boxes
            imgs, pred_person_boxes = self.images_and_boxes_preprocessing_cv2(
                imgs, boxes=pred_person_boxes)

        # Change to list. If we have a model with multi input arch, a second pathway is created on the basis of imgs
        # Tensor with shape (C, num_frames, H, W) -> List(s) of tensor with same shape
        imgs = utils.pack_pathway_output(self.cfg, imgs)

        # Reformat the tensors included in the list
        # tensor  shape (C, num_frames, H, W) -> shape (1=number_of_batches, C, num_frames, H, W)
        if isinstance(imgs, (list, )):
            for i in range(len(imgs)):
                imgs[i] = torch.unsqueeze(imgs[i], 0)

        # ndarray shape (num_boxes, 4=x1, y1, x2, y2)) -> tensor shape (num_boxes, 4= x1, y1, x2, y2))
        pred_person_boxes = torch.from_numpy(pred_person_boxes)
        # For each box, we add a the batch_id (in our case always 0)
        # tensor shape (num_boxes, 4= x1, y1, x2, y2)) -> tensor shape (num_boxes, 5= batch_id, x1, y1, x2, y2)))
        pred_person_boxes = torch.cat([
            torch.full(
                (pred_person_boxes.shape[0], 1), float(0)), pred_person_boxes
        ],
                                      axis=1)

        return imgs, pred_person_boxes
Exemple #7
0
def get_flop_stats(model, cfg, is_train):
    """
    Compute the gflops for the current model given the config.
    Args:
        model (model): model to compute the flop counts.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        is_train (bool): if True, compute flops for training. Otherwise,
            compute flops for testing.

    Returns:
        float: the total number of gflops of the given model.
    """
    rgb_dimension = 3
    if is_train:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TRAIN_CROP_SIZE,
            cfg.DATA.TRAIN_CROP_SIZE,
        )
    else:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TEST_CROP_SIZE,
            cfg.DATA.TEST_CROP_SIZE,
        )

    flop_inputs = pack_pathway_output(cfg, input_tensors)
    for i in range(len(flop_inputs)):
        flop_inputs[i] = flop_inputs[i].unsqueeze(0).cuda(non_blocking=True)

    # If detection is enabled, count flops for one proposal.
    if not cfg.MODEL.LSTM:
        if cfg.DETECTION.ENABLE:
            bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]])
            bbox = bbox.cuda()
            inputs = (flop_inputs, bbox)
        else:
            inputs = (flop_inputs, )
    else:
        label_history = torch.zeros(
            [1, 10, cfg.MODEL.NUM_CLASSES[0] + cfg.MODEL.NUM_CLASSES[1]])
        label_history = label_history.cuda()
        inputs = ([flop_inputs, label_history], )

    gflop_dict, _ = flop_count(model, inputs)
    gflops = sum(gflop_dict.values())
    return gflops
Exemple #8
0
def get_flop_stats(model, cfg, is_train):
    """
    Compute the gflops for the current model given the config.
    Args:
        model (model): model to compute the flop counts.
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        is_train (bool): if True, compute flops for training. Otherwise,
            compute flops for testing.

    Returns:
        float: the total number of gflops of the given model.
    """
    rgb_dimension = 3
    if is_train:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TRAIN_CROP_SIZE,
            cfg.DATA.TRAIN_CROP_SIZE,
        )
    else:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TEST_CROP_SIZE,
            cfg.DATA.TEST_CROP_SIZE,
        )
    whitelist_ops = [
        "aten::addmm",
        "aten::_convolution",
        "aten::einsum",
        "aten::matmul",
    ]
    flop_inputs = pack_pathway_output(cfg, input_tensors)
    for i in range(len(flop_inputs)):
        flop_inputs[i] = flop_inputs[i].unsqueeze(0).cuda(non_blocking=True)

    # If detection is enabled, count flops for one proposal.
    if cfg.DETECTION.ENABLE:
        bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]])
        bbox = bbox.cuda()
        inputs = (flop_inputs, bbox)
    else:
        inputs = (flop_inputs,)

    gflop_dict = flop_count(model, inputs, whitelist_ops)
    gflops = sum(gflop_dict.values())
    return gflops
Exemple #9
0
def _get_model_analysis_input(cfg, is_train):
    """
    Return a dummy input for model analysis with batch size 1. The input is
        used for analyzing the model (counting flops and activations etc.).
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
        is_train (bool): if True, return the input for training. Otherwise,
            return the input for testing.

    Returns:
        inputs: the input for model analysis.
    """
    rgb_dimension = 3
    if is_train:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TRAIN_CROP_SIZE,
            cfg.DATA.TRAIN_CROP_SIZE,
        )
    else:
        input_tensors = torch.rand(
            rgb_dimension,
            cfg.DATA.NUM_FRAMES,
            cfg.DATA.TEST_CROP_SIZE,
            cfg.DATA.TEST_CROP_SIZE,
        )
    input_audio = None
    if cfg.DATA.USE_AUDIO:
        chn = 2 if cfg.DATA.GET_MISALIGNED_AUDIO else 1
        input_audio = torch.rand(
            chn,
            1,
            cfg.DATA.AUDIO_FRAME_NUM,
            cfg.DATA.AUDIO_MEL_NUM,
        )
    model_inputs = pack_pathway_output(cfg, input_tensors, input_audio)
    for i in range(len(model_inputs)):
        model_inputs[i] = model_inputs[i].unsqueeze(0).cuda(non_blocking=True)

    # If detection is enabled, count flops for one proposal.
    if cfg.DETECTION.ENABLE:
        bbox = torch.tensor([[0, 0, 1.0, 0, 1.0]])
        bbox = bbox.cuda()
        inputs = (model_inputs, bbox)
    else:
        inputs = (model_inputs, )
    return inputs
Exemple #10
0
def run(cfg, model, video_data, num_frames, step_frames, fout, batch_size):
    frames, labels, video_idx, meta = video_data
    length = frames.shape[1]
    features = []
    classifiers = []
    batch = []
    for k in range(0, length, step_frames):
        start = k
        end = min(k + num_frames, length)
        if len(batch) == batch_size or ((end - start < num_frames)
                                        and len(batch) > 0):
            # forward
            # batch_size x 3 num_slow_frames x 224 x 224
            b0 = torch.as_tensor(np.stack([b[0] for b in batch])).contiguous()
            # batch_size x 3 num_fast_frames x 224 x 224
            b1 = torch.as_tensor(np.stack([b[1] for b in batch])).contiguous()
            if torch.cuda.is_available():
                b0 = b0.cuda(non_blocking=True)
                b1 = b1.cuda(non_blocking=True)
            batch = [b0, b1]
            feat, cls = model(batch)
            features.append(feat.detach().cpu())
            classifiers.append(cls.detach().cpu())
            batch = []
        if end - start < num_frames:
            break
        inputs = frames[:, start:end]
        inputs = utils.pack_pathway_output(cfg, inputs)
        batch.append(inputs)

    # length of features: ceil((length - num_frames + 1)/step_frames)
    features = torch.cat(features, dim=0).numpy()
    classifiers = torch.cat(classifiers, dim=0).numpy()
    feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy")
    np.save(feat_name, features)
    cls_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".cls.npy")
    np.save(cls_name, classifiers)

    meta["feature_shape"] = features.shape
    meta["cls_shape"] = classifiers.shape
    meta["feature_frame"] = (len(features) - 1) * step_frames + num_frames
    meta["video_feature"] = feat_name
    meta["video_classifier"] = cls_name
    meta["step_frames"] = step_frames
    json_str = json.dumps(meta)
    fout.write(json_str + "\n")
    fout.flush()
    def __getitem__(self, index):
        """
        Given the video index, return the list of frames, label, and video
        index if the video can be fetched and decoded successfully, otherwise
        repeatly find a random video that can be decoded as a replacement.
        Args:
            index (int): the video index provided by the pytorch sampler.
        Returns:
            frames (tensor): the frames of sampled from the video. The dimension
                is `channel` x `num frames` x `height` x `width`.
            label (int): the label of the current video.
            index (int): if the video provided by pytorch sampler can be
                decoded, then return the index of the video. If not, return the
                index of the video replacement that can be decoded.
        """

        frame_seg = torch.zeros(
            (
                3,
                self.out_size,
                self.cfg.DATA.TEST_CROP_SIZE,
                self.cfg.DATA.TEST_CROP_SIZE,
            )
        ).float()

        start = int(index - self.step_size * self.out_size / 2)
        end = int(index + self.step_size * self.out_size / 2)
        max_ind = self.__len__() - 1

        for out_ind, ind in enumerate(range(start, end, self.step_size)):
            if ind < 0 or ind > max_ind:
                continue
            else:
                if self.read_vid_file:
                    frame_seg[:, out_ind, :, :] = self.frames[:, ind, :, :]
                else:
                    frame_seg[:, out_ind, :, :] = self._read_img_file(
                        os.path.join(self.vid_path, self.vid_id), self.frames[ind]
                    )

        # create the pathways
        frame_list = pack_pathway_output(self.cfg, frame_seg)

        return frame_list
Exemple #12
0
def process_cv2_inputs(frames, cfg):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD)
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long()
    inputs = torch.index_select(inputs, 1, index)
    inputs = pack_pathway_output(cfg, inputs)
    inputs = [inp.unsqueeze(0) for inp in inputs]
    return inputs
Exemple #13
0
    def __getitem__(self, idx):
        """
        Generate corresponding clips, boxes, labels and metadata for given idx.

        Args:
            idx (int): the video index provided by the pytorch sampler.
        Returns:
            frames (tensor): the frames of sampled from the video. The dimension
                is `channel` x `num frames` x `height` x `width`.
            label (ndarray): the label for correspond boxes for the current video.
            idx (int): the video index provided by the pytorch sampler.
            extra_data (dict): a dict containing extra data fields, like "boxes",
                "ori_boxes" and "metadata".
        """
        video_idx, sec_idx, sec, center_idx = self._keyframe_indices[idx]
        # Get the frame idxs for current clip.
        seq = utils.get_sequence(
            center_idx,
            self._seq_len // 2,
            self._sample_rate,
            num_frames=len(self._image_paths[video_idx]),
        )

        clip_label_list = self._keyframe_boxes_and_labels[video_idx][sec_idx]
        assert len(clip_label_list) > 0

        # Get boxes and labels for current clip.
        boxes = []
        labels = []
        for box_labels in clip_label_list:
            boxes.append(box_labels[0])
            labels.append(box_labels[1])
        boxes = np.array(boxes)
        # Score is not used.
        boxes = boxes[:, :4].copy()
        ori_boxes = boxes.copy()

        # Load images of current clip.
        image_paths = [self._image_paths[video_idx][frame] for frame in seq]
        imgs = utils.retry_load_images(image_paths,
                                       backend=self.cfg.AVA.IMG_PROC_BACKEND)
        if self.cfg.AVA.IMG_PROC_BACKEND == "pytorch":
            # T H W C -> T C H W.
            imgs = imgs.permute(0, 3, 1, 2)
            # Preprocess images and boxes.
            imgs, boxes = self._images_and_boxes_preprocessing(imgs,
                                                               boxes=boxes)
            # T C H W -> C T H W.
            imgs = imgs.permute(1, 0, 2, 3)
        else:
            # Preprocess images and boxes
            imgs, boxes = self._images_and_boxes_preprocessing_cv2(imgs,
                                                                   boxes=boxes)

        # Construct label arrays.
        label_arrs = np.zeros((len(labels), self._num_classes), dtype=np.int32)
        for i, box_labels in enumerate(labels):
            # AVA label index starts from 1.
            for label in box_labels:
                if label == -1:
                    continue
                assert label >= 1 and label <= 80
                label_arrs[i][label - 1] = 1

        imgs = utils.pack_pathway_output(self.cfg, imgs)
        metadata = [[video_idx, sec]] * len(boxes)

        extra_data = {
            "boxes": boxes,
            "ori_boxes": ori_boxes,
            "metadata": metadata,
        }

        return imgs, label_arrs, idx, extra_data
Exemple #14
0
def run(loader, model, cfg):
    model.eval()
    num_frames = cfg.DATA.NUM_FRAMES
    step_frames = int(num_frames / 2)
    fout = open(cfg.TEST.OUTPUT_FEATURE_FILE, "w")
    batch_size = cfg.TEST.BATCH_SIZE
    start_time = time.time()
    for v_ind, (frames, labels, video_idx, meta) in enumerate(loader):
        print("load frames time:", time.time() - start_time)
        # Transfer the data to the current GPU device.
        if v_ind % 10 == 0:
            print("process video index:", v_ind, "total:", len(loader))
        length = frames.shape[1]
        features = []
        classifiers = []
        batch = []
        for k in range(0, length, step_frames):
            start = k
            end = min(k + num_frames, length)
            if len(batch) == batch_size or ((end - start < num_frames) and len(batch) > 0):
                # forward
                # batch_size x 3 num_slow_frames x 224 x 224
                b0 = torch.as_tensor(np.stack([b[0] for b in batch])).contiguous()
                # batch_size x 3 num_fast_frames x 224 x 224
                b1 = torch.as_tensor(np.stack([b[1] for b in batch])).contiguous()
                if torch.cuda.is_available():
                    b0 = b0.cuda(non_blocking=True)
                    b1 = b1.cuda(non_blocking=True)
                batch = [b0, b1]
                feat, cls = model(batch)
                features.append(feat.detach().cpu())
                classifiers.append(cls.detach().cpu())
                batch = []
            if end - start < num_frames:
                break
            inputs = frames[:, start:end]
            inputs = utils.pack_pathway_output(cfg, inputs)
            batch.append(inputs)

        # length of features: ceil((length - num_frames + 1)/step_frames)
        features = torch.cat(features, dim=0).numpy()
        classifiers = torch.cat(classifiers, dim=0).numpy()
        feat_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".feat.npy")
        np.save(feat_name, features)
        cls_name = os.path.join(cfg.OUTPUT_DIR, meta["video_name"] + ".cls.npy")
        np.save(cls_name, classifiers)

        meta["feature_shape"] = features.shape
        meta["cls_shape"] = classifiers.shape
        meta["feature_frame"] = (len(features)-1) * step_frames + num_frames
        meta["video_feature"] = feat_name
        meta["video_classifier"] = cls_name
        meta["step_frames"] = step_frames
        json_str = json.dumps(meta)
        fout.write(json_str + "\n")
        fout.flush()
        period = time.time() - start_time
        print("video index: %d, length: %d, period: %.2f sec, speed: %.2f sec/f."
              %(v_ind, length, period, period/length))
        start_time = time.time()
    fout.close()
Exemple #15
0
    def __getitem__(self, index):
        """
        Given the video index, return the list of frames, label, and video
        index if the video can be fetched and decoded successfully, otherwise
        repeatly find a random video that can be decoded as a replacement.
        Args:
            index (int): the video index provided by the pytorch sampler.
        Returns:
            frames (tensor): the frames of sampled from the video. The dimension
                is `channel` x `num frames` x `height` x `width`.
            label (int): the label of the current video.
            index (int): if the video provided by pytorch sampler can be
                decoded, then return the index of the video. If not, return the
                index of the video replacement that can be decoded.
        """
        if self.mode in ["train", "val"]:
            # -1 indicates random sampling.
            temporal_sample_index = -1
            spatial_sample_index = -1
            min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0]
            max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1]
            crop_size = self.cfg.DATA.TRAIN_CROP_SIZE
        elif self.mode in ["test"]:
            temporal_sample_index = (self._spatial_temporal_idx[index] //
                                     self.cfg.TEST.NUM_SPATIAL_CROPS)
            # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
            # center, or right if width is larger than height, and top, middle,
            # or bottom if height is larger than width.
            spatial_sample_index = (self._spatial_temporal_idx[index] %
                                    self.cfg.TEST.NUM_SPATIAL_CROPS)
            min_scale, max_scale, crop_size = [self.cfg.DATA.TEST_CROP_SIZE
                                               ] * 3
            # The testing is deterministic and no jitter should be performed.
            # min_scale, max_scale, and crop_size are expect to be the same.
            assert len({min_scale, max_scale, crop_size}) == 1
        else:
            raise NotImplementedError("Does not support {} mode".format(
                self.mode))

        # Try to decode and sample a clip from a video. If the video can not be
        # decoded, repeatly find a random video replacement that can be decoded.
        for _ in range(self._num_retries):
            video_container = None
            try:
                video_container = container.get_video_container(
                    self._path_to_videos[index],
                    self.cfg.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE,
                )
            except Exception as e:
                logger.info(
                    "Failed to load video from {} with error {}".format(
                        self._path_to_videos[index], e))
            # Select a random video if the current video was not able to access.
            if video_container is None:
                index = random.randint(0, len(self._path_to_videos) - 1)
                continue

            # Decode video. Meta info is used to perform selective decoding.
            frames = decoder.decode(
                video_container,
                self.cfg.DATA.SAMPLING_RATE,
                self.cfg.DATA.NUM_FRAMES,
                temporal_sample_index,
                self.cfg.TEST.NUM_ENSEMBLE_VIEWS,
                video_meta=self._video_meta[index],
                target_fps=30,
            )

            # If decoding failed (wrong format, video is too short, and etc),
            # select another video.
            if frames is None:
                index = random.randint(0, len(self._path_to_videos) - 1)
                continue

            # Perform color normalization.
            frames = frames.float()
            frames = frames / 255.0
            frames = frames - torch.tensor(self.cfg.DATA.MEAN)
            frames = frames / torch.tensor(self.cfg.DATA.STD)
            # T H W C -> C T H W.
            frames = frames.permute(3, 0, 1, 2)
            # Perform data augmentation.
            frames = self.spatial_sampling(
                frames,
                spatial_idx=spatial_sample_index,
                min_scale=min_scale,
                max_scale=max_scale,
                crop_size=crop_size,
            )

            label = self._labels[index]
            frames = utils.pack_pathway_output(self.cfg, frames)
            return frames, label, index, {}
        else:
            raise RuntimeError(
                "Failed to fetch video after {} retries.".format(
                    self._num_retries))