Ejemplo n.º 1
0
def get_person_boxes(cfg, object_predictor, mid_frame, frame_provider):
    outputs = object_predictor(mid_frame)
    fields = outputs["instances"]._fields
    pred_classes = fields["pred_classes"]
    selection_mask = pred_classes == 0
    # acquire person boxes
    pred_classes = pred_classes[selection_mask]
    pred_boxes = fields["pred_boxes"].tensor[selection_mask]
    scores = fields["scores"][selection_mask]
    boxes = cv2_transform.scale_boxes(cfg.DATA.TEST_CROP_SIZE, pred_boxes,
                                      frame_provider.display_height,
                                      frame_provider.display_width)
    boxes = torch.cat(
        [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)

    return boxes, scores
Ejemplo n.º 2
0
    def obj_detect(self, mid_frame):

        outputs = self.object_predictor(mid_frame)

        fields = outputs["instances"]._fields
        pred_classes = fields["pred_classes"]
        selection_mask = pred_classes == 0
        # acquire person boxes
        # pred_classes = pred_classes[selection_mask]
        pred_boxes = fields["pred_boxes"].tensor[selection_mask]
        # scores = fields["scores"][selection_mask]
        boxes = cv2_transform.scale_boxes(self.cfg.DATA.TEST_CROP_SIZE,
                                          pred_boxes,
                                          self.frame_provider.display_height,
                                          self.frame_provider.display_width)
        boxes = torch.cat(
            [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)
        # return boxes
        self.queue_demo.put(boxes)
Ejemplo n.º 3
0
def detector(object_predictor , image, backbone, cfg , display_height, display_width ):
    if backbone == 'yolo':
        boxes = object_predictor.detect_image(image)
        boxes = torch.as_tensor(boxes).float().cuda()
        return boxes
    else:
        outputs = object_predictor(image)
        fields = outputs["instances"]._fields
        pred_classes = fields["pred_classes"]
        selection_mask = pred_classes == 0
        # acquire person boxes
        pred_classes = pred_classes[selection_mask]
        pred_boxes = fields["pred_boxes"].tensor[selection_mask]
        scores = fields["scores"][selection_mask]
        boxes = cv2_transform.scale_boxes(cfg.DATA.TEST_CROP_SIZE,
                                            pred_boxes,
                                            display_height,
                                            display_width)
        return boxes
Ejemplo n.º 4
0
    def _images_and_boxes_preprocessing_cv2(self, imgs, boxes):
        """
        This function performs preprocessing for the input images and
        corresponding boxes for one clip with opencv as backend.

        Args:
            imgs (tensor): the images.
            boxes (ndarray): the boxes for the current clip.

        Returns:
            imgs (tensor): list of preprocessed images.
            boxes (ndarray): preprocessed boxes.
        """

        height, width, _ = imgs[0].shape

        boxes[:, [0, 2]] *= width
        boxes[:, [1, 3]] *= height
        boxes = cv2_transform.clip_boxes_to_image(boxes, height, width)

        # `transform.py` is list of np.array. However, for AVA, we only have
        # one np.array.
        boxes = [boxes]

        # The image now is in HWC, BGR format.
        if self._split == "train":  # "train"
            imgs, boxes = cv2_transform.random_short_side_scale_jitter_list(
                imgs,
                min_size=self._jitter_min_scale,
                max_size=self._jitter_max_scale,
                boxes=boxes,
            )
            imgs, boxes = cv2_transform.random_crop_list(imgs,
                                                         self._crop_size,
                                                         order="HWC",
                                                         boxes=boxes)

            # random flip
            imgs, boxes = cv2_transform.horizontal_flip_list(0.5,
                                                             imgs,
                                                             order="HWC",
                                                             boxes=boxes)
        elif self._split == "val":
            # Short side to test_scale. Non-local and STRG uses 256.
            imgs = [cv2_transform.scale(self._crop_size, img) for img in imgs]
            boxes = [
                cv2_transform.scale_boxes(self._crop_size, boxes[0], height,
                                          width)
            ]
            imgs, boxes = cv2_transform.spatial_shift_crop_list(
                self._crop_size, imgs, 1, boxes=boxes)

            if self._test_force_flip:
                imgs, boxes = cv2_transform.horizontal_flip_list(1,
                                                                 imgs,
                                                                 order="HWC",
                                                                 boxes=boxes)

        elif self._split == "test":
            # Short side to test_scale. Non-local and STRG uses 256.
            imgs = [cv2_transform.scale(self._crop_size, img) for img in imgs]
            boxes = [
                cv2_transform.scale_boxes(self._crop_size, boxes[0], height,
                                          width)
            ]

            if self._test_force_flip:
                imgs, boxes = cv2_transform.horizontal_flip_list(1,
                                                                 imgs,
                                                                 order="HWC",
                                                                 boxes=boxes)
        else:
            raise NotImplementedError("Unsupported split mode {}".format(
                self._split))

        # Convert image to CHW keeping BGR order.
        imgs = [cv2_transform.HWC2CHW(img) for img in imgs]

        # Image [0, 255] -> [0, 1].
        imgs = [img / 255.0 for img in imgs]

        imgs = [
            np.ascontiguousarray(
                # img.reshape((3, self._crop_size, self._crop_size))
                img.reshape((3, imgs[0].shape[1], imgs[0].shape[2])
                            )).astype(np.float32) for img in imgs
        ]

        # Do color augmentation (after divided by 255.0).
        if self._split == "train" and self._use_color_augmentation:
            if not self._pca_jitter_only:
                imgs = cv2_transform.color_jitter_list(
                    imgs,
                    img_brightness=0.4,
                    img_contrast=0.4,
                    img_saturation=0.4,
                )

            imgs = cv2_transform.lighting_list(
                imgs,
                alphastd=0.1,
                eigval=np.array(self._pca_eigval).astype(np.float32),
                eigvec=np.array(self._pca_eigvec).astype(np.float32),
            )

        # Normalize images by mean and std.
        imgs = [
            cv2_transform.color_normalization(
                img,
                np.array(self._data_mean, dtype=np.float32),
                np.array(self._data_std, dtype=np.float32),
            ) for img in imgs
        ]

        # Concat list of images to single ndarray.
        imgs = np.concatenate([np.expand_dims(img, axis=1) for img in imgs],
                              axis=1)

        if not self._use_bgr:
            # Convert image format from BGR to RGB.
            imgs = imgs[::-1, ...]

        imgs = np.ascontiguousarray(imgs)
        imgs = torch.from_numpy(imgs)
        boxes = cv2_transform.clip_boxes_to_image(
            # boxes[0], self._crop_size, self._crop_size.
            boxes[0],
            imgs[0].shape[1],
            imgs[0].shape[2],
        )
        return imgs, boxes
def demo(cfg):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    if cfg.DETECTION.ENABLE:
        # Load object detector from detectron2
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
        dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
        object_predictor = DefaultPredictor(dtron2_cfg)
        # Load the labels of AVA dataset
        with open(cfg.DEMO.LABEL_FILE_PATH) as f:
            labels = f.read().split('\n')[:-1]
        palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
        boxes = []
    else:
        # Load the labels of Kinectics-400 dataset
        labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
        labels = labels_df['name'].values

    frame_provider = VideoReader(cfg)
    seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
    frames = []
    pred_labels = []
    s = 0.
    for able_to_read, frame in frame_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            continue

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1:
                mid_frame = frame

        if len(frames) == seq_len:
            start = time()
            if cfg.DETECTION.ENABLE:
                outputs = object_predictor(mid_frame)
                fields = outputs["instances"]._fields
                pred_classes = fields["pred_classes"]
                selection_mask = pred_classes == 0
                # acquire person boxes
                pred_classes = pred_classes[selection_mask]
                pred_boxes = fields["pred_boxes"].tensor[selection_mask]
                scores = fields["scores"][selection_mask]
                boxes = cv2_transform.scale_boxes(
                    cfg.DATA.TEST_CROP_SIZE, pred_boxes,
                    frame_provider.display_height,
                    frame_provider.display_width)
                boxes = torch.cat(
                    [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes],
                    axis=1)

            inputs = torch.as_tensor(frames).float()
            inputs = inputs / 255.0
            # Perform color normalization.
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)

            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1,
                                   cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            # logger.info('fast_pathway.shape={}'.format(fast_pathway.shape))

            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1,
                                   fast_pathway.shape[2] //
                                   cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
            inputs = [slow_pathway, fast_pathway]
            """
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            """
            # Perform the forward pass.
            if cfg.DETECTION.ENABLE:
                # When there is nothing in the scene,
                #   use a dummy variable to disable all computations below.
                if not len(boxes):
                    preds = torch.tensor([])
                else:
                    preds = model(inputs, boxes)
            else:
                preds = model(inputs)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            if cfg.DETECTION.ENABLE:
                # This post processing was intendedly assigned to the cpu since my laptop GPU
                #   RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend
                #   to change this section to make CUDA does the processing.
                preds = preds.cpu().detach().numpy()
                pred_masks = preds > .1
                label_ids = [
                    np.nonzero(pred_mask)[0] for pred_mask in pred_masks
                ]
                pred_labels = [[
                    labels[label_id] for label_id in perbox_label_ids
                ] for perbox_label_ids in label_ids]
                # I'm unsure how to detectron2 rescales boxes to image original size, so I use
                #   input boxes of slowfast and rescale back it instead, it's safer and even if boxes
                #   was not rescaled by cv2_transform.rescale_boxes, it still works.
                boxes = boxes.cpu().detach().numpy()
                ratio = np.min([
                    frame_provider.display_height, frame_provider.display_width
                ]) / cfg.DATA.TEST_CROP_SIZE
                boxes = boxes[:, 1:] * ratio
            else:
                ## Option 1: single label inference selected from the highest probability entry.
                # label_id = preds.argmax(-1).cpu()
                # pred_label = labels[label_id]
                # Option 2: multi-label inferencing selected from probability entries > threshold
                label_ids = torch.nonzero(
                    preds.squeeze() > .1).reshape(-1).cpu().detach().numpy()
                pred_labels = labels[label_ids]
                logger.info(pred_labels)
                if not list(pred_labels):
                    pred_labels = ['Unknown']

            # # option 1: remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            # option 2: empty the buffer
            frames = []
            s = time() - start

        if cfg.DETECTION.ENABLE and pred_labels and boxes.any():
            for box, box_labels in zip(boxes.astype(int), pred_labels):
                cv2.rectangle(frame,
                              tuple(box[:2]),
                              tuple(box[2:]), (0, 255, 0),
                              thickness=2)
                label_origin = box[:2]
                for label in box_labels:
                    label_origin[-1] -= 5
                    (label_width, label_height), _ = cv2.getTextSize(
                        label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2)
                    cv2.rectangle(frame,
                                  (label_origin[0], label_origin[1] + 5),
                                  (label_origin[0] + label_width,
                                   label_origin[1] - label_height - 5),
                                  palette[labels.index(label)], -1)
                    cv2.putText(frame, label, tuple(label_origin),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255),
                                1)
                    label_origin[-1] -= label_height + 5
        if not cfg.DETECTION.ENABLE:
            # Display predicted labels to frame.
            y_offset = 50
            cv2.putText(frame,
                        'Action:', (10, y_offset),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=.65,
                        color=(0, 235, 0),
                        thickness=2)
            for pred_label in pred_labels:
                y_offset += 30
                cv2.putText(frame,
                            '{}'.format(pred_label), (20, y_offset),
                            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                            fontScale=.65,
                            color=(0, 235, 0),
                            thickness=2)

        # Display prediction speed
        cv2.putText(frame,
                    'Speed: {:.2f}s'.format(s), (10, 25),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=.65,
                    color=(0, 235, 0),
                    thickness=2)
        # Display the frame
        cv2.imshow('SlowFast', frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    frame_provider.clean()
    def get_predictions(self):
        """
        Predict and append prediction results to each box in each keyframe in
        `self.pred_boxes` dictionary.
        """
        # Set random seed from configs.
        np.random.seed(self.cfg.RNG_SEED)
        torch.manual_seed(self.cfg.RNG_SEED)

        # Setup logging format.
        logging.setup_logging(self.cfg.OUTPUT_DIR)

        # Print config.
        logger.info("Run demo with config:")
        logger.info(self.cfg)
        assert (self.cfg.NUM_GPUS <=
                1), "Cannot run demo visualization on multiple GPUs."

        # Build the video model and print model statistics.
        model = build_model(self.cfg)
        model.eval()
        logger.info("Start loading model info")
        misc.log_model_info(model, self.cfg, use_train_input=False)
        logger.info("Start loading model weights")
        cu.load_test_checkpoint(self.cfg, model)
        logger.info("Finish loading model weights")
        logger.info("Start making predictions for precomputed boxes.")
        for keyframe_idx, boxes_and_labels in tqdm.tqdm(
                self.pred_boxes.items()):
            inputs = self.get_input_clip(keyframe_idx)
            boxes = boxes_and_labels[0]
            boxes = torch.from_numpy(np.array(boxes)).float()

            box_transformed = scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                boxes,
                self.display_height,
                self.display_width,
            )

            # Pad frame index for each box.
            box_inputs = torch.cat(
                [
                    torch.full((box_transformed.shape[0], 1), float(0)),
                    box_transformed,
                ],
                axis=1,
            )
            if self.cfg.NUM_GPUS:
                # Transfer the data to the current GPU device.
                if isinstance(inputs, (list, )):
                    for i in range(len(inputs)):
                        inputs[i] = inputs[i].cuda(non_blocking=True)
                else:
                    inputs = inputs.cuda(non_blocking=True)

                box_inputs = box_inputs.cuda()

            preds = model(inputs, box_inputs)

            preds = preds.detach()

            if self.cfg.NUM_GPUS:
                preds = preds.cpu()

            boxes_and_labels[1] = preds
Ejemplo n.º 7
0
    def __call__(self, task):
        """
        Returns the prediction results for the current task.
        Args:
            task (TaskInfo object): task object that contain
                the necessary information for action prediction. (e.g. frames, boxes)
        Returns:
            task (TaskInfo object): the same task info object but filled with
                prediction values (a tensor) and the corresponding boxes for
                action detection task.
        """
        if self.cfg.DETECTION.ENABLE:
            task = self.object_detector(task)

        frames, bboxes = task.frames, task.bboxes
        if bboxes is not None:
            bboxes = cv2_transform.scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                bboxes,
                task.img_height,
                task.img_width,
            )
        if self.cfg.DEMO.INPUT_FORMAT == "BGR":
            frames = [
                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
            ]

        frames = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in frames
        ]
        inputs = process_cv2_inputs(frames, self.cfg)
        if bboxes is not None:
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )

            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        if self.cfg.NUM_GPUS > 0:
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(device=torch.device(
                        self.gpu_id),
                                               non_blocking=True)
            else:
                inputs = inputs.cuda(device=torch.device(self.gpu_id),
                                     non_blocking=True)
        if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
            preds = torch.tensor([])
        else:
            preds = self.model(inputs, bboxes)

        if self.cfg.NUM_GPUS:
            preds = preds.cpu()
            if bboxes is not None:
                bboxes = bboxes.detach().cpu()

        preds = preds.detach()
        task.add_action_preds(preds)
        if bboxes is not None:
            task.add_bboxes(bboxes[:, 1:])

        return task
Ejemplo n.º 8
0
    def images_and_boxes_preprocessing_cv2(self, imgs, boxes):
        """
        This function performs preprocessing for the input images and
        corresponding boxes for one clip with opencv as backend.

        Args:
            imgs (list of ndarrays with len num_frames): the images. Each image
                                    is a ndarray with shape (H, W, C)
            boxes (ndarray): the boxes for the current clip - not normalized. shape (num_boxes, 4 = x1, y1, x2, y2)

        Returns:
            imgs (tensor): list of preprocessed images. shape: (C, num_frames, H, W)
            boxes (ndarray): preprocessed boxes. shape (num_boxes, 4 = x1, y1, x2, y2)
        """

        # Assure that boxes have the right size
        boxes = cv2_transform.clip_boxes_to_image(boxes, self.img_height,
                                                  self.img_width)

        # `transform.py` is list of np.array. However, for AVA like structure, we only have
        # one np.array.
        boxes = [boxes]

        # The image now is in HWC, BGR format.
        # Short side to test_scale. Non-local and STRG uses 256.
        imgs = [cv2_transform.scale(self.crop_size, img) for img in imgs]
        # Boxes have to be adjusted to new image scale
        boxes = [
            cv2_transform.scale_boxes(self.crop_size, boxes[0],
                                      self.img_height, self.img_width)
        ]

        # Convert image to CHW keeping BGR order.
        imgs = [cv2_transform.HWC2CHW(img) for img in imgs]

        # Image [0, 255] -> [0, 1].
        imgs = [img / 255.0 for img in imgs]

        imgs = [
            np.ascontiguousarray(
                img.reshape((3, imgs[0].shape[1],
                             imgs[0].shape[2]))).astype(np.float32)
            for img in imgs
        ]

        # Normalize images by mean and std.
        imgs = [
            cv2_transform.color_normalization(
                img,
                np.array(self.data_mean, dtype=np.float32),
                np.array(self.data_std, dtype=np.float32),
            ) for img in imgs
        ]

        # Concat list of images to single ndarray.
        imgs = np.concatenate([np.expand_dims(img, axis=1) for img in imgs],
                              axis=1)

        if not self.use_bgr:
            # Convert image format from BGR to RGB.
            # Note that Kinetics pre-training uses RGB!
            imgs = imgs[::-1, ...]

        imgs = np.ascontiguousarray(imgs)
        imgs = torch.from_numpy(imgs)
        boxes = cv2_transform.clip_boxes_to_image(boxes[0], imgs[0].shape[1],
                                                  imgs[0].shape[2])

        # If you are interested to see, how the images look like, you can activate this
        # export_image(cfg, imgs.permute(1, 0, 2, 3).data.numpy(), [boxes], "demo", "CHW", True, use_bgr)

        return imgs, boxes
Ejemplo n.º 9
0
    def __call__(self, task):
        """
        Returns the prediction results for the current task.
        Args:
            task (TaskInfo object): task object that contain
                the necessary information for action prediction. (e.g. frames, boxes)
        Returns:
            task (TaskInfo object): the same task info object but filled with
                prediction values (a tensor) and the corresponding boxes for
                action detection task.
        """
        # * ------ 1. first stage : starting detection ----------------------*/
        if self.cfg.DETECTION.ENABLE:
            task = self.object_detector(task)

    # * ------ 2. Second stage : starting recognition ----------------------*/
        frames, bboxes = task.frames, task.bboxes

        ################################################################################################################
        from slowfast.datasets.utils import pack_pathway_output, tensor_normalize
        from torchvision import transforms
        from PIL import Image
        if self.cfg.DEMO.INPUT_FORMAT == "BGR":
            frames = [
                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
            ]

        inputs1 = []
        inputs0 = []
        cv2_transform.lineSpace(0, 63, 32, frames, inputs1)
        cv2_transform.lineSpace(0, 31, 8, inputs1, inputs0)

        inputs0 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs0
        ]
        inputs1 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs1
        ]

        inputs0 = torch.from_numpy(np.array(inputs0)).float() / 255
        inputs1 = torch.from_numpy(np.array(inputs1)).float() / 255
        inputs0 = tensor_normalize(inputs0, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        inputs1 = tensor_normalize(inputs1, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        # T H W C -> C T H W.
        inputs0 = inputs0.permute(3, 0, 1, 2)
        inputs1 = inputs1.permute(3, 0, 1, 2)
        inputs0 = inputs0.unsqueeze(0)
        inputs1 = inputs1.unsqueeze(0)
        inputs = [inputs0, inputs1]
        ###############################################################################################################

        if bboxes is not None:
            bboxes = cv2_transform.scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                bboxes,
                task.img_height,
                task.img_width,
            )
        # if self.cfg.DEMO.INPUT_FORMAT == "BGR":
        #     frames = [
        #         cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
        #     ]

        # frames = [
        #     cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
        #     for frame in frames
        # ]

        # change frames to slowfast inputs
        # inputs = process_cv2_inputs(frames, self.cfg)
        # add person cls to bbox
        if bboxes is not None:
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )

            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        if self.cfg.NUM_GPUS > 0:
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(device=torch.device(
                        self.gpu_id),
                                               non_blocking=True)
            else:
                inputs = inputs.cuda(device=torch.device(self.gpu_id),
                                     non_blocking=True)
        if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
            preds = torch.tensor([])
        else:
            # change    {1,3,8,224,224]->[8,3,224,224]
            bboxes = bboxes.unsqueeze(0).unsqueeze(0)
            inputs[0] = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs[1] = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            ##########################################################
            import numpy
            numpy.set_printoptions(suppress=True)

            # import scipy.io as io
            # inputs0 = inputs[0].squeeze(0).permute(
            #     1, 0, 2, 3)[0].permute(1, 2, 0).data.cpu().numpy()
            # cv2.imwrite("1.jpg", np.array(
            #     inputs0*255, dtype=np.float32))  # dtype=np.uint8
            # print(inputs0)
            # numpy.save("input0.npy", inputs0)
            # result0 = numpy.array(inputs0.reshape(-1, 1))
            # numpy.savetxt("result0.txt", result0)
            # io.savemat("save.mat", {"result0": result0})

            #######################  save .txt file ############################
            # result0 = numpy.array(
            #     inputs[0].cpu().reshape(-1, 1)).astype(np.float32)
            # # result0 = result0.astype('float')
            # # for i in range(10):
            # #     print(result0[i])
            # # exit(0)
            # result0.astype('float32').tofile("input0.txt")
            # result1 = numpy.array(
            #     inputs[1].cpu().reshape(-1, 1)).astype(np.float32)
            # result1.astype('float32').tofile("input1.txt")
            # result0 = numpy.array(
            #     bboxes.cpu().reshape(-1, 1)).astype(np.float32)
            # result0.astype('float32').tofile("input2.txt")

            ##################################### save .npy file ###################
            # numpy.save("input0.npy", inputs[0].cpu().numpy())
            # numpy.save("input1.npy", inputs[1].cpu().numpy())
            # numpy.save("input2.npy", bboxes.cpu().numpy())
            # input0 = torch.from_numpy(np.load("input0.npy")).cuda()
            # input1 = torch.from_numpy(np.load("input1.npy")).cuda()
            # input2 = torch.from_numpy(np.load("input2.npy")).cuda()
            ##########################################################
            preds = self.model(inputs, bboxes)
            # preds = self.model([input0, input1], input2)

            # result_pred = numpy.array(preds.detach().cpu().reshape(-1, 1))
            # numpy.savetxt("result_preds.txt", result_pred)
            print(preds)
            exit(0)
            #*****************************   open with video test ##########################
            bboxes = bboxes.squeeze(0).squeeze(0)  # change[1,1,3,5] -->[3,5]
            #*****************************   open with video test end ##########################

        if self.cfg.NUM_GPUS:
            preds = preds.cpu()
            if bboxes is not None:
                bboxes = bboxes.detach().cpu()

        preds = preds.detach()
        task.add_action_preds(preds)
        if bboxes is not None:
            task.add_bboxes(bboxes[:, 1:])

        return task