Exemple #1
0
    def __getitem__(self, index):
        # Decode video. Meta info is used to perform selective decoding.
        frames = self.sample_frames(index)

        # Perform color normalization.
        frames = utils.tensor_normalize(frames, self.cfg.DATA.MEAN,
                                        self.cfg.DATA.STD)
        # T H W C -> C T H W.
        frames = frames.permute(3, 0, 1, 2)
        frames = utils.pack_pathway_output(self.cfg, frames)

        return frames, index
Exemple #2
0
def process_cv2_inputs(frames, cfg):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = tensor_normalize(inputs, cfg.DATA.MEAN, cfg.DATA.STD)
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, cfg.DATA.NUM_FRAMES).long()
    inputs = torch.index_select(inputs, 1, index)
    inputs = pack_pathway_output(cfg, inputs)
    inputs = [inp.unsqueeze(0) for inp in inputs]
    return inputs
Exemple #3
0
    def slowfast_predict(
        self,
        frames,
        labels,
    ):  # slow fast

        start_time = time.time()

        inputs = torch.from_numpy(np.array(frames)).float() / 255.0
        print("frame change time is :", time.time() - start_time)
        inputs = tensor_normalize(inputs, self.cfg.DATA.MEAN,
                                  self.cfg.DATA.STD)
        inputs = inputs.permute(3, 0, 1, 2)

        inputs = inputs.unsqueeze(0)

        index = torch.linspace(0, inputs.shape[2] - 1,
                               self.cfg.DATA.NUM_FRAMES).long()
        fast_pathway = torch.index_select(inputs, 2, index)

        # Sample frames for the slow pathway.
        index = torch.linspace(
            0, fast_pathway.shape[2] - 1,
            fast_pathway.shape[2] // self.cfg.SLOWFAST.ALPHA).long()
        slow_pathway = torch.index_select(fast_pathway, 2, index)
        # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
        inputs = [slow_pathway, fast_pathway]

        # Transfer the data to the current GPU device.
        if isinstance(inputs, (list, )):
            for i in range(len(inputs)):
                inputs[i] = inputs[i].cuda()
        else:
            inputs = inputs.cuda()

        boxes = self.queue_demo.get()

        if not len(boxes):
            preds = torch.tensor([])
        else:
            preds = self.model(inputs, boxes)

        if self.cfg.NUM_GPUS > 1:
            preds = du.all_gather(preds)[0]

        preds = preds.cpu().detach().numpy()
        pred_masks = preds > .1
        label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks]
        pred_labels = [[labels[label_id] for label_id in perbox_label_ids]
                       for perbox_label_ids in label_ids]

        # boxes = boxes.cpu().detach().numpy()
        # ratio = np.min(
        #     [self.frame_provider.display_height, self.frame_provider.display_width]
        # ) / self.cfg.DATA.TEST_CROP_SIZE
        #
        # boxes = boxes[:, 1:] * ratio
        detection_time = time.time()
        print(f'slowfast cost time is :{(detection_time-start_time)}')
        # re=pred_labels
        self.fra(pred_labels)
Exemple #4
0
def demo(cfg):
    """
    Run inference on an input video or stream from webcam.
    Args:
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = build.build_model(cfg)
    model.eval()
    misc.log_model_info(model, cfg)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    if cfg.DETECTION.ENABLE:
        # Load object detector from detectron2.
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
        dtron2_cfg.MODEL.WEIGHTS = (
            cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS)
        logger.info("Initialize detectron2 model.")
        object_predictor = DefaultPredictor(dtron2_cfg)
        # Load the labels of AVA dataset
        with open(cfg.DEMO.LABEL_FILE_PATH) as f:
            labels = f.read().split("\n")[:-1]
        palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
        boxes = []
        logger.info("Finish loading detectron2")
    else:
        # Load the labels of Kinectics-400 dataset.
        labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
        labels = labels_df["name"].values

    frame_provider = VideoReader(cfg)

    seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
    frames = []
    pred_labels = []
    s = 0.0
    for able_to_read, frame in frame_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            break

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1:
                mid_frame = frame

        if len(frames) == seq_len:
            start = time()
            if cfg.DETECTION.ENABLE:
                outputs = object_predictor(mid_frame)
                fields = outputs["instances"]._fields
                pred_classes = fields["pred_classes"]
                selection_mask = pred_classes == 0
                # acquire person boxes.
                pred_classes = pred_classes[selection_mask]
                pred_boxes = fields["pred_boxes"].tensor[selection_mask]
                boxes = cv2_transform.scale_boxes(
                    cfg.DATA.TEST_CROP_SIZE,
                    pred_boxes,
                    frame_provider.display_height,
                    frame_provider.display_width,
                )
                boxes = torch.cat(
                    [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes],
                    axis=1,
                )
            inputs = tensor_normalize(torch.as_tensor(frames), cfg.DATA.MEAN,
                                      cfg.DATA.STD)

            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)
            if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH:
                # Sample frames for the fast pathway.
                index = torch.linspace(0, inputs.shape[2] - 1,
                                       cfg.DATA.NUM_FRAMES).long()
                inputs = [torch.index_select(inputs, 2, index)]
            elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH:
                # Sample frames for the fast pathway.
                index = torch.linspace(0, inputs.shape[2] - 1,
                                       cfg.DATA.NUM_FRAMES).long()
                fast_pathway = torch.index_select(inputs, 2, index)

                # Sample frames for the slow pathway.
                index = torch.linspace(
                    0,
                    fast_pathway.shape[2] - 1,
                    fast_pathway.shape[2] // cfg.SLOWFAST.ALPHA,
                ).long()
                slow_pathway = torch.index_select(fast_pathway, 2, index)
                inputs = [slow_pathway, fast_pathway]
            else:
                raise NotImplementedError("Model arch {} is not in {}".format(
                    cfg.MODEL.ARCH,
                    cfg.MODEL.SINGLE_PATHWAY_ARCH +
                    cfg.MODEL.MULTI_PATHWAY_ARCH,
                ))

            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

            # Perform the forward pass.
            if cfg.DETECTION.ENABLE:
                # When there is nothing in the scene,
                #   use a dummy variable to disable all computations below.
                if not len(boxes):
                    preds = torch.tensor([])
                else:
                    preds = model(inputs, boxes)
            else:
                preds = model(inputs)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            if cfg.DETECTION.ENABLE:
                # This post processing was intendedly assigned to the cpu since my laptop GPU
                #   RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend
                #   to change this section to make CUDA does the processing.
                preds = preds.cpu().detach().numpy()
                pred_masks = preds > 0.1
                label_ids = [
                    np.nonzero(pred_mask)[0] for pred_mask in pred_masks
                ]
                pred_labels = [[
                    labels[label_id] for label_id in perbox_label_ids
                ] for perbox_label_ids in label_ids]
                # I'm unsure how to detectron2 rescales boxes to image original size, so I use
                #   input boxes of slowfast and rescale back it instead, it's safer and even if boxes
                #   was not rescaled by cv2_transform.rescale_boxes, it still works.
                boxes = boxes.cpu().detach().numpy()
                ratio = (np.min([
                    frame_provider.display_height,
                    frame_provider.display_width,
                ]) / cfg.DATA.TEST_CROP_SIZE)
                boxes = boxes[:, 1:] * ratio
            else:
                ## Option 1: single label inference selected from the highest probability entry.
                # label_id = preds.argmax(-1).cpu()
                # pred_label = labels[label_id]
                # Option 2: multi-label inferencing selected from probability entries > threshold.
                label_ids = (torch.nonzero(
                    preds.squeeze() > 0.1).reshape(-1).cpu().detach().numpy())
                pred_labels = labels[label_ids]
                logger.info(pred_labels)
                if not list(pred_labels):
                    pred_labels = ["Unknown"]

            # # option 1: remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            # option 2: empty the buffer
            frames = []
            s = time() - start

        if cfg.DETECTION.ENABLE and pred_labels and boxes.any():
            for box, box_labels in zip(boxes.astype(int), pred_labels):
                cv2.rectangle(
                    frame,
                    tuple(box[:2]),
                    tuple(box[2:]),
                    (0, 255, 0),
                    thickness=2,
                )
                label_origin = box[:2]
                for label in box_labels:
                    label_origin[-1] -= 5
                    (label_width, label_height), _ = cv2.getTextSize(
                        label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                    cv2.rectangle(
                        frame,
                        (label_origin[0], label_origin[1] + 5),
                        (
                            label_origin[0] + label_width,
                            label_origin[1] - label_height - 5,
                        ),
                        palette[labels.index(label)],
                        -1,
                    )
                    cv2.putText(
                        frame,
                        label,
                        tuple(label_origin),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5,
                        (255, 255, 255),
                        1,
                    )
                    label_origin[-1] -= label_height + 5
        if not cfg.DETECTION.ENABLE:
            # Display predicted labels to frame.
            y_offset = 50
            cv2.putText(
                frame,
                "Action:",
                (10, y_offset),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.65,
                color=(0, 235, 0),
                thickness=2,
            )
            for pred_label in pred_labels:
                y_offset += 30
                cv2.putText(
                    frame,
                    "{}".format(pred_label),
                    (20, y_offset),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=0.65,
                    color=(0, 235, 0),
                    thickness=2,
                )

        # Display prediction speed.
        cv2.putText(
            frame,
            "Speed: {:.2f}s".format(s),
            (10, 25),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=0.65,
            color=(0, 235, 0),
            thickness=2,
        )
        frame_provider.display(frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    frame_provider.clean()
Exemple #5
0
    def __getitem__(self, index):
        """
        Given the video index, return the list of frames, label, and video
        index if the video can be fetched and decoded successfully, otherwise
        repeatly find a random video that can be decoded as a replacement.
        Args:
            index (int): the video index provided by the pytorch sampler.
        Returns:
            frames (tensor): the frames of sampled from the video. The dimension
                is `channel` x `num frames` x `height` x `width`.
            label (int): the label of the current video.
            index (int): if the video provided by pytorch sampler can be
                decoded, then return the index of the video. If not, return the
                index of the video replacement that can be decoded.
        """
        short_cycle_idx = None
        # When short cycle is used, input index is a tupple.
        if isinstance(index, tuple):
            index, short_cycle_idx = index

        temporal_sample_index = self._spatial_temporal_idx[
            index] // self._number_of_spatial_crops
        spatial_sample_index = self._spatial_temporal_idx[
            index] % self._number_of_spatial_crops
        sampling_rate = 2

        # Try to decode and sample a clip from a video. If the video can not be
        # decoded, repeatly find a random video replacement that can be decoded.
        for i_try in range(self._num_retries):
            video_container = None
            try:
                video_container = container.get_video_container(
                    self._path_to_videos[index],
                    False,
                    "pyav",
                )
            except Exception as e:
                index = random.randint(0, len(self._path_to_videos) - 1)

            # Select a random video if the current video was not able to access.
            if video_container is None:
                if index + 1 < len(self._path_to_videos):
                    index = index + 1
                else:
                    index = index - 1
                continue
            NUM_ENSEMBLE_VIEWS = 10
            NUM_FRAMES = 64
            # Decode video. Meta info is used to perform selective decoding.
            frames = decoder.decode(
                container=video_container,
                sampling_rate=sampling_rate,
                num_frames=NUM_FRAMES,
                clip_idx=temporal_sample_index,
                num_clips=NUM_ENSEMBLE_VIEWS,
                video_meta=None,
                target_fps=30,
                backend="pyav",
                max_spatial_scale=256,
            )
            # If decoding failed (wrong format, video is too short, and etc),
            # select another video.
            if frames is None:
                print('try a new one')
                if index + 1 < len(self._path_to_videos):
                    index = index + 1
                else:
                    index = index - 1
                continue

            datamean = [0.45, 0.45, 0.45]
            datastd = [0.225, 0.225, 0.225]

            #used by CLIP:
            datamean = [0.48145466, 0.4578275, 0.40821073]
            datastd = [0.26862954, 0.26130258, 0.27577711]

            frames = utils.tensor_normalize(frames, datamean, datastd)
            frames = frames.permute(3, 0, 1, 2)
            frames = transform.crop_EAC_image(frames, spatial_sample_index)
            label = self._labels[index]

            # Perform color normalization.
            '''if frames.dtype == torch.uint8:
                frames = frames.float()
                frames = frames / 255.0
            if type(datamean) == list:
                mean = torch.tensor(datamean)
            if type(datastd) == list:
                std = torch.tensor(datastd)
            frames = frames - mean
            frames = frames / std
            
            frames = frames.permute(0, 3, 1, 2)
            frames = transform.rescale(frames)
            frames = transform.uniform_crop(frames[0], 244)'''
            '''print(frames.size())
            a = frames.numpy()
            a = (a/a)*255
            a = a.astype(np.uint8)
            im = Image.fromarray(a[0,:,:,:])
            im.save("00.jpeg")'''
            '''im = Image.fromarray(a[1,:,:,:])
            im.save("01.jpeg")
            im = Image.fromarray(a[2,:,:,:])
            im.save("02.jpeg")
            im = Image.fromarray(a[3,:,:,:])
            im.save("03.jpeg")
            im = Image.fromarray(a[4,:,:,:])
            im.save("04.jpeg")'''
            #frames = frames.permute(0, 2, 3, 1)
            # T H W C -> C T H W. ????
            #frames = frames.permute(3, 0, 1, 2)
            #label = self._labels[index]
            return frames, label, index, {}, self._unique_video_idx[
                index], temporal_sample_index, spatial_sample_index, self._path_to_videos[
                    index]
        else:
            raise RuntimeError(
                "Failed to fetch video after {} retries.".format(
                    self._num_retries))
Exemple #6
0
    def __call__(self, task):
        """
        Returns the prediction results for the current task.
        Args:
            task (TaskInfo object): task object that contain
                the necessary information for action prediction. (e.g. frames, boxes)
        Returns:
            task (TaskInfo object): the same task info object but filled with
                prediction values (a tensor) and the corresponding boxes for
                action detection task.
        """
        # * ------ 1. first stage : starting detection ----------------------*/
        if self.cfg.DETECTION.ENABLE:
            task = self.object_detector(task)

    # * ------ 2. Second stage : starting recognition ----------------------*/
        frames, bboxes = task.frames, task.bboxes

        ################################################################################################################
        from slowfast.datasets.utils import pack_pathway_output, tensor_normalize
        from torchvision import transforms
        from PIL import Image
        if self.cfg.DEMO.INPUT_FORMAT == "BGR":
            frames = [
                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
            ]

        inputs1 = []
        inputs0 = []
        cv2_transform.lineSpace(0, 63, 32, frames, inputs1)
        cv2_transform.lineSpace(0, 31, 8, inputs1, inputs0)

        inputs0 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs0
        ]
        inputs1 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs1
        ]

        inputs0 = torch.from_numpy(np.array(inputs0)).float() / 255
        inputs1 = torch.from_numpy(np.array(inputs1)).float() / 255
        inputs0 = tensor_normalize(inputs0, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        inputs1 = tensor_normalize(inputs1, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        # T H W C -> C T H W.
        inputs0 = inputs0.permute(3, 0, 1, 2)
        inputs1 = inputs1.permute(3, 0, 1, 2)
        inputs0 = inputs0.unsqueeze(0)
        inputs1 = inputs1.unsqueeze(0)
        inputs = [inputs0, inputs1]
        ###############################################################################################################

        if bboxes is not None:
            bboxes = cv2_transform.scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                bboxes,
                task.img_height,
                task.img_width,
            )
        # if self.cfg.DEMO.INPUT_FORMAT == "BGR":
        #     frames = [
        #         cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
        #     ]

        # frames = [
        #     cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
        #     for frame in frames
        # ]

        # change frames to slowfast inputs
        # inputs = process_cv2_inputs(frames, self.cfg)
        # add person cls to bbox
        if bboxes is not None:
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )

            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        if self.cfg.NUM_GPUS > 0:
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(device=torch.device(
                        self.gpu_id),
                                               non_blocking=True)
            else:
                inputs = inputs.cuda(device=torch.device(self.gpu_id),
                                     non_blocking=True)
        if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
            preds = torch.tensor([])
        else:
            # change    {1,3,8,224,224]->[8,3,224,224]
            bboxes = bboxes.unsqueeze(0).unsqueeze(0)
            inputs[0] = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs[1] = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            ##########################################################
            import numpy
            numpy.set_printoptions(suppress=True)

            # import scipy.io as io
            # inputs0 = inputs[0].squeeze(0).permute(
            #     1, 0, 2, 3)[0].permute(1, 2, 0).data.cpu().numpy()
            # cv2.imwrite("1.jpg", np.array(
            #     inputs0*255, dtype=np.float32))  # dtype=np.uint8
            # print(inputs0)
            # numpy.save("input0.npy", inputs0)
            # result0 = numpy.array(inputs0.reshape(-1, 1))
            # numpy.savetxt("result0.txt", result0)
            # io.savemat("save.mat", {"result0": result0})

            #######################  save .txt file ############################
            # result0 = numpy.array(
            #     inputs[0].cpu().reshape(-1, 1)).astype(np.float32)
            # # result0 = result0.astype('float')
            # # for i in range(10):
            # #     print(result0[i])
            # # exit(0)
            # result0.astype('float32').tofile("input0.txt")
            # result1 = numpy.array(
            #     inputs[1].cpu().reshape(-1, 1)).astype(np.float32)
            # result1.astype('float32').tofile("input1.txt")
            # result0 = numpy.array(
            #     bboxes.cpu().reshape(-1, 1)).astype(np.float32)
            # result0.astype('float32').tofile("input2.txt")

            ##################################### save .npy file ###################
            # numpy.save("input0.npy", inputs[0].cpu().numpy())
            # numpy.save("input1.npy", inputs[1].cpu().numpy())
            # numpy.save("input2.npy", bboxes.cpu().numpy())
            # input0 = torch.from_numpy(np.load("input0.npy")).cuda()
            # input1 = torch.from_numpy(np.load("input1.npy")).cuda()
            # input2 = torch.from_numpy(np.load("input2.npy")).cuda()
            ##########################################################
            preds = self.model(inputs, bboxes)
            # preds = self.model([input0, input1], input2)

            # result_pred = numpy.array(preds.detach().cpu().reshape(-1, 1))
            # numpy.savetxt("result_preds.txt", result_pred)
            print(preds)
            exit(0)
            #*****************************   open with video test ##########################
            bboxes = bboxes.squeeze(0).squeeze(0)  # change[1,1,3,5] -->[3,5]
            #*****************************   open with video test end ##########################

        if self.cfg.NUM_GPUS:
            preds = preds.cpu()
            if bboxes is not None:
                bboxes = bboxes.detach().cpu()

        preds = preds.detach()
        task.add_action_preds(preds)
        if bboxes is not None:
            task.add_bboxes(bboxes[:, 1:])

        return task