コード例 #1
0
    def get_input_clip(self, keyframe_idx):
        """
        Get input clip from the video/folder of images for a given
        keyframe index.
        Args:
            keyframe_idx (int): index of the current keyframe.
        Returns:
            clip (list of tensors): formatted input clip(s) corresponding to
                the current keyframe.
        """
        seq = get_sequence(
            keyframe_idx,
            self.seq_length // 2,
            self.cfg.DATA.SAMPLING_RATE,
            self.total_frames,
        )
        clip = []
        for frame_idx in seq:
            self.cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
            was_read, frame = self.cap.read()
            if was_read:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
                clip.append(frame)
            else:
                logger.error(
                    "Unable to read frame. Duplicating previous frame.")
                clip.append(clip[-1])

        clip = process_cv2_inputs(clip, self.cfg)
        return clip
コード例 #2
0
    def read_frames(self):

        frames, org_frames, success, count_err = [], [], True, 0

        while True:
            success, frame = self.cap.read()
            if success:
                frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame_processed = scale(self.cfg.DATA.TEST_CROP_SIZE,
                                        frame_processed)
                frames.append(frame_processed)
                org_frames.append(frame)
            else:
                count_err += 1
                if count_err % 5 == 0:
                    self.reconnect_cam()
                continue

            if len(frames) % self.seq_len == 0:
                if self.images_queue.qsize() >= 2:
                    self.images_queue.get()

                self.images_queue.put([frames, org_frames])

                frames = []
                org_frames = []
コード例 #3
0
ファイル: demo_net.py プロジェクト: lizituo12/SlowFast
    def predictor(self, ):
        # seq_len = self.cfg.DATA.NUM_FRAMES * self.cfg.DATA.SAMPLING_RATE

        seq_len = 32
        frames = []
        s = 0.
        self.frame_provider.start()
        for able_to_read, frame in self.frame_provider:
            sleep_time = 1 / self.output_fps
            if not able_to_read:
                frames = []
                continue

            if len(frames) != seq_len:
                frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame_processed = scale(self.cfg.DATA.TEST_CROP_SIZE,
                                        frame_processed)
                frames.append(frame_processed)
                if self.cfg.DETECTION.ENABLE and len(
                        frames) == seq_len // 2 - 1:
                    mid_frame = frame
            re = '0.00'
            if len(frames) == seq_len:
                # self.start(mid_frame,frames,self.labels)
                # frames = []
                # pool=mp.Pool()
                # pool.apply_async(func=self.start,args=(mid_frame,frames,self.labels),callback=self.fra)
                # arg=[mid_frame,frames,self.labels]
                # result=threading.Thread(target=self.appy_async,args=(self.start,arg,self.fra))
                # result.start()
                self.obj_detect(mid_frame)
                result = self.slowfast_predict(frames, self.labels)
                re = result
                frames = []
            cv2.putText(frame,
                        'Speed: {:}s'.format(re), (10, 25),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=.65,
                        color=(0, 235, 0),
                        thickness=2)
            # Display the frame
            cv2.imshow('SlowFast', frame)
            # time.sleep(sleep_time)

            key = cv2.waitKey(1)
            if key == 27:
                break
        self.clean()
        self.frame_provider.clean()
コード例 #4
0
def _data_augment_nobox(cn, augment_scale, augment_hflip, fl_u8_bgr):

    imgs = fl_u8_bgr
    height, width, _ = imgs[0].shape

    jmin, jmax = cn.DATA.TRAIN_JITTER_SCALES
    crop_size = cn.DATA.TRAIN_CROP_SIZE

    if augment_scale:
        imgs, _ = sf_cv2_transform.random_short_side_scale_jitter_list(
            imgs, min_size=jmin, max_size=jmax)
        imgs, _ = sf_cv2_transform.random_crop_list(imgs,
                                                    crop_size,
                                                    order="HWC")
    else:
        imgs = [sf_cv2_transform.scale(crop_size, img) for img in imgs]
        imgs, _ = sf_cv2_transform.spatial_shift_crop_list(crop_size, imgs, 1)

    if augment_hflip:
        imgs, _ = sf_cv2_transform.horizontal_flip_list(0.5, imgs, order="HWC")
    imgs = _data_imgs_postprocess(imgs, cn)
    return imgs
コード例 #5
0
    def preprocess_images(self, imgs):
        """
        Preprocesses & resize imgs to fit the detection model input format.
        This is a faster method to preprocess images than in the original project:
        Preprocessing before took approx 0.015 seconds, now 0,006 --> improvement 50-80 %
        :param imgs: (list[np.ndarray]) a list of original images of shape (H, W, C) (in BGR order) and [0,255].
        :return:
            inputs: a list[{dicts}] of the input images with the following keys:
                image: tensor for a single image in (C, Hnew, Wnew), (BGR or RGB depending on model) and [0,255]
                height: the desired output height (original resolution)
                width: the desired output width (original resolution)
        """
        # Convert images to RGB if required
        if self.input_format == "RGB":
            # whether the model expects BGR inputs or RGB
            imgs = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in imgs]

        # Resize image: short side to previously calculated short side
        if self.rescale_image:
            imgs = [
                cv2_transform.scale(self.new_short_side, img) for img in imgs
            ]

        inputs = []
        for img in imgs:
            # Append inputs and (H, W, C) -> (C, H, W) and ndarray -> tensor
            inputs.append({
                "image":
                torch.as_tensor(img.astype("float32").transpose(2, 0, 1)),
                "height":
                self.original_img_height,
                "width":
                self.original_img_width
            })

        return inputs
コード例 #6
0
    def _images_and_boxes_preprocessing_cv2(self, imgs, boxes):
        """
        This function performs preprocessing for the input images and
        corresponding boxes for one clip with opencv as backend.

        Args:
            imgs (tensor): the images.
            boxes (ndarray): the boxes for the current clip.

        Returns:
            imgs (tensor): list of preprocessed images.
            boxes (ndarray): preprocessed boxes.
        """

        height, width, _ = imgs[0].shape

        boxes[:, [0, 2]] *= width
        boxes[:, [1, 3]] *= height
        boxes = cv2_transform.clip_boxes_to_image(boxes, height, width)

        # `transform.py` is list of np.array. However, for AVA, we only have
        # one np.array.
        boxes = [boxes]

        # The image now is in HWC, BGR format.
        if self._split == "train":  # "train"
            imgs, boxes = cv2_transform.random_short_side_scale_jitter_list(
                imgs,
                min_size=self._jitter_min_scale,
                max_size=self._jitter_max_scale,
                boxes=boxes,
            )
            imgs, boxes = cv2_transform.random_crop_list(imgs,
                                                         self._crop_size,
                                                         order="HWC",
                                                         boxes=boxes)

            # random flip
            imgs, boxes = cv2_transform.horizontal_flip_list(0.5,
                                                             imgs,
                                                             order="HWC",
                                                             boxes=boxes)
        elif self._split == "val":
            # Short side to test_scale. Non-local and STRG uses 256.
            imgs = [cv2_transform.scale(self._crop_size, img) for img in imgs]
            boxes = [
                cv2_transform.scale_boxes(self._crop_size, boxes[0], height,
                                          width)
            ]
            imgs, boxes = cv2_transform.spatial_shift_crop_list(
                self._crop_size, imgs, 1, boxes=boxes)

            if self._test_force_flip:
                imgs, boxes = cv2_transform.horizontal_flip_list(1,
                                                                 imgs,
                                                                 order="HWC",
                                                                 boxes=boxes)

        elif self._split == "test":
            # Short side to test_scale. Non-local and STRG uses 256.
            imgs = [cv2_transform.scale(self._crop_size, img) for img in imgs]
            boxes = [
                cv2_transform.scale_boxes(self._crop_size, boxes[0], height,
                                          width)
            ]

            if self._test_force_flip:
                imgs, boxes = cv2_transform.horizontal_flip_list(1,
                                                                 imgs,
                                                                 order="HWC",
                                                                 boxes=boxes)
        else:
            raise NotImplementedError("Unsupported split mode {}".format(
                self._split))

        # Convert image to CHW keeping BGR order.
        imgs = [cv2_transform.HWC2CHW(img) for img in imgs]

        # Image [0, 255] -> [0, 1].
        imgs = [img / 255.0 for img in imgs]

        imgs = [
            np.ascontiguousarray(
                # img.reshape((3, self._crop_size, self._crop_size))
                img.reshape((3, imgs[0].shape[1], imgs[0].shape[2])
                            )).astype(np.float32) for img in imgs
        ]

        # Do color augmentation (after divided by 255.0).
        if self._split == "train" and self._use_color_augmentation:
            if not self._pca_jitter_only:
                imgs = cv2_transform.color_jitter_list(
                    imgs,
                    img_brightness=0.4,
                    img_contrast=0.4,
                    img_saturation=0.4,
                )

            imgs = cv2_transform.lighting_list(
                imgs,
                alphastd=0.1,
                eigval=np.array(self._pca_eigval).astype(np.float32),
                eigvec=np.array(self._pca_eigvec).astype(np.float32),
            )

        # Normalize images by mean and std.
        imgs = [
            cv2_transform.color_normalization(
                img,
                np.array(self._data_mean, dtype=np.float32),
                np.array(self._data_std, dtype=np.float32),
            ) for img in imgs
        ]

        # Concat list of images to single ndarray.
        imgs = np.concatenate([np.expand_dims(img, axis=1) for img in imgs],
                              axis=1)

        if not self._use_bgr:
            # Convert image format from BGR to RGB.
            imgs = imgs[::-1, ...]

        imgs = np.ascontiguousarray(imgs)
        imgs = torch.from_numpy(imgs)
        boxes = cv2_transform.clip_boxes_to_image(
            # boxes[0], self._crop_size, self._crop_size.
            boxes[0],
            imgs[0].shape[1],
            imgs[0].shape[2],
        )
        return imgs, boxes
コード例 #7
0
def demo(cfg):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    if cfg.DETECTION.ENABLE:
        # Load object detector from detectron2
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
        dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
        object_predictor = DefaultPredictor(dtron2_cfg)
        # Load the labels of AVA dataset
        with open(cfg.DEMO.LABEL_FILE_PATH) as f:
            labels = f.read().split('\n')[:-1]
        palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
        boxes = []
    else:
        # Load the labels of Kinectics-400 dataset
        labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
        labels = labels_df['name'].values

    frame_provider = VideoReader(cfg)
    seq_len = cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE
    frames = []
    pred_labels = []
    s = 0.
    for able_to_read, frame in frame_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            continue

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            if cfg.DETECTION.ENABLE and len(frames) == seq_len // 2 - 1:
                mid_frame = frame

        if len(frames) == seq_len:
            start = time()
            if cfg.DETECTION.ENABLE:
                outputs = object_predictor(mid_frame)
                fields = outputs["instances"]._fields
                pred_classes = fields["pred_classes"]
                selection_mask = pred_classes == 0
                # acquire person boxes
                pred_classes = pred_classes[selection_mask]
                pred_boxes = fields["pred_boxes"].tensor[selection_mask]
                scores = fields["scores"][selection_mask]
                boxes = cv2_transform.scale_boxes(
                    cfg.DATA.TEST_CROP_SIZE, pred_boxes,
                    frame_provider.display_height,
                    frame_provider.display_width)
                boxes = torch.cat(
                    [torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes],
                    axis=1)

            inputs = torch.as_tensor(frames).float()
            inputs = inputs / 255.0
            # Perform color normalization.
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)

            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1,
                                   cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            # logger.info('fast_pathway.shape={}'.format(fast_pathway.shape))

            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1,
                                   fast_pathway.shape[2] //
                                   cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            # logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
            inputs = [slow_pathway, fast_pathway]
            """
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)
            """
            # Perform the forward pass.
            if cfg.DETECTION.ENABLE:
                # When there is nothing in the scene,
                #   use a dummy variable to disable all computations below.
                if not len(boxes):
                    preds = torch.tensor([])
                else:
                    preds = model(inputs, boxes)
            else:
                preds = model(inputs)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            if cfg.DETECTION.ENABLE:
                # This post processing was intendedly assigned to the cpu since my laptop GPU
                #   RTX 2080 runs out of its memory, if your GPU is more powerful, I'd recommend
                #   to change this section to make CUDA does the processing.
                preds = preds.cpu().detach().numpy()
                pred_masks = preds > .1
                label_ids = [
                    np.nonzero(pred_mask)[0] for pred_mask in pred_masks
                ]
                pred_labels = [[
                    labels[label_id] for label_id in perbox_label_ids
                ] for perbox_label_ids in label_ids]
                # I'm unsure how to detectron2 rescales boxes to image original size, so I use
                #   input boxes of slowfast and rescale back it instead, it's safer and even if boxes
                #   was not rescaled by cv2_transform.rescale_boxes, it still works.
                boxes = boxes.cpu().detach().numpy()
                ratio = np.min([
                    frame_provider.display_height, frame_provider.display_width
                ]) / cfg.DATA.TEST_CROP_SIZE
                boxes = boxes[:, 1:] * ratio
            else:
                ## Option 1: single label inference selected from the highest probability entry.
                # label_id = preds.argmax(-1).cpu()
                # pred_label = labels[label_id]
                # Option 2: multi-label inferencing selected from probability entries > threshold
                label_ids = torch.nonzero(
                    preds.squeeze() > .1).reshape(-1).cpu().detach().numpy()
                pred_labels = labels[label_ids]
                logger.info(pred_labels)
                if not list(pred_labels):
                    pred_labels = ['Unknown']

            # # option 1: remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            # option 2: empty the buffer
            frames = []
            s = time() - start

        if cfg.DETECTION.ENABLE and pred_labels and boxes.any():
            for box, box_labels in zip(boxes.astype(int), pred_labels):
                cv2.rectangle(frame,
                              tuple(box[:2]),
                              tuple(box[2:]), (0, 255, 0),
                              thickness=2)
                label_origin = box[:2]
                for label in box_labels:
                    label_origin[-1] -= 5
                    (label_width, label_height), _ = cv2.getTextSize(
                        label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2)
                    cv2.rectangle(frame,
                                  (label_origin[0], label_origin[1] + 5),
                                  (label_origin[0] + label_width,
                                   label_origin[1] - label_height - 5),
                                  palette[labels.index(label)], -1)
                    cv2.putText(frame, label, tuple(label_origin),
                                cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255),
                                1)
                    label_origin[-1] -= label_height + 5
        if not cfg.DETECTION.ENABLE:
            # Display predicted labels to frame.
            y_offset = 50
            cv2.putText(frame,
                        'Action:', (10, y_offset),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=.65,
                        color=(0, 235, 0),
                        thickness=2)
            for pred_label in pred_labels:
                y_offset += 30
                cv2.putText(frame,
                            '{}'.format(pred_label), (20, y_offset),
                            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                            fontScale=.65,
                            color=(0, 235, 0),
                            thickness=2)

        # Display prediction speed
        cv2.putText(frame,
                    'Speed: {:.2f}s'.format(s), (10, 25),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=.65,
                    color=(0, 235, 0),
                    thickness=2)
        # Display the frame
        cv2.imshow('SlowFast', frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    frame_provider.clean()
コード例 #8
0
ファイル: predictor.py プロジェクト: AK391/X3D
    def __call__(self, task):
        """
        Returns the prediction results for the current task.
        Args:
            task (TaskInfo object): task object that contain
                the necessary information for action prediction. (e.g. frames, boxes)
        Returns:
            task (TaskInfo object): the same task info object but filled with
                prediction values (a tensor) and the corresponding boxes for
                action detection task.
        """
        if self.cfg.DETECTION.ENABLE:
            task = self.object_detector(task)

        frames, bboxes = task.frames, task.bboxes
        if bboxes is not None:
            bboxes = cv2_transform.scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                bboxes,
                task.img_height,
                task.img_width,
            )
        if self.cfg.DEMO.INPUT_FORMAT == "BGR":
            frames = [
                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
            ]

        frames = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in frames
        ]
        inputs = process_cv2_inputs(frames, self.cfg)
        if bboxes is not None:
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )

            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        if self.cfg.NUM_GPUS > 0:
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(device=torch.device(
                        self.gpu_id),
                                               non_blocking=True)
            else:
                inputs = inputs.cuda(device=torch.device(self.gpu_id),
                                     non_blocking=True)
        if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
            preds = torch.tensor([])
        else:
            preds = self.model(inputs, bboxes)

        if self.cfg.NUM_GPUS:
            preds = preds.cpu()
            if bboxes is not None:
                bboxes = bboxes.detach().cpu()

        preds = preds.detach()
        task.add_action_preds(preds)
        if bboxes is not None:
            task.add_bboxes(bboxes[:, 1:])

        return task
コード例 #9
0
def demo(cfg, backbone):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

   # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2= "caffe2" in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )
    
    darknetlib_path = '/home/ubuntu/hanhbd/SlowFast/detector/libdarknet.so'
    config_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.cfg'
    meta_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.data'
    classes_path = '/home/ubuntu/hanhbd/SlowFast/detector/coco.names'
    weight_path = '/home/ubuntu/hanhbd/SlowFast/detector/yolov4.weights'
    
    if backbone == 'yolo':
        object_predictor =  YOLO.get_instance(darknetlib_path, config_path, meta_path, classes_path, weight_path)
    else:
        dtron2_cfg_file = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_CFG
        dtron2_cfg = get_cfg()
        dtron2_cfg.merge_from_file(model_zoo.get_config_file(dtron2_cfg_file))
        dtron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = .5
        dtron2_cfg.MODEL.WEIGHTS = cfg.DEMO.DETECTRON2_OBJECT_DETECTION_MODEL_WEIGHTS
        object_predictor = DefaultPredictor(dtron2_cfg)

    with open(cfg.DEMO.LABEL_FILE_PATH) as f:
        labels = f.read().split('\n')[:-1]
    palette = np.random.randint(64, 128, (len(labels), 3)).tolist()
    count_xxx = 0
    seq_len = cfg.DATA.NUM_FRAMES*cfg.DATA.SAMPLING_RATE

    frames = []
    org_frames = []
    mid_frame = None
    pred_labels = []
    draw_imgs = []

    cap  = cv2.VideoCapture(cfg.DEMO.DATA_SOURCE)
    was_read, frame = cap.read()
    display_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    display_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    fourcc = cv2.VideoWriter_fourcc(*'DIVX')
    videowriter = cv2.VideoWriter('./result/testset_fighting_05.avi',fourcc, fps, (display_width,display_height))

    while was_read :
        was_read, frame = cap.read()
        if not was_read:
            videowriter.release()
            break

        if len(frames) != seq_len:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)
            frames.append(frame_processed)
            org_frames.append(frame)

        else:
            #predict all person box in all frame 
            start = time()
            mid_frame = org_frames[seq_len//2 - 2]
            # just draw half of number frame because we will use slide = 1/2 length of sequence
            if cfg.DETECTION.ENABLE and len(draw_imgs) == 0:
                for idx in range(seq_len//2 - 1):
                    image = org_frames[idx]
                    boxes = detector(object_predictor , image, backbone, cfg , display_height, display_width )
                    # boxes = object_predictor.detect_image(img)
                    # boxes = torch.as_tensor(boxes).float().cuda()

                    boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)
                    boxes = boxes.cpu().detach().numpy()
                    if backbone == 'yolo':
                        boxes = boxes[:, 1:]
                    else:
                        ratio = np.min(
                            [display_height, display_width]
                        ) / cfg.DATA.TEST_CROP_SIZE
                        boxes = boxes[:, 1:] * ratio

                    for box in boxes:
                        xmin, ymin, xmax, ymax = box
                        cv2.rectangle(image, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2)

                    draw_imgs.append(image)

            # detect box in mid frame
            if cfg.DETECTION.ENABLE:
                boxes = detector(object_predictor , mid_frame, backbone, cfg , display_height, display_width )
                boxes = torch.cat([torch.full((boxes.shape[0], 1), float(0)).cuda(), boxes], axis=1)

            inputs = torch.from_numpy(np.array(frames)).float()
            inputs = inputs / 255.0
            # Perform color normalization.
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)

            # 1 C T H W.
            inputs = inputs.unsqueeze(0)

            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1, cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            

            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1, 
                                    fast_pathway.shape[2]//cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            inputs = [slow_pathway, fast_pathway]

            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list,)):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

            # use a dummy variable to disable all computations below.
            if not len(boxes):
                preds = torch.tensor([])
            else:
                preds = model(inputs, boxes)

            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]
                
            # post processing
            preds = preds.cpu().detach().numpy()
            pred_masks = preds > .1
            label_ids = [np.nonzero(pred_mask)[0] for pred_mask in pred_masks]
            pred_labels = [
                [labels[label_id] for label_id in perbox_label_ids]
                for perbox_label_ids in label_ids
            ]
            print(pred_labels)
            boxes = boxes.cpu().detach().numpy()
            if backbone == 'yolo':
                boxes = boxes[:, 1:]
            else:
                ratio = np.min(
                    [display_height, display_width]
                ) / cfg.DATA.TEST_CROP_SIZE
                boxes = boxes[:, 1:] * ratio


            # draw result on mid frame
            if pred_labels and boxes.any():
                for box, box_labels in zip(boxes.astype(int), pred_labels):
                    xmin, ymin, xmax, ymax = box
                    cv2.rectangle(mid_frame, (xmin, ymin), (xmax , ymax), (0, 255, 0), thickness=2)
                
                    label_origin = box[:2]
                    for label in box_labels:
                        label_origin[-1] -= 5
                        (label_width, label_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, .5, 2)
                        cv2.rectangle(
                            mid_frame, 
                            (label_origin[0], label_origin[1] + 5), 
                            (label_origin[0] + label_width, label_origin[1] - label_height - 5),
                            palette[labels.index(label)], -1
                        )
                        cv2.putText(
                            mid_frame, label, tuple(label_origin), 
                            cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 1
                        )
                        label_origin[-1] -= label_height + 5

            # append mid frame to the draw array
            draw_imgs.append(mid_frame)

            # write image to videos
            for img_ in draw_imgs:
                videowriter.write(img_)
            print("time process", (time() - start) /64 )
            # clean the buffer of frames and org_frames with slide 1/2 seq_len
            # frames = frames[seq_len//2 - 1:]
            # org_frames = org_frames[seq_len//2 - 1:]

            frames = frames[1:]
            org_frames = org_frames[1:]
            draw_imgs = draw_imgs[-1:]

            count_xxx += 1
コード例 #10
0
def demo(cfg):
    # Set random seed from configs.
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)

    # Setup logging format.
    logging.setup_logging()

    # Print config.
    logger.info("Run demo with config:")
    logger.info(cfg)
    # Build the video model and print model statistics.
    model = model_builder.build_model(cfg)
    model.eval()
    misc.log_model_info(model)

    # Load a checkpoint to test if applicable.
    if cfg.TEST.CHECKPOINT_FILE_PATH != "":
        ckpt = cfg.TEST.CHECKPOINT_FILE_PATH
    elif cu.has_checkpoint(cfg.OUTPUT_DIR):
        ckpt = cu.get_last_checkpoint(cfg.OUTPUT_DIR)
    elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "":
        # If no checkpoint found in TEST.CHECKPOINT_FILE_PATH or in the current
        # checkpoint folder, try to load checkpoint from
        # TRAIN.CHECKPOINT_FILE_PATH and test it.
        ckpt = cfg.TRAIN.CHECKPOINT_FILE_PATH
    else:
        raise NotImplementedError("Unknown way to load checkpoint.")

    cu.load_checkpoint(
        ckpt,
        model,
        cfg.NUM_GPUS > 1,
        None,
        inflation=False,
        convert_from_caffe2="caffe2"
        in [cfg.TEST.CHECKPOINT_TYPE, cfg.TRAIN.CHECKPOINT_TYPE],
    )

    # Load the labels of Kinectics-400 dataset
    labels_df = pd.read_csv(cfg.DEMO.LABEL_FILE_PATH)
    labels = labels_df['name'].values
    img_provider = VideoReader(cfg)
    frames = []
    # # Option 1
    # pred_label = ''
    # Option 2
    pred_labels = []
    s = 0.
    for able_to_read, frame in img_provider:
        if not able_to_read:
            # when reaches the end frame, clear the buffer and continue to the next one.
            frames = []
            continue

        if len(frames) != cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE:
            frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_processed = scale(256, frame_processed)
            frames.append(frame_processed)

        if len(frames) == cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE:
            start = time()
            # Perform color normalization.
            inputs = torch.tensor(frames).float()
            inputs = inputs / 255.0
            inputs = inputs - torch.tensor(cfg.DATA.MEAN)
            inputs = inputs / torch.tensor(cfg.DATA.STD)
            # T H W C -> C T H W.
            inputs = inputs.permute(3, 0, 1, 2)
            # 1 C T H W.
            inputs = inputs[None, :, :, :, :]
            # Sample frames for the fast pathway.
            index = torch.linspace(0, inputs.shape[2] - 1,
                                   cfg.DATA.NUM_FRAMES).long()
            fast_pathway = torch.index_select(inputs, 2, index)
            logger.info('fast_pathway.shape={}'.format(fast_pathway.shape))
            # Sample frames for the slow pathway.
            index = torch.linspace(0, fast_pathway.shape[2] - 1,
                                   fast_pathway.shape[2] //
                                   cfg.SLOWFAST.ALPHA).long()
            slow_pathway = torch.index_select(fast_pathway, 2, index)
            logger.info('slow_pathway.shape={}'.format(slow_pathway.shape))
            inputs = [slow_pathway, fast_pathway]
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(non_blocking=True)
            else:
                inputs = inputs.cuda(non_blocking=True)

            # Perform the forward pass.
            preds = model(inputs)
            # Gather all the predictions across all the devices to perform ensemble.
            if cfg.NUM_GPUS > 1:
                preds = du.all_gather(preds)[0]

            ## Option 1: single label inference selected from the highest probability entry.
            # label_id = preds.argmax(-1).cpu()
            # pred_label = labels[label_id]
            # Option 2: multi-label inferencing selected from probability entries > threshold
            label_ids = torch.nonzero(
                preds.squeeze() > .1).reshape(-1).cpu().detach().numpy()
            pred_labels = labels[label_ids]
            logger.info(pred_labels)
            if not list(pred_labels):
                pred_labels = ['Unknown']

            # remove the oldest frame in the buffer to make place for the new one.
            # frames.pop(0)
            frames = []
            s = time() - start

        # #************************************************************
        # # Option 1
        # #************************************************************
        # # Display prediction speed to frame
        # cv2.putText(frame, 'Speed: {:.2f}s'.format(s), (20, 30),
        #             fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        #             fontScale=1, color=(0, 235, 0), thickness=3)
        # # Display predicted label to frame.
        # cv2.putText(frame, 'Action: {}'.format(pred_label), (20, 60),
        #             fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        #             fontScale=1, color=(0, 235, 0), thickness=3)
        #************************************************************
        # Option 2
        #************************************************************
        # Display prediction speed to frame
        cv2.putText(frame,
                    'Speed: {:.2f}s'.format(s), (20, 30),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=1,
                    color=(0, 235, 0),
                    thickness=3)
        # Display predicted labels to frame.
        y_offset = 60
        cv2.putText(frame,
                    'Action:', (20, y_offset),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=1,
                    color=(0, 235, 0),
                    thickness=3)
        for pred_label in pred_labels:
            y_offset += 30
            cv2.putText(frame,
                        '{}'.format(pred_label), (20, y_offset),
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=1,
                        color=(0, 235, 0),
                        thickness=3)

        # Display the frame
        cv2.imshow('SlowFast', frame)
        # hit Esc to quit the demo.
        key = cv2.waitKey(1)
        if key == 27:
            break

    img_provider.clean()
コード例 #11
0
    def images_and_boxes_preprocessing_cv2(self, imgs, boxes):
        """
        This function performs preprocessing for the input images and
        corresponding boxes for one clip with opencv as backend.

        Args:
            imgs (list of ndarrays with len num_frames): the images. Each image
                                    is a ndarray with shape (H, W, C)
            boxes (ndarray): the boxes for the current clip - not normalized. shape (num_boxes, 4 = x1, y1, x2, y2)

        Returns:
            imgs (tensor): list of preprocessed images. shape: (C, num_frames, H, W)
            boxes (ndarray): preprocessed boxes. shape (num_boxes, 4 = x1, y1, x2, y2)
        """

        # Assure that boxes have the right size
        boxes = cv2_transform.clip_boxes_to_image(boxes, self.img_height,
                                                  self.img_width)

        # `transform.py` is list of np.array. However, for AVA like structure, we only have
        # one np.array.
        boxes = [boxes]

        # The image now is in HWC, BGR format.
        # Short side to test_scale. Non-local and STRG uses 256.
        imgs = [cv2_transform.scale(self.crop_size, img) for img in imgs]
        # Boxes have to be adjusted to new image scale
        boxes = [
            cv2_transform.scale_boxes(self.crop_size, boxes[0],
                                      self.img_height, self.img_width)
        ]

        # Convert image to CHW keeping BGR order.
        imgs = [cv2_transform.HWC2CHW(img) for img in imgs]

        # Image [0, 255] -> [0, 1].
        imgs = [img / 255.0 for img in imgs]

        imgs = [
            np.ascontiguousarray(
                img.reshape((3, imgs[0].shape[1],
                             imgs[0].shape[2]))).astype(np.float32)
            for img in imgs
        ]

        # Normalize images by mean and std.
        imgs = [
            cv2_transform.color_normalization(
                img,
                np.array(self.data_mean, dtype=np.float32),
                np.array(self.data_std, dtype=np.float32),
            ) for img in imgs
        ]

        # Concat list of images to single ndarray.
        imgs = np.concatenate([np.expand_dims(img, axis=1) for img in imgs],
                              axis=1)

        if not self.use_bgr:
            # Convert image format from BGR to RGB.
            # Note that Kinetics pre-training uses RGB!
            imgs = imgs[::-1, ...]

        imgs = np.ascontiguousarray(imgs)
        imgs = torch.from_numpy(imgs)
        boxes = cv2_transform.clip_boxes_to_image(boxes[0], imgs[0].shape[1],
                                                  imgs[0].shape[2])

        # If you are interested to see, how the images look like, you can activate this
        # export_image(cfg, imgs.permute(1, 0, 2, 3).data.numpy(), [boxes], "demo", "CHW", True, use_bgr)

        return imgs, boxes
コード例 #12
0
    def __call__(self, task):
        """
        Returns the prediction results for the current task.
        Args:
            task (TaskInfo object): task object that contain
                the necessary information for action prediction. (e.g. frames, boxes)
        Returns:
            task (TaskInfo object): the same task info object but filled with
                prediction values (a tensor) and the corresponding boxes for
                action detection task.
        """
        # * ------ 1. first stage : starting detection ----------------------*/
        if self.cfg.DETECTION.ENABLE:
            task = self.object_detector(task)

    # * ------ 2. Second stage : starting recognition ----------------------*/
        frames, bboxes = task.frames, task.bboxes

        ################################################################################################################
        from slowfast.datasets.utils import pack_pathway_output, tensor_normalize
        from torchvision import transforms
        from PIL import Image
        if self.cfg.DEMO.INPUT_FORMAT == "BGR":
            frames = [
                cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
            ]

        inputs1 = []
        inputs0 = []
        cv2_transform.lineSpace(0, 63, 32, frames, inputs1)
        cv2_transform.lineSpace(0, 31, 8, inputs1, inputs0)

        inputs0 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs0
        ]
        inputs1 = [
            cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
            for frame in inputs1
        ]

        inputs0 = torch.from_numpy(np.array(inputs0)).float() / 255
        inputs1 = torch.from_numpy(np.array(inputs1)).float() / 255
        inputs0 = tensor_normalize(inputs0, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        inputs1 = tensor_normalize(inputs1, self.cfg.DATA.MEAN,
                                   self.cfg.DATA.STD)
        # T H W C -> C T H W.
        inputs0 = inputs0.permute(3, 0, 1, 2)
        inputs1 = inputs1.permute(3, 0, 1, 2)
        inputs0 = inputs0.unsqueeze(0)
        inputs1 = inputs1.unsqueeze(0)
        inputs = [inputs0, inputs1]
        ###############################################################################################################

        if bboxes is not None:
            bboxes = cv2_transform.scale_boxes(
                self.cfg.DATA.TEST_CROP_SIZE,
                bboxes,
                task.img_height,
                task.img_width,
            )
        # if self.cfg.DEMO.INPUT_FORMAT == "BGR":
        #     frames = [
        #         cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames
        #     ]

        # frames = [
        #     cv2_transform.scale(self.cfg.DATA.TEST_CROP_SIZE, frame)
        #     for frame in frames
        # ]

        # change frames to slowfast inputs
        # inputs = process_cv2_inputs(frames, self.cfg)
        # add person cls to bbox
        if bboxes is not None:
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )

            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        if self.cfg.NUM_GPUS > 0:
            # Transfer the data to the current GPU device.
            if isinstance(inputs, (list, )):
                for i in range(len(inputs)):
                    inputs[i] = inputs[i].cuda(device=torch.device(
                        self.gpu_id),
                                               non_blocking=True)
            else:
                inputs = inputs.cuda(device=torch.device(self.gpu_id),
                                     non_blocking=True)
        if self.cfg.DETECTION.ENABLE and not bboxes.shape[0]:
            preds = torch.tensor([])
        else:
            # change    {1,3,8,224,224]->[8,3,224,224]
            bboxes = bboxes.unsqueeze(0).unsqueeze(0)
            inputs[0] = inputs[0].squeeze(0).permute(1, 0, 2, 3)
            inputs[1] = inputs[1].squeeze(0).permute(1, 0, 2, 3)
            ##########################################################
            import numpy
            numpy.set_printoptions(suppress=True)

            # import scipy.io as io
            # inputs0 = inputs[0].squeeze(0).permute(
            #     1, 0, 2, 3)[0].permute(1, 2, 0).data.cpu().numpy()
            # cv2.imwrite("1.jpg", np.array(
            #     inputs0*255, dtype=np.float32))  # dtype=np.uint8
            # print(inputs0)
            # numpy.save("input0.npy", inputs0)
            # result0 = numpy.array(inputs0.reshape(-1, 1))
            # numpy.savetxt("result0.txt", result0)
            # io.savemat("save.mat", {"result0": result0})

            #######################  save .txt file ############################
            # result0 = numpy.array(
            #     inputs[0].cpu().reshape(-1, 1)).astype(np.float32)
            # # result0 = result0.astype('float')
            # # for i in range(10):
            # #     print(result0[i])
            # # exit(0)
            # result0.astype('float32').tofile("input0.txt")
            # result1 = numpy.array(
            #     inputs[1].cpu().reshape(-1, 1)).astype(np.float32)
            # result1.astype('float32').tofile("input1.txt")
            # result0 = numpy.array(
            #     bboxes.cpu().reshape(-1, 1)).astype(np.float32)
            # result0.astype('float32').tofile("input2.txt")

            ##################################### save .npy file ###################
            # numpy.save("input0.npy", inputs[0].cpu().numpy())
            # numpy.save("input1.npy", inputs[1].cpu().numpy())
            # numpy.save("input2.npy", bboxes.cpu().numpy())
            # input0 = torch.from_numpy(np.load("input0.npy")).cuda()
            # input1 = torch.from_numpy(np.load("input1.npy")).cuda()
            # input2 = torch.from_numpy(np.load("input2.npy")).cuda()
            ##########################################################
            preds = self.model(inputs, bboxes)
            # preds = self.model([input0, input1], input2)

            # result_pred = numpy.array(preds.detach().cpu().reshape(-1, 1))
            # numpy.savetxt("result_preds.txt", result_pred)
            print(preds)
            exit(0)
            #*****************************   open with video test ##########################
            bboxes = bboxes.squeeze(0).squeeze(0)  # change[1,1,3,5] -->[3,5]
            #*****************************   open with video test end ##########################

        if self.cfg.NUM_GPUS:
            preds = preds.cpu()
            if bboxes is not None:
                bboxes = bboxes.detach().cpu()

        preds = preds.detach()
        task.add_action_preds(preds)
        if bboxes is not None:
            task.add_bboxes(bboxes[:, 1:])

        return task
コード例 #13
0
def get_processed_frame(cfg, frame):
    frame_processed = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_processed = scale(cfg.DATA.TEST_CROP_SIZE, frame_processed)

    return frame_processed