Example #1
0
    def detect(self, frame, frame_number, save_img=True):

        print(
            f'---------------- Started Detection of Frame-{frame_number} ----------------'
        )

        # Run inference
        t0 = time.time()
        img = torch.zeros((1, 3, self.imgsz, self.imgsz),
                          device=self.device)  # init img
        _ = self.model(img.half() if self.half else img
                       ) if self.device.type != 'cpu' else None  # run once

        # Reshape input frame
        img, im0 = convert_image(frame, self.imgsz)

        img = torch.from_numpy(img).to(self.device)
        img = img.half() if self.half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Inference
        t1 = time_synchronized()
        pred = self.model(img, augment=self.opt.augment)[0]
        print(f'Model run in "{time.time() - t1:.3f}" seconds.')

        # Apply NMS
        pred = non_max_suppression(pred,
                                   self.opt.conf_thres,
                                   self.opt.iou_thres,
                                   classes=self.opt.classes,
                                   agnostic=self.opt.agnostic_nms)
        t2 = time_synchronized()
        print(f'NMS ended in "{time.time() - t2:.3f}" seconds.')
        bboxes = []
        colours = []
        # Process detections
        for i, det in enumerate(pred):  # detections per image

            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4],
                                          im0.shape).round()
                for xyxy in det:
                    colours.append(
                        (random.randint(64, 255), random.randint(64, 255),
                         random.randint(64, 255)))
                    bboxes.append(xyxy2xywh2(xyxy))

        if self.save_txt or save_img:
            s = f"\n{len(list(Path(self.save_dir).glob('labels/*.txt')))} labels saved to {self.save_dir}" if self.save_txt else ''
            print(f"Results saved to {self.save_dir}{s}")

        print(
            f'Detection pipeline ended in "{time.time() - t0:.3f}" seconds.\n----------------------'
        )
        return bboxes, colours
Example #2
0
def detect_and_track4(opt):
    # If you find any errors when loading YOLOv5 try removing uncommenting the line below and try again.
    # sys.path.insert(0, '../detection/yolov5/weights')

    source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size
    webcam = source.isnumeric() or source.endswith(
        '.txt') or source.lower().startswith(('rtsp://', 'rtmp://', 'http://'))

    # Directories
    save_dir = Path(
        increment_path(Path(opt.project) / opt.name,
                       exist_ok=opt.exist_ok))  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(
        parents=True, exist_ok=True)  # make dir
    # save deep sort results
    save_results_path = os.path.join(save_dir, "deep-sort_results.txt")
    # Deep SORT configurations
    if 'original' in os.path.split(opt.config_deepsort)[1]:
        use_original_deep_sort = True
    else:
        use_original_deep_sort = False
    # Initialize
    set_logging()
    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA
    """ YOLOv5 """
    # Load Detector model
    detector_model = attempt_load(weights,
                                  map_location=device)  # load FP32 model
    stride = int(detector_model.stride.max())  # model stride
    imgsz = check_img_size(imgsz, s=stride)  # check img_size
    if half:
        detector_model.half()  # to FP16
    """ Deep SORT """
    # Set up Deep Sort Tracker
    # Load Tracker Model
    deepsort_config = get_config()
    deepsort_config.merge_from_file(opt.config_deepsort)
    if opt.device != 'cpu':
        deepsort = build_tracker(deepsort_config,
                                 use_cuda=True,
                                 use_original_deep_sort=use_original_deep_sort)
    else:
        deepsort = build_tracker(deepsort_config,
                                 use_cuda=False,
                                 use_original_deep_sort=use_original_deep_sort)

    # Set Dataloader
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride)
    else:
        save_img = True
        dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # set up video_path
    save_video_path = os.path.join(save_dir, 'test_video.mp4')
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(
        save_video_path, fourcc, dataset.fps,
        (dataset.input_frame_size[0], dataset.input_frame_size[1]))

    # Get names and colors
    names = detector_model.module.names if hasattr(
        detector_model, 'module') else detector_model.names
    class_list = names
    print('\n- Available classes for detection:\n', names)
    colors_db = [[random.randint(0, 255) for _ in range(3)] for _ in names]

    # Run inference
    if device.type != 'cpu':
        detector_model(
            torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(
                next(detector_model.parameters())))  # run once
    t0 = time.time()
    """ -- Manos Addition -- """
    bboxes = []
    colours = []
    classes = []
    frame_number = 0
    results = []
    show_boxes = True
    try:
        for path, img, im0s, vid_cap in dataset:
            # Original imaage
            frame = im0s
            frame_number += 1
            """ DETECTION by YOLOv5 """
            # img ==> transformed image for yolov5
            img = torch.from_numpy(img).to(device)
            img = img.half() if half else img.float()  # uint8 to fp16/32
            img /= 255.0  # 0 - 255 to 0.0 - 1.0
            if img.ndimension() == 3:
                img = img.unsqueeze(0)

            # Inference
            t1 = time_synchronized()
            pred = detector_model(img, augment=opt.augment)[0]

            # Apply NMS
            pred = non_max_suppression(pred,
                                       opt.conf_thres,
                                       opt.iou_thres,
                                       classes=opt.classes,
                                       agnostic=opt.agnostic_nms)
            # Initialize bboxes
            bboxes = []
            colours = []
            classes = []
            cls_conf = []
            # Process detections
            for i, det in enumerate(pred):  # detections per image
                if webcam:  # batch_size >= 1
                    p, s, im0, frame_counter_from_dataset_object = path[
                        i], '%g: ' % i, im0s[i].copy(), dataset.count
                else:
                    p, s, im0, frame_counter_from_dataset_object = path, '', im0s, getattr(
                        dataset, 'frame', 0)

                p = Path(p)  # to Path
                s += '%gx%g ' % img.shape[2:]  # print string
                if len(det):
                    # Rescale boxes from img_size to im0 size
                    det[:, :4] = scale_coords(img.shape[2:], det[:, :4],
                                              im0.shape).round()
                    for xmin, ymin, xmax, ymax, conf, cls in det.tolist():
                        w = xmax - xmin
                        h = ymax - ymin
                        # Add a bbox
                        # Deep sort will take bboxes as ['x_center', 'y_center', 'w', 'h']
                        bboxes.append([xmin + w / 2, ymin + h / 2, w, h])
                        # Add current box's color
                        colours.append(colors_db[int(cls)])
                        # Add current box's class
                        classes.append(names[int(cls)])
                        # Add current box's class confidence
                        cls_conf.append(conf)

            print(
                f'○ YOLOv5 frame process done in "{time.time() - t1:.3f}" seconds.'
            )
            """ TRACKING by deep sort"""

            # Deep Sort is already initialized
            if len(bboxes) > 0:
                bboxes_tensor = torch.FloatTensor(bboxes)

                class_indexes = names_to_indexes(classes, class_list)
                classes_tensor = torch.LongTensor(class_indexes)
                cls_conf_tensor = torch.FloatTensor(cls_conf)
            else:
                bboxes_tensor = torch.FloatTensor([]).reshape([0, 4])
                cls_conf_tensor = torch.FloatTensor([])
                classes_tensor = torch.LongTensor([])

            # track objects of 'boat' class
            # mask = classes_tensor == 8
            mask = torch.BoolTensor([True for _ in range(len(bboxes))])
            bbox_xywh = bboxes_tensor[mask]

            # expand boxes - this line can be removed
            bbox_xywh[:, 3:] *= 1.2

            # get class confidences
            cls_conf_to_use = cls_conf_tensor[mask]

            # time point to measure deep SORT update duration
            start_deep_sort = time.time()

            # do tracking
            outputs, cls_names = deepsort.update(bbox_xywh, cls_conf_to_use,
                                                 frame, classes_tensor)

            # draw boxes for visualization
            if len(outputs) > 0 and show_boxes:
                bbox_tlwh = []
                bbox_xyxy = outputs[:, :4]
                identities = outputs[:, -1]
                class_names = [
                    class_list[cls_name] if cls_name != -1 else ""
                    for cls_name in cls_names
                ]
                frame = draw_boxes(frame,
                                   bbox_xyxy,
                                   identities,
                                   class_names=class_names)

                for bb_xyxy in bbox_xyxy:
                    bbox_tlwh.append(deepsort.xyxy_to_tlwh(bb_xyxy))

                results.append((frame_number - 1, bbox_tlwh, identities))

            # save results
            write_results(save_results_path, results, 'mot')
            print(
                f'♦ Deep SORT frame process lasted "{time.time() - start_deep_sort:.3f}" seconds.',
                '\n--------------------------------------------------------------'
            )

            # End of pipeline
            waitKey = cv2.waitKey(delay_value)
            if waitKey & 0xFF == 27:
                print('\n- Button Pressed: "Esc".\n')
                break
            elif waitKey & 0xFF == ord('q'):
                print('\n- Button Pressed: "q".\n')
                break
            elif waitKey & 0xFF == ord('b'):
                show_boxes = not show_boxes
            else:
                cv2.imshow('YOLOv5 x Deep SORT', frame)
                video_writer.write(frame)
                continue
    except Exception as e:
        traceback.print_exc()

    print('Ending detection and tracking. Exiting...')
    video_writer.release()
Example #3
0
    def forward(self, imgs, size=640, augment=False, profile=False):
        # Inference from various sources. For height=720, width=1280, RGB images example inputs are:
        #   filename:   imgs = 'data/samples/zidane.jpg'
        #   URI:             = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg'
        #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(720,1280,3)
        #   PIL:             = Image.open('image.jpg')  # HWC x(720,1280,3)
        #   numpy:           = np.zeros((720,1280,3))  # HWC
        #   torch:           = torch.zeros(16,3,720,1280)  # BCHW
        #   multiple:        = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images

        p = next(self.model.parameters())  # for device and type
        if isinstance(imgs, torch.Tensor):  # torch
            return self.model(imgs.to(p.device).type_as(p), augment,
                              profile)  # inference

        # Pre-process
        n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (
            1, [imgs])  # number of images, list of images
        shape0, shape1, files = [], [], [
        ]  # image and inference shapes, filenames
        for i, im in enumerate(imgs):
            if isinstance(im, str):  # filename or uri
                im, f = Image.open(
                    requests.get(im, stream=True).raw if im.
                    startswith('http') else im), im  # open
                im.filename = f  # for uri
            files.append(
                Path(im.filename).with_suffix('.jpg').
                name if isinstance(im, Image.Image) else f'image{i}.jpg')
            im = np.array(im)  # to numpy
            if im.shape[0] < 5:  # image in CHW
                im = im.transpose(
                    (1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
            im = im[:, :, :3] if im.ndim == 3 else np.tile(
                im[:, :, None], 3)  # enforce 3ch input
            s = im.shape[:2]  # HWC
            shape0.append(s)  # image shape
            g = (size / max(s))  # gain
            shape1.append([y * g for y in s])
            imgs[i] = im  # update
        shape1 = [
            make_divisible(x, int(self.stride.max()))
            for x in np.stack(shape1, 0).max(0)
        ]  # inference shape
        x = [letterbox(im, new_shape=shape1, auto=False)[0]
             for im in imgs]  # pad
        x = np.stack(x, 0) if n > 1 else x[0][None]  # stack
        x = np.ascontiguousarray(x.transpose((0, 3, 1, 2)))  # BHWC to BCHW
        x = torch.from_numpy(x).to(
            p.device).type_as(p) / 255.  # uint8 to fp16/32

        # Inference
        with torch.no_grad():
            y = self.model(x, augment, profile)[0]  # forward
        y = non_max_suppression(y,
                                conf_thres=self.conf,
                                iou_thres=self.iou,
                                classes=self.classes)  # NMS

        # Post-process
        for i in range(n):
            scale_coords(shape1, y[i][:, :4], shape0[i])

        return Detections(imgs, y, files, self.names)
Example #4
0
    def detect_and_track2(self):
        frame_number = 0
        bboxes = []
        colours = []
        end_task = False
        just_because = 0
        # START LOGGER
        with open(
                self.dirs.get_command_dir() + '/tracking_information_frame-' +
                str(frame_number) + '.txt', 'w') as event_info_stream:
            # LOOP while 'camera' is open
            while self.cam.isOpened():
                # When Escape is pressed
                if end_task:
                    break
                # Read 1 Frame
                success, frame = self.cam.read()
                frame_number += 1
                if not success:
                    write_to_txt(event_info_stream, "Video reached the end...")
                    print('End of video. Exiting...')
                    break
                """ ----------  DETECTION ----------  """
                if self.detection_state:
                    print('Detection...............')
                    bboxes = []
                    colours = []
                    print(
                        f'---------------- Started Detection in Frame-{frame_number} ----------------'
                    )
                    # Run inference
                    t0 = time.time()
                    img = torch.zeros((1, 3, self.imgsz, self.imgsz),
                                      device=self.device)  # init img
                    _ = self.model(
                        img.half() if self.half else img
                    ) if self.device.type != 'cpu' else None  # run once

                    # Reshape input frame
                    img, im0 = convert_image(frame, self.imgsz)
                    img = torch.from_numpy(img).to(self.device)
                    img = img.half() if self.half else img.float(
                    )  # uint8 to fp16/32
                    img /= 255.0  # 0 - 255 to 0.0 - 1.0
                    if img.ndimension() == 3:
                        img = img.unsqueeze(0)

                    # Inference
                    t1 = time_synchronized()
                    pred = self.model(img, augment=self.opt.augment)[0]
                    print('Predictions:\n', pred)
                    print(f'Model run in "{time.time() - t1:.3f}" seconds.')

                    t2 = time.time()
                    # Apply NMS
                    pred = non_max_suppression(pred,
                                               self.opt.conf_thres,
                                               self.opt.iou_thres,
                                               classes=self.opt.classes,
                                               agnostic=self.opt.agnostic_nms)
                    time_synchronized()
                    print(f'NMS ended in "{time.time() - t2:.3f}" seconds.')

                    # Process predictions
                    t3 = time.time()
                    for i, det in enumerate(pred):  # detections per image

                        if len(det):
                            # Rescale boxes from img_size to im0 size
                            det[:, :4] = scale_coords(img.shape[2:],
                                                      det[:, :4],
                                                      im0.shape).round()
                            for xyxy in det:
                                colours.append((random.randint(64, 255),
                                                random.randint(64, 255),
                                                random.randint(64, 255)))
                                bboxes.append(xyxy2xywh2(xyxy))
                            self.detection_state = False
                            self.initiate_tracking = True
                    print(
                        f'Decoding predictions ended in "{time.time() - t3:.3f}" seconds.'
                    )

                    print(
                        f'Detection pipeline ended in "{time.time() - t0:.3f}" seconds.\n----------------------'
                    )

                    waitKey = cv2.waitKey(delay_value)
                    """ It does not make sense now that I think about it, but there might be a fix """
                    # TODO - add as a feature to run only detection
                    if waitKey & 0xFF == ord('t'):  # 'd' pressed
                        self.detection_state = False
                        self.initiate_tracking = True
                        continue
                    if waitKey & 0xFF == ord('T'):  # 'd' pressed
                        self.detection_state = False
                        self.initiate_tracking = True
                        continue
                    # cv2.imshow('Detector', frame)
                else:
                    """ ----------  TRACKING ----------  """
                    success, frame = self.cam.read()

                    if len(bboxes) <= 0:
                        print(
                            'No objects detected. Entering Detection mode...')
                        self.detection_state = True
                        continue
                    """ Tracking Part"""
                    print(
                        f'---------------- Started Tracking in Frame-{frame_number} ----------------'
                    )
                    # Log everything

                    frame_width = int(self.cam.get(cv2.CAP_PROP_FRAME_WIDTH))
                    frame_height = int(self.cam.get(cv2.CAP_PROP_FRAME_HEIGHT))

                    # The trackers work better if the bounding box is bigger than the object itself
                    bboxes = expand_bboxes(bboxes,
                                           frame_width,
                                           frame_height,
                                           c=self.expansion_constant)

                    # Create MultiTracker object - recreate in order to re-enter new bboxes.
                    if self.initiate_tracking:
                        multiTracker = cv2.MultiTracker_create()

                        # Initialize MultiTracker - You can specify different trackers for every bounding box
                        for bbox, color in zip(bboxes, colours):
                            rect = (int(bbox[0]), int(bbox[1]), int(bbox[2]),
                                    int(bbox[3]))
                            multiTracker.add(
                                createTrackerByName(self.trackerType), frame,
                                rect)

                        self.initiate_tracking = False
                    """ ------------- Start the main Loop ------------- """
                    # get updated location of objects in subsequent frames
                    t1 = time.time()
                    success, boxes = multiTracker.update(frame)
                    # if object is lost go to re-detect
                    if not success:
                        write_to_txt(
                            event_info_stream,
                            "@@@@@@@@@@@@@@@@ TRACKED OBJECT LOST @@@@@@@@@@@@@@@@@@@@@ at frame:"
                            + str(frame_number))
                        # TODO - putText for lost objects outside of the frame of the video - otherwise there won't be a way to remove the "putText"
                        # draw_lost_trackings_text(frame, frame_width, frame_height)
                        self.detection_state = True
                        end_task = False
                        continue

                    ALL_IDs = [
                    ]  # create an empty array to be filled in the following
                    # ALL_CENTROIDS = []  # create an empty array to be filled in the following
                    ALL_bounding_boxes = [
                    ]  # create an empty array to be filled in the following
                    frame_number += 1  # counter from frame number

                    # Draw standard Text
                    # display and output
                    text_positionUL = (pos_row, pos_col)  # cols, rows
                    draw_standard_text(frame, frame_number, self.trackerType,
                                       text_positionUL)

                    # draw tracked objects
                    for m, newbox in enumerate(boxes):
                        x, y, w, h = newbox[0], newbox[1], newbox[2], newbox[3]

                        p7 = (int(x), int(y))
                        p8 = (int(x + w), int(y + h))

                        ID_counter = m + 1

                        # Draw bbox
                        cv2.rectangle(frame, p7, p8, colours[m], 2, 1)
                        # Draw bbox's text
                        draw_bbox_text(frame, colours[m], ID_counter, p7)

                        # Coordinates of one box
                        box_total = (p7, p8)

                        ALL_bounding_boxes.append(
                            box_total)  # fill the array during the iteration
                        # ALL_CENTROIDS.append(box_centr)  # fill the array during the iteration
                        ALL_IDs.append(ID_counter)

                    # Show frame
                    print(
                        f'Tracking step took "{time.time() - t1:.3f}" seconds.'
                    )

                    # Write to txt file
                    # TODO - also write to csv with keys: a) video_name --> str, b) frame --> int, c) object_IDs --> list of ints, d) bboxes --> list of ints, e) centroids --> list of ints
                    write_list_to_txt(
                        event_info_stream,
                        [
                            'Processing frame: ' +
                            str(frame_number),  # current frame number
                            str(datetime.now().strftime(
                                "%d-%m-%Y %H:%M:%S")),  # datetime
                            'Object IDs: ' + str(ALL_IDs),
                            'Bounding boxes pixels: ' +
                            str(ALL_bounding_boxes),
                            # Bounding box --> UP left x (as cols),y (as rows) and BR x (as cols),y (as rows)
                            # 'Centroids pixels: ' + str(ALL_CENTROIDS),
                            '--------------------------------------------------'
                        ])

                    # Write frame to output video
                    self.video_writer.write(frame)
                    """ 
                    https://stackoverflow.com/questions/51143458/difference-in-output-with-waitkey0-and-waitkey1/51143586
    
                    1.waitKey(0) will display the window infinitely until any keypress (it is suitable for image display).
                    2.waitKey(1) will display a frame for 1 ms, after which display will be automatically closed
                    So, if you use waitKey(0) you see a still image until you actually press something while for waitKey(1)
                    the function will show a frame for 1 ms only.
                    """
                waitKey = cv2.waitKey(delay_value)
                if waitKey & 0xFF == 27:
                    end_task = True
                    continue
                elif waitKey & 0xFF == ord('d'):  # 'd' pressed
                    self.detection_state = True
                    continue
                elif 0xFF == ord('D'):  # 'D' pressed
                    self.detection_state = True
                    continue
                else:
                    cv2.imshow('YOLOv5 x ' + self.trackerType, frame)
                    continue
        return