def detect(self, frame, frame_number, save_img=True): print( f'---------------- Started Detection of Frame-{frame_number} ----------------' ) # Run inference t0 = time.time() img = torch.zeros((1, 3, self.imgsz, self.imgsz), device=self.device) # init img _ = self.model(img.half() if self.half else img ) if self.device.type != 'cpu' else None # run once # Reshape input frame img, im0 = convert_image(frame, self.imgsz) img = torch.from_numpy(img).to(self.device) img = img.half() if self.half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = self.model(img, augment=self.opt.augment)[0] print(f'Model run in "{time.time() - t1:.3f}" seconds.') # Apply NMS pred = non_max_suppression(pred, self.opt.conf_thres, self.opt.iou_thres, classes=self.opt.classes, agnostic=self.opt.agnostic_nms) t2 = time_synchronized() print(f'NMS ended in "{time.time() - t2:.3f}" seconds.') bboxes = [] colours = [] # Process detections for i, det in enumerate(pred): # detections per image if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() for xyxy in det: colours.append( (random.randint(64, 255), random.randint(64, 255), random.randint(64, 255))) bboxes.append(xyxy2xywh2(xyxy)) if self.save_txt or save_img: s = f"\n{len(list(Path(self.save_dir).glob('labels/*.txt')))} labels saved to {self.save_dir}" if self.save_txt else '' print(f"Results saved to {self.save_dir}{s}") print( f'Detection pipeline ended in "{time.time() - t0:.3f}" seconds.\n----------------------' ) return bboxes, colours
def detect_and_track4(opt): # If you find any errors when loading YOLOv5 try removing uncommenting the line below and try again. # sys.path.insert(0, '../detection/yolov5/weights') source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.endswith( '.txt') or source.lower().startswith(('rtsp://', 'rtmp://', 'http://')) # Directories save_dir = Path( increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run (save_dir / 'labels' if save_txt else save_dir).mkdir( parents=True, exist_ok=True) # make dir # save deep sort results save_results_path = os.path.join(save_dir, "deep-sort_results.txt") # Deep SORT configurations if 'original' in os.path.split(opt.config_deepsort)[1]: use_original_deep_sort = True else: use_original_deep_sort = False # Initialize set_logging() device = select_device(opt.device) half = device.type != 'cpu' # half precision only supported on CUDA """ YOLOv5 """ # Load Detector model detector_model = attempt_load(weights, map_location=device) # load FP32 model stride = int(detector_model.stride.max()) # model stride imgsz = check_img_size(imgsz, s=stride) # check img_size if half: detector_model.half() # to FP16 """ Deep SORT """ # Set up Deep Sort Tracker # Load Tracker Model deepsort_config = get_config() deepsort_config.merge_from_file(opt.config_deepsort) if opt.device != 'cpu': deepsort = build_tracker(deepsort_config, use_cuda=True, use_original_deep_sort=use_original_deep_sort) else: deepsort = build_tracker(deepsort_config, use_cuda=False, use_original_deep_sort=use_original_deep_sort) # Set Dataloader if webcam: view_img = check_imshow() cudnn.benchmark = True # set True to speed up constant image size inference dataset = LoadStreams(source, img_size=imgsz, stride=stride) else: save_img = True dataset = LoadImages(source, img_size=imgsz, stride=stride) # set up video_path save_video_path = os.path.join(save_dir, 'test_video.mp4') fourcc = cv2.VideoWriter_fourcc(*'mp4v') video_writer = cv2.VideoWriter( save_video_path, fourcc, dataset.fps, (dataset.input_frame_size[0], dataset.input_frame_size[1])) # Get names and colors names = detector_model.module.names if hasattr( detector_model, 'module') else detector_model.names class_list = names print('\n- Available classes for detection:\n', names) colors_db = [[random.randint(0, 255) for _ in range(3)] for _ in names] # Run inference if device.type != 'cpu': detector_model( torch.zeros(1, 3, imgsz, imgsz).to(device).type_as( next(detector_model.parameters()))) # run once t0 = time.time() """ -- Manos Addition -- """ bboxes = [] colours = [] classes = [] frame_number = 0 results = [] show_boxes = True try: for path, img, im0s, vid_cap in dataset: # Original imaage frame = im0s frame_number += 1 """ DETECTION by YOLOv5 """ # img ==> transformed image for yolov5 img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = detector_model(img, augment=opt.augment)[0] # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) # Initialize bboxes bboxes = [] colours = [] classes = [] cls_conf = [] # Process detections for i, det in enumerate(pred): # detections per image if webcam: # batch_size >= 1 p, s, im0, frame_counter_from_dataset_object = path[ i], '%g: ' % i, im0s[i].copy(), dataset.count else: p, s, im0, frame_counter_from_dataset_object = path, '', im0s, getattr( dataset, 'frame', 0) p = Path(p) # to Path s += '%gx%g ' % img.shape[2:] # print string if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() for xmin, ymin, xmax, ymax, conf, cls in det.tolist(): w = xmax - xmin h = ymax - ymin # Add a bbox # Deep sort will take bboxes as ['x_center', 'y_center', 'w', 'h'] bboxes.append([xmin + w / 2, ymin + h / 2, w, h]) # Add current box's color colours.append(colors_db[int(cls)]) # Add current box's class classes.append(names[int(cls)]) # Add current box's class confidence cls_conf.append(conf) print( f'○ YOLOv5 frame process done in "{time.time() - t1:.3f}" seconds.' ) """ TRACKING by deep sort""" # Deep Sort is already initialized if len(bboxes) > 0: bboxes_tensor = torch.FloatTensor(bboxes) class_indexes = names_to_indexes(classes, class_list) classes_tensor = torch.LongTensor(class_indexes) cls_conf_tensor = torch.FloatTensor(cls_conf) else: bboxes_tensor = torch.FloatTensor([]).reshape([0, 4]) cls_conf_tensor = torch.FloatTensor([]) classes_tensor = torch.LongTensor([]) # track objects of 'boat' class # mask = classes_tensor == 8 mask = torch.BoolTensor([True for _ in range(len(bboxes))]) bbox_xywh = bboxes_tensor[mask] # expand boxes - this line can be removed bbox_xywh[:, 3:] *= 1.2 # get class confidences cls_conf_to_use = cls_conf_tensor[mask] # time point to measure deep SORT update duration start_deep_sort = time.time() # do tracking outputs, cls_names = deepsort.update(bbox_xywh, cls_conf_to_use, frame, classes_tensor) # draw boxes for visualization if len(outputs) > 0 and show_boxes: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] class_names = [ class_list[cls_name] if cls_name != -1 else "" for cls_name in cls_names ] frame = draw_boxes(frame, bbox_xyxy, identities, class_names=class_names) for bb_xyxy in bbox_xyxy: bbox_tlwh.append(deepsort.xyxy_to_tlwh(bb_xyxy)) results.append((frame_number - 1, bbox_tlwh, identities)) # save results write_results(save_results_path, results, 'mot') print( f'♦ Deep SORT frame process lasted "{time.time() - start_deep_sort:.3f}" seconds.', '\n--------------------------------------------------------------' ) # End of pipeline waitKey = cv2.waitKey(delay_value) if waitKey & 0xFF == 27: print('\n- Button Pressed: "Esc".\n') break elif waitKey & 0xFF == ord('q'): print('\n- Button Pressed: "q".\n') break elif waitKey & 0xFF == ord('b'): show_boxes = not show_boxes else: cv2.imshow('YOLOv5 x Deep SORT', frame) video_writer.write(frame) continue except Exception as e: traceback.print_exc() print('Ending detection and tracking. Exiting...') video_writer.release()
def forward(self, imgs, size=640, augment=False, profile=False): # Inference from various sources. For height=720, width=1280, RGB images example inputs are: # filename: imgs = 'data/samples/zidane.jpg' # URI: = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/zidane.jpg' # OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(720,1280,3) # PIL: = Image.open('image.jpg') # HWC x(720,1280,3) # numpy: = np.zeros((720,1280,3)) # HWC # torch: = torch.zeros(16,3,720,1280) # BCHW # multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images p = next(self.model.parameters()) # for device and type if isinstance(imgs, torch.Tensor): # torch return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference # Pre-process n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else ( 1, [imgs]) # number of images, list of images shape0, shape1, files = [], [], [ ] # image and inference shapes, filenames for i, im in enumerate(imgs): if isinstance(im, str): # filename or uri im, f = Image.open( requests.get(im, stream=True).raw if im. startswith('http') else im), im # open im.filename = f # for uri files.append( Path(im.filename).with_suffix('.jpg'). name if isinstance(im, Image.Image) else f'image{i}.jpg') im = np.array(im) # to numpy if im.shape[0] < 5: # image in CHW im = im.transpose( (1, 2, 0)) # reverse dataloader .transpose(2, 0, 1) im = im[:, :, :3] if im.ndim == 3 else np.tile( im[:, :, None], 3) # enforce 3ch input s = im.shape[:2] # HWC shape0.append(s) # image shape g = (size / max(s)) # gain shape1.append([y * g for y in s]) imgs[i] = im # update shape1 = [ make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0) ] # inference shape x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad x = np.stack(x, 0) if n > 1 else x[0][None] # stack x = np.ascontiguousarray(x.transpose((0, 3, 1, 2))) # BHWC to BCHW x = torch.from_numpy(x).to( p.device).type_as(p) / 255. # uint8 to fp16/32 # Inference with torch.no_grad(): y = self.model(x, augment, profile)[0] # forward y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes) # NMS # Post-process for i in range(n): scale_coords(shape1, y[i][:, :4], shape0[i]) return Detections(imgs, y, files, self.names)
def detect_and_track2(self): frame_number = 0 bboxes = [] colours = [] end_task = False just_because = 0 # START LOGGER with open( self.dirs.get_command_dir() + '/tracking_information_frame-' + str(frame_number) + '.txt', 'w') as event_info_stream: # LOOP while 'camera' is open while self.cam.isOpened(): # When Escape is pressed if end_task: break # Read 1 Frame success, frame = self.cam.read() frame_number += 1 if not success: write_to_txt(event_info_stream, "Video reached the end...") print('End of video. Exiting...') break """ ---------- DETECTION ---------- """ if self.detection_state: print('Detection...............') bboxes = [] colours = [] print( f'---------------- Started Detection in Frame-{frame_number} ----------------' ) # Run inference t0 = time.time() img = torch.zeros((1, 3, self.imgsz, self.imgsz), device=self.device) # init img _ = self.model( img.half() if self.half else img ) if self.device.type != 'cpu' else None # run once # Reshape input frame img, im0 = convert_image(frame, self.imgsz) img = torch.from_numpy(img).to(self.device) img = img.half() if self.half else img.float( ) # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = self.model(img, augment=self.opt.augment)[0] print('Predictions:\n', pred) print(f'Model run in "{time.time() - t1:.3f}" seconds.') t2 = time.time() # Apply NMS pred = non_max_suppression(pred, self.opt.conf_thres, self.opt.iou_thres, classes=self.opt.classes, agnostic=self.opt.agnostic_nms) time_synchronized() print(f'NMS ended in "{time.time() - t2:.3f}" seconds.') # Process predictions t3 = time.time() for i, det in enumerate(pred): # detections per image if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() for xyxy in det: colours.append((random.randint(64, 255), random.randint(64, 255), random.randint(64, 255))) bboxes.append(xyxy2xywh2(xyxy)) self.detection_state = False self.initiate_tracking = True print( f'Decoding predictions ended in "{time.time() - t3:.3f}" seconds.' ) print( f'Detection pipeline ended in "{time.time() - t0:.3f}" seconds.\n----------------------' ) waitKey = cv2.waitKey(delay_value) """ It does not make sense now that I think about it, but there might be a fix """ # TODO - add as a feature to run only detection if waitKey & 0xFF == ord('t'): # 'd' pressed self.detection_state = False self.initiate_tracking = True continue if waitKey & 0xFF == ord('T'): # 'd' pressed self.detection_state = False self.initiate_tracking = True continue # cv2.imshow('Detector', frame) else: """ ---------- TRACKING ---------- """ success, frame = self.cam.read() if len(bboxes) <= 0: print( 'No objects detected. Entering Detection mode...') self.detection_state = True continue """ Tracking Part""" print( f'---------------- Started Tracking in Frame-{frame_number} ----------------' ) # Log everything frame_width = int(self.cam.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(self.cam.get(cv2.CAP_PROP_FRAME_HEIGHT)) # The trackers work better if the bounding box is bigger than the object itself bboxes = expand_bboxes(bboxes, frame_width, frame_height, c=self.expansion_constant) # Create MultiTracker object - recreate in order to re-enter new bboxes. if self.initiate_tracking: multiTracker = cv2.MultiTracker_create() # Initialize MultiTracker - You can specify different trackers for every bounding box for bbox, color in zip(bboxes, colours): rect = (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])) multiTracker.add( createTrackerByName(self.trackerType), frame, rect) self.initiate_tracking = False """ ------------- Start the main Loop ------------- """ # get updated location of objects in subsequent frames t1 = time.time() success, boxes = multiTracker.update(frame) # if object is lost go to re-detect if not success: write_to_txt( event_info_stream, "@@@@@@@@@@@@@@@@ TRACKED OBJECT LOST @@@@@@@@@@@@@@@@@@@@@ at frame:" + str(frame_number)) # TODO - putText for lost objects outside of the frame of the video - otherwise there won't be a way to remove the "putText" # draw_lost_trackings_text(frame, frame_width, frame_height) self.detection_state = True end_task = False continue ALL_IDs = [ ] # create an empty array to be filled in the following # ALL_CENTROIDS = [] # create an empty array to be filled in the following ALL_bounding_boxes = [ ] # create an empty array to be filled in the following frame_number += 1 # counter from frame number # Draw standard Text # display and output text_positionUL = (pos_row, pos_col) # cols, rows draw_standard_text(frame, frame_number, self.trackerType, text_positionUL) # draw tracked objects for m, newbox in enumerate(boxes): x, y, w, h = newbox[0], newbox[1], newbox[2], newbox[3] p7 = (int(x), int(y)) p8 = (int(x + w), int(y + h)) ID_counter = m + 1 # Draw bbox cv2.rectangle(frame, p7, p8, colours[m], 2, 1) # Draw bbox's text draw_bbox_text(frame, colours[m], ID_counter, p7) # Coordinates of one box box_total = (p7, p8) ALL_bounding_boxes.append( box_total) # fill the array during the iteration # ALL_CENTROIDS.append(box_centr) # fill the array during the iteration ALL_IDs.append(ID_counter) # Show frame print( f'Tracking step took "{time.time() - t1:.3f}" seconds.' ) # Write to txt file # TODO - also write to csv with keys: a) video_name --> str, b) frame --> int, c) object_IDs --> list of ints, d) bboxes --> list of ints, e) centroids --> list of ints write_list_to_txt( event_info_stream, [ 'Processing frame: ' + str(frame_number), # current frame number str(datetime.now().strftime( "%d-%m-%Y %H:%M:%S")), # datetime 'Object IDs: ' + str(ALL_IDs), 'Bounding boxes pixels: ' + str(ALL_bounding_boxes), # Bounding box --> UP left x (as cols),y (as rows) and BR x (as cols),y (as rows) # 'Centroids pixels: ' + str(ALL_CENTROIDS), '--------------------------------------------------' ]) # Write frame to output video self.video_writer.write(frame) """ https://stackoverflow.com/questions/51143458/difference-in-output-with-waitkey0-and-waitkey1/51143586 1.waitKey(0) will display the window infinitely until any keypress (it is suitable for image display). 2.waitKey(1) will display a frame for 1 ms, after which display will be automatically closed So, if you use waitKey(0) you see a still image until you actually press something while for waitKey(1) the function will show a frame for 1 ms only. """ waitKey = cv2.waitKey(delay_value) if waitKey & 0xFF == 27: end_task = True continue elif waitKey & 0xFF == ord('d'): # 'd' pressed self.detection_state = True continue elif 0xFF == ord('D'): # 'D' pressed self.detection_state = True continue else: cv2.imshow('YOLOv5 x ' + self.trackerType, frame) continue return