def mainFunc(args): # Set the main function flag print("Main Function Start...") # Check the GPU device print("Number of available GPUs: {}".format(torch.cuda.device_count())) # Check whether using the distributed runing for the network is_distributed = initDistributed(args) master = True if is_distributed and os.environ["RANK"]: master = int( os.environ["RANK"]) == 0 # check whether this node is master node # Configuration for device setting set_logging() if is_distributed: device = torch.device('cuda:{}'.format(args.local_rank)) else: device = select_device(args.device) half = device.type != 'cpu' # half precision only supported on CUDA # Load the configuration config = loadConfig(args.config) # CuDNN related setting if torch.cuda.is_available(): cudnn.benchmark = config.DEVICE.CUDNN.BENCHMARK cudnn.deterministic = config.DEVICE.CUDNN.DETERMINISTIC cudnn.enabled = config.DEVICE.CUDNN.ENABLED # Configurations for dirctories save_img, save_dir, source, yolov5_weights, view_img, save_txt, imgsz = \ False, Path(args.save_dir), args.source, args.weights, args.view_img, args.save_txt, args.img_size webcam = source.isnumeric() or source.startswith( ('rtsp://', 'rtmp://', 'http://')) or source.endswith('.txt') if save_dir == Path('runs/detect'): # if default os.makedirs('runs/detect', exist_ok=True) # make base save_dir = Path(increment_dir(save_dir / 'exp', args.name)) # increment run os.makedirs(save_dir / 'labels' if save_txt else save_dir, exist_ok=True) # make new dir # Load yolov5 model for human detection model_yolov5 = attempt_load(config.MODEL.PRETRAINED.YOLOV5, map_location=device) imgsz = check_img_size(imgsz, s=model_yolov5.stride.max()) # check img_size if half: model_yolov5.half() # to FP16 # Second-stage classifier classify = False if classify: model_classifier = load_classifier(name='resnet101', n=2) # initialize model_classifier.load_state_dict( torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights model_classifier.to(device).eval() # Load resnet model for human keypoints estimation model_resnet = eval('pose_models.' + config.MODEL.NAME.RESNET + '.get_pose_net')(config, is_train=False) if config.EVAL.RESNET.MODEL_FILE: print('=> loading model from {}'.format(config.EVAL.RESNET.MODEL_FILE)) model_resnet.load_state_dict(torch.load(config.EVAL.RESNET.MODEL_FILE), strict=False) else: print('expected model defined in config at EVAL.RESNET.MODEL_FILE') model_resnet.to(device) model_resnet.eval() # Set Dataloader vid_path, vid_writer = None, None if webcam: view_img = True dataset = LoadStreams(source, img_size=imgsz) else: save_img = True dataset = LoadImages(source, img_size=imgsz) pose_transform = transforms.Compose( [ # input transformation for 2d human pose estimation transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Get names and colors names = model_yolov5.module.names if hasattr( model_yolov5, 'module') else model_yolov5.names colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] # Construt filters for filtering 2D/3D human keypoints # filters_2d = constructFilters((1,16,2), freq=25, mincutoff=1, beta=0.01) # for test # filters_3d = constructFilters((1,16,3), freq=25, mincutoff=1, beta=0.01) # Run the yolov5 and resnet for 2d human pose estimation # with torch.no_grad(): # Run inference t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model_yolov5(img.half() if half else img ) if device.type != 'cpu' else None # run once # Process every video frame for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred_boxes = model_yolov5(img, augment=args.augment)[0] # Apply NMS pred_boxes = non_max_suppression(pred_boxes, args.conf_thres, args.iou_thres, classes=args.classes, agnostic=args.agnostic_nms) t2 = time_synchronized() # Can not find people and move to next frame if pred_boxes[0] is None: # show the frame with no human detected cv2.namedWindow("2D Human Pose Estimation", cv2.WINDOW_NORMAL) cv2.imshow("2D Human Pose Estimation", im0s[0].copy()) # wait manual operations # with kb.Listener(on_press=on_press) as listener: # listener.join() # return # if kb.is_pressed('t'): # return print("No Human Detected and Move on.") print("-" * 30) continue # Print time (inference + NMS) detect_time = t2 - t1 detect_fps = 1.0 / detect_time print("Human Detection Time: {}, Human Detection FPS: {}".format( detect_time, detect_fps)) # Apply Classifier if classify: # false pred_boxes = apply_classifier(pred_boxes, model_classifier, img, im0s) # Estimate 2d human pose(multiple person) centers = [] scales = [] for id, boxes in enumerate(pred_boxes): if boxes is not None and len(boxes): boxes[:, :4] = scale_coords(img.shape[2:], boxes[:, :4], im0s[id].copy().shape).round() # convert tensor to list format boxes = np.delete(boxes.cpu().numpy(), [-2, -1], axis=1).tolist() for l in range(len(boxes)): boxes[l] = [tuple(boxes[l][0:2]), tuple(boxes[l][2:4])] # convert box to center and scale for box in boxes: center, scale = box_to_center_scale(box, imgsz, imgsz) centers.append(center) scales.append(scale) t3 = time_synchronized() pred_pose_2d = get_pose_estimation_prediction(config, model_resnet, im0s[0], centers, scales, transform=pose_transform, device=device) t4 = time_synchronized() # Print time (2d human pose estimation) estimate_time = t4 - t3 estimate_fps = 1.0 / estimate_time print("Pose Estimation Time: {}, Pose Estimation FPS: {}".format( estimate_time, estimate_fps)) # Filter the predicted 2d human pose(multiple person) t5 = time_synchronized() # if False: # for test if config.EVAL.RESNET.USE_FILTERS_2D: # construct filters for every keypoints of every person in 2D filters_2d = constructFilters(pred_pose_2d.shape, freq=1, mincutoff=1, beta=0.01) print("Shape of filters_2d: ({}, {}, {})".format( len(filters_2d), len(filters_2d[0]), len(filters_2d[0][0]))) # for test for per in range(pred_pose_2d.shape[0]): for kp in range(pred_pose_2d.shape[1]): for coord in range(pred_pose_2d.shape[2]): pred_pose_2d[per][kp][coord] = filters_2d[per][kp][ coord](pred_pose_2d[per][kp][coord]) t6 = time_synchronized() # Print time (filter 2d human pose) filter_time_2d = t6 - t5 filter_fps_2d = 1.0 / filter_time_2d print("Filter 2D Pose Time: {}, Filter 2D Pose FPS: {}".format( filter_time_2d, filter_fps_2d)) # Process detections and estimations in 2D for i, box in enumerate(pred_boxes): if webcam: # batch_size >= 1 p, s, im0 = Path(path[i]), '%g: ' % i, im0s[i].copy() else: p, s, im0 = Path(path), '', im0s save_path = str(save_dir / p.name) txt_path = str(save_dir / 'labels' / p.stem) + ( '_%g' % dataset.frame if dataset.mode == 'video' else '') s += '%gx%g ' % img.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh if box is not None and len(box): # Rescale boxes from img_size to im0 size box[:, :4] = scale_coords(img.shape[2:], box[:, :4], im0.shape).round() # Print results for c in box[:, -1].unique(): n = (box[:, -1] == c).sum() # detections per class s += '%g %ss, ' % (n, names[int(c)]) # add to string # Write results for *xyxy, conf, cls in reversed(box): if save_txt: # Write to file xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh line = (cls, *xywh, conf) if args.save_conf else ( cls, *xywh) # label format with open(txt_path + '.txt', 'a') as f: f.write(('%g ' * len(line) + '\n') % line) # Add bbox to image if save_img or view_img: label = '%s %.2f' % (names[int(cls)], conf) plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3) # Draw joint keypoints, number orders and human skeletons for every detected people in 2D for person in pred_pose_2d: # draw the human keypoints for idx, coord in enumerate(person): x_coord, y_coord = int(coord[0]), int(coord[1]) cv2.circle(im0, (x_coord, y_coord), 1, (0, 0, 255), 5) cv2.putText(im0, str(idx), (x_coord, y_coord), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2, cv2.LINE_AA) # draw the human skeletons in PACIFIC mode for skeleton in PACIFIC_SKELETON_INDEXES: cv2.line(im0, (int(person[skeleton[0]][0]), int(person[skeleton[0]][1])), (int(person[skeleton[1]][0]), int(person[skeleton[1]][1])), skeleton[2], 2) # Print time (inference + NMS + estimation) print('%sDone. (%.3fs)' % (s, t4 - t1)) # Stream results if view_img: detect_text = "Detect FPS:{0:0>5.2f}/{1:0>6.2f}ms".format( detect_fps, detect_time * 1000) estimate_text = "Estimate FPS:{0:0>5.2f}/{1:0>6.2f}ms".format( estimate_fps, estimate_time * 1000) cv2.putText(im0, detect_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2, cv2.LINE_AA) cv2.putText(im0, estimate_text, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2, cv2.LINE_AA) cv2.namedWindow("2D Human Pose Estimation", cv2.WINDOW_NORMAL) cv2.imshow("2D Human Pose Estimation", im0) if cv2.waitKey(1) & 0xFF == ord('q'): # q to quit return # goto .mainFunc # Save results (image with detections) if save_img: if dataset.mode == 'images': cv2.imwrite(save_path, im0) else: if vid_path != save_path: # new video vid_path = save_path if isinstance(vid_writer, cv2.VideoWriter): vid_writer.release( ) # release previous video writer fourcc = 'mp4v' # output video codec fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) vid_writer.write(im0) # Print time (inference + NMS + estimation + 2d filtering) all_process_time = t6 - t1 all_process_fps = 1.0 / all_process_time print("All Process Time: {}, All Process FPS: {}".format( all_process_time, all_process_fps)) print("-" * 30) # Goto label # label .mainFunc # Print saving results if save_txt or save_img: print('Results saved to %s' % save_dir) # Release video reader and writer, then destory all opencv windows dataset.vid_cap.release() vid_writer.release() cv2.destroyAllWindows() print('Present 2D Human Pose Inference Done. Total Time:(%.3f seconds)' % (time.time() - t0))
def detect(save_img=False): out, source, weights, view_img, save_txt, imgsz = \ opt.output, opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.startswith( 'rtsp') or source.startswith('http') or source.endswith('.txt') # Initialize set_logging() device = select_device(opt.device) if os.path.exists(out): shutil.rmtree(out) # delete output folder os.makedirs(out) # make new output folder half = device.type != 'cpu' # half precision only supported on CUDA # Load model model = attempt_load(weights, map_location=device) # load FP32 model imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size if half: model.half() # to FP16 # Second-stage classifier classify = False if classify: modelc = load_classifier(name='resnet101', n=2) # initialize modelc.load_state_dict( torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights modelc.to(device).eval() # Set Dataloader vid_path, vid_writer = None, None if webcam: view_img = True cudnn.benchmark = True # set True to speed up constant image size inference dataset = LoadStreams(source, img_size=imgsz) else: save_img = True dataset = LoadImages(source, img_size=imgsz) # Get names and colors names = model.module.names if hasattr(model, 'module') else model.names colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] # Run inference t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model(img.half() if half else img ) if device.type != 'cpu' else None # run once for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = model(img, augment=opt.augment)[0] # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) t2 = time_synchronized() # Apply Classifier if classify: pred = apply_classifier(pred, modelc, img, im0s) # Process detections for i, det in enumerate(pred): # detections per image if webcam: # batch_size >= 1 p, s, im0 = path[i], '%g: ' % i, im0s[i].copy() else: p, s, im0 = path, '', im0s save_path = str(Path(out) / Path(p).name) txt_path = str(Path(out) / Path(p).stem) + ( '_%g' % dataset.frame if dataset.mode == 'video' else '') s += '%gx%g ' % img.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh if det is not None and len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class s += '%g %ss, ' % (n, names[int(c)]) # add to string # Write results for *xyxy, conf, cls in reversed(det): if save_txt: # Write to file xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh with open(txt_path + '.txt', 'a') as f: f.write(('%g ' * 5 + '\n') % (cls, *xywh)) # label format if save_img or view_img: # Add bbox to image label = '%s %.2f' % (names[int(cls)], conf) plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3) # Print time (inference + NMS) print('%sDone. (%.3fs)' % (s, t2 - t1)) # Stream results if view_img: # cv2.imshow(p, im0) cv2.imwrite("C:/Users/lenovo/Desktop/server/output/camera.jpg", im0) if cv2.waitKey(1) == ord('q'): # q to quit raise StopIterationq # Save results (image with detections) if save_img: if dataset.mode == 'images': cv2.imwrite(save_path, im0) else: if vid_path != save_path: # new video vid_path = save_path if isinstance(vid_writer, cv2.VideoWriter): vid_writer.release( ) # release previous video writer fourcc = 'mp4v' # output video codec fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc('X', '2', '6', '4'), fps, (w, h)) vid_writer.write(im0) if save_txt or save_img: print('Results saved to %s' % Path(out)) if platform.system() == 'Darwin' and not opt.update: # MacOS os.system('open ' + save_path) print('Done. (%.3fs)' % (time.time() - t0))
def detect(save_img=False): out, source, weights, view_img, save_txt, imgsz = \ opt.output, opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.startswith( 'rtsp') or source.startswith('http') or source.endswith('.txt') # Initialize set_logging() device = select_device(opt.device) if os.path.exists(out): shutil.rmtree(out) # delete output folder os.makedirs(out) # make new output folder half = device.type != 'cpu' # half precision only supported on CUDA # Load model model = attempt_load(weights, map_location=device) # load FP32 model imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size if half: model.half() # to FP16 # Second-stage classifier classify = False if classify: modelc = load_classifier(name='resnet101', n=2) # initialize modelc.load_state_dict( torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights modelc.to(device).eval() # Set Dataloader vid_path, vid_writer = None, None if webcam: view_img = True cudnn.benchmark = True # set True to speed up constant image size inference dataset = LoadStreams(source, img_size=imgsz) else: save_img = True dataset = LoadImages(source, img_size=imgsz) # Get names and colors names = model.module.names if hasattr(model, 'module') else model.names colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] # Find index corresponding to a person idx_person = names.index("person") # SORT: initialize the tracker mot_tracker = sort_module.Sort(max_age=opt.max_age, min_hits=opt.min_hits, iou_threshold=opt.iou_threshold) # Run inference t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model(img.half() if half else img ) if device.type != 'cpu' else None # run once for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = model(img, augment=opt.augment)[0] # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) t2 = time_synchronized() # Apply Classifier if classify: pred = apply_classifier(pred, modelc, img, im0s) # Process detections for i, det in enumerate(pred): # detections per image if webcam: # batch_size >= 1 p, s, im0 = path[i], '%g: ' % i, im0s[i].copy() else: p, s, im0 = path, '', im0s save_path = str(Path(out) / Path(p).name) txt_path = str(Path(out) / Path(p).stem) + ( '_%g' % dataset.frame if dataset.mode == 'video' else '') s += '%gx%g ' % img.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh if det is not None and len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class s += '%g %ss, ' % (n, names[int(c)]) # add to string # SORT: number of people detected idxs_ppl = ( det[:, -1] == idx_person ).nonzero(as_tuple=False).squeeze( dim=1) # 1. List of indices with 'person' class detections dets_ppl = det[idxs_ppl, :-1].to( "cpu") # 2. Torch.tensor with 'person' detections print('\n {} people were detected!'.format(len(idxs_ppl))) # SORT: feed detections to the tracker if len(dets_ppl) != 0: trackers = mot_tracker.update(dets_ppl) for d in trackers: plot_one_box(d[:-1], im0, label='ID' + str(int(d[-1])), color=colors[1], line_thickness=1) # Print time (inference + NMS) print('%sDone. (%.3fs)' % (s, t2 - t1)) # Stream results if view_img: cv2.imshow(p, im0) if cv2.waitKey(1) == ord('q'): # q to quit raise StopIteration # Save results (image with detections) if save_img: if dataset.mode == 'images': cv2.imwrite(save_path, im0) else: if vid_path != save_path: # new video vid_path = save_path if isinstance(vid_writer, cv2.VideoWriter): vid_writer.release( ) # release previous video writer fourcc = 'mp4v' # output video codec fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) vid_writer.write(im0) if save_txt or save_img: print('Results saved to %s' % Path(out)) if platform.system() == 'Darwin' and not opt.update: # MacOS os.system('open ' + save_path) print('Done. (%.3fs)' % (time.time() - t0))
def detect(save_img=False): out, source, weights, view_img, save_txt, imgsz = \ opt.output, opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.endswith( '.txt') or source.lower().startswith( ('rtsp://', 'rtmp://', 'http://')) or source.lower().startswith('intel') # Initialize set_logging() device = select_device(opt.device) folder_main = out.split('/')[0] if os.path.exists(out): shutil.rmtree(out) # delete output folder folder_features = folder_main + '/features' if os.path.exists(folder_features): shutil.rmtree(folder_features) # delete features output folder folder_crops = folder_main + '/image_crops' if os.path.exists(folder_crops): shutil.rmtree(folder_crops) # delete output folder with object crops os.makedirs(out) # make new output folder os.makedirs(folder_features) # make new output folder os.makedirs(folder_crops) # make new output folder half = device.type != 'cpu' # half precision only supported on CUDA # Load model model = torch.load(weights[0], map_location=device)['model'].float() # load to FP32 model.to(device).eval() imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size if half: model.half() # to FP16 # Second-stage classifier classify = False if classify: modelc = load_classifier(name='resnet101', n=2) # initialize modelc.load_state_dict( torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights modelc.to(device).eval() # Set Dataloader vid_path, vid_writer = None, None if webcam: view_img = True cudnn.benchmark = True # set True to speed up constant image size inference if source.lower().startswith('intel'): dataset = LoadRealSense2() save_img = True else: dataset = LoadStreams(source, img_size=imgsz) else: save_img = True dataset = LoadImages(source, img_size=imgsz) # Get names and colors names = model.module.names if hasattr(model, 'module') else model.names colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] # frames per second # TODO if use intel or if use given footage fps = 30 # dataset.cap.get(cv2.CAP_PROP_FPS) critical_time_frames = opt.time * fps # COUNTER: initialization counter = VoteCounter(critical_time_frames, fps) print('CRITICAL TIME IS ', opt.time, 'sec, or ', counter.critical_time, ' frames') # Find index corresponding to a person idx_person = names.index("person") # Deep SORT: initialize the tracker cfg = get_config() cfg.merge_from_file(opt.config_deepsort) deepsort = DeepSort(cfg.DEEPSORT.REID_CKPT, max_dist=cfg.DEEPSORT.MAX_DIST, min_confidence=cfg.DEEPSORT.MIN_CONFIDENCE, nms_max_overlap=cfg.DEEPSORT.NMS_MAX_OVERLAP, max_iou_distance=cfg.DEEPSORT.MAX_IOU_DISTANCE, max_age=cfg.DEEPSORT.MAX_AGE, n_init=cfg.DEEPSORT.N_INIT, nn_budget=cfg.DEEPSORT.NN_BUDGET, use_cuda=True) # AlphaPose: initialization # args_p = update_config(opt.config_alphapose) # cfg_p = update_config(args_p.ALPHAPOSE.cfg) # # args_p.ALPHAPOSE.tracking = args_p.ALPHAPOSE.pose_track or args_p.ALPHAPOSE.pose_flow # # demo = SingleImageAlphaPose(args_p.ALPHAPOSE, cfg_p, device) # output_pose = opt.output.split('/')[0] + '/pose' # if not os.path.exists(output_pose): # os.mkdir(output_pose) # Run inference t0 = time.time() img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model(img.half() if half else img ) if device.type != 'cpu' else None # run once for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = model(img, augment=opt.augment)[0] # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) t2 = time_synchronized() # TODO => COUNTER: draw queueing ROI # compute urn centoid (1st frame only) and plot a bounding box around it # if dataset.frame == 1: # counter.read_urn_coordinates(opt.urn, im0s, opt.radius) # counter.plot_urn_bbox(im0s) # Apply Classifier if classify: pred = apply_classifier(pred, modelc, img, im0s) # Process detections for i, det in enumerate(pred): # detections per image if webcam: # batch_size >= 1 if source.lower().startswith('intel'): p, s, im0, frame = path, '%g: ' % i, im0s[i].copy( ), dataset.count else: p, s, im0 = path[i], '%g: ' % i, im0s[i].copy() else: p, s, im0 = path, '', im0s save_path = str(Path(out) / Path(p).name) print(save_path) txt_path = str(Path(out) / Path(p).stem) + ( '_%g' % dataset.frame if dataset.mode == 'video' else '') s += '%gx%g ' % img.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh if det is not None and len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() # Print results for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class s += '%g %ss, ' % (n, names[int(c)]) # add to string # Deep SORT: person class only idxs_ppl = ( det[:, -1] == idx_person ).nonzero(as_tuple=False).squeeze( dim=1) # 1. List of indices with 'person' class detections dets_ppl = det[idxs_ppl, : -1] # 2. Torch.tensor with 'person' detections print('\n {} people were detected!'.format(len(idxs_ppl))) # Deep SORT: convert data into a proper format xywhs = xyxy2xywh(dets_ppl[:, :-1]).to("cpu") confs = dets_ppl[:, 4].to("cpu") # Deep SORT: feed detections to the tracker if len(dets_ppl) != 0: trackers, features = deepsort.update(xywhs, confs, im0) # tracks inside a critical sphere trackers_inside = [] for i, d in enumerate(trackers): plot_one_box(d[:-1], im0, label='ID' + str(int(d[-1])), color=colors[1], line_thickness=1) # TODO: queue COUNTER # d_include = counter.centroid_distance(d, im0, colors[1], dataset.frame) # if d_include: # trackers_inside.append(d) # ALPHAPOSE: show skeletons for bounding boxes inside the critical sphere # if len(trackers_inside) > 0: # pose = demo.process('frame_'+str(dataset.frame), im0, trackers_inside) # im0 = demo.vis(im0, pose) # demo.writeJson([pose], output_pose, form=args_p.ALPHAPOSE.format, for_eval=args_p.ALPHAPOSE.eval) # # counter.save_features_and_crops(im0, dataset.frame, trackers_inside, features, folder_main) cv2.putText(im0, 'Voted ' + str(len(counter.voters_count)), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 2) print('NUM VOTERS', len(counter.voters)) print(list(counter.voters.keys())) # COUNTER if len(counter.voters) > 0: counter.save_voter_trajectory(dataset.frame, folder_main) # Print time (inference + NMS) print('%sDone. (%.3fs)' % (s, t2 - t1)) # Stream results if view_img: cv2.imshow(p, im0) if cv2.waitKey(1) == ord('q'): # q to quit raise StopIteration # Save results (image with detections) if save_img: if dataset.mode == 'images': cv2.imwrite(save_path, im0) else: if vid_path != save_path: # new video vid_path = save_path if isinstance(vid_writer, cv2.VideoWriter): vid_writer.release( ) # release previous video writer fourcc = 'mp4v' # output video codec if type(vid_cap ) is dict: # estimate distance_in_meters # TODO hard code w, h, fps = 640, 480, 6 else: fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h)) vid_writer.write(im0) if save_txt or save_img: print('Results saved to %s' % Path(out)) if platform.system() == 'Darwin' and not opt.update: # MacOS os.system('open ' + save_path) print('Done. (%.3fs)' % (time.time() - t0))
def detect(opt, save_img=False): out, source, weights, view_img, save_txt, imgsz = \ opt.output, opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source == '0' or source.startswith('rtsp') or source.startswith( 'http') or source.endswith('.txt') # initialize deepsort cfg = get_config() cfg.merge_from_file(opt.config_deepsort) deepsort = DeepSort(cfg.DEEPSORT.REID_CKPT, max_dist=cfg.DEEPSORT.MAX_DIST, min_confidence=cfg.DEEPSORT.MIN_CONFIDENCE, nms_max_overlap=cfg.DEEPSORT.NMS_MAX_OVERLAP, max_iou_distance=cfg.DEEPSORT.MAX_IOU_DISTANCE, max_age=cfg.DEEPSORT.MAX_AGE, n_init=cfg.DEEPSORT.N_INIT, nn_budget=cfg.DEEPSORT.NN_BUDGET, use_cuda=True) # Initialize device = select_device(opt.device) if os.path.exists(out): shutil.rmtree(out) # delete output folder os.makedirs(out) # make new output folder half = device.type != 'cpu' # half precision only supported on CUDA # Read Yaml with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) names = data_dict['names'] print(names) # Load model #google_utils.attempt_download(weights) model = torch.load(weights, map_location=device)['model'].float() # load to FP32 #model = torch.save(torch.load(weights, map_location=device), weights) # update model if SourceChangeWarning # model.fuse() model.to(device).eval() if half: model.half() # to FP16 # Second-stage classifier classify = False if classify: modelc = torch_utils.load_classifier(name='resnet101', n=2) # initialize modelc.load_state_dict( torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights modelc.to(device).eval() # Set Dataloader vid_path, vid_writer = None, None if webcam: view_img = True cudnn.benchmark = True # set True to speed up constant image size inference dataset = LoadStreams(source, img_size=imgsz) else: save_img = True dataset = LoadImages(source, img_size=imgsz) # Get names and colors names = model.module.names if hasattr(model, 'module') else model.names colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] # Run inference t_fps = time_synchronized() frame_fps = 0 img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img _ = model(img.half() if half else img ) if device.type != 'cpu' else None # run once for path, img, im0s, vid_cap in dataset: img = torch.from_numpy(img).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) # Inference t1 = time_synchronized() pred = model(img, augment=opt.augment)[0] # Apply NMS pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) t2 = time_synchronized() # Apply Classifier if classify: pred = apply_classifier(pred, modelc, img, im0s) # Process detections for i, det in enumerate(pred): # detections per image if webcam: # batch_size >= 1 p, s, im0 = path[i], '%g: ' % i, im0s[i].copy() else: p, s, im0 = path, '', im0s save_path = str(Path(out) / Path(p).name) txt_path = str(Path(out) / Path(p).stem) + ( '_%g' % dataset.frame if dataset.mode == 'video' else '') s += '%gx%g ' % img.shape[2:] # print string gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh if det is not None and len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() # Print results #print(det[:, -1].unique()) for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class s += '%g %ss, ' % (n, names[int(c)]) # add to string bbox_xywh = [] confs = [] #clses = [] # Write results for *xyxy, conf, cls in det: img_h, img_w, _ = im0.shape # get image shape x_c, y_c, bbox_w, bbox_h = bbox_rel(img_w, img_h, *xyxy) obj = [x_c, y_c, bbox_w, bbox_h] bbox_xywh.append(obj) confs.append([conf.item()]) #clses.append([cls.item()]) #outputs, clses2 = deepsort.update((torch.Tensor(bbox_xywh)), (torch.Tensor(confs)), (torch.Tensor(clses)), im0) #outputs = deepsort.update((torch.Tensor(bbox_xywh)), (torch.Tensor(confs)), im0) if save_img or view_img: # Add bbox to image label = '%s %.2f' % (names[int(cls)], conf) plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3) xywhs = torch.Tensor(bbox_xywh) confss = torch.Tensor(confs) # Pass detections to deepsort outputs = deepsort.update(xywhs, confss, im0) # draw boxes for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] ori_im = draw_boxes(im0, bbox_xyxy, identities) # Print time (inference + NMS) #print('%sDone. (%.3fs)' % (s, t2 - t1)) print('%sDone.' % s) # Stream results if view_img: cv2.imshow(p, im0) if cv2.waitKey(1) == ord('q'): # q to quit raise StopIteration # Save results (image with detections) if save_img: if dataset.mode == 'images': cv2.imwrite(save_path, im0) else: if vid_path != save_path: # new video vid_path = save_path if isinstance(vid_writer, cv2.VideoWriter): vid_writer.release( ) # release previous video writer fps = vid_cap.get(cv2.CAP_PROP_FPS) w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) vid_writer = cv2.VideoWriter( save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h)) vid_writer.write(im0) frame_fps += 1 if ((time_synchronized() - t_fps) >= 1): print('\n') print('FPS=%.2f' % (frame_fps)) t_fps = time_synchronized() frame_fps = 0 #print('FPS=%.2f' % (1/(time_synchronized() - t1))) if save_txt or save_img: print('Results saved to %s' % os.getcwd() + os.sep + out) if platform == 'darwin': # MacOS os.system('open ' + save_path)