class WebcamDetectionLoader: def __init__(self, webcam, batchSize=1, queueSize=256): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stream = cv2.VideoCapture(webcam) assert self.stream.isOpened(), 'Cannot open webcam' self.stopped = False self.batchSize = batchSize # initialize the queue used to store frames read from # the video file self.Q = LifoQueue(maxsize=queueSize) def len(self): return self.Q.qsize() def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): # keep looping while True: img = [] inp = [] orig_img = [] im_name = [] im_dim_list = [] for k in range(self.batchSize): (grabbed, frame) = self.stream.read() if not grabbed: continue # process and add the frame to the queue inp_dim = int(opt.inp_dim) img_k, orig_img_k, im_dim_list_k = prep_frame(frame, inp_dim) inp_k = im_to_torch(orig_img_k) img.append(img_k) inp.append(inp_k) orig_img.append(orig_img_k) im_dim_list.append(im_dim_list_k) with torch.no_grad(): ht = inp[0].size(1) wd = inp[0].size(2) # Human Detection img = Variable(torch.cat(img)).cuda() im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) im_dim_list = im_dim_list.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(inp)): if self.Q.full(): with self.Q.mutex: self.Q.queue.clear() self.Q.put((inp[k], orig_img[k], None, None)) continue im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5].cpu() scores = dets[:, 5:6].cpu() for k in range(len(inp)): if self.Q.full(): with self.Q.mutex: self.Q.queue.clear() self.Q.put((inp[k], orig_img[k], boxes[dets[:, 0] == k], scores[dets[:, 0] == k])) def videoinfo(self): # indicate the video info fourcc = int(self.stream.get(cv2.CAP_PROP_FOURCC)) fps = self.stream.get(cv2.CAP_PROP_FPS) frameSize = (int(self.stream.get(cv2.CAP_PROP_FRAME_WIDTH)), int(self.stream.get(cv2.CAP_PROP_FRAME_HEIGHT))) return (fourcc, fps, frameSize) def read(self): # return next frame in the queue return self.Q.get() def more(self): # return True if there are still frames in the queue return self.Q.qsize() > 0 def stop(self): # indicate that the thread should be stopped self.stopped = True
class DetectionLoader: def __init__(self, dataloder, batchSize=25, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet( "joints_detectors/Alphapose/yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights( 'joints_detectors/Alphapose/models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model #.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, name="DetectionLoader", args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=(), daemon=True) # p = mp.Process(target=self.update, args=()) # p.daemon = True p.start() return self def update(self): while (True): sys.stdout.flush() print("detection loader len : " + str(self.Q.qsize())) # keep looping the whole dataset #for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): # Human Detection img = img #.cuda() prediction = self.det_model(img, CUDA=False) #True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[0], im_name[0], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] #for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == 0] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put( (orig_img[0], im_name[0], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[0], im_name[0], boxes_k, scores[dets[:, 0] == 0], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize # initialize the queue used to store frames read from # the video file self.Q = LifoQueue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): # keep looping the whole dataset while True: img, orig_img, im_name, im_dim_list = self.dataloder.getitem() with self.dataloder.Q.mutex: self.dataloder.Q.queue.clear() with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
"Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", default="416", type=str) return parser.parse_args() cap = cv2.VideoCapture( '/home/gaurav/Desktop/sem6/VR/before_midsem/mini_project/harsh_without_mask.mp4' ) args = arg_parse() confidence = float(args.confidence) nms_thesh = float(args.nms_thresh) num_classes = 80 bbox_attrs = 5 + num_classes yolo = Darknet(args.cfgfile) yolo.load_weights(args.weightsfile) yolo.net_info["height"] = args.reso inp_dim = int(yolo.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 yolo.cuda() PATH = '/home/gaurav/Desktop/sem6/VR/before_midsem/mini_project/model/classi4.pkl' net = models.alexnet(pretrained=True) my_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net.to(my_device) loaded_model = pickle.load(open(PATH, 'rb')) tot_cnt = 0 cnt = 0 out = cv2.VideoWriter('./output_harsh_without_mask.avi',
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not cfg_path = "yolo/cfg/yolov3-single.cfg" weights_path = 'models/yolo/01.weights' self.det_model = Darknet(cfg_path, reso=int(opt.inp_dim)) self.det_model.load_weights(weights_path) print("Loading cfg from", cfg_path) print("Loading weights from", weights_path) self.det_model.eval() self.det_model.net_info['height'] = opt.inp_dim #input_dimension self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): img = img.cuda() # Critical, use yolo to do object detection here! prediction = self.det_model(img) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() # Modified by @penggao # Scale for SIXD dataset reso = self.det_inp_dim im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) w, h = im_dim_list[:, 0], im_dim_list[:, 1] w_ratio = w / reso h_ratio = h / reso boxes = dets[:, 1:5] boxes[:, 0] = boxes[:, 0] * w_ratio boxes[:, 1] = boxes[:, 1] * h_ratio boxes[:, 2] = boxes[:, 2] * w_ratio boxes[:, 3] = boxes[:, 3] * h_ratio scores = dets[:, 5:6] # im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) # scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # # coordinate transfer # dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 # dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 # dets[:, 1:5] /= scaling_factor # for j in range(dets.shape[0]): # dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) # dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) # boxes = dets[:, 1:5] # scores = dets[:, 5:6] img = Image.open(im_name[0]) draw = ImageDraw.Draw(img) for i in range(boxes.shape[0]): x1, y1, x2, y2 = boxes[i, 0], boxes[i, 1], boxes[i, 2], boxes[i, 3] objectness = 'conf: %.2f' % scores draw.rectangle((x1, y1, x2, y2), outline='red') # img.save(im_name[0].replace('rgb', 'results')) for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataset, det_model=None, cuda_id=None, batchSize=4, queueSize=256): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not if det_model is None: self.det_model = Darknet('yolo/cfg/yolov3.cfg') self.det_model.load_weights('models/yolo/yolov3.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() else: self.det_model = det_model self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.cuda_id = cuda_id self.stopped = False self.dataset = dataset self.batchSize = batchSize self.datalen = self.dataset.__len__() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file self.Q = Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): # keep looping the whole dataset for i in range(self.num_batches): img = [] inp = [] orig_img = [] im_name = [] im_dim_list = [] for k in range(i * self.batchSize, min((i + 1) * self.batchSize, self.datalen)): img_k, inp_k, orig_img_k, im_name_k, im_dim_list_k = self.dataset.__getitem__( k) img.append(img_k) inp.append(inp_k) orig_img.append(orig_img_k) im_name.append(im_name_k) im_dim_list.append(im_dim_list_k) with torch.no_grad(): ht = inp[0].size(1) wd = inp[0].size(2) # Human Detection if self.cuda_id is None: img = Variable(torch.cat(img)).cuda() else: img = Variable(torch.cat(img)).cuda(self.cuda_id) im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) if self.cuda_id is None: im_dim_list = im_dim_list.cuda() else: im_dim_list = im_dim_list.cuda(self.cuda_id) prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh, cuda_id=self.cuda_id) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(inp)): while self.Q.full(): time.sleep(0.2) self.Q.put( (inp[k], orig_img[k], im_name[k], None, None)) continue im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5].cpu() scores = dets[:, 5:6].cpu() for k in range(len(inp)): while self.Q.full(): time.sleep(0.2) self.Q.put((inp[k], orig_img[k], im_name[k], boxes[dets[:, 0] == k], scores[dets[:, 0] == k])) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder def detect_image(self, im_path): im, ori_im, im_name, im_dim_list = self.dataloder.getitem_yolo(im_path) with torch.no_grad(): im = im.cuda() prediction = self.det_model(im, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: return (ori_im[0], im_name[0], None, None, None, None, None) dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0] \ .view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1: 5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] if boxes.shape[0] > 1: boxes = boxes[scores.argmax()].unsqueeze(0) scores = scores[scores.argmax()].unsqueeze(0) dets = dets[scores.argmax()].unsqueeze(0) # len(ori_im) === 1 for k in range(len(ori_im)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: return (ori_im[k], im_name[k], None, None, None, None, None) inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) return (ori_im[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)
class YOLODetector(BaseDetector): def __init__(self, cfg, opt=None): super(YOLODetector, self).__init__() self.detector_cfg = cfg self.detector_opt = opt self.model_cfg = cfg.get('CONFIG', 'detector/yolo/cfg/yolov3-spp.cfg') self.model_weights = cfg.get('WEIGHTS', 'detector/yolo/data/yolov3-spp.weights') self.inp_dim = cfg.get('INP_DIM', 608) self.nms_thres = cfg.get('NMS_THRES', 0.6) self.confidence = cfg.get('CONFIDENCE', 0.05) self.num_classes = cfg.get('NUM_CLASSES', 80) self.model = None self.load_model() def load_model(self): args = self.detector_opt print('Loading YOLO model..') self.model = Darknet(self.model_cfg) self.model.load_weights(self.model_weights) self.model.net_info['height'] = self.inp_dim print("Network successfully loaded") if args: if len(args.gpus) > 1: self.model = torch.nn.DataParallel(self.model, device_ids=args.gpus).to( args.device) else: self.model.to(args.device) else: self.model.cuda() self.model.eval() def image_preprocess(self, img_source): """ Pre-process the img before fed to the object detection network Input: image name(str) or raw image data(ndarray or torch.Tensor,channel GBR) Output: pre-processed image data(torch.FloatTensor,(1,3,h,w)) """ if isinstance(img_source, str): img, orig_img, im_dim_list = prep_image(img_source, self.inp_dim) elif isinstance(img_source, torch.Tensor) or isinstance( img_source, np.ndarray): img, orig_img, im_dim_list = prep_frame(img_source, self.inp_dim) else: raise IOError('Unknown image source type: {}'.format( type(img_source))) return img def images_detection(self, imgs, orig_dim_list): """ Feed the img data into object detection network and collect bbox w.r.t original image size Input: imgs(torch.FloatTensor,(b,3,h,w)): pre-processed mini-batch image input orig_dim_list(torch.FloatTensor, (b,(w,h,w,h))): original mini-batch image size Output: dets(torch.cuda.FloatTensor,(n,(batch_idx,x1,y1,x2,y2,c,s,idx of cls))): object detection results """ args = self.detector_opt _CUDA = True if args: if args.gpus[0] < 0: _CUDA = False if not self.model: self.load_model() with torch.no_grad(): imgs = imgs.to(args.device) if args else imgs.cuda() prediction = self.model(imgs, args=args) #do nms to the detection results, only human category is left dets = self.dynamic_write_results(prediction, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thres) if isinstance(dets, int) or dets.shape[0] == 0: return 0 dets = dets.cpu() orig_dim_list = torch.index_select(orig_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.inp_dim / orig_dim_list, 1)[0].view(-1, 1) dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * orig_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * orig_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for i in range(dets.shape[0]): dets[i, [1, 3]] = torch.clamp(dets[i, [1, 3]], 0.0, orig_dim_list[i, 0]) dets[i, [2, 4]] = torch.clamp(dets[i, [2, 4]], 0.0, orig_dim_list[i, 1]) return dets def dynamic_write_results(self, prediction, confidence, num_classes, nms=True, nms_conf=0.4): prediction_bak = prediction.clone() dets = self.write_results(prediction.clone(), confidence, num_classes, nms, nms_conf) if isinstance(dets, int): return dets if dets.shape[0] > 100: nms_conf -= 0.05 dets = self.write_results(prediction_bak.clone(), confidence, num_classes, nms, nms_conf) return dets def write_results(self, prediction, confidence, num_classes, nms=True, nms_conf=0.4): args = self.detector_opt #prediction: (batchsize, num of objects, (xc,yc,w,h,box confidence, 80 class scores)) conf_mask = (prediction[:, :, 4] > confidence).float().float().unsqueeze(2) prediction = prediction * conf_mask try: ind_nz = torch.nonzero(prediction[:, :, 4]).transpose(0, 1).contiguous() except: return 0 #the 3rd channel of prediction: (xc,yc,w,h)->(x1,y1,x2,y2) box_a = prediction.new(prediction.shape) box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2] / 2) box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3] / 2) box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2] / 2) box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3] / 2) prediction[:, :, :4] = box_a[:, :, :4] batch_size = prediction.size(0) output = prediction.new(1, prediction.size(2) + 1) write = False num = 0 for ind in range(batch_size): #select the image from the batch image_pred = prediction[ind] #Get the class having maximum score, and the index of that class #Get rid of num_classes softmax scores #Add the class index and the class score of class having maximum score max_conf, max_conf_score = torch.max( image_pred[:, 5:5 + num_classes], 1) max_conf = max_conf.float().unsqueeze(1) max_conf_score = max_conf_score.float().unsqueeze(1) seq = (image_pred[:, :5], max_conf, max_conf_score) #image_pred:(n,(x1,y1,x2,y2,c,s,idx of cls)) image_pred = torch.cat(seq, 1) #Get rid of the zero entries non_zero_ind = (torch.nonzero(image_pred[:, 4])) image_pred_ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7) #Get the various classes detected in the image try: img_classes = unique(image_pred_[:, -1]) except: continue #WE will do NMS classwise #print(img_classes) for cls in img_classes: if cls == 0: continue #get the detections with one particular class cls_mask = image_pred_ * (image_pred_[:, -1] == cls).float().unsqueeze(1) class_mask_ind = torch.nonzero(cls_mask[:, -2]).squeeze() image_pred_class = image_pred_[class_mask_ind].view(-1, 7) #sort the detections such that the entry with the maximum objectness #confidence is at the top conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1] image_pred_class = image_pred_class[conf_sort_index] idx = image_pred_class.size(0) #if nms has to be done if nms: if platform.system() != 'Windows': #We use faster rcnn implementation of nms (soft nms is optional) nms_op = getattr(nms_wrapper, 'nms') #nms_op input:(n,(x1,y1,x2,y2,c)) #nms_op output: input[inds,:], inds _, inds = nms_op(image_pred_class[:, :5], nms_conf) image_pred_class = image_pred_class[inds] else: # Perform non-maximum suppression max_detections = [] while image_pred_class.size(0): # Get detection with highest confidence and save as max detection max_detections.append( image_pred_class[0].unsqueeze(0)) # Stop if we're at the last detection if len(image_pred_class) == 1: break # Get the IOUs for all boxes with lower confidence ious = bbox_iou(max_detections[-1], image_pred_class[1:], args) # Remove detections with IoU >= NMS threshold image_pred_class = image_pred_class[1:][ ious < nms_conf] image_pred_class = torch.cat(max_detections).data #Concatenate the batch_id of the image to the detection #this helps us identify which image does the detection correspond to #We use a linear straucture to hold ALL the detections from the batch #the batch_dim is flattened #batch is identified by extra batch column batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) seq = batch_ind, image_pred_class if not write: output = torch.cat(seq, 1) write = True else: out = torch.cat(seq, 1) output = torch.cat((output, out)) num += 1 if not num: return 0 #output:(n,(batch_ind,x1,y1,x2,y2,c,s,idx of cls)) return output def detect_one_img(self, img_name): """ Detect bboxs in one image Input: 'str', full path of image Output: '[{"category_id":1,"score":float,"bbox":[x,y,w,h],"image_id":str},...]', The output results are similar with coco results type, except that image_id uses full path str instead of coco %012d id for generalization. """ args = self.detector_opt _CUDA = True if args: if args.gpus[0] < 0: _CUDA = False if not self.model: self.load_model() if isinstance(self.model, torch.nn.DataParallel): self.model = self.model.module dets_results = [] #pre-process(scale, normalize, ...) the image img, orig_img, img_dim_list = prep_image(img_name, self.inp_dim) with torch.no_grad(): img_dim_list = torch.FloatTensor([img_dim_list]).repeat(1, 2) img = img.to(args.device) if args else img.cuda() prediction = self.model(img, args=args) #do nms to the detection results, only human category is left dets = self.dynamic_write_results(prediction, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thres) if isinstance(dets, int) or dets.shape[0] == 0: return None dets = dets.cpu() img_dim_list = torch.index_select(img_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.inp_dim / img_dim_list, 1)[0].view(-1, 1) dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * img_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * img_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for i in range(dets.shape[0]): dets[i, [1, 3]] = torch.clamp(dets[i, [1, 3]], 0.0, img_dim_list[i, 0]) dets[i, [2, 4]] = torch.clamp(dets[i, [2, 4]], 0.0, img_dim_list[i, 1]) #write results det_dict = {} x = float(dets[i, 1]) y = float(dets[i, 2]) w = float(dets[i, 3] - dets[i, 1]) h = float(dets[i, 4] - dets[i, 2]) det_dict["category_id"] = 1 det_dict["score"] = float(dets[i, 5]) det_dict["bbox"] = [x, y, w, h] det_dict["image_id"] = int( os.path.basename(img_name).split('.')[0]) dets_results.append(det_dict) return dets_results
os.mkdir(args.outputpath) if not len(videofile): raise IOError('Error: must contain --video') # Load input video fvs = VideoLoader(videofile).start() (fourcc,fps,frameSize) = fvs.videoinfo() # Data writer save_path = os.path.join(args.outputpath, 'AlphaPose_'+videofile.split('/')[-1].split('.')[0]+'.avi') writer = DataWriter(args.save_video, save_path, cv2.VideoWriter_fourcc(*'XVID'), fps, frameSize).start() # Load YOLO model print('Loading YOLO model..') det_model = Darknet("yolo/cfg/yolov3.cfg") det_model.load_weights('models/yolo/yolov3.weights') det_model.net_info['height'] = args.inp_dim det_inp_dim = int(det_model.net_info['height']) assert det_inp_dim % 32 == 0 assert det_inp_dim > 32 det_model.cuda() det_model.eval() # Load pose model pose_dataset = Mscoco() if args.fast_inference: pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) else: pose_model = InferenNet(4 * 1 + 1, pose_dataset) pose_model.cuda() pose_model.eval()
class DetectionLoader: def __init__(self, dataloder, batchSize=1): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 if opt.device == 'GPU': self.det_model.cuda() else: self.det_model.cpu() self.det_model.eval() self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the list used to store frames read from # the video file self.Q = list() def start(self): # start to dectect person self.update() def update(self): # keep looping the whole dataset for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.Q[i] with torch.no_grad(): # Human Detection if opt.device == 'GPU': img = img.cuda() else: img = img.cpu() prediction = self.det_model( img, CUDA=True if opt.device == 'GPU' else False) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): self.Q.append((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: self.Q.append((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) # multiply the score with bounding box height processed_scores = self.cal_scores(scores, boxes_k) self.Q.append( (orig_img[k], im_name[k], boxes_k[np.argmax(processed_scores ):np.argmax(processed_scores) + 1], scores[np.argmax(processed_scores)], inps[np.argmax(processed_scores ):np.argmax(processed_scores) + 1], pt1[np.argmax(processed_scores ):np.argmax(processed_scores) + 1], pt2[np.argmax(processed_scores ):np.argmax(processed_scores) + 1])) def cal_scores(self, scores, boxes): processed_scores = scores.clone() for i in range(boxes.shape[0]): processed_scores[i][0] *= abs(boxes[i][1] - boxes[i][3]) return processed_scores def len(self): # return list len return len(self.Q)
class PeopleDetection(): def __init__(self): super().__init__() self.cfgfile = "yolo/cfg/yolov3.cfg" self.weightsfile = "yolo/yolov3.weights" self.num_classes = 80 self.confidence = 0.25 self.nms_thesh = 0.4 self.reso = 160 self.start = 0 self.CUDA = torch.cuda.is_available() self.num_classes = 80 self.bbox_attrs = 5 + self.num_classes self.model = Darknet(self.cfgfile) self.model.load_weights(self.weightsfile) self.model.net_info["height"] = self.reso self.inp_dim = int(self.model.net_info["height"]) assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 if self.CUDA: self.model.cuda() # Switch to “evaluate” mode before predictions self.model.eval() self.classes = load_classes('yolo/data/coco.names') self.colors = pkl.load(open("yolo/pallete", "rb")) def prep_image(self, img, inp_dim): """ Prepare image for inputting to the neural network. Returns a Variable """ orig_im = img dim = orig_im.shape[1], orig_im.shape[0] img = cv2.resize(orig_im, (inp_dim, inp_dim)) img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, orig_im, dim def write(self, x, img, people_bounding_boxes, only_person=False): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) cls = int(x[-1]) if (not only_person) or (only_person and self.classes[cls] == 'person'): label = "{0}".format(self.classes[cls]) color = random.choice(self.colors) cv2.rectangle(img, c1, c2, color, 2) x = int(c1[0]) y = int(c1[1]) w = int(c2[0] - x) h = int(c2[1] - y) people_bounding_boxes.append([x, y, w, h]) font_scale = 1.5 line_thickness = 2 t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, font_scale, line_thickness)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, font_scale, [225, 255, 255], line_thickness) return img def run(self, frame): img, orig_im, dim = self.prep_image(frame, self.inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) if self.CUDA: im_dim = im_dim.cuda() img = img.cuda() output = self.model(Variable(img), self.CUDA) output = write_results(output, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh) if type(output) == int: return orig_im, False, [] output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float(self.inp_dim)) / self.inp_dim # im_dim = im_dim.repeat(output.size(0), 1) output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] people_bounding_boxes = [] list(map(lambda x: self.write(x, orig_im, people_bounding_boxes, only_person=True), output)) return orig_im, True, people_bounding_boxes
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024, use_boxGT=False, gt_json=''): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet(opt.yolo_model_cfg) self.det_model.load_weights(opt.yolo_model_path) self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) self.use_boxGT = use_boxGT if use_boxGT: print('loading grondtruth box.') self.box_gt = box_gt_class(gt_json) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] # print(scores.shape,img.shape,boxes.shape,len(orig_img)) for k in range(len(orig_img)): if not self.use_boxGT: boxes_k = boxes[dets[:,0]==k] scores_k = scores[dets[:,0]==k] else: boxes_k = self.box_gt.get_box(im_name[k].split('/')[-1]) # print(boxes_k.shape) scores_k = torch.ones((boxes_k.shape[0],1)) # print(boxes_k.shape,img.shape,scores[dets[:,0]==k].shape,len(orig_img)) if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores_k, inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights("models/yolo/yolov3-spp.weights") self.det_model.net_info['hight'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['hight']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return # 当网络中的某一个tensor不需要梯度时,可以使用torch.no_grad()来处理 with torch.no_grad(): # Human Detction img = img.cuda() prediction = self.det_model(img, CUDA=True) dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) # mul person if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] print("boxes", boxes) scores = dets[:, 5:6] print("scoes", scores) for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): return self.Q.get() def len(self): return self.Q.qsize()
class PeopleDetection(): def __init__(self): super().__init__() self.cfgfile = "yolo/cfg/yolov3.cfg" self.weightsfile = "yolo/yolov3.weights" self.num_classes = 80 self.confidence = 0.25 self.nms_thesh = 0.4 self.reso = 160 self.start = 0 self.CUDA = torch.cuda.is_available() self.num_classes = 80 self.bbox_attrs = 5 + self.num_classes self.model = Darknet(self.cfgfile) self.model.load_weights(self.weightsfile) self.model.net_info["height"] = self.reso self.inp_dim = int(self.model.net_info["height"]) assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 if self.CUDA: self.model.cuda() self.model.eval() self.classes = load_classes('yolo/data/coco.names') self.colors = pkl.load(open("yolo/pallete", "rb")) def prep_image(self, img, inp_dim): """ Prepare image for inputting to the neural network. Returns a Variable """ orig_im = img dim = orig_im.shape[1], orig_im.shape[0] img = cv2.resize(orig_im, (inp_dim, inp_dim)) img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, orig_im, dim def write(self, x, img, only_person=False, paintings_bounding_boxes=None): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) cls = int(x[-1]) count_persons = 0 if (not only_person) or (only_person and self.classes[cls] == 'person'): label = "{0}".format(self.classes[cls]) color = random.choice(self.colors) # if person is inside a painting if paintings_bounding_boxes is not None: # (x_min, x_max, y_min, y_max) for pbb in paintings_bounding_boxes: if c1[0] > pbb[0] and c1[1] < pbb[1] and c2[0] > pbb[ 2] and c2[1] < pbb[3]: return 0 if c1 == (0, 0) and c2 == (0, 0): return 0 cv2.rectangle(img, c1, c2, color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) if only_person and self.classes[cls] == 'person': count_persons += 1 return count_persons def run(self, frame, paintings_bounding_boxes=None): img, orig_im, dim = self.prep_image(frame, self.inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) if self.CUDA: im_dim = im_dim.cuda() img = img.cuda() output = self.model(Variable(img), self.CUDA) output = write_results(output, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh) if type(output) == int: return orig_im, False, 0 output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float( self.inp_dim)) / self.inp_dim output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] persons_list = list( map( lambda x: self.write(x, orig_im, only_person=True, paintings_bounding_boxes= paintings_bounding_boxes), output)) persons = np.array(persons_list).sum() return orig_im, True, persons
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim # self.det_inp_dim = int(self.det_model.net_info['height']) self.det_inp_dim = int(opt.inp_dim) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset from mtcnn.mtcnn import MTCNN detector = MTCNN() for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): if self.dataloder.format == 'yolo': # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) elif self.dataloder.format == 'mtcnn': # Face detection imgs_np = img.float().mul(255.0).cpu().numpy() imgs_np = np.squeeze(imgs_np, axis=0) imgs_np = np.transpose(imgs_np, (1, 2, 0)) dets = detector.detect_faces(imgs_np) fac_det = [] for det in dets: fac_det.append([ 0, det["box"][0], det["box"][1], det["box"][0] + det["box"][2], det["box"][1] + det["box"][3], det["confidence"], 0.99, 0 ]) dets = torch.tensor(fac_det) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class Yolo: def __init__(self): # set args self.args = self.arg_parse() self.confidence = float(self.args.confidence) self.nms_thesh = float(self.args.nms_thresh) # load file self.cfgfile = "yolo/cfg/yolov3.cfg" self.weightsfile = "yolo/yolov3.weights" self.classes = load_classes('yolo/data/coco.names') self.colors = pkl.load(open("yolo/pallete", "rb")) # set model self.num_classes = 80 self.bbox_attrs = 5 + self.num_classes self.model = Darknet(self.cfgfile) self.model.load_weights(self.weightsfile) self.model.net_info["height"] = self.args.reso self.inp_dim = int(self.model.net_info["height"]) self.model.eval() def arg_parse(self): parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25) parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4) parser.add_argument( "--reso", dest='reso', help= "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", default="160", type=str) return parser.parse_args() def prep_image(self, img, inp_dim): orig_im = img dim = orig_im.shape[1], orig_im.shape[0] img = cv2.resize(orig_im, (inp_dim, inp_dim)) img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, orig_im, dim def write(self, x, img): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) cls = int(x[-1]) label = "{0}".format(self.classes[cls]) color = random.choice(self.colors) cv2.rectangle(img, c1, c2, color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) # print("[*] label:{} c1x:{} c1y:{} c2x:{} c2y:{}".format(label, c1[0], c1[1], c2[0], c2[1])) return [ label, c1[0].numpy(), c1[1].numpy(), c2[0].numpy(), c2[1].numpy() ] def detect_bbox(self, frame, imshow=True): img, orig_im, dim = self.prep_image(frame, self.inp_dim) CUDA = False # torch.cuda.is_available() output = self.model(Variable(img), CUDA) output = write_results(output, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh) output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float( self.inp_dim)) / self.inp_dim output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] bbox = list(map(lambda x: self.write(x, orig_im), output)) if imshow: cv2.imshow("frame", orig_im) key = cv2.waitKey(3 * 1000) # wait and show msec return bbox