def init_model(args): scales = args.scales images = args.images batch_size = int(args.bs) confidence = float(args.confidence) nms_thesh = float(args.nms_thresh) start = 0 num_classes = 80 classes = load_classes('yolo/data/coco.names') print("classes") print(classes) # Set up the neural network print("Loading network.....") model = Darknet(args.cfgfile) model.load_weights(args.weightsfile) print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 # If there's a GPU availible, put the model on GPU if CUDA: model.cuda() # Set the model in evaluation mode model.eval() return model
class YoloLoader(): def __init__(self): self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval()
def load_yolo_model(args): print('loading yolo model ...') det_model = Darknet("yolo/cfg/yolov3-spp.cfg") det_model.load_weights('models/yolo/yolov3-spp.weights') det_model.net_info['height'] = args.inp_dim det_inp_dim = int(det_model.net_info['height']) assert det_inp_dim % 32 == 0 assert det_inp_dim > 32 det_model.cuda() det_model.eval() return det_model, det_inp_dim
def __init__(self): det_model = Darknet("yolo/cfg/yolov3-spp.cfg") det_model.load_weights('models/yolo/yolov3-spp.weights') det_model.net_info['height'] = args.inp_dim det_inp_dim = int(det_model.net_info['height']) assert det_inp_dim % 32 == 0 assert det_inp_dim > 32 self.det_inp_dim = det_inp_dim det_model.cuda() det_model.eval() self.det_model = det_model
class DetectionLoader2: def __init__(self): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("./yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('./models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda(torchCuda) self.det_model.eval() def load(self,img, orig_img, im_dim_list): with torch.no_grad(): # Human Detection img = img.cuda(torchCuda) prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] # for k in range(len(orig_img)): k=0 boxes_k = boxes[dets[:,0]==k] inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) return (orig_img[k], boxes_k, scores[dets[:,0]==k], inps, pt1, pt2) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
def load_model(opt): pose_dataset = Mscoco() pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) det_model = Darknet("yolo/cfg/yolov3-spp.cfg") det_model.load_weights('models/yolo/yolov3-spp.weights') det_model.net_info['height'] = opt.inp_dim pose_model.cuda() pose_model.eval() det_model.cuda() det_model.eval() return det_model, pose_model
def set_yolo(args): labelsPath = os.path.sep.join([args["yolo"], "coco.names"]) labels = load_classes(labelsPath) weightsPath = os.path.sep.join([args["yolo"], "yolov3.weights"]) configPath = os.path.sep.join([args["yolo"], "yolov3.cfg"]) # load our YOLO object detector trained on COCO dataset (80 classes) # and determine only the *output* layer names that we need from YOLO print("[INFO] loading YOLO from disk...") model = Darknet(configPath) model.load_weights(weightsPath) model.net_info["height"] = 320 model.cuda() model.eval() return labels, model
def main(): m = Darknet(args.config) m.print_network() m.load_weights(args.weights) print('Loading weights from %s... Done!' % (args.weights)) if m.num_classes == 20: namesfile = 'weights/voc.names' elif m.num_classes == 80: namesfile = 'weights/coco.names' elif m.num_classes == 1: namesfile = 'weights/logo.names' else: namesfile = 'data/names' print("{} is used for classification".format(namesfile)) class_names = load_class_names(namesfile) use_cuda = True if use_cuda: m.cuda() if args.video != 'none': cap = cv2.VideoCapture(args.video) else: cap = cv2.VideoCapture(0) if not cap.isOpened(): print("Unable to open camera") exit(-1) while True: res, img = cap.read() if res: sized = cv2.resize(img, (m.width, m.height)) bboxes = do_detect(m, sized, 0.5, 0.4, use_cuda) draw_img = plot_boxes_cv2(img, bboxes, None, class_names) cv2.imshow(args.config, draw_img) cv2.waitKey(1) else: print("Unable to read image") exit(-1)
class DarknetModel(object): def __init__(self): self.scales = "1,2,3" self.batch_size = 1 self.confidence = 0.5 self.nms_thesh = 0.4 self.reso = 416 self.CUDA = False self.num_classes = 80 self.classes = load_classes('data/coco.names') self.colors = load_colors('data/pallete') self.model = Darknet('cfg/yolov3.cfg', self.reso) self.model.load_state_dict(torch.load('yolov3.pkl')) self.inp_dim = self.reso assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 if self.CUDA: self.model.cuda() self.model.eval() def predict(self, filename): image = cv2.imread(filename) img, orig_im, dim = prep_image(image, self.inp_dim) im_dim = torch.FloatTensor(dim).repeat(1,2) if self.CUDA: im_dim = im_dim.cuda() img = img.cuda() output = self.model(img) output = sift_results(output, self.confidence, self.num_classes, nms = True, nms_conf = self.nms_thesh) output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.inp_dim))/self.inp_dim output[:,[1,3]] *= image.shape[1] output[:,[2,4]] *= image.shape[0] list(map(lambda x: write(x, orig_im, self.classes, self.colors), output)) return orig_im
class DetectionNetwork(object): def __init__(self): self.confidence = 0.7 self.nms_thesh = 0.4 self.resolution = 640 self.scales = "1,2,3" self.confidence = float(self.confidence) self.nms_thesh = float(self.nms_thesh) self.CUDA = torch.cuda.is_available() self.num_classes = 80 self.classes = load_classes('yolo/data/coco.names') print("Loading network.....") self.model_detect = Darknet('cfg/yolov3.cfg') self.model_detect.load_weights('yolo/yolov3.weights') print("Network successfully loaded") self.model_detect.net_info["height"] = self.resolution self.inp_dim = int(self.model_detect.net_info["height"]) assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 if self.CUDA: self.model_detect.cuda() self.model_detect.eval() self.colors = pkl.load(open("yolo/pallete", "rb")) def write(self, x, org_img): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) img = org_img cls = int(x[-1]) label = "{0}".format(self.classes[cls]) color = random.choice(self.colors) cv2.rectangle(img, c1, c2, color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) return img def detect(self, image, im_dim_list): # Detection Inference ########################################################################################## prediction = self.model_detect(image, True) prediction = write_results(prediction, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh) output = prediction # Check if something was found.... if type(prediction) == int: return None objs = [self.classes[int(x[-1])] for x in output] print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs))) print("----------------------------------------------------------") # Scaling, considering original input resolution ############################################################### im_dim_list = torch.index_select(im_dim_list, 0, output[:, 0].long()) scaling_factor = torch.min(self.inp_dim / im_dim_list, 1)[0].view(-1, 1) output[:, [1, 3]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 output[:, [2, 4]] -= (self.inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 output[:, 1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, im_dim_list[i, 0]) output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, im_dim_list[i, 1]) return output def visualize_outputs(self, detect_output, draw_image): # Draw every bounding box iteratively for n_f in range(detect_output.size(0)): draw_image = self.write(detect_output[n_f, ...], draw_image) return draw_image
class YOLODetector(BaseDetector): def __init__(self, cfg, opt=None): super(YOLODetector, self).__init__() self.detector_cfg = cfg self.detector_opt = opt self.model_cfg = cfg.get('CONFIG', 'detector/yolo/cfg/yolov3-spp.cfg') self.model_weights = cfg.get('WEIGHTS', 'detector/yolo/data/yolov3-spp.weights') self.inp_dim = cfg.get('INP_DIM', 608) self.nms_thres = cfg.get('NMS_THRES', 0.6) self.confidence = cfg.get('CONFIDENCE', 0.05) self.num_classes = cfg.get('NUM_CLASSES', 80) self.model = None self.load_model() def load_model(self): args = self.detector_opt print('Loading YOLO model..') self.model = Darknet(self.model_cfg) self.model.load_weights(self.model_weights) self.model.net_info['height'] = self.inp_dim print("Network successfully loaded") if args: if len(args.gpus) > 1: self.model = torch.nn.DataParallel(self.model, device_ids=args.gpus).to( args.device) else: self.model.to(args.device) else: self.model.cuda() self.model.eval() def image_preprocess(self, img_source): """ Pre-process the img before fed to the object detection network Input: image name(str) or raw image data(ndarray or torch.Tensor,channel GBR) Output: pre-processed image data(torch.FloatTensor,(1,3,h,w)) """ if isinstance(img_source, str): img, orig_img, im_dim_list = prep_image(img_source, self.inp_dim) elif isinstance(img_source, torch.Tensor) or isinstance( img_source, np.ndarray): img, orig_img, im_dim_list = prep_frame(img_source, self.inp_dim) else: raise IOError('Unknown image source type: {}'.format( type(img_source))) return img def images_detection(self, imgs, orig_dim_list): """ Feed the img data into object detection network and collect bbox w.r.t original image size Input: imgs(torch.FloatTensor,(b,3,h,w)): pre-processed mini-batch image input orig_dim_list(torch.FloatTensor, (b,(w,h,w,h))): original mini-batch image size Output: dets(torch.cuda.FloatTensor,(n,(batch_idx,x1,y1,x2,y2,c,s,idx of cls))): object detection results """ args = self.detector_opt _CUDA = True if args: if args.gpus[0] < 0: _CUDA = False if not self.model: self.load_model() with torch.no_grad(): imgs = imgs.to(args.device) if args else imgs.cuda() prediction = self.model(imgs, args=args) #do nms to the detection results, only human category is left dets = self.dynamic_write_results(prediction, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thres) if isinstance(dets, int) or dets.shape[0] == 0: return 0 dets = dets.cpu() orig_dim_list = torch.index_select(orig_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.inp_dim / orig_dim_list, 1)[0].view(-1, 1) dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * orig_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * orig_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for i in range(dets.shape[0]): dets[i, [1, 3]] = torch.clamp(dets[i, [1, 3]], 0.0, orig_dim_list[i, 0]) dets[i, [2, 4]] = torch.clamp(dets[i, [2, 4]], 0.0, orig_dim_list[i, 1]) return dets def dynamic_write_results(self, prediction, confidence, num_classes, nms=True, nms_conf=0.4): prediction_bak = prediction.clone() dets = self.write_results(prediction.clone(), confidence, num_classes, nms, nms_conf) if isinstance(dets, int): return dets if dets.shape[0] > 100: nms_conf -= 0.05 dets = self.write_results(prediction_bak.clone(), confidence, num_classes, nms, nms_conf) return dets def write_results(self, prediction, confidence, num_classes, nms=True, nms_conf=0.4): args = self.detector_opt #prediction: (batchsize, num of objects, (xc,yc,w,h,box confidence, 80 class scores)) conf_mask = (prediction[:, :, 4] > confidence).float().float().unsqueeze(2) prediction = prediction * conf_mask try: ind_nz = torch.nonzero(prediction[:, :, 4], as_tuple=False).transpose(0, 1).contiguous() except: return 0 #the 3rd channel of prediction: (xc,yc,w,h)->(x1,y1,x2,y2) box_a = prediction.new(prediction.shape) box_a[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2] / 2) box_a[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3] / 2) box_a[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2] / 2) box_a[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3] / 2) prediction[:, :, :4] = box_a[:, :, :4] batch_size = prediction.size(0) output = prediction.new(1, prediction.size(2) + 1) write = False num = 0 for ind in range(batch_size): #select the image from the batch image_pred = prediction[ind] #Get the class having maximum score, and the index of that class #Get rid of num_classes softmax scores #Add the class index and the class score of class having maximum score max_conf, max_conf_score = torch.max( image_pred[:, 5:5 + num_classes], 1) max_conf = max_conf.float().unsqueeze(1) max_conf_score = max_conf_score.float().unsqueeze(1) seq = (image_pred[:, :5], max_conf, max_conf_score) #image_pred:(n,(x1,y1,x2,y2,c,s,idx of cls)) image_pred = torch.cat(seq, 1) #Get rid of the zero entries non_zero_ind = (torch.nonzero(image_pred[:, 4], as_tuple=False)) image_pred_ = image_pred[non_zero_ind.squeeze(), :].view(-1, 7) #Get the various classes detected in the image try: img_classes = unique(image_pred_[:, -1]) except: continue #WE will do NMS classwise #print(img_classes) for cls in img_classes: if cls == 0: continue #get the detections with one particular class cls_mask = image_pred_ * (image_pred_[:, -1] == cls).float().unsqueeze(1) class_mask_ind = torch.nonzero(cls_mask[:, -2], as_tuple=False).squeeze() image_pred_class = image_pred_[class_mask_ind].view(-1, 7) #sort the detections such that the entry with the maximum objectness #confidence is at the top conf_sort_index = torch.sort(image_pred_class[:, 4], descending=True)[1] image_pred_class = image_pred_class[conf_sort_index] idx = image_pred_class.size(0) #if nms has to be done if nms: if platform.system() != 'Windows': #We use faster rcnn implementation of nms (soft nms is optional) nms_op = getattr(nms_wrapper, 'nms') #nms_op input:(n,(x1,y1,x2,y2,c)) #nms_op output: input[inds,:], inds _, inds = nms_op(image_pred_class[:, :5], nms_conf) image_pred_class = image_pred_class[inds] else: # Perform non-maximum suppression max_detections = [] while image_pred_class.size(0): # Get detection with highest confidence and save as max detection max_detections.append( image_pred_class[0].unsqueeze(0)) # Stop if we're at the last detection if len(image_pred_class) == 1: break # Get the IOUs for all boxes with lower confidence ious = bbox_iou(max_detections[-1], image_pred_class[1:], args) # Remove detections with IoU >= NMS threshold image_pred_class = image_pred_class[1:][ ious < nms_conf] image_pred_class = torch.cat(max_detections).data #Concatenate the batch_id of the image to the detection #this helps us identify which image does the detection correspond to #We use a linear straucture to hold ALL the detections from the batch #the batch_dim is flattened #batch is identified by extra batch column batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) seq = batch_ind, image_pred_class if not write: output = torch.cat(seq, 1) write = True else: out = torch.cat(seq, 1) output = torch.cat((output, out)) num += 1 if not num: return 0 #output:(n,(batch_ind,x1,y1,x2,y2,c,s,idx of cls)) return output def detect_one_img(self, img_name): """ Detect bboxs in one image Input: 'str', full path of image Output: '[{"category_id":1,"score":float,"bbox":[x,y,w,h],"image_id":str},...]', The output results are similar with coco results type, except that image_id uses full path str instead of coco %012d id for generalization. """ args = self.detector_opt _CUDA = True if args: if args.gpus[0] < 0: _CUDA = False if not self.model: self.load_model() if isinstance(self.model, torch.nn.DataParallel): self.model = self.model.module dets_results = [] #pre-process(scale, normalize, ...) the image img, orig_img, img_dim_list = prep_image(img_name, self.inp_dim) with torch.no_grad(): img_dim_list = torch.FloatTensor([img_dim_list]).repeat(1, 2) img = img.to(args.device) if args else img.cuda() prediction = self.model(img, args=args) #do nms to the detection results, only human category is left dets = self.dynamic_write_results(prediction, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thres) if isinstance(dets, int) or dets.shape[0] == 0: return None dets = dets.cpu() img_dim_list = torch.index_select(img_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.inp_dim / img_dim_list, 1)[0].view(-1, 1) dets[:, [1, 3]] -= (self.inp_dim - scaling_factor * img_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.inp_dim - scaling_factor * img_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for i in range(dets.shape[0]): dets[i, [1, 3]] = torch.clamp(dets[i, [1, 3]], 0.0, img_dim_list[i, 0]) dets[i, [2, 4]] = torch.clamp(dets[i, [2, 4]], 0.0, img_dim_list[i, 1]) #write results det_dict = {} x = float(dets[i, 1]) y = float(dets[i, 2]) w = float(dets[i, 3] - dets[i, 1]) h = float(dets[i, 4] - dets[i, 2]) det_dict["category_id"] = 1 det_dict["score"] = float(dets[i, 5]) det_dict["bbox"] = [x, y, w, h] det_dict["image_id"] = int( os.path.basename(img_name).split('.')[0]) dets_results.append(det_dict) return dets_results
class Alphapose_skeleton: def __init__(self, cuda_id=0, fast_yolo=False): self.time_det = 0.0 self.time_run = 0.0 self.cuda_id = cuda_id self.target_kps = [5, 6, 7, 8, 9, 10] # Load yolo detection model print('Loading YOLO model..') if fast_yolo: self.det_model = Darknet('./AlphaPose/yolo/cfg/yolov3-tiny.cfg', self.cuda_id) self.det_model.load_weights('./AlphaPose/models/yolo/yolov3-tiny.weights') else: self.det_model = Darknet('./AlphaPose/yolo/cfg/yolov3.cfg', self.cuda_id) self.det_model.load_weights('./AlphaPose/models/yolo/yolov3.weights') self.det_model.cuda(self.cuda_id) self.det_model.eval() # Load pose model print('Loading Alphapose pose model..') pose_dataset = Mscoco() if args.fast_inference: self.pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) else: self.pose_model = InferenNet(4 * 1 + 1, pose_dataset) self.pose_model.cuda(self.cuda_id) self.pose_model.eval() def run(self, folder_or_imglist, sample_rate): time_run_start = time.time() if type(folder_or_imglist) == 'str': inputpath = folder_or_imglist print(inputpath) args.inputpath = inputpath # Load input images im_names = [img for img in sorted(os.listdir(inputpath)) if img.endswith('jpg')] N = len(im_names) dataset = Image_loader(im_names, format='yolo') else: N = len(folder_or_imglist) imglist = [img for i, img in enumerate(folder_or_imglist) if i % sample_rate == 0] dataset = Image_loader_from_images(imglist, format='yolo') # Load detection loader test_loader = DetectionLoader(dataset, self.det_model, self.cuda_id).start() skeleton_result_list = [] for i in range(dataset.__len__()): with torch.no_grad(): (inp, orig_img, im_name, boxes, scores) = test_loader.read() if boxes is None or boxes.nelement() == 0: skeleton_result = None else: # Pose Estimation time_det_start = time.time() inps, pt1, pt2 = crop_from_dets(inp, boxes) inps = Variable(inps.cuda(self.cuda_id)) hm = self.pose_model(inps) hm_data = hm.cpu().data preds_hm, preds_img, preds_scores = getPrediction( hm_data, pt1, pt2, args.inputResH, args.inputResW, args.outputResH, args.outputResW) skeleton_result = pose_nms(boxes, scores, preds_img, preds_scores) self.time_det += (time.time() - time_det_start) skeleton_result_list.append(skeleton_result) skeleton_list = [] j = 0 for i in range(N): im_name = 'image_{:05d}.jpg'.format(i+1) if (i == sample_rate * (1+j)): j += 1 skeleton_result = skeleton_result_list[j] skeleton_list.append([im_name.split('/')[-1]]) if skeleton_result is not None: for human in skeleton_result: kp_preds = human['keypoints'] kp_scores = human['kp_score'] # ## remove small hand # if float(kp_scores[9]) < 0.2 and float(kp_scores[10]) < 0.2: # continue for n in range(kp_scores.shape[0]): skeleton_list[-1].append(int(kp_preds[n, 0])) skeleton_list[-1].append(int(kp_preds[n, 1])) skeleton_list[-1].append(round(float(kp_scores[n]), 2)) self.time_run += (time.time() - time_run_start) return skeleton_list def runtime(self): return self.time_det, self.time_run def save_skeleton(self, skeleton_list, outputpath): if not os.path.exists(outputpath): os.mkdir(outputpath) out_file = open(os.path.join(outputpath, 'skeleton.txt'), 'w') for skeleton in skeleton_list: out_file.write(' '.join(str(x) for x in skeleton)) out_file.write('\n') out_file.close()
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) # 输入的inp_dim是大于32且可以被32整除的数 assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() # print('___________show the dataloader original information_________') # print('image name',im_name) # print('im_dim_list',im_dim_list) # # print('!!!!!!!!!!!!!!!!!!!!!!!!!+++++++++++++++++++++++++++++++++++++++++++') # print() # print() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() # index_select,第一个参数是索引的对象,第二个参数是如何索引(0是行,1是列),第三个参数是索引的范围 # 返回到检测到目标的索引的im_dim_list(w,h,w,h) im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) # scaling_factor的每个元素就对应一张图片缩放成416的时候所采用的缩放系数 scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): # 将输入det张量每个元素的夹紧到区间 [0,im_dim_list对应的 w,h],并返回结果到一个新张量 dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, batchSize=1, queueSize=1, size=100, device=0): ## camera stream self.stream = cv2.VideoCapture(device) assert self.stream.isOpened(), 'Cannot capture from camera' self.stream.set(cv2.CAP_PROP_BUFFERSIZE, 1) self.inp_dim = int(opt.inp_dim) ## yolo model self.det_model = Darknet("joints_detectors/Alphapose/yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('joints_detectors/Alphapose/models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.batchSize = batchSize self.datalen = 1 leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover ## alphapose model fast_inference = True pose_dataset = Mscoco() if fast_inference: self.pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) else: self.pose_model = InferenNet(4 * 1 + 1, pose_dataset) self.pose_model.cuda() self.pose_model.eval() ## 2d plotting self.fig_in = plt.figure(figsize=(size , size)) self.ax_in = self.fig_in.add_subplot(1, 1, 1) self.ax_in.get_xaxis().set_visible(False) self.ax_in.get_yaxis().set_visible(False) self.ax_in.set_axis_off() self.ax_in.set_title('Input') self.initialized = False self.size=size thismanager = get_current_fig_manager() thismanager.window.wm_geometry("+0-1000") def update(self): time1 = time.time() _, frame = self.stream.read() # frame = cv2.resize(frame, (frame.shape[1]//2,frame.shape[0]//2)) #TODO TESTING # frame[:,:200,:]=0 # frame[:,450:,:]=0 img_k, self.orig_img, im_dim_list_k = prep_frame(frame, self.inp_dim) img = [img_k] im_name = ["im_name"] im_dim_list = [im_dim_list_k] img = torch.cat(img) im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) time2 = time.time() with torch.no_grad(): ### detector ######################### # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: self.visualize2dnoperson() return None dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] boxes_k = boxes[dets[:, 0] == 0] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: self.visualize2dnoperson() raise NotImplementedError return None inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) time3 = time.time() ### processor ######################### inp = im_to_torch(cv2.cvtColor(self.orig_img, cv2.COLOR_BGR2RGB)) inps, pt1, pt2 = self.crop_from_dets(inp, boxes, inps, pt1, pt2) ### generator ######################### self.orig_img = np.array(self.orig_img, dtype=np.uint8) # location prediction (n, kp, 2) | score prediction (n, kp, 1) datalen = inps.size(0) batchSize = 20 #args.posebatch() leftover = 0 if datalen % batchSize: leftover = 1 num_batches = datalen // batchSize + leftover hm = [] time4 = time.time() for j in range(num_batches): inps_j = inps[j * batchSize:min((j + 1) * batchSize, datalen)].cuda() hm_j = self.pose_model(inps_j) hm.append(hm_j) hm = torch.cat(hm) hm = hm.cpu().data preds_hm, preds_img, preds_scores = getPrediction( hm, pt1, pt2, opt.inputResH, opt.inputResW, opt.outputResH, opt.outputResW) result = pose_nms( boxes, scores, preds_img, preds_scores) time5 = time.time() if not result: # No people self.visualize2dnoperson() return None else: self.kpt = max(result, key=lambda x: x['proposal_score'].data[0] * calculate_area(x['keypoints']), )['keypoints'] self.visualize2d() return self.kpt time6 = time.time() print("process time : {} ".format(time6 - time5)) ########################################################################################## ########################################################################################## def crop_from_dets(self,img, boxes, inps, pt1, pt2): ''' Crop human from origin image according to Dectecion Results ''' imght = img.size(1) imgwidth = img.size(2) tmp_img = img tmp_img[0].add_(-0.406) tmp_img[1].add_(-0.457) tmp_img[2].add_(-0.480) for i, box in enumerate(boxes): upLeft = torch.Tensor( (float(box[0]), float(box[1]))) bottomRight = torch.Tensor( (float(box[2]), float(box[3]))) ht = bottomRight[1] - upLeft[1] width = bottomRight[0] - upLeft[0] scaleRate = 0.3 upLeft[0] = max(0, upLeft[0] - width * scaleRate / 2) upLeft[1] = max(0, upLeft[1] - ht * scaleRate / 2) bottomRight[0] = max( min(imgwidth - 1, bottomRight[0] + width * scaleRate / 2), upLeft[0] + 5) bottomRight[1] = max( min(imght - 1, bottomRight[1] + ht * scaleRate / 2), upLeft[1] + 5) try: inps[i] = cropBox(tmp_img.clone(), upLeft, bottomRight, opt.inputResH, opt.inputResW) except IndexError: print(tmp_img.shape) print(upLeft) print(bottomRight) print('===') pt1[i] = upLeft pt2[i] = bottomRight return inps, pt1, pt2 ########################################################################################## ########################################################################################## def visualize2d(self): if not self.initialized: self.image = self.ax_in.imshow(self.orig_img, aspect='equal') self.point= self.ax_in.scatter(*self.kpt.T, 5, color='red', edgecolors='white', zorder=10) self.initialized = True else: self.image.set_data(self.orig_img) self.point.set_offsets(self.kpt) def visualize2dnoperson(self): #TODO # Update 2D poses if not self.initialized: self.image = self.ax_in.imshow(self.orig_img, aspect='equal') else: self.image.set_data(self.orig_img)
class PeopleDetector: def __init__(self, confidence=0.5, nms_thresh=0.4, resolution=416, weights_path='yolo/weights/yolov3.weights', cfg_path='yolo/cfg/yolov3.cfg', num_classes=80, names_path='yolo/data/coco.names'): self.confidence = confidence self.nms_thesh = nms_thresh self.weightsfile = weights_path self.cfgfile = cfg_path self.CUDA = torch.cuda.is_available() self.num_classes = num_classes self.classes = load_classes(names_path) self.model = Darknet(self.cfgfile) self.model.load_weights(self.weightsfile) self.model.net_info["height"] = resolution self.inp_dim = int(self.model.net_info["height"]) #Check if resolution is multiple of 32 assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 # If there's a GPU availible, put the model on GPU if self.CUDA: self.model.cuda() # Set model in evaluation mode self.model.eval() def prep_image(self, img): """ Prepare image (resize) for inputting to the neural network. """ orig_im = img dim = orig_im.shape[1], orig_im.shape[0] img = cv2.resize(orig_im, (self.inp_dim, self.inp_dim)) img_ = img[:, :, ::-1].transpose((2, 0, 1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, orig_im, dim def writeSingleLabel(self, x, img, paintBoundingBoxies, color=(0, 0, 255)): """ Put label on top of image if it's not inside another Bounding box Default label color: red """ c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) #check if the label of the person is inside a label of a painting isInside = False for i, box in enumerate(paintBoundingBoxies): isInside = (box[0] <= c1[0] < c2[0] <= (box[0] + box[2]) and box[1] <= c1[1] < c2[1] <= (box[1] + box[3])) if isInside: break if isInside: return cls = int(x[-1]) label = "{0}".format(self.classes[cls]) cv2.rectangle(img, c1, c2, color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) return img def writLabels(self, origin_im, netOutput, paintBoundingBoxies): """ Put all the labels on top of image if they are no inside another Bounding box """ list( map( lambda x: self.writeSingleLabel( x, origin_im, paintBoundingBoxies), netOutput)) return origin_im def detectPeopleFromFrame(self, frame): """ Detect people inside a frame and return bounding boxes """ #Prepare imgs compatible with pytorch img, orig_im, dim = self.prep_image(frame) #Load img on GPU if available if self.CUDA: img = img.cuda() #Inference time with torch.no_grad(): output = self.model(Variable(img), self.CUDA) #Collect 3 stage prediction into single one output = write_results(output, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh) #If No detection... if type(output) == int: return None #If we have detection maintain only people --> people id == 0 output = output[output[:, -1] < 1] #Resize Label according to input frame dimension output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float( self.inp_dim)) / self.inp_dim output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] if output.shape[1] != 8: return None return output
save_path = os.path.join(args.outputpath, 'AlphaPose_webcam' + webcam + '.avi') writer = DataWriter(args.save_video, save_path, cv2.VideoWriter_fourcc(*'XVID'), fps, frameSize).start() # Load YOLO model print('Loading YOLO model..') sys.stdout.flush() det_model = Darknet("yolo/cfg/yolov3.cfg") det_model.load_weights('models/yolo/yolov3.weights') det_model.net_info['height'] = args.inp_dim det_inp_dim = int(det_model.net_info['height']) assert det_inp_dim % 32 == 0 assert det_inp_dim > 32 det_model.cuda() det_model.eval() # Load pose model pose_dataset = Mscoco() if args.fast_inference: pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) else: pose_model = InferenNet(4 * 1 + 1, pose_dataset) pose_model.cuda() pose_model.eval() runtime_profile = {'ld': [], 'dt': [], 'dn': [], 'pt': [], 'pn': []} print('Starting webcam demo, press Ctrl + C to terminate...') sys.stdout.flush()
class MSRApose_skeleton(): def __init__(self, cuda_id=0, fast_yolo=False): self.time_det = 0.0 self.time_run = 0.0 self.num_joints = 17 self.target_kps = [5, 6, 7, 8, 9, 10] # Load yolo detection model print('Loading YOLO model..') if fast_yolo: self.det_model = Darknet('./AlphaPose/yolo/cfg/yolov3-tiny.cfg') self.det_model.load_weights( './AlphaPose/models/yolo/yolov3-tiny.weights') else: self.det_model = Darknet("./AlphaPose/yolo/cfg/yolov3.cfg") self.det_model.load_weights( './AlphaPose/models/yolo/yolov3.weights') self.det_model.cuda() self.det_model.eval() cfg_file = 'MSRAPose/experiments/coco/resnet50/256x192_d256x3_adam_lr1e-3.yaml' model_file = 'MSRAPose/models/pytorch/pose_coco/pose_resnet_50_256x192.pth.tar' # update config update_config(cfg_file) config.TEST.MODEL_FILE = model_file # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED # load pre-trained model self.model = eval('models_msra.' + config.MODEL.NAME + '.get_pose_net')(config, is_train=False) print('Loading MSRA pose model..') print('=> loading model from {}'.format(config.TEST.MODEL_FILE)) self.model.load_state_dict(torch.load(config.TEST.MODEL_FILE)) gpus = [int(i) for i in config.GPUS.split(',')] self.model = torch.nn.DataParallel(self.model, device_ids=gpus).cuda() self.model.eval() # image transform self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) def _box2cs(self, box, image_width, image_height): x, y, w, h = box[:4] return self._xywh2cs(x, y, w, h, image_width, image_height) def _xywh2cs(self, x, y, w, h, image_width, image_height): center = np.zeros((2), dtype=np.float32) center[0] = x + w * 0.5 center[1] = y + h * 0.5 aspect_ratio = image_width * 1.0 / image_height pixel_std = 200 if w > aspect_ratio * h: h = w * 1.0 / aspect_ratio elif w < aspect_ratio * h: w = h * aspect_ratio scale = np.array([w * 1.0 / pixel_std, h * 1.0 / pixel_std], dtype=np.float32) if center[0] != -1: scale = scale * 1.25 return center, scale def detect_skeleton_on_single_human(self, image, box): ''' input: image read by opencv2 ''' data_numpy = image.copy() # object detection box if box is None: box = [0, 0, data_numpy.shape[0], data_numpy.shape[1]] c, s = self._box2cs(box, data_numpy.shape[0], data_numpy.shape[1]) r = 0 trans = get_affine_transform(c, s, r, config.MODEL.IMAGE_SIZE) input = cv2.warpAffine( data_numpy, trans, (int(config.MODEL.IMAGE_SIZE[0]), int(config.MODEL.IMAGE_SIZE[1])), flags=cv2.INTER_LINEAR) input = self.transform(input).unsqueeze(0) with torch.no_grad(): # compute output heatmap output = self.model(input) output = output.clone().cpu().numpy() # heatmap = output # heatmap_hand = heatmap[0][self.target_kps[0]] # print(heatmap.shape) # for kk in self.target_kps[1:]: # heatmap_hand += heatmap[0][kk] # cv2.imshow('skeletons', heatmap_hand) # cv2.waitKey() # compute coordinate preds, maxvals = get_final_preds(config, output, np.asarray([c]), np.asarray([s])) return preds[0] def run(self, folder_or_imglist, sample_rate): time_run_start = time.time() if type(folder_or_imglist) == 'str': inputpath = folder_or_imglist print(inputpath) args.inputpath = inputpath # Load input images im_names = [ img for img in sorted(os.listdir(inputpath)) if img.endswith('jpg') ] dataset = Image_loader(im_names, format='yolo') else: imglist = folder_or_imglist dataset = Image_loader_from_images(imglist, format='yolo') # Load detection loader test_loader = DetectionLoader(dataset, self.det_model).start() skeleton_list = [] # final_result = [] for i in range(dataset.__len__()): with torch.no_grad(): (inp, orig_img, im_name, boxes, scores) = test_loader.read() skeleton_result = [] if boxes is None or boxes.nelement() == 0: skeleton_result = None else: # Pose Estimation time_det_start = time.time() for box in boxes.tolist(): x1, y1, x2, y2 = int(box[0]), int(box[1]), int( box[2]), int(box[3]) box = [x1, y1, x2 - x1, y2 - y1] skeleton_result.append( self.detect_skeleton_on_single_human( orig_img, box)) self.time_det += (time.time() - time_det_start) skeleton_list.append([im_name.split('/')[-1]]) if skeleton_result is not None: for human in skeleton_result: for mat in human: skeleton_list[-1].append(int(mat[0])) skeleton_list[-1].append(int(mat[1])) skeleton_list[-1].append(0.8) self.time_run += (time.time() - time_run_start) return skeleton_list def runtime(self): return self.time_det, self.time_run def generate_target_points(self, joints, image_size, sigma): ''' :param joints: [num_joints, 3] :param joints_vis: [num_joints, 3] :return: target, target_weight(1: visible, 0: invisible) ''' # target_weight = np.ones((self.num_joints, 1), dtype=np.float32) # target_weight[:, 0] = joints_vis[:, 0] target = np.zeros((self.num_joints, image_size[1], image_size[0]), dtype=np.float32) tmp_size = sigma * 3 for joint_id in range(self.num_joints): feat_stride = [1, 1] #image_size / image_size mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] if ul[0] >= image_size[0] or ul[1] >= image_size[1] \ or br[0] < 0 or br[1] < 0: # If not, just return the image as is # target_weight[joint_id] = 0 continue # # Generate gaussian size = 2 * tmp_size + 1 x = np.arange(0, size, 1, np.float32) y = x[:, np.newaxis] x0 = y0 = size // 2 # The gaussian is not normalized, we want the center value to equal 1 g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) # Usable gaussian range g_x = max(0, -ul[0]), min(br[0], image_size[0]) - ul[0] g_y = max(0, -ul[1]), min(br[1], image_size[1]) - ul[1] # Image range img_x = max(0, ul[0]), min(br[0], image_size[0]) img_y = max(0, ul[1]), min(br[1], image_size[1]) v = 1 #target_weight[joint_id] if v > 0.5: target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ g[g_y[0]:g_y[1], g_x[0]:g_x[1]] return target #, target_weight def generate_target_lines(self, joints, image_size, target_kps): l_pair = [ (0, 1), (0, 2), (1, 3), (2, 4), # Head (5, 6), (5, 7), (7, 9), (6, 8), (8, 10), # Hand (17, 11), (17, 12), # Body (11, 13), (12, 14), (13, 15), (14, 16) ] # Leg line_color = [(0, 215, 255), (0, 255, 204), (0, 134, 255), (0, 255, 50), (77, 255, 222), (77, 196, 255), (77, 135, 255), (191, 255, 77), (77, 255, 77), (77, 222, 255), (255, 156, 127), (0, 127, 255), (255, 127, 77), (0, 77, 255), (255, 77, 36)] # Nose, LEye, REye, LEar, REar # LShoulder, RShoulder, LElbow, RElbow, LWrist, RWrist # LHip, RHip, LKnee, Rknee, LAnkle, RAnkle, Neck p_color = [(0, 255, 255), (0, 191, 255), (0, 255, 102), (0, 77, 255), (0, 255, 0), (77, 255, 255), (77, 255, 204), (77, 204, 255), (191, 255, 77), (77, 191, 255), (191, 255, 77), (204, 77, 255), (77, 255, 204), (191, 77, 255), (77, 255, 191), (127, 77, 255), (77, 255, 127), (0, 255, 255)] img = np.zeros(shape=image_size, dtype='uint8') part_line = {} for n in range(self.num_joints): # if float(kp_scores_h[n]) <= 0.05: # continue cor_x, cor_y = int(joints[n][0]), int(joints[n][1]) part_line[n] = (cor_x, cor_y) # cv2.circle(img, (cor_x, cor_y), 4, p_color[n], -1) # Draw limbs for i, (start_p, end_p) in enumerate(l_pair): if i not in target_kps: continue if start_p in part_line and end_p in part_line: start_xy = part_line[start_p] end_xy = part_line[end_p] cv2.line(img, start_xy, end_xy, line_color[i], 5) return img
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet( "joints_detectors/Alphapose/yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights( 'joints_detectors/Alphapose/models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, name="DetectionLoader", args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=(), daemon=True) # p = mp.Process(target=self.update, args=()) # p.daemon = True p.start() return self def update(self): while (True): sys.stdout.flush() print("detection loader len : " + str(self.Q.qsize())) # keep looping the whole dataset #for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: # if self.Q.full(): # time.sleep(2) self.Q.put((orig_img[0], im_name[0], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] #for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == 0] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: # if self.Q.full(): # time.sleep(2) self.Q.put( (orig_img[0], im_name[0], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) # if self.Q.full(): # time.sleep(2) self.Q.put((orig_img[0], im_name[0], boxes_k, scores[dets[:, 0] == 0], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class VideoDetectionLoader: def __init__(self, path, batchSize=4, queueSize=256): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stream = cv2.VideoCapture(path) assert self.stream.isOpened(), 'Cannot capture source' self.stopped = False self.batchSize = batchSize self.datalen = int(self.stream.get(cv2.CAP_PROP_FRAME_COUNT)) leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file self.Q = Queue(maxsize=queueSize) def length(self): return self.datalen def len(self): return self.Q.qsize() def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): # keep looping the whole video for i in range(self.num_batches): img = [] inp = [] orig_img = [] im_name = [] im_dim_list = [] for k in range(i * self.batchSize, min((i + 1) * self.batchSize, self.datalen)): (grabbed, frame) = self.stream.read() # if the `grabbed` boolean is `False`, then we have # reached the end of the video file if not grabbed: self.stop() return # process and add the frame to the queue inp_dim = int(opt.inp_dim) img_k, orig_img_k, im_dim_list_k = prep_frame(frame, inp_dim) inp_k = im_to_torch(orig_img_k) img.append(img_k) inp.append(inp_k) orig_img.append(orig_img_k) im_dim_list.append(im_dim_list_k) with torch.no_grad(): ht = inp[0].size(1) wd = inp[0].size(2) # Human Detection img = Variable(torch.cat(img)).cuda() im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) im_dim_list = im_dim_list.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(inp)): while self.Q.full(): time.sleep(0.2) self.Q.put((inp[k], orig_img[k], None, None)) continue im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5].cpu() scores = dets[:, 5:6].cpu() for k in range(len(inp)): while self.Q.full(): time.sleep(0.2) self.Q.put((inp[k], orig_img[k], boxes[dets[:, 0] == k], scores[dets[:, 0] == k])) def videoinfo(self): # indicate the video info fourcc = int(self.stream.get(cv2.CAP_PROP_FOURCC)) fps = self.stream.get(cv2.CAP_PROP_FPS) frameSize = (int(self.stream.get(cv2.CAP_PROP_FRAME_WIDTH)), int(self.stream.get(cv2.CAP_PROP_FRAME_HEIGHT))) return (fourcc, fps, frameSize) def read(self): # return next frame in the queue return self.Q.get() def more(self): # return True if there are still frames in the queue return self.Q.qsize() > 0 def stop(self): # indicate that the thread should be stopped self.stopped = True
class WebcamDetectionLoader: def __init__(self, webcam=0, batchSize=1, queueSize=256): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stream = cv2.VideoCapture(int(webcam)) assert self.stream.isOpened(), 'Cannot open webcam' self.stopped = False self.batchSize = batchSize # initialize the queue used to store frames read from # the video file self.Q = LifoQueue(maxsize=queueSize) def len(self): return self.Q.qsize() def start(self): # start a thread to read frames from the file video stream t = threading.Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): print( f'WebcamDetectionLoader_update_thread: {threading.currentThread().name}' ) # keep looping while True: img = [] inp = [] orig_img = [] im_name = [] im_dim_list = [] for k in range(self.batchSize): (grabbed, frame) = self.stream.read() h, w, c = frame.shape # frame = cv2.resize(frame, (int(w / 4), int(h / 4)), interpolation=cv2.INTER_CUBIC) if not grabbed: continue # process and add the frame to the queue inp_dim = int(opt.inp_dim) img_k, orig_img_k, im_dim_list_k = prep_frame(frame, inp_dim) inp_k = im_to_torch(orig_img_k) img.append(img_k) inp.append(inp_k) orig_img.append(orig_img_k) im_dim_list.append(im_dim_list_k) with torch.no_grad(): ht = inp[0].size(1) wd = inp[0].size(2) # Human Detection img = Variable(torch.cat(img)).cuda() im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) im_dim_list = im_dim_list.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(inp)): if self.Q.full(): with self.Q.mutex: self.Q.queue.clear() self.Q.put((inp[k], orig_img[k], None, None)) continue im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5].cpu() scores = dets[:, 5:6].cpu() for k in range(len(inp)): if self.Q.full(): with self.Q.mutex: self.Q.queue.clear() self.Q.put((inp[k], orig_img[k], boxes[dets[:, 0] == k], scores[dets[:, 0] == k])) def videoinfo(self): # indicate the video info fourcc = int(self.stream.get(cv2.CAP_PROP_FOURCC)) fps = self.stream.get(cv2.CAP_PROP_FPS) frameSize = (int(self.stream.get(cv2.CAP_PROP_FRAME_WIDTH)), int(self.stream.get(cv2.CAP_PROP_FRAME_HEIGHT))) return (fourcc, fps, frameSize) def read(self): # return next frame in the queue return self.Q.get() def more(self): # return True if there are still frames in the queue return self.Q.qsize() > 0 def stop(self): # indicate that the thread should be stopped self.stopped = True
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize # initialize the queue used to store frames read from # the video file self.Q = LifoQueue(maxsize=queueSize) pose_dataset = Mscoco() if opt.fast_inference: self.pose_model = InferenNet_fast(4 * 1 + 1, pose_dataset) else: self.pose_model = InferenNet(4 * 1 + 1, pose_dataset) self.pose_model.cuda() self.pose_model.eval() def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): while True: (img, orig_img, im_name, im_dim_list) = self.dataloder.getitem() with self.dataloder.Q.mutex: self.dataloder.Q.queue.clear() with torch.no_grad(): # Human Detection #img = img.cuda() img = img.cuda() prediction = self.det_model(img, CUDA=True) # im_dim_list = im_dim_list.cuda() frame_id = int(im_name.split('.')[0]) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put( (orig_img, frame_id, None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] # Pose Estimation inp = im_to_torch(orig_img) inps = torch.zeros(boxes.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes.size(0), 2) pt2 = torch.zeros(boxes.size(0), 2) inps, pt1, pt2 = crop_from_dets(inp, boxes, inps, pt1, pt2) inps = Variable(inps.cuda()) hm = self.pose_model(inps) if boxes is None: if self.Q.full(): time.sleep(2) self.Q.put( (orig_img, frame_id, None, None, None, None, None)) continue else: preds_hm, preds_img, preds_scores = getPrediction( hm.cpu(), pt1, pt2, opt.inputResH, opt.inputResW, opt.outputResH, opt.outputResW) bbox, b_score, kp, kp_score, roi = pose_nms( orig_img, boxes, scores, preds_img, preds_scores) # result = { # 'imgname': im_name, # 'result': result, # 'orig_img' : orig_img # } if self.Q.full(): time.sleep(2) #self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:,0]==k], inps, pt1, pt2)) #self.Q.put((result, orig_img, im_name)) self.Q.put( (orig_img, frame_id, bbox, b_score, kp, kp_score, roi)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder, batchSize=1): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.dataloader = dataloder self.stopped = False self.batchSize = batchSize # initialize the queue used to store frames read from # the video file def forward(self, Q_load, Q_det): # keep looping the whole dataset while True: #print(Q_load.qsize(), Q_det.qsize()) img, orig_img, im_dim_list = Q_load.get() with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if Q_det.full(): time.sleep(0.1) #print("detectionloaderQ1 full ") #Q_det.put((orig_img[k], None, None, None, None, None)) Q_det.put((None, orig_img[k], None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) inp = im_to_torch(cv2.cvtColor(orig_img[k], cv2.COLOR_BGR2RGB)) inps, pt1, pt2 = crop_from_dets(inp, boxes_k, inps, pt1, pt2) if Q_det.full(): time.sleep(0.1) #print("detectionloaderQ3 full ") #Q_det.put((orig_img[k], boxes_k, scores[dets[:,0]==k], inps, pt1, pt2)) Q_det.put((inps, orig_img[k], boxes_k, scores[dets[:, 0] == k], pt1, pt2))
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize # initialize the queue used to store frames read from # the video file self.Q = LifoQueue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream t = Thread(target=self.update, args=()) t.daemon = True t.start() return self def update(self): # keep looping the whole dataset while True: img, orig_img, im_name, im_dim_list = self.dataloder.getitem() with self.dataloder.Q.mutex: self.dataloder.Q.queue.clear() with torch.no_grad(): # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder, obj_id, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not cfg_path = "yolo/cfg/yolov3-single.cfg" weights_path = 'models/yolo/{:02d}.weights'.format(obj_id) self.det_model = Darknet(cfg_path, reso=int(opt.inp_dim)) self.det_model.load_weights(weights_path) print("Loading YOLO cfg from", cfg_path) print("Loading YOLO weights from", weights_path) self.det_model.eval() self.det_model.net_info['height'] = opt.inp_dim #input_dimension self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): img = img.cuda() # Critical, use yolo to do object detection here! prediction = self.det_model(img) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() # Scale for SIXD dataset reso = self.det_inp_dim im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) w, h = im_dim_list[:, 0], im_dim_list[:, 1] w_ratio = w / reso h_ratio = h / reso boxes = dets[:, 1:5] boxes[:, 0] = boxes[:, 0] * w_ratio boxes[:, 1] = boxes[:, 1] * h_ratio boxes[:, 2] = boxes[:, 2] * w_ratio boxes[:, 3] = boxes[:, 3] * h_ratio scores = dets[:, 5:6] # im_dim_list = torch.index_select(im_dim_list,0, dets[:, 0].long()) # scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # # coordinate transfer # dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 # dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 # dets[:, 1:5] /= scaling_factor # for j in range(dets.shape[0]): # dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) # dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) # boxes = dets[:, 1:5] # scores = dets[:, 5:6] img = Image.open(im_name[0]) draw = ImageDraw.Draw(img) for i in range(boxes.shape[0]): x1, y1, x2, y2 = boxes[i, 0], boxes[i, 1], boxes[i, 2], boxes[i, 3] objectness = 'conf: %.2f' % scores draw.rectangle((x1, y1, x2, y2), outline='red') # img.save(im_name[0].replace('rgb', 'results')) for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
colors = load_colors('data/pallete') num_classes = 80 bbox_attrs = 5 + num_classes model = Darknet(args.cfgfile, height=args.reso) model.load_state_dict(torch.load(args.weightsfile)) model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 if CUDA: model.cuda() model.eval() cap = cv2.VideoCapture(args.video) assert cap.isOpened(), 'Cannot capture source' frames = 0 start = time.time() while cap.isOpened(): ret, frame = cap.read() if ret: img, orig_im, dim = prep_image(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) if CUDA:
cap = cv2.VideoCapture( '/home/gaurav/Desktop/sem6/VR/before_midsem/mini_project/harsh_without_mask.mp4' ) args = arg_parse() confidence = float(args.confidence) nms_thesh = float(args.nms_thresh) num_classes = 80 bbox_attrs = 5 + num_classes yolo = Darknet(args.cfgfile) yolo.load_weights(args.weightsfile) yolo.net_info["height"] = args.reso inp_dim = int(yolo.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 yolo.cuda() PATH = '/home/gaurav/Desktop/sem6/VR/before_midsem/mini_project/model/classi4.pkl' net = models.alexnet(pretrained=True) my_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net.to(my_device) loaded_model = pickle.load(open(PATH, 'rb')) tot_cnt = 0 cnt = 0 out = cv2.VideoWriter('./output_harsh_without_mask.avi', cv2.VideoWriter_fourcc(*'MJPG'), 20, (352, 640)) while (True): ret, frame = cap.read() if ret == False:
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim # self.det_inp_dim = int(self.det_model.net_info['height']) self.det_inp_dim = int(opt.inp_dim) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset from mtcnn.mtcnn import MTCNN detector = MTCNN() for i in range(self.num_batches): img, orig_img, im_name, im_dim_list = self.dataloder.getitem() if img is None: self.Q.put((None, None, None, None, None, None, None)) return with torch.no_grad(): if self.dataloder.format == 'yolo': # Human Detection img = img.cuda() prediction = self.det_model(img, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) elif self.dataloder.format == 'mtcnn': # Face detection imgs_np = img.float().mul(255.0).cpu().numpy() imgs_np = np.squeeze(imgs_np, axis=0) imgs_np = np.transpose(imgs_np, (1, 2, 0)) dets = detector.detect_faces(imgs_np) fac_det = [] for det in dets: fac_det.append([ 0, det["box"][0], det["box"][1], det["box"][0] + det["box"][2], det["box"][1] + det["box"][3], det["confidence"], 0.99, 0 ]) dets = torch.tensor(fac_det) if isinstance(dets, int) or dets.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] for k in range(len(orig_img)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None)) continue inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) if self.Q.full(): time.sleep(2) self.Q.put((orig_img[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
class DetectionLoader: def __init__(self, dataloder): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder def detect_image(self, im_path): im, ori_im, im_name, im_dim_list = self.dataloder.getitem_yolo(im_path) with torch.no_grad(): im = im.cuda() prediction = self.det_model(im, CUDA=True) # NMS process dets = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(dets, int) or dets.shape[0] == 0: return (ori_im[0], im_name[0], None, None, None, None, None) dets = dets.cpu() im_dim_list = torch.index_select(im_dim_list, 0, dets[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0] \ .view(-1, 1) # coordinate transfer dets[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 dets[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 dets[:, 1:5] /= scaling_factor for j in range(dets.shape[0]): dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0]) dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1]) boxes = dets[:, 1:5] scores = dets[:, 5:6] if boxes.shape[0] > 1: boxes = boxes[scores.argmax()].unsqueeze(0) scores = scores[scores.argmax()].unsqueeze(0) dets = dets[scores.argmax()].unsqueeze(0) # len(ori_im) === 1 for k in range(len(ori_im)): boxes_k = boxes[dets[:, 0] == k] if isinstance(boxes_k, int) or boxes_k.shape[0] == 0: return (ori_im[k], im_name[k], None, None, None, None, None) inps = torch.zeros(boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(boxes_k.size(0), 2) pt2 = torch.zeros(boxes_k.size(0), 2) return (ori_im[k], im_name[k], boxes_k, scores[dets[:, 0] == k], inps, pt1, pt2)
class DetectionLoader: def __init__(self, dataloder, batchSize=1, queueSize=1024): # initialize the file video stream along with the boolean # used to indicate if the thread should be stopped or not self.det_model = Darknet("yolo/cfg/yolov3-spp.cfg") self.det_model.load_weights('models/yolo/yolov3-spp.weights') self.det_model.net_info['height'] = opt.inp_dim self.det_inp_dim = int(self.det_model.net_info['height']) assert self.det_inp_dim % 32 == 0 assert self.det_inp_dim > 32 self.det_model.cuda() self.det_model.eval() self.stopped = False self.dataloder = dataloder self.batchSize = batchSize self.datalen = self.dataloder.length() leftover = 0 if (self.datalen) % batchSize: leftover = 1 self.num_batches = self.datalen // batchSize + leftover # initialize the queue used to store frames read from # the video file if opt.sp: self.Q = Queue(maxsize=queueSize) else: self.Q = mp.Queue(maxsize=queueSize) def start(self): # start a thread to read frames from the file video stream if opt.sp: t = Thread(target=self.update, args=()) t.daemon = True t.start() else: p = mp.Process(target=self.update, args=()) p.daemon = True p.start() return self def update(self): # keep looping the whole dataset """ :return: """ for i in range(self.num_batches): # repeat img, orig_img, im_name, im_dim_list = self.dataloder.getitem() # img = (batch, frames) if img is None: self.Q.put((None, None, None, None, None, None, None)) return start_time = getTime() with torch.no_grad(): # Human Detection img = img.cuda() # image ( B, 3, 608,608 ) prediction = self.det_model(img, CUDA=True) # ( B, 22743, 85 ) = ( batchsize, proposal boxes, xywh+cls) # predictions for each B image. # NMS process carperson = dynamic_write_results(prediction, opt.confidence, opt.num_classes, nms=True, nms_conf=opt.nms_thesh) if isinstance(carperson, int) or carperson.shape[0] == 0: for k in range(len(orig_img)): if self.Q.full(): time.sleep(0.5) self.Q.put((orig_img[k], im_name[k], None, None, None, None, None, None)) # 8 elements continue ckpt_time, det_time = getTime(start_time) carperson = carperson.cpu() # (1) k-th image , (7) x,y,w,h,c, cls_score, cls_index im_dim_list = torch.index_select(im_dim_list, 0, carperson[:, 0].long()) scaling_factor = torch.min(self.det_inp_dim / im_dim_list, 1)[0].view(-1, 1) # coordinate transfer carperson[:, [1, 3]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2 carperson[:, [2, 4]] -= (self.det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2 carperson[:, 1:5] /= scaling_factor for j in range(carperson.shape[0]): carperson[j, [1, 3]] = torch.clamp(carperson[j, [1, 3]], 0.0, im_dim_list[j, 0]) carperson[j, [2, 4]] = torch.clamp(carperson[j, [2, 4]], 0.0, im_dim_list[j, 1]) cls_car_mask = carperson * (carperson[:, -1] == 2).float().unsqueeze(1) # car class__car_mask_ind = torch.nonzero(cls_car_mask[:, -2]).squeeze() car_dets = carperson[class__car_mask_ind].view(-1, 8) cls_person_mask = carperson * (carperson[:, -1] == 0).float().unsqueeze(1) # person class__person_mask_ind = torch.nonzero(cls_person_mask[:, -2]).squeeze() hm_dets = carperson[class__person_mask_ind].view(-1, 8) ckpt_time, masking_time = getTime(ckpt_time) hm_boxes, hm_scores = None, None if hm_dets.size(0) > 0: hm_boxes = hm_dets[:, 1:5] hm_scores = hm_dets[:, 5:6] car_box_conf = None if car_dets.size(0) > 0: car_box_conf = car_dets for k in range(len(orig_img)): # for k-th image detection. if car_box_conf is None: car_k = None else: car_k = car_box_conf[car_box_conf[:, 0] == k].numpy() car_k = car_k[np.where(car_k[:, 5] > 0.2)] # TODO check here, cls or bg/fg confidence? # car_k = non_max_suppression_fast(car_k, overlapThresh=0.3) # TODO check here, NMS if hm_boxes is not None: hm_boxes_k = hm_boxes[hm_dets[:, 0] == k] hm_scores_k = hm_scores[hm_dets[:, 0] == k] inps = torch.zeros(hm_boxes_k.size(0), 3, opt.inputResH, opt.inputResW) pt1 = torch.zeros(hm_boxes_k.size(0), 2) pt2 = torch.zeros(hm_boxes_k.size(0), 2) item = (orig_img[k], im_name[k], hm_boxes_k, hm_scores_k, inps, pt1, pt2, car_k) # print('video processor ', 'image' , im_name[k] , 'hm box ' , hm_boxes_k.size()) else: item = (orig_img[k], im_name[k], None, None, None, None, None, car_k) # 8-elemetns if self.Q.full(): time.sleep(0.5) self.Q.put(item) ckpt_time, distribute_time = getTime(ckpt_time) def read(self): # return next frame in the queue return self.Q.get() def len(self): # return queue len return self.Q.qsize()
def main(): global args args = parser.parse_args() # Yolo confidence = float(args.confidence) nms_thesh = float(args.nms_thresh) start = 0 CUDA = torch.cuda.is_available() num_classes = 80 bbox_attrs = 5 + num_classes model = Darknet(args.config_file) model.load_weights(args.weights_file) model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 if CUDA: model.cuda() model.eval() # Connect client = paho.Client() host, port = args.broker_url.split(':') client.connect(host, int(port)) # subscribe a system messages client.message_callback_add("$SYS/#", system_message) client.subscribe("$SYS/#") # Open rtsp stream cap = cv2.VideoCapture(args.input_url) assert cap.isOpened(), 'Cannot capture source {}'.format(args.input_url) # Inspect input stream input_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) input_fps = cap.get(cv2.CAP_PROP_FPS) print("[input stream] width: {}, height: {}, fps: {}".format( input_width, input_height, input_fps)) # Open output stream output_command = stream_factory(args.output_url, input_width, input_height, input_fps) print(output_command) output_stream = sp.Popen(output_command, stdin=sp.PIPE, stderr=sp.PIPE) frames = 0 start = time.time() while cap.isOpened(): ret, frame = cap.read() # frame size: 640x360x3(=691200) if ret: # Our detect operations on the frame come here img, orig_im, dim = prep_image(frame, inp_dim) if CUDA: im_dim = im_dim.cuda() img = img.cuda() output = model(Variable(img), CUDA) output = write_results(output, confidence, num_classes, nms=True, nms_conf=nms_thesh) if type(output) == int: frames += 1 print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) cv2.imshow("frame", orig_im) key = cv2.waitKey(1) if key & 0xFF == ord('q'): break continue output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float(inp_dim)) / inp_dim output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] classes = load_classes('yolo/data/coco.names') colors = pkl.load(open("yolo/pallete", "rb")) # Overlay on screen list(map(lambda x: write(x, orig_im, classes, colors), output)) # Send a BBoxes # Display the resulting frame cv2.imshow("frame", orig_im) frames += 1 print("FPS of the video is {:5.2f}, size: {}".format( frames / (time.time() - start), orig_im.size)) # Write rtmp stream output_stream.stdin.write(frame.tostring()) else: break if cv2.waitKey(1) & 0xFF == ord('q'): break # Close cap.release() cv2.destroyAllWindows() client.disconnect()