def load_model(model_file): torch.set_default_tensor_type('torch.cuda.FloatTensor') set_cfg('yolact_plus_resnet50_config') net = Yolact() net.load_weights(model_file) net.eval() return net
class YOLACT_MODEL(): def __init__(self, opts): #concat the two files to one file # if not os.path.isfile('weights/yolact_resnet50_54_800000.pth'): # script = "cat weights/a* > weights/yolact_resnet50_54_800000.pth" # call(script, shell=True) set_cfg('yolact_resnet50_config') cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = Yolact() self.net.load_weights(opts['checkpoint']) print("done.") self.net.eval() self.net = self.net.cuda() self.net.detect.use_fast_nms = True cfg.mask_proto_debug = False self.color_cache = defaultdict(lambda: {}) self.threshold = opts['threshold'] # Generate an image based on some text. def detect(self, img): numpy_image = np.array(img) print('starting inference...') frame = torch.from_numpy(numpy_image).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = self.net(batch) print("done.") output_image = self.display(preds, frame, None, None, undo_transform=False, score_threshold=self.threshold) return output_image def display(self, dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, top_k = 100, score_threshold = 0.3): img_gpu = img / 255.0 h, w, _ = img.shape with timer.env('Postprocess'): t = postprocess(dets_out, w, h, visualize_lincomb = False, crop_masks = True, score_threshold = score_threshold) torch.cuda.synchronize() with timer.env('Copy'): if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][:top_k] img_gpu = img_gpu * masks[0] # Then draw the stuff that needs to be done on the cpu # Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason img_numpy = (img_gpu * 255).byte().cpu().numpy() return img_numpy
def __init__( self, weights='./crow_vision_yolact/data/yolact/weights/weights_yolact_kuka_17/crow_base_35_457142.pth', config=None, batchsize=1, top_k=25, score_threshold=0.1, display_text=True, display_bboxes=True, display_masks=True, display_scores=True): self.score_threshold = score_threshold self.top_k = top_k self.batchsize = batchsize # initialize a yolact net for inference ## YOLACT setup # setup config if config is not None: if '.obj' in config: with open(config, 'rb') as f: config = dill.load(f) set_cfg(config) self.class_names_tuple = get_class_names_tuple() parse_args([ '--top_k=' + str(top_k), '--score_threshold=' + str(score_threshold), '--display_text=' + str(display_text), '--display_bboxes=' + str(display_bboxes), '--display_masks=' + str(display_masks), '--display_scores=' + str(display_scores), ]) # CUDA setup for yolact torch.backends.cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') #YOLACT net itself with torch.no_grad(): net = Yolact().cuda(torch.cuda.current_device()) net.load_weights(weights) net.eval() net.detect.use_fast_nms = True net.detect.use_cross_class_nms = False self.net = net print("YOLACT network available as self.net") #for debug,benchmark self.duration = 0.0
def init_model(transform): args = parse_args() if args.config is not None: print(args.config) set_cfg(args.config) cfg.mask_proto_debug = False if args.trained_model == 'interrupt': args.trained_model = SavePath.get_interrupt('weights/') elif args.trained_model == 'latest': args.trained_model = SavePath.get_latest('weights/', cfg.name) if args.config is None: model_path = SavePath.from_str(args.trained_model) # TODO: Bad practice? Probably want to do a name lookup instead. args.config = model_path.model_name + '_config' print('Config not specified. Parsed %s from the file name.\n' % args.config) set_cfg(args.config) if args.detect: cfg.eval_mask_branch = False if args.dataset is not None: set_dataset(args.dataset) with torch.no_grad(): if args.cuda: cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') print('Loading model...', end='') net = Yolact() net.load_weights(args.trained_model) net.eval() print(' Done.') net = net.cuda() net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel(FastBaseTransform()).cuda() return net, args
def load_weights(filename, cuda): """Load YOLACT network weights""" global ynet if filename == '': raise ValueError('Empty filename for network weights') print('#### CUDA ENABLED', cuda) print(f'Loading weights from {filename}') tic = time.perf_counter_ns() with torch.no_grad(): if cuda: cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') # torch.set_default_tensor_type('torch.FloatTensor') ynet = Yolact() ynet.load_weights(filename, False) ynet.eval() toc = time.perf_counter_ns() logging.debug(f'Time to load weights: {1e-9 * (toc - tic)}')
def convert_to_onnx_with_hydra(cfg: DictConfig): # create folder for onnx createFolderOnnx(cfg) # set cfg set_cfg(cfg.onnx.yolact_cfg) model = Yolact() model.load_weights(cfg.onnx.model_ckpt_path) model.eval() model = model.cpu() dummy_input = torch.rand( (cfg.onnx.model_batch_size, cfg.onnx.model_channel_input, cfg.onnx.model_height_input, cfg.onnx.model_width_input)) torch.onnx.export(model, dummy_input, cfg.onnx.model_onnx_path, verbose=cfg.onnx.verbose, opset_version=cfg.onnx.opset_version)
def main(args): rospy.init_node('yolact_ros') rospack = rospkg.RosPack() yolact_path = rospack.get_path('yolact_ros') model_path_str = yolact_path + "/scripts/yolact/weights/yolact_base_54_800000.pth" model_path = SavePath.from_str(model_path_str) set_cfg(model_path.model_name + '_config') with torch.no_grad(): results_path_str = yolact_path + "/scripts/yolact/results" if not os.path.exists(results_path_str): os.makedirs(results_path_str) cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') print('Loading model...', end='') net = Yolact() net.load_weights(model_path_str) net.eval() print(' Done.') net = net.cuda() net.detect.use_fast_nms = True cfg.mask_proto_debug = False ic = image_converter(net) try: rospy.spin() except KeyboardInterrupt: print("Shutting down") cv2.destroyAllWindows()
class YolactWorker(qc.QObject): # emits list of classes, scores, and bboxes of detected objects # bboxes are in (top-left, w, h) format # The even is passed for synchronizing display of image in videowidget # with the bounding boxes sigProcessed = qc.pyqtSignal(np.ndarray, int) sigInitialized = qc.pyqtSignal() sigError = qc.pyqtSignal(YolactException) def __init__(self): super(YolactWorker, self).__init__() self.mutex = qc.QMutex() self._image = None self._pos = 0 self.top_k = 10 self.cuda = torch.cuda.is_available() self.net = None self.score_threshold = 0.15 self.overlap_thresh = 1.0 self.config = yconfig.cfg self.weights_file = '' self.config_file = '' self.video_file = None def setWaitCond(self, waitCond: threading.Event) -> None: _ = qc.QMutexLocker(self.mutex) self._waitCond = waitCond @qc.pyqtSlot(bool) def enableCuda(self, on): settings.setValue('yolact/cuda', on) self.cuda = on @qc.pyqtSlot(int) def setTopK(self, value): _ = qc.QMutexLocker(self.mutex) self.top_k = value @qc.pyqtSlot(int) def setBatchSize(self, value): _ = qc.QMutexLocker(self.mutex) self.batch_size = int(value) @qc.pyqtSlot(float) def setScoreThresh(self, value): _ = qc.QMutexLocker(self.mutex) self.score_threshold = value @qc.pyqtSlot(float) def setOverlapThresh(self, value): """Merge objects if their bboxes overlap more than this.""" _ = qc.QMutexLocker(self.mutex) self.overlap_thresh = value @qc.pyqtSlot(str) def setConfig(self, filename): if filename == '': return self.config_file = filename with open(filename, 'r') as cfg_file: config = yaml.safe_load(cfg_file) for key, value in config.items(): logging.debug('%r \n%r %r', key, type(value), value) self.config.__setattr__(key, value) if 'mask_proto_debug' not in config: self.config.mask_proto_debug = False logging.debug(yaml.dump(self.config)) @qc.pyqtSlot(str) def setWeights(self, filename: str) -> None: if filename == '': raise YolactException('Empty filename for network weights') self.weights_file = filename tic = time.perf_counter_ns() with torch.no_grad(): if self.cuda: cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') self.net = Yolact() self.net.load_weights(self.weights_file, self.cuda) self.net.eval() if self.cuda: self.net = self.net.cuda() toc = time.perf_counter_ns() logging.debug('Time to load weights %f s', 1e-9 * (toc - tic)) self.sigInitialized.emit() @qc.pyqtSlot(np.ndarray, int) def process(self, image: np.ndarray, pos: int): """:returns (classes, scores, boxes) where `boxes` is an array of bounding boxes of detected objects in (xleft, ytop, width, height) format. `classes` is the class ids of the corresponding objects. `scores` are the computed class scores corresponding to the detected objects. Roughly high score indicates strong belief that the object belongs to the identified class. """ _ts = time.perf_counter() logging.debug(f'Received frame {pos}') if self.net is None: self.sigError.emit(YolactException('Network not initialized')) return # Partly follows yolact eval.py tic = time.perf_counter_ns() _ = qc.QMutexLocker(self.mutex) with torch.no_grad(): if self.cuda: image = torch.from_numpy(image).cuda().float() else: image = torch.from_numpy(image).float() batch = FastBaseTransform()(image.unsqueeze(0)) preds = self.net(batch) image_gpu = image / 255.0 h, w, _ = image.shape save = self.config.rescore_bbox self.config.rescore_bbox = True classes, scores, boxes, masks = oututils.postprocess( preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=self.score_threshold) idx = scores.argsort(0, descending=True)[:self.top_k] # if self.config.eval_mask_branch: # masks = masks[idx] classes, scores, boxes = [ x[idx].cpu().numpy() for x in (classes, scores, boxes) ] # This is probably not required, `postprocess` uses # `score_thresh` already num_dets_to_consider = min(self.top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < self.score_threshold: num_dets_to_consider = j break # logging.debug('Bounding boxes: %r', boxes) # Convert from top-left bottom-right format to # top-left, width, height format if len(boxes) == 0: self.sigProcessed.emit(boxes, pos) return boxes[:, 2:] = boxes[:, 2:] - boxes[:, :2] boxes = np.asanyarray(boxes, dtype=np.int_) if self.overlap_thresh < 1: dist_matrix = pairwise_distance(new_bboxes=boxes, bboxes=boxes, boxtype=OutlineStyle.bbox, metric=DistanceMetric.ios) bad_idx = [jj for ii in range(dist_matrix.shape[0] - 1) \ for jj in range(ii+1, dist_matrix.shape[1]) \ if dist_matrix[ii, jj] < 1 - self.overlap_thresh] good_idx = list(set(range(boxes.shape[0])) - set(bad_idx)) boxes = boxes[good_idx].copy() toc = time.perf_counter_ns() logging.debug('Time to process single _image: %f s', 1e-9 * (toc - tic)) self.sigProcessed.emit(boxes, pos) logging.debug(f'Emitted bboxes for frame {pos}: {boxes}') _dt = time.perf_counter() - _ts logging.debug( f'{__name__}.{self.__class__.__name__}.process: Runtime: {_dt}s')
# # Editor : VIM # File name : convert_weight.py # Author : YunYang1994 # Created date: 2019-07-27 18:07:20 # Description : # #================================================================ import torch import numpy as np from yolact import Yolact with torch.no_grad(): model = Yolact() model.eval() model.load_weights("./yolact_darknet53_54_800000.pth") modules = model.children() def parse_layer(layer, weights): assert isinstance(layer, torch.nn.Conv2d) or isinstance( layer, torch.nn.BatchNorm2d) print("=> Parsing ", layer) if isinstance(layer, torch.nn.Conv2d): weight, bias = layer.weight.detach().numpy(), layer.bias weight = np.transpose( weight, [2, 3, 1, 0]) # k_h, h_w, in_channels, out_channels if bias is None: weights.append([weight]) else:
class YOLACT_MODEL(): def __init__(self, opts): #concat the two files to one file # if not os.path.isfile('weights/yolact_resnet50_54_800000.pth'): # script = "cat weights/a* > weights/yolact_resnet50_54_800000.pth" # call(script, shell=True) set_cfg('yolact_resnet50_config') cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = Yolact() self.net.load_weights(opts['checkpoint']) print("done.") self.net.eval() self.net = self.net.cuda() self.net.detect.use_fast_nms = True cfg.mask_proto_debug = False self.color_cache = defaultdict(lambda: {}) self.threshold = opts['threshold'] self.mode = opts['mode'] # Generate an image based on some text. def detect(self, img): numpy_image = np.array(img) print('starting inference...') frame = torch.from_numpy(numpy_image).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = self.net(batch) print("done.") return self.display(preds, frame, None, None, undo_transform=False, score_threshold=self.threshold) def display(self, dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, top_k=100, score_threshold=0.3): img_gpu = img / 255.0 h, w, _ = img.shape with timer.env('Postprocess'): t = postprocess(dets_out, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=score_threshold) torch.cuda.synchronize() with timer.env('Copy'): if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][:top_k] classes, scores, boxes = [ x[:top_k].detach().cpu().numpy() for x in t[:3] ] num_dets_to_consider = min(top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < 0: num_dets_to_consider = j break if num_dets_to_consider == 0: # No detections found so just output the original image return (img_gpu * 255).byte().detach().cpu().numpy() # Quick and dirty lambda for selecting the color for a particular index # Also keeps track of a per-gpu color cache for maximum speed def get_color(j, on_gpu=None): color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS) if on_gpu is not None and color_idx in self.color_cache[on_gpu]: return self.color_cache[on_gpu][color_idx] else: color = COLORS[color_idx] if not undo_transform: # The image might come in as RGB or BRG, depending color = (color[2], color[1], color[0]) if on_gpu is not None: color = torch.Tensor(color).to(on_gpu).float() / 255. self.color_cache[on_gpu][color_idx] = color return color show_mask = True show_box = True if self.mode == "mask_only": show_box = False if self.mode == "box_only": show_mask = False print("mode :", self.mode) print("show_mask :", show_mask) print("show_box :", show_box) # First, draw the masks on the GPU where we can do it really fast # Beware: very fast but possibly unintelligible mask-drawing code ahead # I wish I had access to OpenGL or Vulkan but alas, I guess Pytorch tensor operations will have to suffice if show_mask and cfg.eval_mask_branch: # After this, mask is of size [num_dets, h, w, 1] masks = masks[:num_dets_to_consider, :, :, None] # Prepare the RGB images for each mask given their color (size [num_dets, h, w, 1]) colors = torch.cat([ get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider) ], dim=0) masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha # This is 1 everywhere except for 1-mask_alpha where the mask is inv_alph_masks = masks * (-mask_alpha) + 1 # I did the math for this on pen and paper. This whole block should be equivalent to: # for j in range(num_dets_to_consider): # img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j] masks_color_summand = masks_color[0] if num_dets_to_consider > 1: inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider - 1)].cumprod(dim=0) masks_color_cumul = masks_color[1:] * inv_alph_cumul masks_color_summand += masks_color_cumul.sum(dim=0) img_gpu = img_gpu * inv_alph_masks.prod( dim=0) + masks_color_summand # Then draw the stuff that needs to be done on the cpu # Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason img_numpy = (img_gpu * 255).byte().cpu().numpy() if show_box: for j in reversed(range(num_dets_to_consider)): x1, y1, x2, y2 = boxes[j, :] color = get_color(j) score = scores[j] if True: cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1) if True: _class = cfg.dataset.class_names[classes[j]] text_str = '%s: %.2f' % (_class, score) if True else _class font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] text_pt = (x1, y1 - 3) text_color = [255, 255, 255] cv2.rectangle(img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1) cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) return (img_numpy, boxes, scores)
def detect(): img_path = '/home/user/dataset/pear/train/JPEGImages' save_path = '/home/user/pear_output' weight_path = '/home/user/caoliwei/yolact/weights/20200901/yolact_darknet53_1176_20000.pth' set_cfg('pear_config') with torch.no_grad(): torch.cuda.set_device(0) ###### # If the input image size is constant, this make things faster (hence why we can use it in a video setting). # cudnn.benchmark = True # cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') ###### net = Yolact() net.load_weights(weight_path) net.eval() net = net.cuda() print('model loaded...') net.detect.cross_class_nms = True net.detect.use_fast_nms = True cfg.mask_proto_debug = False if not os.path.exists(save_path): os.mkdir(save_path) img_names = [ name for name in os.listdir(img_path) if name.endswith('.jpg') or name.endswith('.png') ] #for img_name in tqdm(img_names): for img_name in img_names: img = cv2.imread(os.path.join(img_path, img_name)) img = torch.from_numpy(img).cuda().float() img = FastBaseTransform()(img.unsqueeze(0)) start = time.time() preds = net(img) print('clw: image_name: %s, inference time use %.3fs' % (img_name, time.time() - start)) # inference time use 0.023s, 550x550 # start = time.time() h, w = img.shape[2:] result = postprocess( preds, w, h, crop_masks=True, score_threshold=0.3) # classes, scores, boxes, masks 按照score排序 # top_k = 10 # classes, scores, boxes, masks = [x[:top_k].cpu().numpy() for x in result] # clw note TODO: 是否有必要只取top_k个? # print('clw: postprocess time use %.3fs' % (time.time() - start)) # 0.001s ### 顺序遍历result[0],找到第一个是0的值,也就是梨,也就拿到了相应的mask # start = time.time() bFindPear = False for i, cls_id in enumerate(result[0]): if cls_id == 0 and not bFindPear: pear_mask = result[3][i].cpu().numpy() bFindPear = True # 从梨的mask中提取轮廓 pear_outline = get_outline_from_mask(pear_mask, w, h) # print('pear_mask.sum:', pear_mask.sum()) # 124250.0 # print('pear_outline.sum:', pear_outline.sum()) # 34335.0 # print('clw: outline extract time use %.3fs' % (time.time() - start)) # 0.001s roundness = compute_roundness(pear_outline) ### result.append(roundness)
class pear_detector(object): #def __init__(self, weight_path = '/home/user/caoliwei/yolact/weights/20200901/yolact_darknet53_1176_20000.pth'): def __init__( self, weight_path='C:/Users/user/yolact_notes/weights/yolact_darknet53_249_2000.pth', save_path='C:/Users/user/yolact_notes/pear_output'): set_cfg('pear_config') self.save_path = save_path self.weight_path = weight_path self.net = Yolact() self.net.load_weights(self.weight_path) self.net.eval() self.net = self.net.cuda() print('model loaded...') self.net.detect.cross_class_nms = True self.net.detect.use_fast_nms = True def detect(self, img): try: print('') print( '======================== clw: detect of python nn start !! ================================' ) print('img.shape:', img.shape) with torch.no_grad(): torch.cuda.set_device(0) ###### # If the input image size is constant, this make things faster (hence why we can use it in a video setting). # cudnn.benchmark = True # cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') ###### cfg.mask_proto_debug = False # if not os.path.exists(self.save_path): # os.mkdir(self.save_path) #img = img[:, :, ::-1].copy() img = img.copy( ) # clw note: 训练的时候cv2.imread()加载进来,然后通过BackboneTransform对BGR做处理;测试的时候用FastBaseTransform也会对BGR做处理;因此应该不需要::-1的操作 img = torch.from_numpy(img).cuda().float() img = FastBaseTransform()(img.unsqueeze(0)) start = time.time() preds = self.net(img) # start = time.time() h, w = img.shape[2:] result = postprocess( preds, w, h, crop_masks=True, score_threshold=0.3 ) # classes, scores, boxes, masks 按照score排序 # top_k = 10 # classes, scores, boxes, masks = [x[:top_k].cpu().numpy() for x in result] # clw note TODO: 是否有必要只取top_k个? # print('clw: postprocess time use %.3fs' % (time.time() - start)) # 0.001s print('clw: inference time use %.3fs, item nums in result:%d' % (time.time() - start, len( result[0]))) # inference time use 0.023s, 550x550 ### 顺序遍历result[0],找到第一个是0的值,也就是梨,也就拿到了相应的mask # start = time.time() bFindPear = False for i, cls_id in enumerate(result[0]): if cls_id == 0 and not bFindPear: pear_mask = result[3][i].cpu().numpy() bFindPear = True # 从梨的mask中提取轮廓 pear_outline = get_outline_from_mask(pear_mask, w, h) # print('pear_mask.sum:', pear_mask.sum()) # 124250.0 # print('pear_outline.sum:', pear_outline.sum()) # 34335.0 # print('clw: outline extract time use %.3fs' % (time.time() - start)) # 0.001s roundness = compute_roundness(pear_outline) ### result.append(roundness) except: traceback.print_exc() print( '======================== clw: detect of python nn end !! ================================' ) print('') return result
class YolactInterface(object): def __init__(self, model_pth, output_num=5): self.output_num = output_num with torch.no_grad(): set_cfg("yolact_base_config") torch.cuda.set_device(0) cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = Yolact() self.net.load_weights(model_pth) self.net.eval() self.net = self.net.cuda() print("load model complete") def run_once(self, src): self.net.detect.cross_class_nms = True self.net.detect.use_fast_nms = True cfg.mask_proto_debug = False with torch.no_grad(): frame = torch.Tensor(src).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) time_start = time.clock() preds = self.net(batch) time_elapsed = (time.clock() - time_start) h, w, _ = src.shape t = postprocess( preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=0.) # TODO: give a suitable threshold torch.cuda.synchronize() classes, scores, bboxes, masks = [ x[:self.output_num].cpu().numpy() for x in t ] # TODO: Only 5 objects for test print(time_elapsed) instances = self.build_up_result(masks.shape[0], classes, bboxes, masks, scores) return {"instances": instances} def build_up_result(self, num, classes, bboxes, masks, scores): instances = [] for i in range(num): bbox = [ bboxes[i, 0], bboxes[i, 1], bboxes[i, 2] - bboxes[i, 0], bboxes[i, 3] - bboxes[i, 1] ] # Round to the nearest 10th to avoid huge file sizes, as COCO suggests bbox = [round(float(x) * 10) / 10 for x in bbox] # encode segmentation with RLE rle = pycocotools.mask.encode( np.asfortranarray(masks[i, :, :].astype( np.uint8))) # rle binary encoding rle['counts'] = rle['counts'].decode( 'ascii') # json.dump doesn't like bytes strings # create one instance json instances.append({ 'category_id': int(classes[i] ), # TODO: origin: get_coco_cat(int(category_id)) 'bbox': { "b": bbox }, "segmentation": rle, 'score': float(scores[i]) }) return instances
def main(argv=None): """ Parses the parameters or, if None, sys.argv and starts prediction mode. :param argv: the command-line parameters to parse (list of strings) :type: argv: list """ parser = argparse.ArgumentParser(description='YOLACT Prediction') parser.add_argument('--model', required=True, type=str, help='The trained model to use (.pth file).') parser.add_argument('--config', default="external_config", help='The name of the configuration to use.') parser.add_argument( '--top_k', default=5, type=int, help='Further restrict the number of predictions (eg objects) to parse' ) parser.add_argument( '--score_threshold', default=0, type=float, help= 'Detections with a score under this threshold will not be considered.') parser.add_argument( '--fast_nms', action="store_false", help='Whether to use a faster, but not entirely correct version of NMS.' ) parser.add_argument('--cross_class_nms', action="store_true", help='Whether compute NMS cross-class or per-class.') parser.add_argument( '--prediction_in', default=None, type=str, required=True, help='The directory in which to look for images for processing.') parser.add_argument('--prediction_out', default=None, type=str, required=True, help='The directory to store the results in.') parser.add_argument( '--prediction_tmp', default=None, type=str, required=False, help= 'The directory to store the results in first, before moving them to the actual output directory.' ) parser.add_argument( '--continuous', action="store_true", help= 'Whether to continuously poll the input directory or exit once all initial images have been processed.' ) parser.add_argument( '--delete_input', action="store_true", help= 'Whether to delete the input images rather than moving them to the output directory.' ) parser.add_argument( '--output_polygons', action='store_true', help= 'Whether to masks are predicted and polygons should be output in the ROIS CSV files', required=False, default=False) parser.add_argument( '--fit_bbox_to_polygon', action='store_true', help= 'When outputting polygons whether to fit the bounding box to the polygon', required=False, default=False) parser.add_argument( '--bbox_as_fallback', default=-1.0, type=float, help= 'When outputting polygons the bbox can be used as fallback polygon. This happens if the ratio ' + 'between the surrounding bbox of the polygon and the bbox is smaller than the specified value. ' + 'Turned off if < 0.', required=False) parser.add_argument( '--mask_threshold', type=float, help='The threshold (0-1) to use for determining the contour of a mask', required=False, default=0.1) parser.add_argument( '--mask_nth', type=int, help='To speed polygon detection up, use every nth row and column only', required=False, default=1) parser.add_argument( '--output_minrect', action='store_true', help= 'When outputting polygons whether to store the minimal rectangle around the objects in the CSV files as well', required=False, default=False) parser.add_argument( '--view_margin', default=2, type=int, required=False, help= 'The number of pixels to use as margin around the masks when determining the polygon' ) parser.add_argument( '--fully_connected', default='high', choices=['high', 'low'], required=False, help= 'When determining polygons, whether regions of high or low values should be fully-connected at isthmuses' ) parser.add_argument( '--output_width_height', action='store_true', help= "Whether to output x/y/w/h instead of x0/y0/x1/y1 in the ROI CSV files", required=False, default=False) parser.add_argument( '--scale', type=float, help= 'The scale factor to apply to the image (0-1) before processing. Output will be in original dimension space.', required=False, default=1.0) parser.add_argument( '--debayer', default="", type=str, help='The OpenCV2 debayering method to use, eg "COLOR_BAYER_BG2BGR"', required=False) parser.add_argument( '--output_mask_image', action='store_true', default=False, help= "Whether to output a mask image (PNG) when predictions generate masks (independent of outputting polygons)", required=False) parsed = parser.parse_args(args=argv) if parsed.fit_bbox_to_polygon and (parsed.bbox_as_fallback >= 0): raise Exception( "Options --fit_bbox_to_polygon and --bbox_as_fallback cannot be used together!" ) if (parsed.debayer is not None ) and not (parsed.debayer == "") and not parsed.debayer.startswith("COLOR_BAYER_"): raise Exception( "Expected debayering type to start with COLOR_BAYER_, instead got: " + str(parsed.debayer)) with torch.no_grad(): # initializing cudnn print('Initializing cudnn', end='') cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') print(' Done.') # load configuration and model print('Loading config %s' % parsed.config, end='') set_cfg(parsed.config) cfg.mask_proto_debug = False print(' Done.') print('Loading model: %s' % parsed.model, end='') net = Yolact() net.load_weights(parsed.model) net.eval() net = net.cuda() net.detect.use_fast_nms = parsed.fast_nms net.detect.use_cross_class_nms = parsed.cross_class_nms print(' Done.') predict(model=net, input_dir=parsed.prediction_in, output_dir=parsed.prediction_out, tmp_dir=parsed.prediction_tmp, top_k=parsed.top_k, score_threshold=parsed.score_threshold, delete_input=parsed.delete_input, output_polygons=parsed.output_polygons, mask_threshold=parsed.mask_threshold, mask_nth=parsed.mask_nth, output_minrect=parsed.output_minrect, view_margin=parsed.view_margin, fully_connected=parsed.fully_connected, fit_bbox_to_polygon=parsed.fit_bbox_to_polygon, output_width_height=parsed.output_width_height, bbox_as_fallback=parsed.bbox_as_fallback, scale=parsed.scale, debayer=parsed.debayer, continuous=parsed.continuous, output_mask_image=parsed.output_mask_image)
class RunYolact(object): """ 运行YOLACT的类 source: https://github.com/dbolya/yolact/issues/9 """ def __init__(self, trained_model: str, save_json=True, output_dir=None, output_name="detection", output_num=5): """ YOLACT 初始化,参数: - save_json 是否将计算结果保存为json文件 - output_dir 当上个参数为True时,这个参数表示将json文件保存到的位置 - output_name 保存的json文件名 - output_num # ? 目测是要输出的类别个数 """ # step 0 初始化变量 self.save_json = save_json # NOTE 卧槽还有这种用法,学习了 self.detections = None self.output_num = output_num # step 1 如果指定了要生成json文件,就创建上面的Detection类对象 if self.save_json and output_dir is not None: self.detections = Detections(output_dir, output_name) # step 2 初始化YOLACT网络 with torch.no_grad(): set_cfg("yolact_base_config") torch.cuda.set_device(1) cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = Yolact() # TODO 这里的权值是需要进行修改的 # self.net.load_weights('./weights/yolact_base_54_800000.pth') self.net.load_weights(trained_model) self.net.eval() self.net = self.net.cuda() print("load model complete") def run_once(self, src, image_name): """ 只对一张图像进行预测.参数: - src # ? 要预测的图像 - image_name 图像名称 # ? 猜测就是图像的文件名 """ # step 0 准备 self.net.detect.cross_class_nms = True self.net.detect.use_fast_nms = True cfg.mask_proto_debug = False # step 1 预测 with torch.no_grad(): frame = torch.Tensor(src).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) time_start = time.clock() preds = self.net(batch) time_elapsed = (time.clock() - time_start) h, w, _ = src.shape # NOTICE 这里并没有设置最小的阈值 t = postprocess( preds, w, h, visualize_lincomb=False, crop_masks=True, score_threshold=0.) # TODO: give a suitable threshold torch.cuda.synchronize() classes, scores, boxes, masks = [ x[:self.output_num].cpu().numpy() for x in t ] # TODO: Only 5 objects for test print(time_elapsed) # 将预测得到的每一个结果都添加到detection对象中 for i in range(masks.shape[0]): self.detections.add_instance(image_name, i, classes[i], boxes[i, :], masks[i, :, :], scores[i]) # step 2 保存所有预测结果 self.detections.dump_all()
class MattingService: def __init__(self, model_path="./weights/yolact_im700_54_800000.pth", use_cuda=False): print('Loading model...', end='') self.use_cuda = use_cuda self.trained_model = model_path self.net = Yolact() self.net.load_weights(self.trained_model) self.net.eval() if self.use_cuda: self.net = self.net.cuda() self.net.detect.use_fast_nms = True self.net.detect.use_cross_class_nms = False cfg.mask_proto_debug = False print(' Done.') def process(self, image, top_k=1, score_threshold=0.6): # TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo with torch.no_grad(): if image is not None: if ':' in image: inp, _image_name = image.split(':') self._infer_image(self.net, inp, _image_name, top_k, score_threshold) else: _image_name = image.split('/')[-1].split('.')[0] + '.png' out = os.path.join('results/', _image_name) self._infer_image(self.net, image, out, top_k, score_threshold) return _image_name def _infer_image(self, net: Yolact, path, save_path, top_k, score_threshold): if self.use_cuda: frame = torch.from_numpy(cv2.imread(path)).cuda().float() else: frame = torch.from_numpy(cv2.imread(path)).float() batch = FastBaseTransform()(frame.unsqueeze(0)) preds = net(batch) img_numpy = self.post_process(preds, frame, None, None, top_k, score_threshold, undo_transform=False) if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0, 3)] if save_path is None: plt.subplot() plt.imshow(img_numpy) plt.title(path) plt.show() else: # plt.subplot() # plt.imshow(img_numpy) # plt.title(path) # plt.show() cv2.imwrite(save_path, img_numpy) @staticmethod def post_process(dets_out, img, h, w, top_k=1, score_threshold=0.6, undo_transform=True): """ Note: If undo_transform=False then im_h and im_w are allowed to be None. """ if undo_transform: img_numpy = undo_image_transformation(img, w, h) img_gpu = torch.Tensor(img_numpy).cuda() else: img_gpu = img / 255.0 h, w, _ = img.shape with timer.env('Postprocess'): save = cfg.rescore_bbox cfg.rescore_bbox = True t = postprocess(dets_out, w, h, visualize_lincomb=False, crop_masks=False, score_threshold=score_threshold) cfg.rescore_bbox = save with timer.env('Copy'): idx = t[1].argsort(0, descending=True)[:top_k] if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][idx] classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]] num_dets_to_consider = min(top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < score_threshold: num_dets_to_consider = j break # First, draw the masks on the GPU where we can do it really fast # Beware: very fast but possibly unintelligible mask-drawing code ahead # I wish I had access to OpenGL or Vulkan but alas, I guess Pytorch tensor operations will have to suffice # After this, mask is of size [num_dets, h, w, 1] final_res = (img_gpu * 255).byte().cpu().numpy() final_res = cv2.cvtColor(final_res, cv2.COLOR_RGB2RGBA) if num_dets_to_consider == 0: return final_res masks = masks[:num_dets_to_consider, :, :, None] _mask = (masks * 255).byte().cpu().numpy()[0] # Then assign the mask to the last channel of the image final_res[:, :, 3] = _mask.squeeze() return final_res
def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', visualize_lincomb=False, crop_masks=True, score_threshold=0): dets = det_output[batch_idx]['detection'] if dets is None: print("detections None") return [torch.Tensor() ] * 4 # Warning, this is 4 copies of the same thing score_threshold = 0.15 # TODO hardcoded keep = dets['score'] > score_threshold for k in dets: if k != 'proto': dets[k] = dets[k][keep] if dets['score'].size(0) == 0: return [torch.Tensor()] * 4 # Actually extract everything from dets now classes = dets['class'] boxes = dets['box'] scores = dets['score'] masks = dets['mask'] # At this points masks is only the coefficients proto_data = dets['proto'] masks = proto_data @ masks.t() masks = activation_func['sigmoid'](masks) # Crop masks before upsampling because you know why if crop_masks: masks = crop(masks, boxes) # Permute into the correct output shape [num_dets, proto_h, proto_w] masks = masks.permute(2, 0, 1).contiguous() net = Yolact() net.load_weights('yolact_resnet50_54_800000.pth') net.eval() # if cfg.use_maskiou: if True: with torch.no_grad(): maskiou_p = net.maskiou_net(masks.unsqueeze(1)) maskiou_p = torch.gather(maskiou_p, dim=1, index=classes.unsqueeze(1)).squeeze(1) if True: if True: scores = scores * maskiou_p else: scores = [scores, scores * maskiou_p] # Scale masks up to the full image masks = F.interpolate(masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False).squeeze(0) # Binarize the masks masks.gt_(0.5) boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, cast=False) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, cast=False) boxes = boxes.long()
class YolactEdgeEngine: def __init__(self): parse_args(self) self.args.config = 'yolact_edge_mobilenetv2_config' set_cfg(self.args.config) self.args.trained_model = '/home/ht/catkin_ws/src/instance_segmentation/scripts/weights/yolact_edge_mobilenetv2_124_10000.pth' self.args.top_k = 10 self.args.score_threshold = 0.3 self.args.trt_batch_size = 3 self.args.disable_tensorrt = False self.args.use_fp16_tensorrt = False self.args.use_tensorrt_safe_mode = True self.args.cuda = True self.args.fast_nms = True self.args.display_masks = True self.args.display_bboxes = True self.args.display_text = True self.args.display_scores = True self.args.display_linecomb = False self.args.fast_eval = False self.args.deterministic = False self.args.no_crop = False self.args.crop = True self.args.calib_images = '/home/ht/catkin_ws/src/instance_segmentation/scripts/data/coco/calib_images' setup_logger(logging_level=logging.INFO) self.logger = logging.getLogger('yolact.eval') self.color_cache = defaultdict(lambda: {}) with torch.no_grad(): cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.logger.info('Loading model...') self.net = Yolact(training=False) if self.args.trained_model is not None: self.net.load_weights(self.args.trained_model, args=self.args) else: self.logger.warning('No weights loaded!') self.net.eval() self.logger.info('Model loaded.') convert_to_tensorrt(self.net, cfg, self.args, transform=BaseTransform()) def evaluate(self, train_mode=False, train_cfg=None): with torch.no_grad(): self.net = self.net.cuda() self.net.detect.use_fast_nms = self.args.fast_nms cfg.mask_proto_debug = self.args.mask_proto_debug inp, out = self.args.images.split(':') self.evalimages(inp, out) def evalimages(self, input_folder: str, output_folder: str): if not os.path.exists(output_folder): os.mkdir(output_folder) print() for p in Path(input_folder).glob('*'): path = str(p) name = os.path.basename(path) name = '.'.join(name.split('.')[:-1]) + '.jpg' out_path = os.path.join(output_folder, name) img = cv2.imread(path) img_out = self.evalimage(img, out_path) #print(path + ' -> ' + out_path) print('Done.') def detect(self, img_in, return_imgs=False): with torch.no_grad(): self.net = self.net.cuda() self.net.detect.use_fast_nms = self.args.fast_nms cfg.mask_proto_debug = self.args.mask_proto_debug #return self.evalimage(img_in[0]) return self.evalbatch(img_in, return_imgs) def evalbatch(self, imgs, return_imgs=False): frame = torch.from_numpy(np.array(imgs)).cuda().float() batch = FastBaseTransform()(frame) if cfg.flow.warp_mode != 'none': assert False, 'Evaluating the image with a video-based model.' extras = { "backbone": "full", "interrupt": False, "keep_statistics": False, "moving_statistics": None } #start_time = time.time() preds = self.net(batch, extras=extras)["pred_outs"] #end_time = time.time() #print('%.3f s' % (end_time-start_time)) imgs_out = [] allres = [] for i, img in enumerate(imgs): if return_imgs: img_out, res = self.prep_display(preds, frame[i], None, None, undo_transform=False, batch_idx=i, create_mask=True, return_imgs=return_imgs) imgs_out.append(img_out) allres.append(res) else: res = self.prep_display(preds, frame[i], None, None, undo_transform=False, batch_idx=i, create_mask=True, return_imgs=return_imgs) allres.append(res) if return_imgs: return imgs_out, allres else: return allres def evalimage(self, img, save_path=None): frame = torch.from_numpy(img).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) if cfg.flow.warp_mode != 'none': assert False, 'Evaluating the image with a video-based model.' extras = { "backbone": "full", "interrupt": False, "keep_statistics": False, "moving_statistics": None } preds = self.net(batch, extras=extras)["pred_outs"] return self.prep_display(preds, frame, None, None, undo_transform=False, create_mask=True) #if save_path: # cv2.imwrite(save_path, img_numpy) #return img_numpy, mask def prep_display(self, dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, batch_idx=0, create_mask=False, return_imgs=False): if undo_transform: img_numpy = undo_image_transformation(img, w, h) img_gpu = torch.Tensor(img_numpy).cuda() else: img_gpu = img / 255.0 h, w, _ = img.shape #print(h, " ", w) with timer.env('Postprocess'): t = postprocess(dets_out, w, h, batch_idx, visualize_lincomb=self.args.display_linecomb, crop_masks=self.args.crop, score_threshold=self.args.score_threshold) torch.cuda.synchronize() with timer.env('Copy'): if cfg.eval_mask_branch: masks = t[3][:self.args.top_k] classes, scores, boxes = [ x[:self.args.top_k].cpu().numpy() for x in t[:3] ] num_dets_to_consider = min(self.args.top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < self.args.score_threshold: num_dets_to_consider = j break idx_fil = [] for i in range(num_dets_to_consider): if cfg.dataset.class_names[ classes[i]] == 'car' or cfg.dataset.class_names[ classes[i]] == 'truck': idx_fil.append(i) num_dets_to_consider = len(idx_fil) if num_dets_to_consider == 0: # no detection found so just output original image if not create_mask: return (img_gpu * 255).byte().cpu().numpy() elif return_imgs: return (img_gpu * 255).byte().cpu().numpy(), ImageResult( None, None, None, np.zeros((h, w, 1), dtype='uint8'), 0) else: return ImageResult(None, None, None, np.zeros((h, w, 1), dtype='uint8'), 0) # Quick and dirty lambda for selecting the color for a particular index # Also keeps track of a per-gpu color cache for maximum speed def get_color(j, on_gpu=None): color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS) if on_gpu is not None and color_idx in self.color_cache[on_gpu]: return self.color_cache[on_gpu][color_idx] else: color = COLORS[color_idx] if not undo_transform: # The image might come in as RGB or BRG, depending color = (color[2], color[1], color[0]) if on_gpu is not None: color = torch.Tensor(color).to(on_gpu).float() / 255. self.color_cache[on_gpu][color_idx] = color return color if self.args.display_masks and cfg.eval_mask_branch: # after this, mask is of size [num_dets, h, w, l] #masks = masks[:num_dets_to_consider, :, :, None] #classes = classes[:num_dets_to_consider] #scores = scores[:num_dets_to_consider] #boxes = boxes[:num_dets_to_consider, :] masks = masks[idx_fil, :, :, None] classes = classes[idx_fil] scores = scores[idx_fil] boxes = boxes[idx_fil, :] if create_mask: mask_img = np.zeros((h, w, 1), dtype='uint8') for j in range(num_dets_to_consider): mask_img += 10 * (j + 1) * masks[j].cpu().numpy().astype( np.uint8) if not return_imgs: return ImageResult(classes, scores, boxes, mask_img, num_dets_to_consider) # prepare the rgb image for each mask given their color (of size [num_dets, w, h, l]) colors = torch.cat([ get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider) ], dim=0) masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha # this is 1 everywhere except for 1-mask_alpha where the mask is inv_alph_masks = masks * (-mask_alpha) + 1 # I did the math for this on pen and paper. This whole block should be equivalent to: # for j in range(num_dets_to_consider): # img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j] masks_color_summand = masks_color[0] if num_dets_to_consider > 1: inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider - 1)].cumprod(dim=0) masks_color_cumul = masks_color[1:] * inv_alph_cumul masks_color_summand += masks_color_cumul.sum(dim=0) img_gpu = img_gpu * inv_alph_masks.prod( dim=0) + masks_color_summand # then draw the stuff that needs to be done on cpu # note make sure this is a uint8 tensor or opencv will not anti aliaz text for wahtever reason img_numpy = (img_gpu * 255).byte().cpu().numpy() if self.args.display_text or self.args.display_bboxes: for j in reversed(range(num_dets_to_consider)): x1, y1, x2, y2 = boxes[j, :] color = get_color(j) score = scores[j] if self.args.display_bboxes: cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1) if self.args.display_text: _class = cfg.dataset.class_names[classes[j]] text_str = '%s: %.2f' % ( _class, score) if self.args.display_scores else _class text_pt = (x1, y1 - 3) text_color = [255, 255, 255] font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) return img_numpy, ImageResult(classes, scores, boxes, mask_img, num_dets_to_consider)
class SocialDistance: def __init__(self, id): # self.cap = cv2.VideoCapture(id) self.cap = WebcamVideoStream(src=id).start() self.width = 1280 #640# self.height = 720 #360# self.display_lincomb = False self.crop = True self.score_threshold = 0.15 self.top_k = 30 self.display_masks = True self.display_fps = False self.display_text = True self.display_bboxes = True self.display_scores = False self.fast_nms = True self.cross_class_nms = True self.config = 'yolact_plus_base_config' print('Config specified. Parsed %s from the file name.\n' % self.config) set_cfg(self.config) print('Loading model...', end='') self.trained_model = 'weights/yolact_plus_base_54_800000.pth' self.model = Yolact() self.model.load_weights(self.trained_model) self.model.detect.use_fast_nms = self.fast_nms self.model.detect.use_cross_class_nms = self.cross_class_nms self.model.eval() self.model = self.model.to(device, non_blocking=True) print(' Done.') self.model_path = SavePath.from_str(self.trained_model) def prep_display(self, dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''): """ Note: If undo_transform=False then im_h and im_w are allowed to be None. """ lineThickness = 2 if undo_transform: img_numpy = undo_image_transformation(img, w, h) img_gpu = torch.Tensor(img_numpy).cuda() else: img_gpu = img / 255.0 h, w, _ = img.shape with timer.env('Postprocess'): save = cfg.rescore_bbox cfg.rescore_bbox = True t = postprocess(dets_out, w, h, visualize_lincomb=self.display_lincomb, crop_masks=self.crop, score_threshold=self.score_threshold) cfg.rescore_bbox = save with timer.env('Copy'): # idx = t[1].argsort(0, descending=True)[top_k] if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][:self.top_k] classes, scores, boxes = [ x[:self.top_k].cpu().detach().numpy() for x in t[:3] ] num_dets_to_consider = min(self.top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < self.score_threshold: num_dets_to_consider = j break # Quick and dirty lambda for selecting the color for a particular index # Also keeps track of a per-gpu color cache for maximum speed def get_color(j, on_gpu=None): global color_cache color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS) if on_gpu is not None and color_idx in color_cache[on_gpu]: return color_cache[on_gpu][color_idx] else: color = COLORS[color_idx] if not undo_transform: # The image might come in as RGB or BRG, depending color = (color[2], color[1], color[0]) if on_gpu is not None: color = torch.Tensor(color).to(on_gpu).float() / 255. color_cache[on_gpu][color_idx] = color return color # First, draw the masks on the GPU where we can do it really fast # Beware: very fast but possibly unintelligible mask-drawing code ahead # I wish I had access to OpenGL or Vulkan but alas, I guess Pytorch tensor operations will have to suffice if self.display_masks and cfg.eval_mask_branch and num_dets_to_consider > 0: # After this, mask is of size [num_dets, h, w, 1] masks = masks[:num_dets_to_consider, :, :, None] # Prepare the RGB images for each mask given their color (size [num_dets, h, w, 1]) colors = torch.cat([ get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider) ], dim=0) masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha # This is 1 everywhere except for 1-mask_alpha where the mask is inv_alph_masks = masks * (-mask_alpha) + 1 # I did the math for this on pen and paper. This whole block should be equivalent to: # for j in range(num_dets_to_consider): # img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j] masks_color_summand = masks_color[0] if num_dets_to_consider > 1: inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider - 1)].cumprod(dim=0) masks_color_cumul = masks_color[1:] * inv_alph_cumul masks_color_summand += masks_color_cumul.sum(dim=0) img_gpu = img_gpu * inv_alph_masks.prod( dim=0) + masks_color_summand if self.display_fps: # Draw the box for the fps on the GPU font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 text_w, text_h = cv2.getTextSize(fps_str, font_face, font_scale, font_thickness)[0] img_gpu[0:text_h + 8, 0:text_w + 8] *= 0.6 # 1 - Box alpha # Then draw the stuff that needs to be done on the cpu # Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason img_numpy = (img_gpu * 255).byte().cpu().detach().numpy() if self.display_fps: # Draw the text on the CPU text_pt = (4, text_h + 2) text_color = [255, 255, 255] cv2.putText(img_numpy, fps_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) if num_dets_to_consider == 0: return img_numpy if self.display_text or self.display_bboxes: distance_boxes = [] def all_subsets(ss): return chain( *map(lambda x: combinations(ss, x), range(0, len(ss) + 1))) def draw_distance(boxes): """ input : boxes(type=list) Make all possible combinations between the detected boxes of persons perform distance measurement between the boxes to measure distancing """ red_counter = 0 ## Countting people who are in high risk green_counter = 0 for subset in all_subsets(boxes): if len(subset) == 2: a = np.array((subset[0][2], subset[0][3])) b = np.array((subset[1][2], subset[1][3])) dist = np.linalg.norm( a - b ) ## Eucledian distance if you want differnt ways to measure distance b/w two boxes you can use the following options # dist = spatial.distance.cosine(a, b) # # print ('Eucledian distance is version-1', dist) # # print ('Eucledian distance is', spatial.distance.euclidean(a, b)) # print ('Cosine distance is', dist) if dist < 250: red_counter += len(subset) cv2.line(img_numpy, (subset[0][2], subset[0][3]), (subset[1][2], subset[1][3]), (0, 0, 255), lineThickness) elif dist < 300: green_counter += len(subset) cv2.line(img_numpy, (subset[0][2], subset[0][3]), (subset[1][2], subset[1][3]), (0, 255, 0), lineThickness) log["total_person_in_red_zone"] = red_counter // 2 log["total_person_in_green_zone"] = green_counter // 2 # gc.collect() for j in reversed(range(num_dets_to_consider)): x1, y1, x2, y2 = boxes[j, :] color = get_color(j) score = scores[j] if self.display_bboxes: cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1) if self.display_text: _class = cfg.dataset.class_names[classes[j]] if _class == "person": log["total_person"] = num_dets_to_consider distance_boxes.append(boxes[j, :].tolist()) draw_distance(distance_boxes) text_str = '%s: %.2f' % ( _class, score) if self.display_scores else _class font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] text_pt = (x1, y1 - 3) text_color = [255, 255, 255] cv2.rectangle(img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1) cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) return img_numpy def main(self): q = queue.Queue() while True: def frame_render(queue_from_cam): frame = self.cap.read( ) # If you capture stream using opencv (cv2.VideoCapture()) the use the following line # ret, frame = self.cap.read() frame = cv2.resize(frame, (self.width, self.height)) queue_from_cam.put(frame) cam = threading.Thread(target=frame_render, args=(q, )) cam.start() cam.join() inputs = q.get() q.task_done() ## Desiging the frame with necessary infos title = "Social Distance Monitoring - COVID19" total_person = "Total = {}".format(log["total_person"]) # print(log) red_zone = "High Risk = {}".format(log["total_person_in_red_zone"]) green_zone = "Safe Distance = {}".format( log["total_person_in_green_zone"]) notification_bar_thickness = 3 overlay = inputs.copy() background = inputs.copy() opacity = 0.4 cv2.rectangle(overlay, (0, 0), (1280, 100), (255, 255, 255), -1) cv2.rectangle(overlay, (0, 615), (400, 720), (255, 255, 255), -1) cv2.addWeighted(overlay, opacity, background, 1 - opacity, 0, inputs) cv2.putText(inputs, title, (195, 50), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 0), 2, cv2.LINE_AA) ### Text Main Title cv2.putText(inputs, total_person, (50, 640), cv2.FONT_HERSHEY_DUPLEX, 0.8, (0, 0, 0), 2, cv2.LINE_AA) ### Text Total Person cv2.line(inputs, (15, 660), (40, 660), (0, 0, 255), notification_bar_thickness) ### Line red-zone cv2.putText(inputs, red_zone, (50, 670), cv2.FONT_HERSHEY_DUPLEX, 0.8, (0, 0, 255), 1, cv2.LINE_AA) ### Text Red Zone Person cv2.line(inputs, (15, 700), (40, 700), (0, 255, 0), notification_bar_thickness) ### Line Green-zone cv2.putText(inputs, green_zone, (50, 710), cv2.FONT_HERSHEY_DUPLEX, 0.8, (0, 255, 0), 1, cv2.LINE_AA) ### Text green Zone Person with torch.no_grad(): inputs = torch.from_numpy(inputs).cuda().float() images = FastBaseTransform()(inputs.unsqueeze(0)) images = images.to(device) preds = self.model(images) frame = self.prep_display(preds, inputs, None, None, undo_transform=False) ret, jpeg = cv2.imencode('.jpg', frame) torch.cuda.empty_cache() return jpeg.tostring()
torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') if args.resume and not args.display: with open(args.ap_data_file, 'rb') as f: ap_data = pickle.load(f) calc_map(ap_data) exit() dataset = None print('Loading model...', end='') net = Yolact() net.load_weights(args.trained_model) net.eval() print(' Done.') if args.cuda: net = net.cuda() net.detect.use_fast_nms = args.fast_nms net.detect.use_cross_class_nms = args.cross_class_nms cfg.mask_proto_debug = args.mask_proto_debug scan = Scan(rgb_paths=rgb_paths, depth_paths=depth_paths, pose_paths=pose_paths, cam_intr=cam_intr, mesh_plot=mesh_plot, scannet_data=scannet_data, mask_net=net, args=args, root_path=root_path, use_gpu=use_gpu)
class DOTMask(): def __init__(self, nn, input_device): """ Initialisation function """ print('Loading model...') self.nn = nn if self.nn == 'yolact': print("Selected NN: Yolact") # Yoloact imports sys.path.append('../nn/yolact/') from yolact import Yolact from data import cfg, set_cfg, set_dataset import torch import torch.backends.cudnn as cudnn set_cfg("yolact_resnet50_config") #set_cfg("yolact_resnet50_config") cfg.eval_mask_branch = True cfg.mask_proto_debug = False cfg.rescore_bbox = True self.net = Yolact() self.net.load_weights("../weights/yolact_resnet50_54_800000.pth") #self.net.load_weights("../weights/yolact_resnet50_54_800000.pth") self.net.eval() cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = self.net.cuda() elif self.nn == 'yolact++': print("Selected NN: Yolact++") # Yoloact imports sys.path.append('../nn/yolact/') from yolact import Yolact from data import cfg, set_cfg, set_dataset import torch import torch.backends.cudnn as cudnn set_cfg("yolact_plus_resnet50_config") #set_cfg("yolact_resnet50_config") cfg.eval_mask_branch = True cfg.mask_proto_debug = False cfg.rescore_bbox = True self.net = Yolact() self.net.load_weights("../weights/yolact_plus_resnet50_54_800000.pth") #self.net.load_weights("../weights/yolact_resnet50_54_800000.pth") self.net.eval() cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = self.net.cuda() elif self.nn == 'yolact_edge': print("Selected NN: Yolact_edge") #Yoloact_edge imports sys.path.append('../nn/yolact_edge') from yolact import Yolact from data import cfg, set_cfg, set_dataset import torch import torch.backends.cudnn as cudnn set_cfg("yolact_edge_resnet50_config") cfg.eval_mask_branch = True cfg.mask_proto_debug = False cfg.rescore_bbox = True self.net = Yolact() self.net.load_weights("../weights/yolact_edge_resnet50_54_800000.pth") self.net.eval() cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') self.net = self.net.cuda() elif self.nn == 'mrcnn': print("Selected NN: Mask-RCNN") # Keras import keras from keras.models import Model from keras import backend as K K.common.set_image_dim_ordering('tf') # Mask-RCNN sys.path.append('../nn/Mask_RCNN/') from mrcnn import config from mrcnn import utils from mrcnn import model as modellib from inference_config import InferenceConfig self.config = InferenceConfig() self.model = modellib.MaskRCNN( mode="inference", model_dir="../weights/",#"../nn/Mask_RCNN/mrcnn/", config=self.config) # Load weights trained on MS-COCO self.model.load_weights("../weights/mask_rcnn_coco.h5", by_name=True) else: print("no nn defined") self.bridge = CvBridge() self._max_inactive_frames = 10 # Maximum nb of frames before destruction self.next_object_id = 0 # ID for next object self.objects_dict = {} # Detected objects dictionary self.var_init = 0 self.cam_pos_qat = np.array([[0.,0.,0.],[0.,0.,0.,1.]]) self.cam_pos = np.array([[0.,0.,0.],[0.,0.,0.]]) self.dilatation = 1 self.score_threshold = 0.1 self.max_number_observation = 5 self.human_threshold = 0.01 self.object_threshold = 0.3 self.iou_threshold = 0.9 self.selected_classes = [0, 56, 67] self.masked_id = [] #if input_device == 'xtion': # self.human_threshold = 0.1 # self.iou_threshold = 0.3 self.depth_image_pub = rospy.Publisher( "/camera/depth_registered/masked_image_raw", Image,queue_size=1) self.dynamic_depth_image_pub = rospy.Publisher( "/camera/depth_registered/dynamic_masked_image_raw", Image,queue_size=1) self.frame = [] self.depth_frame = [] self.msg_header = std_msgs.msg.Header() self.depth_msg_header = std_msgs.msg.Header() # Class names COCO dataset self.class_names = [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] def get_active(self, val): for key in self.objects_dict: if self.objects_dict[key]["maskID"] == val: return self.objects_dict[key]["activeObject"] return "Key not exist" def class_selection(self, masks_in, class_ids): """ Function for Mask class selection (Selected classes : 1,40,41,42,57) """ if len(masks_in.shape) > 1: masks=copy.deepcopy(masks_in) x = np.zeros([class_ids.shape[0], masks.shape[1], masks.shape[2]]) for l in range(masks.shape[0]): if (class_ids[l] == 0 or class_ids[l] == 39 or class_ids[l] == 56): x[l, :, :] = masks[l, :, :] else: x[l, :, :] = 0 return x else: x = np.zeros([1, 480, 640]) return x def static_masks_selection(self, masks_in, class_ids): """ Function for static Mask class selection """ if len(masks_in.shape) > 1: masks=copy.deepcopy(masks_in) x = np.zeros([masks.shape[0], masks.shape[1], masks.shape[2]]) for i in self.objects_dict: if not np.in1d(i, self.masked_id): if self.objects_dict[i]["activeObject"] == 1 and self.objects_dict[i]["maskID"] < masks.shape[0] and (class_ids[self.objects_dict[i]["maskID"]] == 0 or class_ids[self.objects_dict[i]["maskID"]] == 39 or class_ids[self.objects_dict[i]["maskID"]] == 56): x[self.objects_dict[i]["maskID"], :, :] = masks[self.objects_dict[i]["maskID"], :, :] elif self.objects_dict[i]["activeObject"] == 0 and self.objects_dict[i]["maskID"] < masks.shape[0]: x[self.objects_dict[i]["maskID"], :, :] = 0 else: pass self.masked_id.append(i) return x else: x = np.zeros([1, 480, 640]) return x def read_objects_pose(self): for i in self.objects_dict: if self.objects_dict[i]["classID"]==0: object_type = "Person" elif self.objects_dict[i]["classID"]==39: object_type = "Bottle" elif self.objects_dict[i]["classID"]==56: object_type = "Chair" else: object_type = "Nan" try: (self.objects_dict[i]["worldPose"],rot) = listener.lookupTransform('/map',object_type+'_'+str(i), rospy.Time(0)) except (tf.LookupException, tf.ConnectivityException, tf.ExtrapolationException): continue def handle_objects_pose(self): for i in self.objects_dict: if self.objects_dict[i]["classID"]==0 or self.objects_dict[i]["classID"]==39 or self.objects_dict[i]["classID"]==56: if self.objects_dict[i]["classID"]==0: object_type = "Person" elif self.objects_dict[i]["classID"]==39: object_type = "Bottle" elif self.objects_dict[i]["classID"]==56: object_type = "Chair" else: object_type = "Nan" br = tf.TransformBroadcaster() e_pose = self.objects_dict[i]["estimatedPose"] br.sendTransform((e_pose[0], e_pose[1], e_pose[2]), tf.transformations.quaternion_from_euler(0,0,0), rospy.Time.now(), object_type+'_'+str(i), '/map') def iou_centered_centroid(self, rois_old, rois_new, mask_old, mask_new): # intersection_over_union applied on centered centroid img_v = mask_old.shape[0] img_h = mask_old.shape[1] pad_x_old = int((img_v-(rois_old[3]-rois_old[1]))/2) pad_y_old = int((img_h-(rois_old[2]-rois_old[0]))/2) pad_x_new = int((img_v-(rois_new[3]-rois_new[1]))/2) pad_y_new = int((img_h-(rois_new[2]-rois_new[0]))/2) cropped_mask_old = mask_old[rois_old[1]:rois_old[3], rois_old[0]:rois_old[2]] cropped_mask_new = mask_new[rois_new[1]:rois_new[3], rois_new[0]:rois_new[2]] centered_mask_old = add_padding(cropped_mask_old, pad_y_old, pad_x_old, pad_y_old, pad_x_old) centered_mask_new = add_padding(cropped_mask_new, pad_y_new, pad_x_new, pad_y_new, pad_x_new) centered_mask_old_croped = centered_mask_old[1:478, 1:638] centered_mask_new_croped = centered_mask_new[1:478, 1:638] intersection = np.logical_and(centered_mask_old_croped, centered_mask_new_croped) union = np.logical_or(centered_mask_old_croped, centered_mask_new_croped) iou = np.sum(intersection) / np.sum(union) return iou def apply_depth_image_masking(self, image_in, masks): """Apply the given mask to the image. """ image = copy.deepcopy(image_in) image_static = copy.deepcopy(image_in) for i in range(masks.shape[0]): is_active = self.get_active(i) mask = masks[i, :, :] mask = ndimage.binary_dilation(mask, iterations=self.dilatation) if is_active == 1: image[:, :] = np.where(mask == 1, 0, image[:, :]) image_static[:, :] = np.where(mask == 1, 0, image[:, :]) else: image[:, :] = np.where(mask == 1, 0, image[:, :]) return image_static, image def mask_dilatation(self, masks): timebefore = time.time() mask=copy.deepcopy(masks) for i in range(mask.shape[0]): mask[i] = ndimage.binary_dilation(mask[i], iterations=self.dilatation) print("Numpy dilation time : ", - (timebefore - time.time())) return mask def mask_dilatation_cv(self, masks): timebefore = time.time() mask=copy.deepcopy(masks) kernel = np.ones((3,3)) for i in range(mask.shape[0]): mask[i] = cv2.dilate(mask[i],kernel, iterations=self.dilatation) print("cv2 dilation time : ", - (timebefore - time.time())) return mask def get_masking_depth(self, image, mask): """Apply the given mask to the image. """ x = np.zeros([image.shape[0], image.shape[1]]) y = np.zeros(mask.shape[0]) for i in range(mask.shape[0]): x[:, :] = np.where(mask[i,:,:] != 1, 0, image[:, :]) x[:, :] = np.where( np.isnan(x[:,:]), 0, x[:, :]) if sum(sum((x[:, :]!=0))) == 0: y[i] = 0 else: y[i] = (x[:, :].sum()/sum(sum((x[:, :]!=0)))) return y def add_object(self, centroid, dimensions, mask_id, class_id, mask_old, rois_old): dt = 0.25 try: (transc, rotc) = listener.lookupTransform('/map', self.tf_camera, rospy.Time(0)) except (tf.LookupException, tf.ConnectivityException, tf.ExtrapolationException): transc = np.array([0.,0.,0.]) rotc = np.array([0.,0.,0.,1.]) euler = tf.transformations.euler_from_quaternion(rotc) rot = tf.transformations.euler_matrix(euler[0],euler[1],euler[2]) h_mat = rot h_mat[0:3,3:] = np.array([transc]).T b = h_mat.dot(np.array([[centroid[0],centroid[1],centroid[2],1]]).T)[0:3,:] y = np.array([b[0,0], b[1,0], b[2,0]]) x = [y[0], y[1], y[2], 0, 0, 0] P = np.eye(len(x)) F = np.array([[ 1, 0, 0, dt, 0, 0], [ 0, 1, 0, 0, dt, 0], [ 0, 0, 1, 0, 0, dt], [ 0, 0, 0, 1, 0, 0], [ 0, 0, 0, 0, 1, 0], [ 0, 0, 0, 0, 0, 1]]) H = np.array([[ 0.001, 0, 0, 0, 0, 0], [ 0, 0.001, 0, 0, 0, 0], [ 0, 0, 0.001, 0, 0, 0]]) if class_id == 1: ax = 0.68 ay = 0.68 az = 0.68 else: ax = 1 ay = 1 az = 1 Q = np.array([[((dt**4)/4)*(ax**2), 0.0, 0.0, ((dt**4)/4)*(ax**3), 0.0, 0.0], [0.0, ((dt**4)/4)*(ay**2), 0.0, 0.0, ((dt**4)/4)*(ay**3), 0.0], [0.0, 0.0, ((dt**4)/4)*(az**2), 0.0, 0.0, ((dt**4)/4)*(az**3)], [((dt**4)/4)*(ax**3), 0.0, 0.0, (dt**2)*(ax**2), 0.0, 0.0], [0.0, ((dt**4)/4)*(ay**3), 0.0, 0.0, (dt**2)*(ax**2), 0.0], [0.0, 0.0, ((dt**4)/4)*(az**3), 0.0, 0.0, (dt**2)*(ax**2)]]) R = np.array([[ 0.8, 0, 0], [ 0, 0.8, 0], [ 0, 0, 1.2]]) self.objects_dict.update({self.next_object_id : { "kalmanFilter" : extendedKalmanFilter(x, P, F, H, Q, R), "centroid" : centroid, "dimension" : dimensions, "classID" : class_id, "roisOld" : rois_old, "maskID" : mask_id, "maskOld" : mask_old, "worldPose" : [0,0,0], "estimatedVelocity" : [0,0,0], "estimatedPose" : [0,0,0], "inactiveNbFrame" : 0, "activeObject" : 0}}) self.next_object_id = self.next_object_id+1 def delete_object(self, object_id): del self.objects_dict[object_id] def mask_to_centroid(self, rois, mask_depth): current_centroids = {} current_dimensions = {} for i in range(len(rois)): # 3D centroids from depth frame if args.input == 'tum': fx = 525.0 # focal length x fy = 525.0 # focal length y cx = 319.5 # optical center x cy = 239.5 # optical center y elif args.input == 'xtion': # Asus xtion sensor fx = 525 fy = 525 cx = 319.5 cy = 239.5 elif args.input == 'zed': # Zed sensor left img vga fx = 350.113 fy = 350.113 cx = 336.811 cy = 190.357 else: print("No valid input") # Translation from depth pixel to local point if mask_depth[i] == -1: z = 0 else : z = mask_depth[i] y = (((rois[i,3]+rois[i,1])/2) - cy) * z / fy x = (((rois[i,2]+rois[i,0])/2) - cx) * z / fx # Translation from point to world coord current_centroids.update({i:[x, y, z]}) current_dimensions.update({i:[rois[i,3]-rois[i,1], rois[i,2]-rois[i,0]]}) return current_centroids, current_dimensions def live_analysis(self): """ Function for live stream video masking """ bar = [ " Waiting for frame [= ] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", " Waiting for frame [ =] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", " Waiting for frame [ = ] ", ] idx = 0 while not rospy.is_shutdown(): start_time = time.time() self.masked_id = [] current_frame = self.frame current_depth_frame = self.depth_frame if len(current_frame)==0 or len(current_depth_frame)==0 : print(bar[idx % len(bar)], end= "\r") idx = idx +1 time.sleep(0.1) else: nn_start_time = time.time() if self.nn == 'yolact' or self.nn == 'yolact++' or self.nn == 'yolact_edge': frame = torch.from_numpy(current_frame).cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) if self.nn == 'yolact_edge': extras = {"backbone": "full", "interrupt":False, "keep_statistics":False, "moving_statistics":None} preds = self.net(batch.cuda(), extras=extras) preds = preds["pred_outs"] else: preds = self.net(batch.cuda()) nn_pred_time = time.time() h, w, _ = frame.shape b = {} r = {} b['class_ids'], b['scores'], b['rois'], b['masks'] = postprocess(preds, w, h, score_threshold=self.score_threshold) r['class_ids'] = copy.deepcopy(b['class_ids'].cpu().data.numpy()) r['scores'] = copy.deepcopy(b['scores'].cpu().data.numpy()) r['rois'] = copy.deepcopy(b['rois'].cpu().data.numpy()) r['masks'] = copy.deepcopy(b['masks'].cpu().data.numpy()) elif self.nn == 'mrcnn': results = self.model.detect([current_frame],verbose=1) r = results[0] r['masks'] = np.swapaxes(r['masks'],0,2) r['masks'] = np.swapaxes(r['masks'],1,2) for i in range(r['rois'].shape[0]): buff = r['rois'][i] r['rois'][i] = [buff[1],buff[0],buff[3],buff[2]] r['class_ids'] = r['class_ids'] - 1 ''' Deprecated, did not enhance speed j=0 for i in range(len(r['class_ids'])): if not np.in1d(r['class_ids'][j], self.selected_classes): r['class_ids'] = np.delete(r['class_ids'], j) r['scores']= np.delete(r['scores'], j) r['rois']= np.delete(r['rois'], j,axis=0) r['masks']= np.delete(r['masks'], j, axis=0) else: j=j+1 ''' self.number_observation = min(self.max_number_observation, r['class_ids'].shape[0]) for j in range(self.number_observation): if r['scores'][j] < self.score_threshold: self.number_observation = j break r['class_ids'] = r['class_ids'][:self.number_observation] r['scores'] = r['scores'][:self.number_observation] r['rois'] = r['rois'][:self.number_observation] r['masks'] = r['masks'][:self.number_observation] nn_time = time.time() mask_depth = self.get_masking_depth(current_depth_frame, r['masks']) # Read object tf pose self.read_objects_pose() # Read camera tf pose try: (transc, rotc) = listener.lookupTransform(self.tf_camera,'/map', rospy.Time(0)) except (tf.LookupException, tf.ConnectivityException, tf.ExtrapolationException): transc = np.array([0.,0.,0.]) rotc = np.array([0.,0.,0.,1.]) euler = tf.transformations.euler_from_quaternion(rotc) rot = tf.transformations.euler_matrix(euler[0],euler[1],euler[2]) h_mat = rot h_mat[0:3,3:] = np.array([transc]).T objects_to_delete = [] # Main filter update and prediction step if len(r['rois']) == 0: for i in self.objects_dict: self.objects_dict[i]["inactiveNbFrame"] = self.objects_dict[i]["inactiveNbFrame"] + 1 if self.objects_dict[i]["inactiveNbFrame"] > self._max_inactive_frames: objects_to_delete.append(i) for i in objects_to_delete: self.delete_object(i) else : current_centroids, current_dimensions = self.mask_to_centroid(r['rois'],mask_depth) if not self.objects_dict: if not len(current_centroids)==0: for i in range(len(current_centroids)): self.add_object(current_centroids[i], current_dimensions[i], i, r['class_ids'][i], r['masks'][i], r['rois'][i]) for i in self.objects_dict: self.objects_dict[i]["kalmanFilter"].prediction() self.objects_dict[i]["kalmanFilter"].update(self.objects_dict[i]["centroid"], h_mat) self.objects_dict[i]["estimatedPose"] = self.objects_dict[i]["kalmanFilter"].x[0:3] self.objects_dict[i]["estimatedVelocity"] = self.objects_dict[i]["kalmanFilter"].x[3:6] else: objects_pose = np.zeros((len(self.objects_dict),3)) objects_ids = np.zeros((len(self.objects_dict))) index = 0 for i in self.objects_dict: objects_pose[index,] = self.objects_dict[i]["centroid"] objects_ids[index] = i index = index + 1 centroids_pose = np.zeros((len(current_centroids),3)) for i in range(len(current_centroids)): centroids_pose[i,] = current_centroids[i] eucledian_dist_pairwise = np.array(cdist(objects_pose, centroids_pose)).flatten() index_sorted = np.argsort(eucledian_dist_pairwise) used_objects = [] used_centroids = [] for index in range(len(eucledian_dist_pairwise)): object_id = int(index_sorted[index] / len(centroids_pose)) centroid_id = index_sorted[index] % len(centroids_pose) if not np.in1d(object_id, used_objects) and not np.in1d(centroid_id, used_centroids):# and (eucledian_dist_pairwise[index]<0.5): if self.objects_dict[objects_ids[object_id]]["classID"] == r['class_ids'][centroid_id]: timebefore = time.time() used_objects.append(object_id) used_centroids.append(centroid_id) self.objects_dict[objects_ids[object_id]]["kalmanFilter"].prediction() self.objects_dict[objects_ids[object_id]]["kalmanFilter"].update(current_centroids[centroid_id], h_mat) self.objects_dict[objects_ids[object_id]]["estimatedPose"] = self.objects_dict[objects_ids[object_id]]["kalmanFilter"].x[0:3] self.objects_dict[objects_ids[object_id]]["estimatedVelocity"] = self.objects_dict[objects_ids[object_id]]["kalmanFilter"].x[3:6] if self.objects_dict[objects_ids[object_id]]["classID"] == 0: max_threshold = self.human_threshold else: max_threshold = self.object_threshold if abs(self.objects_dict[objects_ids[object_id]]["estimatedVelocity"][0])>max_threshold or abs(self.objects_dict[objects_ids[object_id]]["estimatedVelocity"][1])>max_threshold or abs(self.objects_dict[objects_ids[object_id]]["estimatedVelocity"][2])>max_threshold: self.objects_dict[objects_ids[object_id]]["activeObject"] = 1 else: self.objects_dict[objects_ids[object_id]]["activeObject"] = 0 if self.objects_dict[objects_ids[object_id]]["classID"] == 0 and self.objects_dict[objects_ids[object_id]]["activeObject"] == 0: iou = self.iou_centered_centroid(self.objects_dict[objects_ids[object_id]]["roisOld"], r['rois'][centroid_id], self.objects_dict[objects_ids[object_id]]["maskOld"],r['masks'][centroid_id]) if iou<self.iou_threshold: self.objects_dict[objects_ids[object_id]]["activeObject"] = 1 else: x=1 self.objects_dict[objects_ids[object_id]]["centroid"] = centroids_pose[centroid_id] self.objects_dict[objects_ids[object_id]]["dimensions"] = current_dimensions[centroid_id] self.objects_dict[objects_ids[object_id]]["inactiveNbFrame"] = 0 self.objects_dict[objects_ids[object_id]]["maskID"] = centroid_id self.objects_dict[objects_ids[object_id]]["maskOld"] = r['masks'][centroid_id] self.objects_dict[objects_ids[object_id]]["roisOld"] = r['rois'][centroid_id] if len(centroids_pose) < len(objects_pose): for index in range(len(eucledian_dist_pairwise)): object_id = int(index_sorted[index] / len(objects_pose)) if not np.in1d(object_id, used_objects): self.objects_dict[objects_ids[object_id]]["inactiveNbFrame"] += 1 self.objects_dict[objects_ids[object_id]]["activeObject"] = 0 if self.objects_dict[objects_ids[object_id]]["inactiveNbFrame"] >= self._max_inactive_frames: self.delete_object(objects_ids[object_id]) used_objects.append(object_id) else: self.objects_dict[objects_ids[object_id]]["kalmanFilter"].prediction() self.objects_dict[objects_ids[object_id]]["estimatedPose"] = self.objects_dict[objects_ids[object_id]]["kalmanFilter"].x_[0:3] self.objects_dict[objects_ids[object_id]]["estimatedVelocity"] = self.objects_dict[objects_ids[object_id]]["kalmanFilter"].x_[3:6] elif len(centroids_pose) > len(objects_pose): buff_id = self.next_object_id for index in range(len(eucledian_dist_pairwise)): centroid_id = index_sorted[index] % len(centroids_pose) if not np.in1d(centroid_id, used_centroids): self.add_object(current_centroids[centroid_id], current_dimensions[centroid_id], centroid_id, r['class_ids'][centroid_id], r['masks'][centroid_id], r['rois'][centroid_id]) self.objects_dict[buff_id]["kalmanFilter"].prediction() self.objects_dict[buff_id]["kalmanFilter"].update(current_centroids[centroid_id], h_mat) self.objects_dict[buff_id]["estimatedPose"] = self.objects_dict[buff_id]["kalmanFilter"].x[0:3] self.objects_dict[buff_id]["estimatedVelocity"] = self.objects_dict[buff_id]["kalmanFilter"].x[3:6] buff_id = buff_id + 1 kalman_time = time.time() # Write objects filter pose to tf self.handle_objects_pose() result_dynamic_depth_image, result_depth_image = self.apply_depth_image_masking(current_depth_frame, r['masks']) DDITS = Image() DDITS = self.bridge.cv2_to_imgmsg(result_dynamic_depth_image,'32FC1') DDITS.header = self.depth_msg_header self.dynamic_depth_image_pub.publish(DDITS) DITS = Image() DITS = self.bridge.cv2_to_imgmsg(result_depth_image,'32FC1') DITS.header = self.depth_msg_header self.depth_image_pub.publish(DITS) print_time = time.time() #print(" NN pred time: ", format(nn_pred_time - nn_start_time, '.3f'),", NN post time: ", format(nn_time - nn_pred_time, '.3f'),", NN time: ", format(nn_time - start_time, '.3f'), ", Kalman time: ", format(kalman_time - nn_time, '.3f'), #", Print time: ", format(print_time - kalman_time, '.3f'), ", Total time: ", format(time.time() - start_time, '.3f'), #", FPS :", format(1/(time.time() - start_time), '.2f'), end="\r") def image_callback(self, msg): self.msg_header = msg.header self.frame = self.bridge.imgmsg_to_cv2(msg, "bgr8") def depth_image_callback(self, msg): self.depth_msg_header = msg.header #32FC1 for asus xtion #8UC1 forkicect self.depth_frame = self.bridge.imgmsg_to_cv2(msg, "32FC1")