def test_class_torch(): import numpy as np import torch import netharn as nh import ubelt as ub # from netharn.util.nms.torch_nms import torch_nms # from netharn.util import non_max_supression thresh = .5 num = 500 rng = nh.util.ensure_rng(0) cpu_boxes = nh.util.Boxes.random(num, scale=400.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr))) tlbr = cpu_boxes.to_tlbr().data.to('cuda') scores = cpu_scores.to('cuda') classes = cpu_cls.to('cuda') keep1 = [] for idxs in ub.group_items(range(len(classes)), classes.cpu().numpy()).values(): # cls_tlbr = tlbr.take(idxs, axis=0) # cls_scores = scores.take(idxs, axis=0) cls_tlbr = tlbr[idxs] cls_scores = scores[idxs] cls_keep = torch_nms(cls_tlbr, cls_scores, thresh=thresh, bias=0) keep1.extend(list(ub.compress(idxs, cls_keep.cpu().numpy()))) keep1 = sorted(keep1) keep_ = torch_nms(tlbr, scores, classes=classes, thresh=thresh, bias=0) keep2 = np.where(keep_.cpu().numpy())[0].tolist() keep3 = nh.util.non_max_supression(tlbr.cpu().numpy(), scores.cpu().numpy(), classes=classes.cpu().numpy(), thresh=thresh, bias=0, impl='gpu') print(len(keep1)) print(len(keep2)) print(len(keep3)) print(set(keep1) - set(keep2)) print(set(keep2) - set(keep1))
def _benchmark(): """ python -m netharn.util.nms.torch_nms _benchmark --show SeeAlso: PJR Darknet NonMax supression https://github.com/pjreddie/darknet/blob/master/src/box.c Lightnet NMS https://gitlab.com/EAVISE/lightnet/blob/master/lightnet/data/transform/_postprocess.py#L116 """ import torch import numpy as np import netharn as nh from netharn.util.nms.torch_nms import torch_nms from netharn.util import non_max_supression import ubelt as ub import itertools as it N = 100 bestof = 10 ydata = ub.ddict(list) # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, 2000] # max number of boxes yolo will spit out at a time max_boxes = 19 * 19 * 5 xdata = [ 10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, max_boxes ] # xdata = [10, 20, 40, 80, 100, 200, 300, 400, 500] xdata = [10, 100, 500] rng = nh.util.ensure_rng(0) thresh = 0.5 for num in xdata: print('\n\n---- number of boxes = {} ----\n'.format(num)) outputs = {} # Build random test boxes and scores cpu_boxes = nh.util.Boxes.random(num, scale=10.0, rng=rng, format='tlbr', tensor=True) cpu_tlbr = cpu_boxes.to_tlbr().data # cpu_scores = torch.Tensor(rng.rand(len(cpu_tlbr))) # make all scores unique to ensure comparability cpu_scores = torch.Tensor(np.linspace(0, 1, len(cpu_tlbr))) cpu_cls = torch.LongTensor(rng.randint(0, 10, len(cpu_tlbr))) # Format boxes in lightnet format cpu_ln_boxes = torch.cat([ cpu_boxes.to_cxywh().data, cpu_scores[:, None], cpu_cls.float()[:, None] ], dim=-1) # Move boxes to numpy np_tlbr = cpu_tlbr.numpy() np_scores = cpu_scores.numpy() np_cls = cpu_cls.numpy() # NOQA gpu = torch.device('cuda', 0) measure_gpu = torch.cuda.is_available() measure_cpu = False or not torch.cuda.is_available() def _ln_output_to_keep(ln_output, ln_boxes): keep = [] for row in ln_output: # Find the index that we kept idxs = np.where(np.all(np.isclose(ln_boxes, row), axis=1))[0] assert len(idxs) == 1 keep.append(idxs[0]) assert np.all(np.isclose(ln_boxes[keep], ln_output)) return keep if measure_gpu: # Move boxes to the GPU gpu_tlbr = cpu_tlbr.to(gpu) gpu_scores = cpu_scores.to(gpu) gpu_cls = cpu_cls.to(gpu) # NOQA gpu_ln_boxes = cpu_ln_boxes.to(gpu) t1 = ub.Timerit(N, bestof=bestof, label='torch(gpu)') for timer in t1: with timer: keep = torch_nms(gpu_tlbr, gpu_scores, thresh=thresh) torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] t1 = ub.Timerit(N, bestof=bestof, label='cython(gpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='gpu') torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) from lightnet.data.transform._postprocess import NonMaxSupression t1 = ub.Timerit(N, bestof=bestof, label='lightnet-slow(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=False) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if False: t1 = ub.Timerit(N, bestof=bestof, label='lightnet-fast(gpu)') for timer in t1: with timer: ln_output = NonMaxSupression._nms(gpu_ln_boxes, nms_thresh=thresh, class_nms=False, fast=True) torch.cuda.synchronize() # convert lightnet NMS output to keep for consistency keep = _ln_output_to_keep(ln_output, gpu_ln_boxes) ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if measure_cpu: t1 = ub.Timerit(N, bestof=bestof, label='torch(cpu)') for timer in t1: with timer: keep = torch_nms(cpu_tlbr, cpu_scores, thresh=thresh) ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] if True: t1 = ub.Timerit(N, bestof=bestof, label='cython(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='cpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) t1 = ub.Timerit(N, bestof=bestof, label='numpy(cpu)') for timer in t1: with timer: keep = non_max_supression(np_tlbr, np_scores, thresh=thresh, impl='py') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) # Check that all kept boxes do not have more than `threshold` ious for key, idxs in outputs.items(): ious = nh.util.box_ious(np_tlbr[idxs], np_tlbr[idxs]) max_iou = (np.tril(ious) - np.eye(len(ious))).max() if max_iou > thresh: print('{} produced a bad result with max_iou={}'.format( key, max_iou)) # Check result consistency: print('\nResult stats:') for key in sorted(outputs.keys()): print(' * {:<20}: num={}'.format(key, len(outputs[key]))) print('\nResult overlaps (method1, method2: jaccard):') datas = [] for k1, k2 in it.combinations(sorted(outputs.keys()), 2): idxs1 = set(outputs[k1]) idxs2 = set(outputs[k2]) jaccard = len(idxs1 & idxs2) / len(idxs1 | idxs2) datas.append((k1, k2, jaccard)) datas = sorted(datas, key=lambda x: -x[2]) for k1, k2, jaccard in datas: print(' * {:<20}, {:<20}: {:0.4f}'.format(k1, k2, jaccard)) nh.util.mplutil.autompl() nh.util.mplutil.multi_plot(xdata, ydata, xlabel='num boxes', ylabel='seconds') nh.util.show_if_requested()
def _benchmark(): import ubelt import torch import numpy as np import netharn as nh from netharn.util.nms.torch_nms import torch_nms from netharn.util import non_max_supression import ubelt as ub import itertools as it N = 100 bestof = 10 ydata = ub.ddict(list) xdata = [ 10, 20, 40, 80, 100, 200, 300, 400, 500, 600, 700, 1000, 1500, 2000 ] rng = nh.util.ensure_rng(0) thresh = 0.5 for num in xdata: outputs = {} # Build random test boxes and scores boxes = nh.util.Boxes.random(num, scale=10.0, rng=rng, format='tlbr', tensor=True).data scores = torch.Tensor(rng.rand(len(boxes))) t1 = ubelt.Timerit(N, bestof=bestof, label='torch(cpu)') for timer in t1: with timer: keep = torch_nms(boxes, scores, thresh=thresh) ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] if torch.cuda.is_available(): # Move boxes to the GPU gpu_boxes = boxes.cuda() gpu_scores = scores.cuda() t1 = ubelt.Timerit(N, bestof=bestof, label='torch(gpu)') for timer in t1: with timer: keep = torch_nms(gpu_boxes, gpu_scores, thresh=thresh) torch.cuda.synchronize() ydata[t1.label].append(t1.min()) outputs[t1.label] = np.where(keep.cpu().numpy())[0] # Move boxes to numpy np_boxes = boxes.cpu().numpy() np_scores = scores.cpu().numpy() t1 = ubelt.Timerit(N, bestof=bestof, label='numpy(cpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='py') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) t1 = ubelt.Timerit(N, bestof=bestof, label='cython(cpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='cpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) if torch.cuda.is_available(): t1 = ubelt.Timerit(N, bestof=bestof, label='cython(gpu)') for timer in t1: with timer: keep = non_max_supression(np_boxes, np_scores, thresh=thresh, impl='gpu') ydata[t1.label].append(t1.min()) outputs[t1.label] = sorted(keep) # Check that all kept boxes do not have more than `threshold` ious for key, idxs in outputs.items(): ious = nh.util.box_ious(np_boxes[idxs], np_boxes[idxs]) max_iou = (np.tril(ious) - np.eye(len(ious))).max() if max_iou > thresh: print('{} produced a bad result with max_iou={}'.format( key, max_iou)) # Check result consistency: print('Result consistency:') for k1, k2 in it.combinations(outputs.keys(), 2): idxs1 = set(outputs[k1]) idxs2 = set(outputs[k2]) jaccard = len(idxs1 & idxs2) / len(idxs1 | idxs2) print('{}, {}: {}'.format(k1, k2, jaccard)) nh.util.mplutil.qtensure() nh.util.mplutil.multi_plot(xdata, ydata, xlabel='num boxes', ylabel='seconds')
def _nms(self, cxywh_score_cls, nms_mode=4): """ Non maximum suppression. Source: https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ Args: cxywh_score_cls (tensor): Bounding boxes and scores from get_detections. Assumes columns 0:4 are cx, cy, w, h, Column 4 is confidence, and column 5 is class id. Return: (tensor): Pruned boxes CommandLine: python -m netharn.models.yolo2.light_postproc GetBoundingBoxes._nms --profile Examples: >>> import torch >>> torch.random.manual_seed(0) >>> anchors = np.array([(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), (9.47112, 4.84053), (11.2364, 10.0071)]) >>> self = GetBoundingBoxes(anchors=anchors, num_classes=20, conf_thresh=.01, nms_thresh=0.5) >>> output = torch.randn(8, 5, 5 + 20, 9, 9) >>> boxes_ = self._get_boxes(output.data) >>> boxes = torch.Tensor(boxes_[0]) >>> ans0 = self._nms(boxes, nms_mode=0) >>> ans1 = self._nms(boxes, nms_mode=1) >>> ans2 = self._nms(boxes, nms_mode=2) Ignore: >>> from netharn import util >>> scores = boxes[..., 4:5] >>> classes = boxes[..., 5:6] >>> cxywh = util.Boxes(boxes[..., 0:4], 'cxywh') >>> tlbr = cxywh.to_tlbr() >>> util.non_max_supression(tlbr.data.numpy(), scores.numpy().ravel(), self.nms_thresh) Benchmark: boxes = torch.Tensor(boxes_[0]) import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+cpu'): with timer: self._nms(boxes, nms_mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+cpu'): with timer: self._nms(boxes, nms_mode=1) boxes = boxes.to() import ubelt for timer in ubelt.Timerit(100, bestof=10, label='nms0+gpu'): with timer: self._nms(boxes, nms_mode=0) for timer in ubelt.Timerit(100, bestof=10, label='nms1+gpu'): with timer: self._nms(boxes, nms_mode=1) """ if cxywh_score_cls.numel() == 0: return cxywh_score_cls a = cxywh_score_cls[:, :2] b = cxywh_score_cls[:, 2:4] # convert to tlbr tlbr_tensor = torch.cat([a - b / 2, a + b / 2], 1) scores = cxywh_score_cls[:, 4] if nms_mode == 0: # if torch.cuda.is_available: # boxes = boxes.to(0) from netharn.util.nms.torch_nms import torch_nms cls_tensor = cxywh_score_cls[:, 5] keep = torch_nms(tlbr_tensor, scores, classes=cls_tensor, thresh=self.nms_thresh, bias=0) return cxywh_score_cls[keep] # keep = _nms_torch(tlbr_tensor, scores, nms_thresh=self.nms_thresh) # keep = sorted(keep) elif nms_mode == 1: # Dont group by classes, just NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) keep = util.non_max_supression(tlbr_np, scores_np, self.nms_thresh, bias=0) keep = sorted(keep) elif nms_mode == 2: # Group and use NMS tlbr_np = tlbr_tensor.cpu().numpy().astype(np.float32) scores_np = scores.cpu().numpy().astype(np.float32) classes_np = cxywh_score_cls[:, 5].cpu().numpy().astype(np.int) keep = util.non_max_supression(tlbr_np, scores_np, self.nms_thresh, classes=classes_np, bias=0) # keep = [] # for idxs in ub.group_items(range(len(classes_np)), classes_np).values(): # cls_tlbr_np = tlbr_np.take(idxs, axis=0) # cls_scores_np = scores_np.take(idxs, axis=0) # cls_keep = util.non_max_supression(cls_tlbr_np, cls_scores_np, # self.nms_thresh, bias=0) # keep.extend(list(ub.take(idxs, cls_keep))) keep = sorted(keep) elif nms_mode == 3: # Group and use NMS classes_np = cxywh_score_cls[:, 5].cpu().numpy().astype(np.int) keep = util.non_max_supression(tlbr_tensor, scores, self.nms_thresh, classes=classes_np, bias=0, impl='torch') keep = sorted(keep) elif nms_mode == 4: # Dont group, but use torch from netharn.util.nms.torch_nms import torch_nms keep = torch_nms(tlbr_tensor, scores, thresh=self.nms_thresh, bias=0) return cxywh_score_cls[keep] else: raise KeyError(nms_mode) return cxywh_score_cls[torch.LongTensor(keep)]
def non_max_supression(tlbr, scores, thresh, bias=0.0, classes=None, impl='auto'): """ Non-Maximum Suppression Args: tlbr (ndarray): Nx4 boxes in tlbr format scores (ndarray): score for each bbox thresh (float): iou threshold bias (float): bias for iou computation either 0 or 1 (hint: choosing 1 is wrong computer vision community) classes (ndarray or None): integer classes. If specified NMS is done on a perclass basis. impl (str): implementation can be auto, python, cpu, or gpu CommandLine: python ~/code/netharn/netharn/util/nms/nms_core.py nms python ~/code/netharn/netharn/util/nms/nms_core.py nms:0 python ~/code/netharn/netharn/util/nms/nms_core.py nms:1 References: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ https://github.com/bharatsingh430/soft-nms/blob/master/lib/nms/cpu_nms.pyx <- TODO Example: >>> dets = np.array([ >>> [0, 0, 100, 100], >>> [100, 100, 10, 10], >>> [10, 10, 100, 100], >>> [50, 50, 100, 100], >>> ], dtype=np.float32) >>> scores = np.array([.1, .5, .9, .1]) >>> thresh = .5 >>> keep = non_max_supression(dets, scores, thresh, impl='py') >>> print('keep = {!r}'.format(keep)) keep = [2, 1, 3] Example: >>> import ubelt as ub >>> dets = np.array([ >>> [0, 0, 100, 100], >>> [100, 100, 10, 10], >>> [10, 10, 100, 100], >>> [50, 50, 100, 100], >>> [100, 100, 150, 101], >>> [120, 100, 180, 101], >>> [150, 100, 200, 101], >>> ], dtype=np.float32) >>> scores = np.linspace(0, 1, len(dets)) >>> thresh = .2 >>> solutions = {} >>> for impl in _impls: >>> solutions[impl] = sorted(non_max_supression(dets, scores, thresh, impl=impl)) >>> print('solutions = {}'.format(ub.repr2(solutions, nl=1))) >>> assert ub.allsame(solutions.values()) """ if tlbr.shape[0] == 0: return [] if impl == 'auto': impl = _automode if classes is not None: keep = [] for idxs in ub.group_items(range(len(classes)), classes).values(): # cls_tlbr = tlbr.take(idxs, axis=0) # cls_scores = scores.take(idxs, axis=0) cls_tlbr = tlbr[idxs] cls_scores = scores[idxs] cls_keep = non_max_supression(cls_tlbr, cls_scores, thresh=thresh, bias=bias, impl=impl) keep.extend(list(ub.take(idxs, cls_keep))) return keep else: if impl == 'py': keep = py_nms.py_nms(tlbr, scores, thresh, bias=float(bias)) elif impl == 'torch': was_tensor = torch.is_tensor(tlbr) if not was_tensor: tlbr = torch.Tensor(tlbr) scores = torch.Tensor(scores) flags = torch_nms.torch_nms(tlbr, scores, thresh=thresh, bias=float(bias)) keep = np.where(flags.cpu().numpy())[0] else: # TODO: it would be nice to be able to pass torch tensors here nms = _impls[impl] tlbr = tlbr.astype(np.float32) scores = scores.astype(np.float32) # dets = np.hstack((tlbr, scores[:, None])).astype(np.float32) if impl == 'gpu': # HACK: we should parameterize which device is used device = torch.cuda.current_device() keep = nms(tlbr, scores, thresh, bias=float(bias), device_id=device) else: keep = nms(tlbr, scores, thresh, bias=float(bias)) return keep