def get_val_opn_pt_loader(args): cfg, _ = data.get_val_cfg_transform(args) transform = transforms.video_transform_val(crop_size=args.opn_crop_size) if args.opn_flow_folder is None: dataset = opn_datasets.OPNVideoDataset( cfg['root'], cfg['val_metafile'], transform=transform) else: dataset = opn_datasets.MotionAwareOPNVideoDataset( cfg['root'], args.opn_flow_folder, cfg['val_metafile'], transform=transform) return data.get_val_dataloader(args, dataset)
def get_val_rot_pt_loader(args): cfg, _ = data.get_val_cfg_transform(args) if not args.rot_real_prep: transform = transforms.video_3DRot_transform_val() else: transform = transforms.video_3DRot_transform_val((136, 136)) dataset = datasets.RotVideoDataset(cfg['root'], cfg['val_metafile'], num_frames=16, frame_interval=1, transform=transform, fps_conversion_factor=FPS_FACTOR) return data.get_val_dataloader(args, dataset)
def benchmark(args): app_start = time.time() prewarm_iters = 50 bench_secs = 10 val_dataloader = get_val_dataloader(args) for nbatch, (tensor_nchw, img_id, (image_heights, image_widths), _, _) in enumerate(val_dataloader): tensor_nchw, image_heights, image_widths = [ t.to('cuda') for t in (tensor_nchw, image_heights, image_widths) ] break batch_dim = tensor_nchw.size(0) update_fps, plot_thread = gpuplot.bg_plot( num_gpus=args.num_devices, sample_hz=5, ) max_times = 10 batch_times = [] last_update = time.time() update_period = 0.5 if args.runtime == 'pytorch': print( f'Runtime: Pytorch\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}' ) model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path) model = model.eval().to('cuda') if args.precision == 'fp16': tensor_nchw, image_heights, image_widths = [ t.to(torch.float16) for t in (tensor_nchw, image_heights, image_widths) ] plot_thread.start() print('Prewarming model') for i in range(prewarm_iters): model(tensor_nchw, image_heights, image_widths) batch_times = (batch_times + [time.time()])[-max_times:] print(f'Beginning benchmark (+{time.time() - app_start:.1f})...') start_time = time.time() bench_iters = 0 while True: model(tensor_nchw, image_heights, image_widths) batch_times = (batch_times + [time.time()])[-max_times:] if batch_times[-1] > last_update + update_period and len( batch_times) > 1: last_update = batch_times[-1] update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0])) bench_iters += 1 if time.time() > start_time + bench_secs: break elif args.runtime == 'trt': print( f'Runtime: TensorRT\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}' ) np_to_torch_type = { np.float32: torch.float32, np.float16: torch.float16, np.int32: torch.int32, np.int64: torch.int64, } devices = [cuda.Device(i) for i in range(args.num_devices)] contexts = [devices[i].make_context() for i in range(args.num_devices)] for d in devices: pycuda.autoinit.context.pop() context_detail = [] for device_id, context in enumerate(contexts): context.push() try: torch_device = torch.device('cuda', device_id) streams = [ cuda.Stream() for i in range(args.num_streams_per_device) ] tensors = { name: t.clone().to(torch_device) for name, t in [( 'tensor_nchw', tensor_nchw), ( 'image_heights', image_heights), ('image_widths', image_widths)] } model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path) trt_outputs, bindings = [ [] for i in range(args.num_streams_per_device) ], [[] for i in range(args.num_streams_per_device)] for stream_id in range(args.num_streams_per_device): for binding_name in model.trt_engine: shape = model.trt_engine.get_binding_shape( binding_name) dtype = trt.nptype( model.trt_engine.get_binding_dtype(binding_name)) torch_type = np_to_torch_type[dtype] if model.trt_engine.binding_is_input(binding_name): torch_input = tensors[binding_name].to(torch_type) bindings[stream_id].append( int(torch_input.data_ptr())) else: torch_output = torch.zeros(tuple(shape), dtype=torch_type, device=torch_device) trt_outputs[stream_id].append(torch_output) bindings[stream_id].append( int(torch_output.data_ptr())) context_detail.append({ 'streams': streams, 'model': model, 'trt_outputs': trt_outputs, 'bindings': bindings }) finally: context.pop() event_queue = queue.Queue(args.num_devices * args.num_streams_per_device) def sync_streams(update_fps, batch_times, max_times, last_update, update_period): while True: ce = event_queue.get() if ce is None: break else: context, e = ce context.push() e.synchronize() context.pop() batch_times = (batch_times + [time.time()])[-max_times:] if batch_times[-1] > last_update + update_period and len( batch_times) > 1: last_update = batch_times[-1] update_fps(args.batch_dim * (len(batch_times) - 1) / (batch_times[-1] - batch_times[0])) sync_thread = threading.Thread(target=sync_streams, args=(update_fps, batch_times, max_times, last_update, update_period)) sync_thread.start() plot_thread.start() # for benchmarking purposes, just run model repeatedly on initial batch of inputs bench_iters = 0 while True: if bench_iters == 0: print('Prewarming model') elif bench_iters == prewarm_iters: print( f'Beginning benchmark (+{time.time() - app_start:.1f})...') start_time = time.time() elif bench_iters > prewarm_iters and time.time( ) > start_time + bench_secs: break context_id = bench_iters % len(context_detail) context = contexts[context_id] context.push() try: detail = context_detail[context_id] stream_id = (bench_iters - context_id) % len(detail['streams']) stream = detail['streams'][stream_id] detail['model'].trt_context.execute_async_v2( bindings=detail['bindings'][stream_id], stream_handle=stream.handle) event = cuda.Event(cuda.event_flags.DISABLE_TIMING) event_queue.put((context, event.record(stream))) finally: context.pop() bench_iters += 1 event_queue.put(None) while not event_queue.empty(): pass bench_iters -= prewarm_iters total_time = time.time() - start_time update_fps(None) plot_thread.join() print( f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total' ) print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image') print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS') if args.output_path: with open(args.output_path, 'w') as fout: print( f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total', file=fout) print( f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image', file=fout) print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS', file=fout)
def build_onnx(args): device = torch.device('cpu') val_dataloader = get_val_dataloader(args) for nbatch, (X, img_id, img_size, _, _) in enumerate(val_dataloader): inputs = X, img_size[0], img_size[1] break model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, 'fp32', args.batch_dim, None, onnx_export=True).to(device).eval() onnx_buf = io.BytesIO() torch.onnx.export(model, inputs, onnx_buf, input_names=('tensor_nchw', 'image_heights', 'image_widths'), output_names=('bboxes', 'probs'), opset_version=11, export_params=True) onnx_buf.seek(0) onnx_module = shape_inference.infer_shapes(onnx.load(onnx_buf)) while len(onnx_module.graph.output): onnx_module.graph.output.remove(onnx_module.graph.output[0]) onnx_module.graph.output.extend([ helper.make_tensor_value_info('num_detections', TensorProto.INT32, [-1]), helper.make_tensor_value_info('nms_bboxes', TensorProto.FLOAT, [-1, -1, -1]), helper.make_tensor_value_info('nms_probs', TensorProto.FLOAT, [-1, -1]), helper.make_tensor_value_info('nms_classes', TensorProto.FLOAT, [-1, -1]), ]) graph = gs.import_onnx(onnx_module) attrs = { 'shareLocation': False, 'numClasses': 80, 'backgroundLabelId': -1, 'topK': args.topk, # per-class, pre NMS 'keepTopK': args.topk, # across-classes, per image 'scoreThreshold': args.detection_threshold, 'iouThreshold': args.iou_threshold, 'isNormalized': False, 'clipBoxes': False, } ts = graph.tensors() nms_layer = graph.layer(op='BatchedNMSDynamic_TRT', attrs=attrs, inputs=[ts['bboxes'], ts['probs']], outputs=[ ts['num_detections'], ts['nms_bboxes'], ts['nms_probs'], ts['nms_classes'] ]) graph.cleanup() graph.toposort() onnx_module = gs.export_onnx(graph) onnx_path = os.path.splitext(args.trt_path)[0] + '.onnx' print('saving ONNX model to', onnx_path) onnx.save(onnx_module, onnx_path) return onnx_module
def eval_coco(args): device = torch.device(args.device) model = SSD300(args.topk, args.detection_threshold, args.iou_threshold, args.precision, args.batch_dim, args.trt_path).to(device).eval() dataloader = get_val_dataloader(args) inv_map = {v: k for k, v in dataloader.dataset.label_map.items()} coco_ground_truth = get_coco_ground_truth(args) results = None start = time.time() for nbatch, (X, img_id, img_size, _, _) in enumerate(dataloader): print('Inference batch: {}/{}'.format(nbatch, len(dataloader)), end='\r') with torch.no_grad(): batch_dim = X.size(0) if args.precision == 'fp16': X = X.to(torch.float16) X = X.to(device) image_heights, image_widths = [i.to(device) for i in img_size] if batch_dim < args.batch_dim: num_pad = args.batch_dim - batch_dim X = torch.cat([X, X[-1].expand(num_pad, *X[-1].size())], dim=0) image_heights = torch.cat( [image_heights, image_heights[-1].repeat(num_pad)], dim=0) image_widths = torch.cat( [image_widths, image_widths[-1].repeat(num_pad)], dim=0) bboxes, probs, class_indexes, image_indexes = model.forward_coco( X, image_heights, image_widths) # filter out pad results small_batch_filter = image_indexes < batch_dim bboxes = bboxes[small_batch_filter] probs = probs[small_batch_filter] class_indexes = class_indexes[small_batch_filter] image_indexes = image_indexes[small_batch_filter] mapped_labels = class_indexes.to('cpu') mapped_labels.apply_(lambda i: inv_map[i]) image_ids = img_id[image_indexes] batch_results = torch.cat([ image_ids.cpu().unsqueeze(-1), bboxes.cpu(), probs.cpu().unsqueeze(-1), mapped_labels.unsqueeze(-1) ], dim=1) if results is not None: results = torch.cat([results, batch_results], dim=0) else: results = batch_results print() print(f'DONE (t={time.time() - start:.2f}).') results = results.numpy().astype(np.float32) coco_detections = coco_ground_truth.loadRes(results) E = COCOeval(coco_ground_truth, coco_detections, iouType='bbox') E.evaluate() E.accumulate() stdout = sys.stdout try: if args.output_path: sys.stdout = open(args.output_path, 'w') E.summarize() finally: if args.output_path: sys.stdout.close() sys.stdout = stdout print('mAP: {:.5f}'.format(E.stats[0]))
def __init__(self, args): super().__init__() self.batch_dim = args.batch_dim self.dataloader = iter(get_val_dataloader(args)) self.current_batch = None # for ref-counting self.cache_path = 'calibration.cache'