Esempio n. 1
0
def get_val_opn_pt_loader(args):
    cfg, _ = data.get_val_cfg_transform(args)
    transform = transforms.video_transform_val(crop_size=args.opn_crop_size)

    if args.opn_flow_folder is None:
        dataset = opn_datasets.OPNVideoDataset(
                cfg['root'], cfg['val_metafile'], 
                transform=transform)
    else:
        dataset = opn_datasets.MotionAwareOPNVideoDataset(
                cfg['root'], args.opn_flow_folder, 
                cfg['val_metafile'], 
                transform=transform)
    return data.get_val_dataloader(args, dataset)
Esempio n. 2
0
def get_val_rot_pt_loader(args):
    cfg, _ = data.get_val_cfg_transform(args)
    if not args.rot_real_prep:
        transform = transforms.video_3DRot_transform_val()
    else:
        transform = transforms.video_3DRot_transform_val((136, 136))

    dataset = datasets.RotVideoDataset(cfg['root'],
                                       cfg['val_metafile'],
                                       num_frames=16,
                                       frame_interval=1,
                                       transform=transform,
                                       fps_conversion_factor=FPS_FACTOR)
    return data.get_val_dataloader(args, dataset)
def benchmark(args):
    app_start = time.time()

    prewarm_iters = 50
    bench_secs = 10

    val_dataloader = get_val_dataloader(args)

    for nbatch, (tensor_nchw, img_id, (image_heights, image_widths), _,
                 _) in enumerate(val_dataloader):
        tensor_nchw, image_heights, image_widths = [
            t.to('cuda') for t in (tensor_nchw, image_heights, image_widths)
        ]
        break

    batch_dim = tensor_nchw.size(0)

    update_fps, plot_thread = gpuplot.bg_plot(
        num_gpus=args.num_devices,
        sample_hz=5,
    )

    max_times = 10
    batch_times = []
    last_update = time.time()
    update_period = 0.5

    if args.runtime == 'pytorch':
        print(
            f'Runtime: Pytorch\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}'
        )
        model = SSD300(args.topk, args.detection_threshold, args.iou_threshold,
                       args.precision, args.batch_dim, args.trt_path)
        model = model.eval().to('cuda')

        if args.precision == 'fp16':
            tensor_nchw, image_heights, image_widths = [
                t.to(torch.float16)
                for t in (tensor_nchw, image_heights, image_widths)
            ]

        plot_thread.start()

        print('Prewarming model')
        for i in range(prewarm_iters):
            model(tensor_nchw, image_heights, image_widths)
            batch_times = (batch_times + [time.time()])[-max_times:]

        print(f'Beginning benchmark (+{time.time() - app_start:.1f})...')
        start_time = time.time()

        bench_iters = 0
        while True:
            model(tensor_nchw, image_heights, image_widths)
            batch_times = (batch_times + [time.time()])[-max_times:]
            if batch_times[-1] > last_update + update_period and len(
                    batch_times) > 1:
                last_update = batch_times[-1]
                update_fps(args.batch_dim * (len(batch_times) - 1) /
                           (batch_times[-1] - batch_times[0]))
            bench_iters += 1
            if time.time() > start_time + bench_secs:
                break

    elif args.runtime == 'trt':
        print(
            f'Runtime: TensorRT\nPrecision: {args.precision}\nBatch-dim: {args.batch_dim}\nTop-k: {args.topk}'
        )
        np_to_torch_type = {
            np.float32: torch.float32,
            np.float16: torch.float16,
            np.int32: torch.int32,
            np.int64: torch.int64,
        }

        devices = [cuda.Device(i) for i in range(args.num_devices)]
        contexts = [devices[i].make_context() for i in range(args.num_devices)]

        for d in devices:
            pycuda.autoinit.context.pop()

        context_detail = []

        for device_id, context in enumerate(contexts):
            context.push()
            try:
                torch_device = torch.device('cuda', device_id)
                streams = [
                    cuda.Stream() for i in range(args.num_streams_per_device)
                ]

                tensors = {
                    name: t.clone().to(torch_device)
                    for name, t in [(
                        'tensor_nchw',
                        tensor_nchw), (
                            'image_heights',
                            image_heights), ('image_widths', image_widths)]
                }

                model = SSD300(args.topk, args.detection_threshold,
                               args.iou_threshold, args.precision,
                               args.batch_dim, args.trt_path)

                trt_outputs, bindings = [
                    [] for i in range(args.num_streams_per_device)
                ], [[] for i in range(args.num_streams_per_device)]

                for stream_id in range(args.num_streams_per_device):
                    for binding_name in model.trt_engine:
                        shape = model.trt_engine.get_binding_shape(
                            binding_name)
                        dtype = trt.nptype(
                            model.trt_engine.get_binding_dtype(binding_name))
                        torch_type = np_to_torch_type[dtype]

                        if model.trt_engine.binding_is_input(binding_name):
                            torch_input = tensors[binding_name].to(torch_type)
                            bindings[stream_id].append(
                                int(torch_input.data_ptr()))
                        else:
                            torch_output = torch.zeros(tuple(shape),
                                                       dtype=torch_type,
                                                       device=torch_device)
                            trt_outputs[stream_id].append(torch_output)
                            bindings[stream_id].append(
                                int(torch_output.data_ptr()))

                context_detail.append({
                    'streams': streams,
                    'model': model,
                    'trt_outputs': trt_outputs,
                    'bindings': bindings
                })

            finally:
                context.pop()

        event_queue = queue.Queue(args.num_devices *
                                  args.num_streams_per_device)

        def sync_streams(update_fps, batch_times, max_times, last_update,
                         update_period):
            while True:
                ce = event_queue.get()
                if ce is None:
                    break
                else:
                    context, e = ce
                    context.push()
                    e.synchronize()
                    context.pop()

                    batch_times = (batch_times + [time.time()])[-max_times:]
                    if batch_times[-1] > last_update + update_period and len(
                            batch_times) > 1:
                        last_update = batch_times[-1]
                        update_fps(args.batch_dim * (len(batch_times) - 1) /
                                   (batch_times[-1] - batch_times[0]))

        sync_thread = threading.Thread(target=sync_streams,
                                       args=(update_fps, batch_times,
                                             max_times, last_update,
                                             update_period))
        sync_thread.start()

        plot_thread.start()

        # for benchmarking purposes, just run model repeatedly on initial batch of inputs
        bench_iters = 0
        while True:
            if bench_iters == 0:
                print('Prewarming model')
            elif bench_iters == prewarm_iters:
                print(
                    f'Beginning benchmark (+{time.time() - app_start:.1f})...')
                start_time = time.time()
            elif bench_iters > prewarm_iters and time.time(
            ) > start_time + bench_secs:
                break

            context_id = bench_iters % len(context_detail)
            context = contexts[context_id]
            context.push()
            try:
                detail = context_detail[context_id]
                stream_id = (bench_iters - context_id) % len(detail['streams'])
                stream = detail['streams'][stream_id]
                detail['model'].trt_context.execute_async_v2(
                    bindings=detail['bindings'][stream_id],
                    stream_handle=stream.handle)
                event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
                event_queue.put((context, event.record(stream)))
            finally:
                context.pop()

            bench_iters += 1

        event_queue.put(None)
        while not event_queue.empty():
            pass
        bench_iters -= prewarm_iters

    total_time = time.time() - start_time

    update_fps(None)
    plot_thread.join()

    print(
        f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total'
    )
    print(f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image')
    print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS')

    if args.output_path:
        with open(args.output_path, 'w') as fout:
            print(
                f'{bench_iters} batches, {bench_iters * batch_dim} images, {total_time:.2f} seconds total',
                file=fout)
            print(
                f'{1000 * total_time / (bench_iters * batch_dim):.1f} ms per image',
                file=fout)
            print(f'{(bench_iters * batch_dim) / total_time:.1f} FPS',
                  file=fout)
def build_onnx(args):
    device = torch.device('cpu')
    val_dataloader = get_val_dataloader(args)

    for nbatch, (X, img_id, img_size, _, _) in enumerate(val_dataloader):
        inputs = X, img_size[0], img_size[1]
        break

    model = SSD300(args.topk,
                   args.detection_threshold,
                   args.iou_threshold,
                   'fp32',
                   args.batch_dim,
                   None,
                   onnx_export=True).to(device).eval()

    onnx_buf = io.BytesIO()
    torch.onnx.export(model,
                      inputs,
                      onnx_buf,
                      input_names=('tensor_nchw', 'image_heights',
                                   'image_widths'),
                      output_names=('bboxes', 'probs'),
                      opset_version=11,
                      export_params=True)
    onnx_buf.seek(0)
    onnx_module = shape_inference.infer_shapes(onnx.load(onnx_buf))

    while len(onnx_module.graph.output):
        onnx_module.graph.output.remove(onnx_module.graph.output[0])
    onnx_module.graph.output.extend([
        helper.make_tensor_value_info('num_detections', TensorProto.INT32,
                                      [-1]),
        helper.make_tensor_value_info('nms_bboxes', TensorProto.FLOAT,
                                      [-1, -1, -1]),
        helper.make_tensor_value_info('nms_probs', TensorProto.FLOAT,
                                      [-1, -1]),
        helper.make_tensor_value_info('nms_classes', TensorProto.FLOAT,
                                      [-1, -1]),
    ])

    graph = gs.import_onnx(onnx_module)

    attrs = {
        'shareLocation': False,
        'numClasses': 80,
        'backgroundLabelId': -1,
        'topK': args.topk,  # per-class, pre NMS
        'keepTopK': args.topk,  # across-classes, per image
        'scoreThreshold': args.detection_threshold,
        'iouThreshold': args.iou_threshold,
        'isNormalized': False,
        'clipBoxes': False,
    }

    ts = graph.tensors()

    nms_layer = graph.layer(op='BatchedNMSDynamic_TRT',
                            attrs=attrs,
                            inputs=[ts['bboxes'], ts['probs']],
                            outputs=[
                                ts['num_detections'], ts['nms_bboxes'],
                                ts['nms_probs'], ts['nms_classes']
                            ])

    graph.cleanup()
    graph.toposort()

    onnx_module = gs.export_onnx(graph)
    onnx_path = os.path.splitext(args.trt_path)[0] + '.onnx'
    print('saving ONNX model to', onnx_path)
    onnx.save(onnx_module, onnx_path)
    return onnx_module
def eval_coco(args):
    device = torch.device(args.device)

    model = SSD300(args.topk, args.detection_threshold, args.iou_threshold,
                   args.precision, args.batch_dim,
                   args.trt_path).to(device).eval()

    dataloader = get_val_dataloader(args)
    inv_map = {v: k for k, v in dataloader.dataset.label_map.items()}

    coco_ground_truth = get_coco_ground_truth(args)

    results = None
    start = time.time()

    for nbatch, (X, img_id, img_size, _, _) in enumerate(dataloader):
        print('Inference batch: {}/{}'.format(nbatch, len(dataloader)),
              end='\r')
        with torch.no_grad():
            batch_dim = X.size(0)
            if args.precision == 'fp16':
                X = X.to(torch.float16)
            X = X.to(device)
            image_heights, image_widths = [i.to(device) for i in img_size]

            if batch_dim < args.batch_dim:
                num_pad = args.batch_dim - batch_dim
                X = torch.cat([X, X[-1].expand(num_pad, *X[-1].size())], dim=0)
                image_heights = torch.cat(
                    [image_heights, image_heights[-1].repeat(num_pad)], dim=0)
                image_widths = torch.cat(
                    [image_widths, image_widths[-1].repeat(num_pad)], dim=0)

            bboxes, probs, class_indexes, image_indexes = model.forward_coco(
                X, image_heights, image_widths)

            # filter out pad results
            small_batch_filter = image_indexes < batch_dim
            bboxes = bboxes[small_batch_filter]
            probs = probs[small_batch_filter]
            class_indexes = class_indexes[small_batch_filter]
            image_indexes = image_indexes[small_batch_filter]

            mapped_labels = class_indexes.to('cpu')
            mapped_labels.apply_(lambda i: inv_map[i])
            image_ids = img_id[image_indexes]

            batch_results = torch.cat([
                image_ids.cpu().unsqueeze(-1),
                bboxes.cpu(),
                probs.cpu().unsqueeze(-1),
                mapped_labels.unsqueeze(-1)
            ],
                                      dim=1)

            if results is not None:
                results = torch.cat([results, batch_results], dim=0)
            else:
                results = batch_results

    print()
    print(f'DONE (t={time.time() - start:.2f}).')

    results = results.numpy().astype(np.float32)

    coco_detections = coco_ground_truth.loadRes(results)

    E = COCOeval(coco_ground_truth, coco_detections, iouType='bbox')
    E.evaluate()
    E.accumulate()
    stdout = sys.stdout
    try:
        if args.output_path:
            sys.stdout = open(args.output_path, 'w')
        E.summarize()
    finally:
        if args.output_path:
            sys.stdout.close()
        sys.stdout = stdout
    print('mAP: {:.5f}'.format(E.stats[0]))
 def __init__(self, args):
     super().__init__()
     self.batch_dim = args.batch_dim
     self.dataloader = iter(get_val_dataloader(args))
     self.current_batch = None  # for ref-counting
     self.cache_path = 'calibration.cache'