def download(split='val2017', reload=False): """ Downloads COCO split Returns: Path to downloaded split directory """ from io import BytesIO from urllib.request import urlopen from zipfile import ZipFile from ml import hub download_url = BASE_DOWNLOAD_URL(split) download_dir = Path(os.path.join(hub.get_dir(), 'COCO')) download_dir.mkdir(exist_ok=True, parents=True) split_dir = Path(os.path.join(download_dir, f'{split}')) if split_dir.exists() and any(split_dir.iterdir()) and not reload: # already exists logging.info( f'Skipping download of COCO: {split} as it already exists') else: # download with urlopen(download_url) as zipresp: with ZipFile(BytesIO(zipresp.read())) as zfile: zfile.extractall(download_dir) logging.info(f'Downloaded COCO: {split}') return str(split_dir)
def attempt_download(params, epoch=8, model_dir=None, force=False): if model_dir is None: hub_dir = hub.get_dir() model_dir = os.path.join(hub_dir, 'checkpoints') try: os.makedirs(model_dir) except OSError as e: import errno if e.errno == errno.EEXIST: # Directory already exists, ignore. pass else: # Unexpected OSError, re-raise. raise chkpts = { 'rfcn_dcn_coco': '0B6T5quL13CdHZ3ZrRVNjcnFmZk0', 'rfcn_coco': None, } id = chkpts[params] path = f"{model_dir}/{params}-{epoch:04d}.params" if hub.download_gdrive(id, path, force=force) != 0: raise IOError(f"Failed to download to {path}") return path
def build(name, model, spec, model_dir=None, backend='trt', reload=False, **kwargs): r""" Args: name(str): checkpoint name to save and load model(nn.Module): pytorch model on CPU spec(Tuple[B, C, H, W]): input shape including dynamic axies as -1 Kwargs: model_dir(str, Path): path to save and load model checkpoint backend(str): deployment backend reload(bool): whether to force deploying the model with backend input_names(List[str]): list of names of input tensor args output_names([List[str]): list of names of output tensors batch_size(int): max batch size as the dynamic axis 0 workspace_size(int): fp16(bool): int8(bool): strict_type_constraints(bool): type mode strictly forced or not int8_calib_batch_size(int): int8_calib_preprocess_func(Callable): min_shapes(Tuple): opt_shapes(Tuple): max_shapes(Tuple) """ from ml import hub if model_dir is None: hub_dir = hub.get_dir() model_dir = os.path.join(hub_dir, 'checkpoints') try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: pass else: raise e from time import time t = time() if backend in ['trt', 'tensorrt']: # XXX No intermmediate ONNX archive from . import trt as backend chkpt_path = Path(f"{model_dir}/{name}.pth") if chkpt_path.exists() and not reload: logging.info(f"Loading torch2trt checkpoint from {chkpt_path}") chkpt = th.load(chkpt_path) predictor = backend.TRTPredictor() predictor.load_state_dict(chkpt) return predictor trt = Path(f"{model_dir}/{name}.trt") input_names = kwargs.pop('input_names', None) output_names = kwargs.pop('output_names', None) if trt.exists() and not reload: # Load from previous saved deployment engine logging.info(f"Building TensorRT inference engine from {trt}") engine = backend.build(trt) if not (input_names and output_names): input_names, output_names = get_input_output_names(engine) predictor = backend.TRTPredictor(engine=engine, input_names=input_names, output_names=output_names) else: batch_size = kwargs.pop('batch_size', 1) workspace_size = kwargs.pop('workspace_size', GiB(2)) fp16 = kwargs.pop('amp', False) fp16 = fp16 or kwargs.pop('fp16', False) int8 = kwargs.pop('int8', False) strict_type_constraints = kwargs.pop('strict_type_constraints', False) # amp implied int8_calib_batch_size = kwargs.pop('int8_calib_batch_size', 16) device = next(model.parameters()).device min_shapes = kwargs.get('min_shapes') if min_shapes is not None: inputs = tuple([ th.rand(1, *shape, device=device) for shape in min_shapes ]) else: inputs = tuple( [th.rand(1, *shape, device=device) for shape in spec]) predictor = backend.torch2trt( model, inputs, max_batch_size=batch_size, max_workspace_size=workspace_size, input_names=input_names, output_names=output_names, fp16_mode=fp16, int8_mode=int8, int8_calib_batch_size=int8_calib_batch_size, strict_type_constraints=strict_type_constraints, use_onnx=True, **kwargs) logging.info(f"Saving TensorRT checkpoint to {chkpt_path}") io.save(predictor.state_dict(), chkpt_path) logging.info(f"Built TensorRT inference engine for {time() - t:.3f}s") return predictor elif backend == 'onnx': onnx_path = Path(f"{model_dir}/{name}.onnx") batch_size = kwargs.pop('batch_size', 1) workspace_size = kwargs.pop('workspace_size', GiB(1)) amp = kwargs.pop('amp', False) if not onnx_path.exists() or reload: # print(spec, onnx_path, kwargs) export(model, spec, onnx_path, **kwargs) import onnxruntime from .onnx import ONNXPredictor engine = onnxruntime.InferenceSession(str(onnx_path)) predictor = ONNXPredictor(engine) logging.info( f"Built ONNX inference engine at {onnx_path} for {time() - t:.3f}s" ) else: raise ValueError(f"Unsupported backend: {backend}") return predictor
def deploy(self, name='yolo5x', batch_size=10, spec=(3, 640, 640), fp16=True, backend='trt', reload=False, **kwargs): r"""Deploy optimized runtime backend. Args: batch_size(int): max batch size spec(Tuple[int]): preprocessed frame shape which must be fixed through the batch amp(bool): mixed precision with FP16 kwargs: dynamix_axes: dynamic axes for each input ==> {'input_0': {0: 'batch_size', 2: 'height'}} min_shapes: min input shapes ==> [(3, 320, 640)] max_shapes: max input shapes ==> [(3, 640, 640)] """ from ml import deploy module = self.module # avoids warning for dynamic ifs module.model[-1].onnx_dynamic = True # FIXME: workaround for invalid values with different batch size in tensorrt # tensorrt output is not consistent with in place operations module.model[-1].inplace = False int8 = kwargs.get('int8', False) strict = kwargs.get('strict', False) if int8: from ml import hub from ml.vision.datasets.coco import download def preprocessor(size=(384, 640)): from PIL import Image from torchvision import transforms trans = transforms.Compose( [transforms.Resize(size), transforms.ToTensor()]) H, W = size def preprocess(image_path, *shape): r'''Preprocessing for TensorRT calibration Args: image_path(str): path to image channels(int): ''' image = Image.open(image_path) logging.debug( f"image.size={image.size}, mode={image.mode}") image = image.convert('RGB') C = len(image.mode) im = trans(image) assert im.shape == (C, H, W) return im return preprocess int8_calib_max = kwargs.get('int8_calib_max', 5000) int8_calib_batch_size = kwargs.get('int8_calib_batch_size', max(batch_size, 64)) cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache' cache_path = Path(os.path.join(hub.get_dir(), cache)) kwargs['int8_calib_cache'] = str(cache_path) kwargs['int8_calib_data'] = download(split='val2017', reload=False) kwargs['int8_calib_preprocess_func'] = preprocessor() kwargs['int8_calib_max'] = int8_calib_max kwargs['int8_calib_batch_size'] = int8_calib_batch_size device = next(self.module.parameters()).device # FIXME: cuda + onnx_dynamic: causes the onnx export to fail: https://github.com/ultralytics/yolov5/issues/5439 self.to('cpu') self.engine = deploy.build( f"{name}-bs{batch_size}_{spec[-2]}x{spec[-1]}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}{strict and '_strict' or ''}", self, [spec], backend=backend, reload=reload, batch_size=batch_size, fp16=fp16, strict_type_constraints=strict, **kwargs) self.to(device) # TODO: avoid storing dummy modules to keep track of module device self.dummy = module.model[-1] del self.module
def test_deploy_trt(benchmark, batch, detector, dev, B, fp16, int8, strict, name): # FIXME pytorch cuda initialization must be ahead of pycuda module = detector.module module.model[-1].export = True batch = TF.resize(batch, (384, 640)).float() h, w = batch.shape[2:] kwargs = {} if int8: import os from pathlib import Path from ml import hub from ml.vision.datasets.coco import download def preprocessor(size=(384, 640)): from PIL import Image from torchvision import transforms trans = transforms.Compose( [transforms.Resize(size), transforms.ToTensor()]) H, W = size def preprocess(image_path, *shape): r'''Preprocessing for TensorRT calibration Args: image_path(str): path to image channels(int): ''' image = Image.open(image_path) logging.debug(f"image.size={image.size}, mode={image.mode}") image = image.convert('RGB') C = len(image.mode) im = trans(image) assert im.shape == (C, H, W) return im return preprocess int8_calib_max = 5000 int8_calib_batch_size = 64 cache = f'{name}-COCO2017-val-{int8_calib_max}-{int8_calib_batch_size}.cache' cache_path = Path(os.path.join(hub.get_dir(), cache)) kwargs['int8_calib_cache'] = str(cache_path) kwargs['int8_calib_data'] = download(split='val2017', reload=False) kwargs['int8_calib_preprocess_func'] = preprocessor() kwargs['int8_calib_max'] = int8_calib_max kwargs['int8_calib_batch_size'] = int8_calib_batch_size engine = deploy.build( f"yolo5x-bs{B}_{h}x{w}{fp16 and '_fp16' or ''}{int8 and '_int8' or ''}", detector, [batch.shape[1:]], backend='trt', reload=not True, batch_size=B, fp16=fp16, int8=int8, strict_type_constraints=strict, **kwargs) preds, *features = benchmark(engine.predict, batch[:B].to(dev), sync=True) assert len(features) == 3 with th.no_grad(): with th.cuda.amp.autocast(enabled=fp16): torch_preds, torch_features = detector(batch[:B].to(dev)) logging.info( f"outputs trt norm={preds.norm().item()}, torch norm={torch_preds.norm().item()}" ) if fp16 or int8: pass # th.testing.assert_allclose(torch_preds.float(), preds.float(), rtol=2e-02, atol=4e-02) else: th.testing.assert_allclose(torch_preds.float(), preds.float(), rtol=1e-03, atol=4e-04) for torch_feats, feats in zip(torch_features, features): th.testing.assert_allclose(torch_feats.float(), feats.float(), rtol=1e-03, atol=4e-04)