def convert_tensorrt(torch_model, ts_model, data_loader): # TRTorch need to give range of input shapes # parse all the shape of input shapes min_size = 987654321 max_size = 0 for d in data_loader: image = d[0]["image"] inputs = [{"image": image}] # remove other unused keys images = torch_model.preprocess_image(inputs) _, _, h, w = images.tensor.shape min_val = min(h, w) max_val = max(h, w) if min_val < min_size: min_size = min_val if max_val > max_size: max_size = max_val # compile settings compile_settings = { "input_shapes": [ { "min": [1, 3, min_size, min_size], "opt": [1, 3, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST], "max": [1, 3, max_size, max_size] }, ], "op_precision": torch.float32 # If you want Run with FP16, 'torch.half' instead } ts_model.eval() trt_ts_module = trtorch.compile(ts_model, compile_settings) with PathManager.open(os.path.join(args.output, "trt_ts_module.ts"), "wb") as f: torch.jit.save(trt_ts_module, f)
def test_compile_script(self): fp32_test_acc = self.compute_accuracy(self.testing_dataloader, self.model) log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc)) compile_spec = { "input_shapes": [[1, 3, 32, 32]], "op_precision": torch.int8, "calibrator": self.calibrator, "device": { "device_type": trtorch.DeviceType.GPU, "gpu_id": 0, "dla_core": 0, "allow_gpu_fallback": False, } } trt_mod = trtorch.compile(self.model, compile_spec) int8_test_acc = self.compute_accuracy(self.testing_dataloader, trt_mod) log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc)) acc_diff = fp32_test_acc - int8_test_acc self.assertTrue(abs(acc_diff) < 3)
def test_compile_traced(self): extra_info = { "input_shapes": [self.input.shape], } trt_mod = trtorch.compile(self.traced_model, extra_info) same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max() self.assertTrue(same < 2e-3)
def main(): model = ConvGelu().eval().cuda() scripted_model = torch.jit.script(model) compile_settings = { "input_shapes": [[1, 3, 5, 5]], "op_precision": torch.float32 } trt_ts_module = trtorch.compile(scripted_model, compile_settings) torch.jit.save(trt_ts_module, 'conv_gelu.jit') norm_model = Norm().eval().cuda() norm_ts_module = torch.jit.script(norm_model) norm_trt_ts = trtorch.compile(norm_ts_module, compile_settings) torch.jit.save(norm_trt_ts, 'norm.jit') print("Generated Torchscript-TRT models.")
def test_compile_script(self): compile_spec = { "input_shapes": [self.input.shape], } trt_mod = trtorch.compile(self.scripted_model, compile_spec) same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max() self.assertTrue(same < 2e-3)
def test_compile_script(self): compile_spec = { "input_shapes": [self.input.shape], "device": { "device_type": trtorch.DeviceType.GPU, "gpu_id": 0, "dla_core": 0, "allow_gpu_fallback": False, "disable_tf32": False } } trt_mod = trtorch.compile(self.scripted_model, compile_spec) same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max() self.assertTrue(same < 2e-3)
def test_compile_script(self): trtorch.set_device(0) compile_spec = { "input_shapes": [self.input.shape], "device": { "device_type": trtorch.DeviceType.GPU, "gpu_id": self.target_gpu, "dla_core": 0, "allow_gpu_fallback": False, "disable_tf32": False } } trt_mod = trtorch.compile(self.scripted_model, compile_spec) # Changing the device ID deliberately. It should still run on correct device ID by context switching trtorch.set_device(1) same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max() self.assertTrue(same < 2e-3)
def test_compile_script(self): compile_spec = { "input_shapes": [self.input.shape], "device": { "device_type": trtorch.DeviceType.GPU, "gpu_id": 0, "dla_core": 0, "allow_gpu_fallback": False, "disable_tf32": False }, "torch_fallback": { "enabled": True, "forced_fallback_ops": ["aten::max_pool2d"], "min_block_size": 1 } } trt_mod = trtorch.compile(self.scripted_model, compile_spec) same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max() self.assertTrue(same < 2e-3)
def main(): model = Elu().eval() #.cuda() scripted_model = torch.jit.script(model) compile_settings = { "input_shapes": [{ "min": [1024, 1, 32, 32], "opt": [1024, 1, 33, 33], "max": [1024, 1, 34, 34], }], "op_precision": torch.half # Run with FP16 } trt_ts_module = trtorch.compile(scripted_model, compile_settings) input_data = torch.randn((1024, 1, 32, 32)) input_data = input_data.half().to("cuda") pytorch_out = model.forward(input_data) trtorch_out = trt_ts_module(input_data) print('PyTorch output: \n', pytorch_out[0, :, :, 0]) print('TRTorch output: \n', trtorch_out[0, :, :, 0]) cal_max_diff(pytorch_out, trtorch_out)
def simple_trtorch_example(device='cuda'): input_shape = 128, 3, 224, 224 # (batch size, channel, height, width). preprocess = resnet50_preprocess() fig, axes = plt.subplots(nrows=2, ncols=2) for i in range(4): img_path = './data/img{}.JPG'.format(i) img = Image.open(img_path) input_tensor = preprocess(img) plt.subplot(2, 2, i + 1) plt.imshow(img) plt.axis('off') with open('./data/imagenet_class_index.json') as fd: d = json.load(fd) print('Number of classes in ImageNet: {}'.format(len(d))) #-------------------- resnet50_model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50', pretrained=True) resnet50_model.eval() # Decode the results into ([predicted class, description], probability). def predict(img_path, model): img = Image.open(img_path) #preprocess = resnet50_preprocess() input_tensor = preprocess(img) input_batch = input_tensor.unsqueeze(0) # Create a mini-batch as expected by the model. # Move the input and model to GPU for speed if available. if torch.cuda.is_available(): input_batch = input_batch.to(device) model.to(device) with torch.no_grad(): output = model(input_batch) # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes. sm_output = torch.nn.functional.softmax(output[0], dim=0) ind = torch.argmax(sm_output) return d[str(ind.item())], sm_output[ind] # ([predicted class, description], probability). for i in range(4): img_path = './data/img{}.JPG'.format(i) img = Image.open(img_path) pred, prob = predict(img_path, resnet50_model) print('{}: Predicted = {}, Probablility = {}.'.format(img_path, pred, prob)) plt.subplot(2, 2, i + 1) plt.imshow(img) plt.axis('off') plt.title(pred[1]) #-------------------- # Model benchmark without TRTorch/TensorRT model = resnet50_model.eval().to(device) cuda_cnn_benchmark(model, input_shape, nruns=1000, device=device) #-------------------- # Create TorchScript modules. # Tracing. model = resnet50_model.eval().to(device) print('Start creating a traced model...') start_time = time.time() traced_model = torch.jit.trace(model, [torch.randn(input_shape, device=device)]) print('End creating a traced model: {} secs.'.format(time.time() - start_time)) #torch.jit.save(traced_model, './resnet50_traced.jit.pt') cuda_cnn_benchmark(traced_model, input_shape, nruns=1000, device=device) # Compile with TRTorch. import trtorch # FP32 (single precision). # The compiled module will have precision as specified by "op_precision". print('Start creating a TRTorch model (FP32)...') start_time = time.time() trt_ts_model_fp32 = trtorch.compile(traced_model, { 'input_shapes': [input_shape], 'op_precision': torch.float32, # Run with FP32. 'workspace_size': 1 << 20 }) print('End creating a TRTorch model (FP32): {} secs.'.format(time.time() - start_time)) #torch.jit.save(trt_ts_model_fp32, './resnet50_fp32.ts') cuda_cnn_benchmark(trt_ts_model_fp32, input_shape, nruns=1000, device=device) # FP16 (half precision). # The compiled module will have precision as specified by "op_precision". print('Start creating a TRTorch model (FP16)...') start_time = time.time() trt_ts_model_fp16 = trtorch.compile(traced_model, { 'input_shapes': [input_shape], 'op_precision': torch.half, # Run with FP16. 'workspace_size': 1 << 20 }) print('End creating a TRTorch model (FP16): {} secs.'.format(time.time() - start_time)) #torch.jit.save(trt_ts_model_fp16, './resnet50_fp16.ts') cuda_cnn_benchmark(trt_ts_model_fp16, input_shape, is_fp16=True, nruns=1000, device=device)
# not all operations supported :/ if trtorch is not None: with torch.no_grad(): x = cv2_frame_to_cuda(cam.read()) print(x.shape) shape = list(x.shape) compile_settings = { "input_shapes": [shape, shape], # "input_shapes": [ # # [shape, shape] # # { # # "min": [1, 3, 224, 224], # # "opt": [1, 3, 512, 512], # # "max": [1, 3, 1024, 1024] # # }, # For static size [1, 3, 224, 224] # ], "op_precision": torch.half, # Run with FP16 "num_min_timing_iters": 2, # Default: 2 "num_avg_timing_iters": 1, # Default: 1 "max_batch_size": 1, # Maximum batch size (must be >= 1 to be set, 0 means not set) } # script_model = torch.jit.script(model) traced_model = torch.jit.trace(model, [x, x]) trt_ts_module = trtorch.compile(traced_model, compile_settings) x = x.half() result = trt_ts_module(x, x) torch.jit.save(trt_ts_module, "trt_torchscript_module_fp16.ts")