def tensorrt_backend_pfe_onnx(): pillar_x = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_y = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_z = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_i = np.ones([1, 1, 12000, 100], dtype=np.float32) num_points_per_pillar = np.ones([1, 12000], dtype=np.float32) x_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32) y_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32) mask = np.ones([1, 1, 12000, 100], dtype=np.float32) pfe_inputs = [ pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask ] print("pfe_inputs length is : ", len(pfe_inputs)) start = time.time() pfe_model = onnx.load("pfe.onnx") engine = backend.prepare(pfe_model, device="CUDA:0", max_batch_size=1) for i in range(1, 1000): pfe_outputs = engine.run(pfe_inputs) end = time.time() print('inference time is : ', (end - start) / 1000) print(pfe_outputs)
def onnx2trt_infer( onnx_model_filename: str, input_values: 'Sequence[np.ndarray]', batch_size: int = 1, workspace_size: int = (1024 * 1024 * 16), ) -> 'Sequence[np.ndarray]': r"""infer model with 'onnx_tensorrt' backend""" import onnx import onnx.optimizer as optimizer import onnx_tensorrt.backend as backend from onnx.utils import polish_model model = onnx.load(onnx_model_filename) passes = optimizer.get_available_passes() passes = list(filter(lambda name: not name.startswith('split_'), passes)) logger.debug('optimizations to perform in ONNX:\n\t%s', passes) model = optimizer.optimize(model, passes=passes) model = polish_model(model) onnx.save(model, onnx_model_filename.rpartition('.onnx')[0] + '.optimized.onnx') engine = backend.prepare( model, device='CUDA', max_batch_size=batch_size, max_workspace_size=workspace_size, ) return engine.run(input_values)
def load_model(self, path): self.model = onnx.load(path) if not torch.cuda.is_available(): raise NotImplementedError( 'TensorRT backend does not work for non-CUDA devices.') # get first gpu self.engine = backend.prepare(self.model, device='CUDA:0')
def onnx_infer(image, model_path): model = onnx.load(model_path) engine = backend.prepare(model, device='CUDA:1') # input_data = np.random.random(size=(32, 3, 224, 224)).astype(np.float32) output_data = engine.run(image)[0] print(output_data) print(output_data.shape)
def load_model(path, shape): model = onnx.load(path) engine = backend.prepare(model, device='CUDA:0') input_data = np.random.random(size=shape).astype(np.float32) # return output_data = engine.run(input_data) print(output_data['steer']) print(output_data)
def inference_model(): torch.backends.cudnn.deterministic = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = VellaDeconv().to(device).eval() points = np.random.rand(1, 3, 6, 1)#(25,100, 2, 2) points = np.zeros((1,3,6,1))+1 points_trch = points #.reshape((25,100, 2, 2)) points_t = [torch.tensor(points_trch, device=device, dtype=torch.float32)] points_t_np = points_t[0].detach().cpu().numpy() # import pdb; pdb.set_trace() torch.cuda.synchronize() with torch.no_grad(): x_conv_trch = net(*points_t) torch.cuda.synchronize() x_conv_np = x_conv_trch.detach().cpu().numpy() onnx_model = onnx.load(model_onnx) tensorrt_engine = backend.prepare( onnx_model, device="CUDA:0", max_workspace_size=536870912, max_batch_size=1, using_fp16=False, serialize_engine=False, engine_file_path=model_tensorrt ) # points_trt = points points_trt = points_trch points_nv = torch.tensor(points_trt, device=device, dtype=torch.float32) points_np = [points_nv.detach().cpu().numpy()] x_conv_nv = tensorrt_engine.run(points_np) print(f"x_conv_np is :{x_conv_np}") print(f"x_conv_np.shape is: {x_conv_np.shape}") print(f"x_conv_np.dtype is: {x_conv_np.dtype}") print(f"x_conv_nv is: {x_conv_nv[0]}") print(f"x_conv_nv shape is: {x_conv_nv[0].shape}") print(x_conv_nv[0].dtype) print("Input matching percentage") print((np.count_nonzero(points_t_np==points_np[0]))/points_t_np.size) print("Output matching percentage") print((np.count_nonzero(x_conv_np==x_conv_nv[0]))/x_conv_np.size) print(np.count_nonzero(np.isclose(x_conv_np, x_conv_nv[0], atol=1e-8))/x_conv_np.size) # np.savetxt("x_conv_np.txt",x_conv_np) print("................")
def tensorrt_backend_rpn_onnx(): rpn_input_features = np.ones([1, 64, 496, 432], dtype=np.float32) rpn_start_time = time.time() rpn_model = onnx.load("rpn.onnx") engine = backend.prepare(rpn_model, device="CUDA:0", max_batch_size=1) for i in range(1, 1000): rpn_outputs = engine.run(rpn_input_features) rpn_end_time = time.time() print('rpn inference time is : ', (rpn_end_time - rpn_start_time) / 1000) print(rpn_outputs)
def main(input_data_dir, output_data_dir, onnx_name): # sess = onnxruntime.InferenceSession(onnx_name) """ so = onnxruntime.SessionOptions() so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess = onnxruntime.InferenceSession(onnx_name, sess_options=so) sess.set_providers(['CUDAExecutionProvider']) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[0].name """ model = onnx.load(onnx_name) # engine = build_engine(onnx_name) engine = backend.prepare(model, device="CUDA:0") # print("The model expects input shape: ", sess.get_inputs()[0].shape) # sess.run(None, {input_name: np.random.rand(1, 3, 1024, 1024).astype(np.float32)}) result = engine.run(np.random.rand(1, 3, 1024, 1024).astype(np.float32))[0] image_path_list = get_image_pathes(input_data_dir) for image_path in tqdm(image_path_list): base_name = Path(image_path).name rgb_image = cv2.imread(image_path) bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR) img_tfmd = transform_image(add_dummy_dim(bgr_image)) img_tfmd_ary = img_tfmd.to("cpu").detach().numpy() # test = add_dummy_dim(bgr_image) # result = sess.run(None, {input_name: np.random.rand(1, 3, 1024, 1024).astype(np.float32)}) start = time.time() result = engine.run(img_tfmd_ary.astype(np.float32))[0] # result = sess.run(None, {input_name: img_tfmd_ary.astype(np.float32)}) end = time.time() print(end - start) prob = result[0][0] label_img = (prob.argmax(0) * 255).astype(np.uint8) cv2.imwrite(str(Path(output_data_dir, base_name)), label_img) cv2.waitKey(10)
import sys import onnx import onnx_tensorrt.backend as backend import numpy as np import time model = onnx.load("centernet_dla34.onnx") graph = onnx.helper.printable_graph(model.graph) #print(graph) engine = backend.prepare(model, device='CUDA:1') #input_data = np.random.random(size=(1, 3, 512, 512)).astype(np.float32) images = np.load('images.npy') #input_data = np.random.random(size=(1, 3, 512, 512)).astype(np.float32) output_datas = engine.run(images) #print(output_datas[0].shape) #print(output_datas[0]) #print("===============(output_datas[0]===============================") #print(output_datas[1].shape) #print(output_datas[1]) #print("================(output_datas[1]==================================") #print(output_datas[2].shape) print(output_datas[2][0]) #print("=================output_datas[2]=================================") #output_hm = np.load('output_hm.npy') #output_wh = np.load('output_wh.npy') output_reg = np.load('output_reg.npy') #print(output_hm.shape) #print(output_hm) #print("============= output_hm =====================================") #print(output_wh.shape)
roop = 20 e = 0.0 inp = np.ones((1,4,480,640), dtype=np.float32) for _ in range(roop): s = time.time() result = onnx_session.run( [output_name], {input_name: inp} ) e += (time.time() - s) print(f'elapsed time: {e/roop*1000}ms') """ elapsed time: 57.117438316345215ms """ import onnx import onnx_tensorrt.backend as be model = onnx.load('saved_model_sony_480x640/model_float32.onnx') engine = be.prepare(model, device='CUDA:0') e = 0.0 for _ in range(roop): s = time.time() output = engine.run(inp)[0] e += (time.time() - s) print(f'elapsed time: {e/roop*1000}ms') """ elapsed time: 13.761746883392334ms """
loader = transforms.Compose( [transforms.Resize(128), transforms.ToTensor(), transforms.Normalize(std, mean)]) imagen = loader(imagen).float() imagen = imagen.unsqueeze(0) return imagen def to_numpy(tensor): return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() if FLAG_DETECCION: # Inicia sesion runtime modelo_onnx = onnx.load(path_modelo_deteccion) engine = backend.prepare(modelo_onnx, device = device) # Lee imagenes e inferencia paths_imagenes = glob.glob(f"{path_imagenes_deteccion}/*.png") imagenes = [tr_deteccion(path_imagen) for path_imagen in paths_imagenes] for imagen in imagenes: imagen1 = imagen.numpy() time_start = time.time() salidas = engine.run(entradas) time_end = time.time() total_time = time_end - time_start prediccion = salidas
[0, 255, 255], # empty ], dtype=np.float32) FULL_LABEL_MAP = np.arange(len(LABEL_NAMES)).reshape(len(LABEL_NAMES), 1) FULL_COLOR_MAP = colormap[FULL_LABEL_MAP] # load model model = onnx.load("deeplab_pruned.onnx") engine_path = 'deeplab_pruned_' + precision + '.engine' engine = backend.prepare(model, device='CUDA:0', serialize_engine=False, precision=precision, max_batch_size=4, load_engine=load_engine, engine_path=engine_path) def list_images(folder, pattern='*', ext='bmp'): """List the images in a specified folder by pattern and extension Args: folder (str): folder containing the images to list pattern (str, optional): a bash-like pattern of the files to select defaults to * (everything) ext(str, optional): the image extension (defaults to png) Returns: str list: list of (filenames) images matching the pattern in the folder
def test_onnx_for_trt(onnx_path, config_path, model_dir, ckpt_path=None): dummy_dev_pillar_x_ = np.random.random(size=(1, 1, 12000, 100)).astype(np.float32) dummy_dev_pillar_y_ = np.random.random(size=(1, 1, 12000, 100)).astype(np.float32) dummy_dev_pillar_z_ = np.random.random(size=(1, 1, 12000, 100)).astype(np.float32) dummy_dev_pillar_i_ = np.random.random(size=(1, 1, 12000, 100)).astype(np.float32) dummy_dev_num_points_per_pillar_ = np.random.random(size=(1, 1, 12000, 1)).astype( np.float32) dummy_dev_x_coors_for_sub_shaped_ = np.random.random(size=(1, 1, 12000, 100)).astype( np.float32) dummy_dev_y_coors_for_sub_shaped_ = np.random.random(size=(1, 1, 12000, 100)).astype( np.float32) dummy_dev_pillar_feature_mask_ = np.random.random(size=(1, 1, 12000, 100)).astype( np.float32) model = onnx.load(onnx_path) engine = backend.prepare(model, device='CUDA:0', max_batch_size=1) print("model read success") print() output_data = engine.run( (dummy_dev_pillar_x_, dummy_dev_pillar_y_, dummy_dev_pillar_z_, dummy_dev_pillar_i_, dummy_dev_num_points_per_pillar_, dummy_dev_x_coors_for_sub_shaped_, dummy_dev_y_coors_for_sub_shaped_, dummy_dev_pillar_feature_mask_)) # ##########compare with pytorch output ######################### for i in range(len(output_data)): print(output_data[i].shape) print(output_data[0][0, 0, 0:100]) model_dir = pathlib.Path(model_dir) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) model_cfg = config.model.second voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder_for_official_onnx_and_cuda.build( model_cfg, voxel_generator, target_assigner) net.cuda() net.eval() # since the model is changed, dont restore first if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) dummy_dev_pillar_x_ = torch.as_tensor(dummy_dev_pillar_x_, device="cuda") dummy_dev_pillar_y_ = torch.as_tensor(dummy_dev_pillar_y_, device="cuda") dummy_dev_pillar_z_ = torch.as_tensor(dummy_dev_pillar_z_, device="cuda") dummy_dev_pillar_i_ = torch.as_tensor(dummy_dev_pillar_i_, device="cuda") dummy_dev_num_points_per_pillar_ = torch.as_tensor( dummy_dev_num_points_per_pillar_, device="cuda") dummy_dev_x_coors_for_sub_shaped_ = torch.as_tensor( dummy_dev_x_coors_for_sub_shaped_, device="cuda") dummy_dev_y_coors_for_sub_shaped_ = torch.as_tensor( dummy_dev_y_coors_for_sub_shaped_, device="cuda") dummy_dev_pillar_feature_mask_ = torch.as_tensor( dummy_dev_pillar_feature_mask_, device="cuda") output_pytorch = net.voxel_feature_extractor( dummy_dev_pillar_x_, dummy_dev_pillar_y_, dummy_dev_pillar_z_, dummy_dev_pillar_i_, dummy_dev_num_points_per_pillar_, dummy_dev_x_coors_for_sub_shaped_, dummy_dev_y_coors_for_sub_shaped_, dummy_dev_pillar_feature_mask_) print(output_pytorch[0, 0, 0:100])
def test(self): import onnx from ngraph_onnx.onnx_importer.importer import import_onnx_model import ngraph as ng global dim0, dim2, dim3 torch.set_grad_enabled(False) epoch = self.optimizer.get_last_epoch() + 1 self.ckp.write_log('\nEvaluation:') self.ckp.add_log( torch.zeros(1, len(self.loader_test), len(self.scale)) ) self.model.eval() timer_test = utility.timer() if self.args.save_results: self.ckp.begin_background() # print(self.loader_test) for idx_data, d in enumerate(self.loader_test): for idx_scale, scale in enumerate(self.scale): d.dataset.set_scale(idx_scale) print('idx_scale={}'.format(idx_scale)) # print("len: {}".format(len(d))) # for lr, hr, filename, _ in tqdm(d, ncols=80): for batch, (lr, hr, filename, _) in enumerate(d): print('{} '.format(batch), end='', flush=True) lr, hr = self.prepare(lr, hr) print('test lr.size: {}'.format(lr.size())) dim0 = lr.size()[0] dim2 = lr.size()[2] dim3 = lr.size()[3] showbug = False if showbug: print('stage1', flush=True) if self.args.ngraph: pytorch_model_name = self.args.ngraph pytorch_edsr_model = torch.load(pytorch_model_name).cuda() if showbug: print('stage2-1', flush=True) # print(lr.size()) # dummy_input = torch.randn_like(lr, device='cuda') if showbug: print('stage2-2', flush=True) edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name) # print('Export to onnx model {}'.format(edsr_onnx_filename)) torch.onnx.export(pytorch_edsr_model, lr.to(torch.device('cuda')), edsr_onnx_filename, export_params=True, verbose=False, training=False) if showbug: print('stage2-3', flush=True) edsr_onnx_model = onnx.load(edsr_onnx_filename) # print(onnx.helper.printable_graph(edsr_onnx_model.graph)) if showbug: print('stage2-4', flush=True) ng_models = import_onnx_model(edsr_onnx_model) # print('Convert to nGreph Model') ng_model = ng_models[0] if showbug: print('stage2-5', flush=True) runtime = ng.runtime(backend_name='CPU') if showbug: print('stage2-6', flush=True) edsr_ng_model = runtime.computation(ng_model['output'], *ng_model['inputs']) if showbug: print('stage2-7', flush=True) sr = edsr_ng_model(lr, idx_scale) if showbug: print('stage2-8', flush=True) sr = torch.from_numpy(sr) if showbug: print('stage2-9', flush=True) elif self.args.tensorrt: pytorch_model_name = self.args.tensorrt pytorch_edsr_model = torch.load(pytorch_model_name) # lr_np = lr.numpy().astype(np.float32) dummy_input = torch.randn_like(lr, device='cuda') edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name) print('Export to onnx model {}'.format(edsr_onnx_filename)) torch.onnx.export(pytorch_edsr_model, dummy_input, edsr_onnx_filename, export_params=True, verbose=False, training=False) import os import onnx edsr_onnx_model = onnx.load(edsr_onnx_filename) # print(onnx.helper.printable_graph(edsr_onnx_model.graph)) import tensorrt import onnx_tensorrt.backend as backend import numpy as np tensorrt_engine = backend.prepare(edsr_onnx_model, device='CUDA:0') # lr_np = lr_np.to(torch.device("cuda:0")) # lr.numpy().astype(np.float32) sr = tensorrt_engine.run(lr.numpy().astype(np.float32))[0] sr = torch.from_numpy(sr) print('complete one') pytorch_model_name = self.args.tensorrt pytorch_edsr_model = torch.load(pytorch_model_name) # lr_np = lr.numpy().astype(np.float32) dummy_input = torch.randn_like(lr, device='cuda') edsr_onnx_filename = '{}.onnx'.format(pytorch_model_name) print('Export to onnx model {}'.format(edsr_onnx_filename)) torch.onnx.export(pytorch_edsr_model, dummy_input, edsr_onnx_filename, export_params=True, verbose=False, training=False) import os import onnx edsr_onnx_model = onnx.load(edsr_onnx_filename) # print(onnx.helper.printable_graph(edsr_onnx_model.graph)) import tensorrt import onnx_tensorrt.backend as backend import numpy as np tensorrt_engine = backend.prepare(edsr_onnx_model, device='CUDA:0') # lr_np = lr_np.to(torch.device("cuda:0")) # lr.numpy().astype(np.float32) sr = tensorrt_engine.run(lr.numpy().astype(np.float32))[0] sr = torch.from_numpy(sr) print('complete two') else: sr = self.model(lr, idx_scale) if showbug: print('stage3', flush=True) sr = utility.quantize(sr, self.args.rgb_range) if showbug: print('stage4', flush=True) save_list = [sr] if showbug: print('stage5', flush=True) self.ckp.log[-1, idx_data, idx_scale] += utility.calc_psnr( sr, hr, scale, self.args.rgb_range, dataset=d ) if showbug: print('stage6', flush=True) if self.args.save_gt: save_list.extend([lr, hr]) if showbug: print('stage7', flush=True) if self.args.save_results: self.ckp.save_results(d, filename[0], save_list, scale) if showbug: print('stage8', flush=True) self.ckp.log[-1, idx_data, idx_scale] /= len(d) best = self.ckp.log.max(0) psnr = self.ckp.log[-1, idx_data, idx_scale].numpy() print('') self.ckp.write_log( '[{} x{}]\tPSNR: {:.3f} (Best: {:.3f} @epoch {})'.format( d.dataset.name, scale, self.ckp.log[-1, idx_data, idx_scale], best[0][idx_data, idx_scale], best[1][idx_data, idx_scale] + 1 ) ) self.ckp.write_log('Forward: {:.2f}s\n'.format(timer_test.toc())) self.ckp.write_log('Saving...') if self.args.save_results: self.ckp.end_background() if not self.args.test_only: self.ckp.save(self, epoch, is_best=(best[1][0, 0] + 1 == epoch)) self.ckp.write_log( 'Total: {:.2f}s\n'.format(timer_test.toc()), refresh=True ) torch.set_grad_enabled(True) return psnr
import torch.optim as optim from torch.optim import lr_scheduler import numpy as np import torchvision from torchvision import datasets, models, transforms # import matplotlib.pyplot as plt import time import os import copy import onnx import tensorrt import onnx_tensorrt.backend as backend model = onnx.load("resnet18.onnx") engine = backend.prepare(model, device='CUDA:0', max_batch_size=64) input_data = np.random.random(size=(1, 3, 224, 224)).astype(np.float32) output_data = engine.run(input_data)[0] print(output_data) print(output_data.shape) # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.RandomRotation((-180,180)), transforms.RandomAffine(degrees=(-30,30), shear=(-20,20)), #transforms.Resize(256), #transforms.CenterCrop(224),
def tensorrt_backend_pointpillars_onnx(config_path=None): import torch from second.protos import pipeline_pb2 from google.protobuf import text_format from second.builder import voxel_builder from second.pytorch.models.pointpillars import PointPillarsScatter ############################# PFE-Layer TensorRT ################################ pillar_x = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_y = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_z = np.ones([1, 1, 12000, 100], dtype=np.float32) pillar_i = np.ones([1, 1, 12000, 100], dtype=np.float32) num_points_per_pillar = np.ones([1, 12000], dtype=np.float32) x_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32) y_sub_shaped = np.ones([1, 1, 12000, 100], dtype=np.float32) mask = np.ones([1, 1, 12000, 100], dtype=np.float32) pfe_inputs = [ pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask ] pfe_model = onnx.load("pfe.onnx") engine = backend.prepare(pfe_model, device="CUDA:0", max_batch_size=1) pfe_start_time = time.time() pfe_outputs = engine.run(pfe_inputs) pfe_end_time = time.time() print('inference time is : ', (pfe_end_time - pfe_start_time)) ###################### PillarScatter Python Coder Transfer ######################### # numpy --> tensor pfe_outs = np.array(pfe_outputs) voxel_features_tensor = torch.from_numpy(pfe_outs) voxel_features = voxel_features_tensor.squeeze() voxel_features = voxel_features.permute(1, 0) if isinstance(config_path, str): config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path model_cfg = config.model.second vfe_num_filters = list(model_cfg.voxel_feature_extractor.num_filters) voxel_generator = voxel_builder.build(model_cfg.voxel_generator) grid_size = voxel_generator.grid_size output_shape = [1] + grid_size[::-1].tolist() + [vfe_num_filters[-1]] num_input_features = vfe_num_filters[-1] batch_size = 1 mid_feature_extractor = PointPillarsScatter(output_shape, num_input_features, batch_size) device = torch.device("cuda:0") coors_numpy = np.loadtxt('coors.txt', dtype=np.int32) coors = torch.from_numpy(coors_numpy) coors = coors.to(device).cuda() #CPU Tensor --> GPU Tensor voxel_features = voxel_features.to(device).cuda() rpn_input_features = mid_feature_extractor(voxel_features, coors) ########################### RPN Network TensorRT ################################# rpn_input_features = rpn_input_features.data.cpu().numpy() rpn_model = onnx.load("rpn.onnx") engine_rpn = backend.prepare(rpn_model, device="CUDA:0", max_batch_size=1) rpn_start_time = time.time() rpn_outputs = engine_rpn.run(rpn_input_features) rpn_end_time = time.time() print('rpn inference time is : ', (rpn_end_time - rpn_start_time)) print(rpn_outputs)