def create_session(self): sess_opt = ort.SessionOptions() sess_opt.enable_profiling = self.profiling sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=[self.provider]) return sess
def testSessionOptionsAddConfigEntry(self): so = onnxrt.SessionOptions() key = "CONFIG_KEY" val = "CONFIG_VAL" so.add_session_config_entry(key, val) self.assertEqual(so.get_session_config_entry(key), val)
def _create_ort_training_session(self): # Validating frozen_weights names unused_frozen_weights = [n for n in self.options.utils.frozen_weights\ if n not in [i.name for i in self._onnx_model.graph.initializer]] if unused_frozen_weights: raise RuntimeError( "{} params from 'frozen_weights' not found in the ONNX model.". format(unused_frozen_weights)) # Get loss name from model description loss_name = [ item.name for item in self.model_desc.outputs if item.is_loss ] assert len( loss_name ) == 1, f"Only one loss output is supported ({len(loss_name)} were specified)" loss_name = loss_name[0] # Parse optimizer parameters optimizer_attributes_map = {} optimizer_int_attributes_map = {} trainable_params = set() for initializer in self._onnx_model.graph.initializer: if initializer.name in self.options.utils.frozen_weights: continue # only trainable parameters are passed to the backend trainable_params.add(initializer.name) optimizer_attributes_map[initializer.name] = {} optimizer_int_attributes_map[initializer.name] = {} not_in_param_groups = True for param_group in self.optim_config.params: if initializer.name not in param_group['params']: continue # keep looking for a matching param_group not_in_param_groups = False for k, v in param_group.items(): # 'params' is not a hyper parameter, skip it. 'lr' per weight is not supported if k == 'params' or k == 'lr': continue if isinstance(v, float): optimizer_attributes_map[initializer.name][k] = v elif isinstance(v, int): optimizer_int_attributes_map[initializer.name][k] = v else: raise ValueError( "Optimizer attributes must be either float or int." ) # set default values for params not found in groups if not_in_param_groups: for k, v in self.optim_config.defaults.items(): if k == 'lr': continue if isinstance(v, float): optimizer_attributes_map[initializer.name][k] = v elif isinstance(v, int): optimizer_int_attributes_map[initializer.name][k] = v else: raise ValueError( "Optimizer attributes must be either float or int." ) # TrainingParameters ort_parameters = ort.TrainingParameters() ort_parameters.loss_output_name = loss_name ort_parameters.use_mixed_precision = self.options.mixed_precision.enabled ort_parameters.world_rank = self.options.distributed.world_rank ort_parameters.world_size = self.options.distributed.world_size ort_parameters.gradient_accumulation_steps = self.options.batch.gradient_accumulation_steps ort_parameters.allreduce_post_accumulation = self.options.distributed.allreduce_post_accumulation ort_parameters.deepspeed_zero_stage = self.options.distributed.deepspeed_zero_optimization.stage ort_parameters.enable_grad_norm_clip = self.options.utils.grad_norm_clip ort_parameters.set_gradients_as_graph_outputs = False ort_parameters.use_invertible_layernorm_grad = self.options.utils.invertible_layer_norm_gradient ort_parameters.training_optimizer_name = self.optim_config.name ort_parameters.lr_params_feed_name = self.model_desc.learning_rate.name ort_parameters.weights_to_train = trainable_params ort_parameters.optimizer_attributes_map = optimizer_attributes_map ort_parameters.optimizer_int_attributes_map = optimizer_int_attributes_map ort_parameters.attn_dropout_recompute = self.options.graph_transformer.attn_dropout_recompute ort_parameters.gelu_recompute = self.options.graph_transformer.gelu_recompute ort_parameters.transformer_layer_recompute = self.options.graph_transformer.transformer_layer_recompute ort_parameters.number_recompute_layers = self.options.graph_transformer.number_recompute_layers ort_parameters.model_with_training_graph_path = self.options.debug.model_with_training_graph_path # SessionOptions session_options = ort.SessionOptions() session_options.use_deterministic_compute = self.options.debug.deterministic_compute if (self.options.graph_transformer.attn_dropout_recompute or self.options.graph_transformer.gelu_recompute or self.options.graph_transformer.transformer_layer_recompute): session_options.execution_order = ort.ExecutionOrder.PRIORITY_BASED # old ort session may already exists and occupies GPU memory when creating new session, this may cause OOM error. # for example, load_state_dict will be called before returing the function, and it calls _init_session again del self._training_session # TrainingSession self._training_session = ort.TrainingSession( self._onnx_model.SerializeToString(), ort_parameters, session_options) # I/O bindings self._train_io_binding = self._training_session.io_binding() self._eval_io_binding = self._training_session.io_binding()
cv2.namedWindow(WINDOW) if len(sys.argv) > 1: capture = cv2.VideoCapture(sys.argv[1]) mirror_img = False else: capture = cv2.VideoCapture(2) mirror_img = True if capture.isOpened(): hasFrame, frame = capture.read() frame_ct = 0 else: hasFrame = False onnx_file_name = 'Preprocess1x256x256xBGRxByte.onnx' sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.enable_profiling = True ort_session = onnxruntime.InferenceSession(onnx_file_name, sess_options) input_name = ort_session.get_inputs()[0].name while hasFrame: img1, img2, scale, pad = resize_pad(frame) img_in = np.expand_dims(img1, axis=0).astype(np.uint8) ort_inputs = {input_name: img_in} ort_outs = ort_session.run(None, ort_inputs) imgDisp = ort_outs[0][0]
def getSingleSessionProfilingStartTime(): so = onnxrt.SessionOptions() so.enable_profiling = True sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=so) return sess.get_profiling_start_time_ns()
def inference(onnx_model, model_dir, examples): quantized_str = '' if 'quantized' in onnx_model: quantized_str = 'quantized' onnx_inference = [] # pytorch_inference = [] # onnx session options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = 16 # does not seem to make a difference, always parallelized # options.inter_op_num_threads = multiprocessing.cpu_count() # logger.info(onnx_model) ort_session = ort.InferenceSession(onnx_model, options) # pytorch pretrained model and tokenizer if 'bertweet' in onnx_model: tokenizer = AutoTokenizer.from_pretrained(model_dir, normalization=True) else: tokenizer = AutoTokenizer.from_pretrained(model_dir) tokenizer_str = "TokenizerFast" # logger.info("**************** {} ONNX inference with batch tokenization and with {} tokenizer****************".format( # quantized_str, tokenizer_str)) start_batch_tokenization = time.time() tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128) token0 = get_tokens(tokens_dict, 0) examples_chunks_list = chunkIt(examples, NUM_BATCHES) tokens_dict_list = [ tokenizer.batch_encode_plus(chunk, padding='longest') for chunk in examples_chunks_list ] # tokens_dict_list = [tokenizer.batch_encode_plus(chunk, max_length=128) for chunk in examples_chunks_list] minibatches_list = [] for i, token_batch in enumerate(tokens_dict_list): minibatch = {} number_examples_in_this_batch = len(token_batch['input_ids']) minibatch['input_ids'] = np.stack(([ get_tokens(token_batch, i)['input_ids'][0] for i in range(number_examples_in_this_batch) ]), axis=0) minibatch['token_type_ids'] = np.stack(([ get_tokens(token_batch, i)['token_type_ids'][0] for i in range(number_examples_in_this_batch) ]), axis=0) minibatch['attention_mask'] = np.stack(([ get_tokens(token_batch, i)['attention_mask'][0] for i in range(number_examples_in_this_batch) ]), axis=0) # logger.info('y') minibatches_list.append(minibatch) # tokens_dict = tokenizer.batch_encode_plus(examples, padding='longest') total_batch_tokenization_time = time.time() - start_batch_tokenization total_inference_time = 0 total_build_label_time = 0 start_onnx_inference_batch = time.time() # for i in range(len(examples)): for i, minibatch in enumerate(minibatches_list): """ Onnx inference with batch tokenization """ # if i % 100 == 0: # logger.info(i, '/', NUM_BATCHES) tokens = get_tokens(tokens_dict, i) # inference start_inference = time.time() # ort_outs = ort_session.run(None, tokens) ort_outs = ort_session.run(None, minibatch) total_inference_time = total_inference_time + (time.time() - start_inference) # build label start_build_label = time.time() torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32) onnx_logits = F.softmax(torch_onnx_output, dim=1) logits_label = torch.argmax(onnx_logits, dim=1) label = logits_label.detach().cpu().numpy() # onnx_inference.append(label[0]) # onnx_inference.append(onnx_logits.detach().cpu().numpy().tolist()) # TODO might be able to make this faster by using arrays with pre-defined size isntead of mutating lists like this onnx_inference = onnx_inference + onnx_logits.detach().cpu().numpy( ).tolist() # onnx_inference.append(onnx_logits.detach().cpu().numpy()[0].tolist()) # total_build_label_time = total_build_label_time + (time.time() - start_build_label) # logger.info(i, label[0], onnx_logits.detach().cpu().numpy()[0].tolist(), type(onnx_logits.detach().cpu().numpy()[0]) ) end_onnx_inference_batch = time.time() # logger.info("Total batch tokenization time (in seconds): ", total_batch_tokenization_time) # logger.info("Total inference time (in seconds): ", total_inference_time) # logger.info("Total build label time (in seconds): ", total_build_label_time) # logger.info("Duration ONNX inference (in seconds) with {} and batch tokenization: ".format(tokenizer_str), # logger.info("Duration ONNX inference (in seconds): ", # end_onnx_inference_batch - start_onnx_inference_batch, # (end_onnx_inference_batch - start_onnx_inference_batch) / len(examples)) # logger.info(onnx_inference) return onnx_inference
def main(): args = parse_arguments() setup_logger(args.verbose) dump_environment() enable_past_input = args.enable_past_input cache_dir = args.cache_dir if not os.path.exists(cache_dir): os.makedirs(cache_dir) output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) (model_class, tokenizer_class, model_name_or_path) = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) model = model_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) model.eval().cpu() inputs = tokenizer.encode_plus("Here is an example input for GPT2 model", add_special_tokens=True, return_tensors='pt') input_ids = inputs['input_ids'] outputs = model(input_ids=input_ids, past=None) num_layer = model.config.n_layer present_names = [f'present_{i}' for i in range(num_layer)] output_names = ["last_state"] + present_names input_names = ['input_ids'] dynamic_axes = { 'input_ids': { 0: 'batch_size', 1: 'seq_len' }, 'last_state': { 0: 'batch_size', 1: 'seq_len' } } for name in present_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} if enable_past_input: past_names = [f'past_{i}' for i in range(num_layer)] input_names = ['input_ids'] + past_names dummy_past = [ torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer) ] for name in past_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} export_inputs = (inputs['input_ids'], tuple(dummy_past)) else: export_inputs = (inputs['input_ids']) export_model_path = os.path.join( output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input))) torch.onnx.export(model, args=export_inputs, f=export_model_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, opset_version=11, do_constant_folding=True, verbose=False) # Let's run performance test on PyTorch before updating environment variable. past = dummy_past if enable_past_input else None outputs = pytorch_inference(model, input_ids, past, total_runs=args.total_runs) # setup environment variables before importing onnxruntime. setup_environment(args.use_openmp) import onnxruntime if enable_past_input: onnx_model_path = export_model_path else: onnx_model_path = os.path.join( output_dir, 'gpt2_past{}_out1.onnx'.format(int(enable_past_input))) remove_past_outputs(export_model_path, onnx_model_path) if args.enable_optimization: from optimizer import optimize_model m = optimize_model(onnx_model_path, model_type='gpt2', num_heads=12, hidden_size=768, opt_level=0, optimization_options=None) onnx_model_path = os.path.join( output_dir, 'gpt2_past{}_optimized.onnx'.format(int(enable_past_input))) m.save_model_to_file(onnx_model_path) if 'CUDAExecutionProvider' in onnxruntime.get_available_providers(): logger.warning( "onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference." ) sess_options = onnxruntime.SessionOptions() if args.use_openmp: sess_options.intra_op_num_threads = 1 else: sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) logger.info( f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) logger.info(f"Start inferencing onnx model: {onnx_model_path}") session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=['CPUExecutionProvider']) ort_outputs = onnxruntime_inference(session, input_ids, past, args.total_runs) if args.verify_outputs: logger.info( 'PyTorch and OnnxRuntime output 0 (last_state) are close:'.format( 0), numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)) for layer in range(model.config.n_layer): logger.info( 'PyTorch and OnnxRuntime layer {} state (present_{}) are close:' .format(layer, layer), numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))
def main(): model = GoogleModel(64, 5, 2, 32, 8, True, None, False) batch_size = 1000 input = torch.randn(batch_size, 5, 9, 9) outputs = model(input) model.eval() # model(input)[0][0, 0].item() # print("torch CPU:", timeit.timeit(lambda: model(input)[0][0, 0].item(), number=20)) # model.cuda() # input = input.cuda() # model(input)[0][0, 0].item() # print("torch CUDA:", timeit.timeit(lambda: model(input)[0][0, 0].item(), number=20)) print("Exporting to onnx") torch.onnx.export( model, input, "../../data/onnx/small.onnx", example_outputs=outputs, opset_version=12, input_names=["input"], output_names=["value", "policy"], dynamic_axes={ "input": { 0: "batch_size" }, "value": { 0: "batch_size" }, "policy": { 0: "batch_size" } }, training=TrainingMode.EVAL, ) # print("Quantizing") # quantize_dynamic("../../data/onnx/small.onnx", "../../data/onnx/small_quant.onnx", weight_type=QuantType.QUInt8) print("Loading onnx") model = onnx.load("../../data/onnx/small.onnx") onnx.checker.check_model(model) model = onnx.load("../../data/onnx/small_quant.onnx") onnx.checker.check_model(model) print("Building profile") sess_options = onnxruntime.SessionOptions() sess_options.enable_profiling = True # sess_options.log_severity_level = 0 session = onnxruntime.InferenceSession("../../data/onnx/small.onnx", sess_options=sess_options) print("Running model") onnx_input = input.cpu().numpy() _ = session.run(None, {"input": onnx_input}) _ = session.run(None, {"input": onnx_input}) rounds = 100 delta = timeit.timeit(lambda: session.run(None, {"input": onnx_input}), number=rounds) throughput = batch_size * rounds / delta print(f"onnx cuda (?): {throughput} boards/s")
def load_a_batch(batch_filenames): unconcatenated_batch_data = [] for image_filename in batch_filenames: image_filepath = imagenet_path + '/' + image_filename nchw_data = load_and_resize_image(image_filepath, height, width) unconcatenated_batch_data.append(nchw_data) batch_data = np.concatenate(unconcatenated_batch_data, axis=0) return batch_data #print("Device: " + rt.get_device()) sess_options = rt.SessionOptions() if CPU_THREADS > 0: sess_options.enable_sequential_execution = False sess_options.session_thread_pool_size = CPU_THREADS sess = rt.InferenceSession(model_path, sess_options) input_layer_names = [ x.name for x in sess.get_inputs() ] # FIXME: check that input_layer_name belongs to this list input_layer_name = input_layer_name or input_layer_names[0] output_layer_names = [ x.name for x in sess.get_outputs() ] # FIXME: check that output_layer_name belongs to this list output_layer_name = output_layer_name or output_layer_names[0]
def test_modulated_deform_conv2d(): try: from mmcv.ops import ModulatedDeformConv2d, get_onnxruntime_op_path except (ImportError, ModuleNotFoundError): pytest.skip('modulated_deform_conv op is not successfully compiled') ort_custom_op_path = get_onnxruntime_op_path() # modulated deform conv config in_channels = 3 out_channels = 64 stride = 1 padding = 0 dilation = 1 groups = 1 deform_groups = 1 kernel_size = 3 input = torch.rand(1, in_channels, 28, 28).cuda() # (n, c, h, w) conv_offset = nn.Conv2d(in_channels=3, out_channels=deform_groups * 3 * kernel_size * kernel_size, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True).cuda() conv_offset.cuda() out = conv_offset(input) o1, o2, mask = torch.chunk(out, 3, dim=1) offset = torch.cat((o1, o2), dim=1) mask = torch.sigmoid(mask) model_with_bias = ModulatedDeformConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, deform_groups, bias=True) model_without_bias = ModulatedDeformConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, deform_groups, bias=False) models = [model_with_bias.cuda(), model_without_bias.cuda()] for model in models: # export and load onnx model with torch.no_grad(): torch.onnx.export(model, (input, offset, mask), onnx_file, export_params=True, keep_initializers_as_inputs=True, input_names=['input', 'offset', 'mask'], opset_version=11) session_options = rt.SessionOptions() if os.path.exists(ort_custom_op_path): session_options.register_custom_ops_library(ort_custom_op_path) # compute onnx_output sess = rt.InferenceSession(onnx_file, session_options) onnx_output = sess.run( None, { 'input': input.cpu().detach().numpy(), 'offset': offset.cpu().detach().numpy(), 'mask': mask.cpu().detach().numpy() })[0] # compute pytorch_output with torch.no_grad(): pytorch_output = model(input, offset, mask).cpu() # allclose assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
def test_deform_conv2d(threshold=1e-3): try: from mmcv.ops import DeformConv2d, get_onnxruntime_op_path except (ImportError, ModuleNotFoundError): pytest.skip('deform_conv op is not successfully compiled') ort_custom_op_path = get_onnxruntime_op_path() if not os.path.exists(ort_custom_op_path): pytest.skip('custom ops for onnxruntime are not compiled.') # deform conv config # modulated deform conv config in_channels = 1 out_channels = 64 stride = 1 padding = 0 dilation = 1 groups = 1 deform_groups = 1 kernel_size = 2 input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]] offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]], [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]], [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]], [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]] offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7] deform_weight = [[[0.4, 0.2, 0.1, 0.9]]] x = torch.tensor(input) conv_offset = nn.Conv2d(in_channels=in_channels, out_channels=deform_groups * 2 * kernel_size * kernel_size, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=True) conv_offset.weight.data = torch.nn.Parameter( torch.Tensor(offset_weight).reshape(8, 1, 2, 2)) conv_offset.bias.data = torch.nn.Parameter( torch.Tensor(offset_bias).reshape(8)) offset = conv_offset(x) model = DeformConv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, deform_groups) model.weight.data = torch.nn.Parameter( torch.Tensor(deform_weight).reshape(1, 1, 2, 2)) with torch.no_grad(): torch.onnx.export(model, (x, offset), onnx_file, export_params=True, keep_initializers_as_inputs=True, input_names=['input', 'offset'], opset_version=11) session_options = rt.SessionOptions() if os.path.exists(ort_custom_op_path): session_options.register_custom_ops_library(ort_custom_op_path) # compute onnx_output sess = rt.InferenceSession(onnx_file, session_options) onnx_output = sess.run( None, { 'input': x.cpu().detach().numpy(), 'offset': offset.cpu().detach().numpy(), })[0] # compute pytorch_output with torch.no_grad(): pytorch_output = model(x, offset).cpu() # allclose assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
def test_cummax_cummin(key, opset=11): if torch.__version__ == 'parrots': pytest.skip('onnx is not supported in parrots directly') # Note generally `cummax` or `cummin` is exportable to ONNX # as long as the pytorch version >= 1.5.0, since `torch.cummax` # is only supported with torch >= 1.5.0. # But when `cummax` or `cummin` serves as an intermediate component # whose outputs is used as inputs for another modules, it's expected # that pytorch version must be >= 1.7.0. Otherwise error appears like: # `RuntimeError: tuple appears in op that does not forward tuples, # unsupported 'kind: prim::PythonOp`. if version.parse(torch.__version__) < version.parse('1.7.0'): pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0') # register custom op `mmcv::cummax` and `mmcv::cummin` from mmcv.onnx.symbolic import register_extra_symbolics register_extra_symbolics(opset) from mmcv.ops import get_onnxruntime_op_path ort_custom_op_path = get_onnxruntime_op_path() if not os.path.exists(ort_custom_op_path): pytest.skip('custom ops for onnxruntime are not compiled.') input_list = [ # arbitrary shape, e.g. 1-D, 2-D, 3-D, ... torch.rand((2, 3, 4, 1, 5)), torch.rand((1)), torch.rand((2, 0, 1)), # tensor.numel() is 0 torch.FloatTensor(), # empty tensor ] cummax_cummin_funcs = {'cummax': torch.cummax, 'cummin': torch.cummin} for input in input_list: ndims = input.dim() # valid dim range is [-ndims, ndims-1] # test for all `dim` value which is valid for dim in range(-ndims, ndims): cummax_func = partial(cummax_cummin_funcs[key], dim=dim) wrapped_model = WrapFunction(cummax_func).eval() with torch.no_grad(): torch.onnx.export(wrapped_model, input, onnx_file, export_params=True, keep_initializers_as_inputs=True, input_names=['input'], output_names=['output', 'indices'], opset_version=opset) onnx_model = onnx.load(onnx_file) input_all = [node.name for node in onnx_model.graph.input] input_initializer = [ node.name for node in onnx_model.graph.initializer ] net_feed_input = list(set(input_all) - set(input_initializer)) assert (len(net_feed_input) == 1) session_options = rt.SessionOptions() session_options.register_custom_ops_library(ort_custom_op_path) sess = rt.InferenceSession(onnx_file, session_options) ort_output, ort_inds = sess.run(None, {'input': input.detach().numpy()}) pytorch_output, pytorch_inds = wrapped_model(input.clone()) pytorch_output = pytorch_output.detach().numpy() pytorch_inds = pytorch_inds.detach().numpy() assert np.allclose(pytorch_output, ort_output, atol=1e-5) assert np.all(pytorch_inds == ort_inds)
def test_roialign_rotated(): if torch.__version__ == 'parrots': pytest.skip('onnx is not supported in parrots directly') try: from mmcv.ops import get_onnxruntime_op_path, roi_align_rotated except (ImportError, ModuleNotFoundError): pytest.skip('roi_align_aligned op is not successfully compiled') ort_custom_op_path = get_onnxruntime_op_path() if not os.path.exists(ort_custom_op_path): pytest.skip('custom ops for onnxruntime are not compiled.') # roi align config pool_h = 2 pool_w = 2 spatial_scale = 1.0 sampling_ratio = 2 inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., 0]]), ([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., np.pi / 2]]), ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[0., 0.5, 0.5, 1., 1., 0]]), ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.], [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., 0]]), ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.], [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., np.pi / 2]])] def warpped_function(torch_input, torch_rois): return roi_align_rotated(torch_input, torch_rois, (pool_w, pool_h), spatial_scale, sampling_ratio, True, False) for case in inputs: np_input = np.array(case[0], dtype=np.float32) np_rois = np.array(case[1], dtype=np.float32) input = torch.from_numpy(np_input) rois = torch.from_numpy(np_rois) # compute pytorch_output with torch.no_grad(): pytorch_output = roi_align_rotated(input, rois, (pool_w, pool_h), spatial_scale, sampling_ratio, True, False) # export and load onnx model wrapped_model = WrapFunction(warpped_function) with torch.no_grad(): torch.onnx.export(wrapped_model, (input, rois), onnx_file, export_params=True, keep_initializers_as_inputs=True, input_names=['features', 'rois'], opset_version=11) onnx_model = onnx.load(onnx_file) session_options = rt.SessionOptions() if os.path.exists(ort_custom_op_path): session_options.register_custom_ops_library(ort_custom_op_path) # compute onnx_output input_all = [node.name for node in onnx_model.graph.input] input_initializer = [ node.name for node in onnx_model.graph.initializer ] net_feed_input = list(set(input_all) - set(input_initializer)) assert (len(net_feed_input) == 2) sess = rt.InferenceSession(onnx_file, session_options) onnx_output = sess.run(None, { 'features': input.detach().numpy(), 'rois': rois.detach().numpy() }) onnx_output = onnx_output[0] # allclose assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
def test_softnms(): if torch.__version__ == 'parrots': pytest.skip('onnx is not supported in parrots directly') from mmcv.ops import get_onnxruntime_op_path, soft_nms # only support pytorch >= 1.7.0 if version.parse(torch.__version__) < version.parse('1.7.0'): warnings.warn('test_softnms should be ran with pytorch >= 1.7.0') return # only support onnxruntime >= 1.5.1 assert version.parse(rt.__version__) >= version.parse( '1.5.1'), 'test_softnms should be ran with onnxruntime >= 1.5.1' ort_custom_op_path = get_onnxruntime_op_path() if not os.path.exists(ort_custom_op_path): pytest.skip('softnms for onnxruntime is not compiled.') np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0], [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]], dtype=np.float32) np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32) boxes = torch.from_numpy(np_boxes) scores = torch.from_numpy(np_scores) configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'], [0.3, 0.5, 0.01, 'naive']] session_options = rt.SessionOptions() session_options.register_custom_ops_library(ort_custom_op_path) for _iou_threshold, _sigma, _min_score, _method in configs: pytorch_dets, pytorch_inds = soft_nms(boxes, scores, iou_threshold=_iou_threshold, sigma=_sigma, min_score=_min_score, method=_method) nms = partial(soft_nms, iou_threshold=_iou_threshold, sigma=_sigma, min_score=_min_score, method=_method) wrapped_model = WrapFunction(nms) wrapped_model.cpu().eval() with torch.no_grad(): torch.onnx.export(wrapped_model, (boxes, scores), onnx_file, export_params=True, keep_initializers_as_inputs=True, input_names=['boxes', 'scores'], opset_version=11) onnx_model = onnx.load(onnx_file) # get onnx output input_all = [node.name for node in onnx_model.graph.input] input_initializer = [ node.name for node in onnx_model.graph.initializer ] net_feed_input = list(set(input_all) - set(input_initializer)) assert (len(net_feed_input) == 2) sess = rt.InferenceSession(onnx_file, session_options) onnx_dets, onnx_inds = sess.run(None, { 'scores': scores.detach().numpy(), 'boxes': boxes.detach().numpy() }) assert np.allclose(pytorch_dets, onnx_dets, atol=1e-3) assert np.allclose(onnx_inds, onnx_inds, atol=1e-3)
def evaluate(opt): # set config config = load_config(opt) if opt.num_threads > 0: torch.set_num_threads(opt.num_threads) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # prepare test dataset test_loader = prepare_datasets(config) # load pytorch model checkpoint checkpoint = load_checkpoint(config) # prepare model and load parameters model = load_model(config, checkpoint) model.eval() # convert to onnx format if opt.convert_onnx: (x, y) = next(iter(test_loader)) x = to_device(x, opt.device) y = to_device(y, opt.device) convert_onnx(config, model, x) check_onnx(config) logger.info("[ONNX model saved at {}".format(opt.onnx_path)) return # load onnx model for using onnxruntime if opt.enable_ort: import onnxruntime as ort sess_options = ort.SessionOptions() sess_options.inter_op_num_threads = opt.num_threads sess_options.intra_op_num_threads = opt.num_threads ort_session = ort.InferenceSession(opt.onnx_path, sess_options=sess_options) # convert to tvm format if opt.convert_tvm: (x, y) = next(iter(test_loader)) x = to_device(x, opt.device) y = to_device(y, opt.device) convert_tvm(config, model, x) logger.info("[TVM model saved at {}".format(opt.tvm_path)) return # enable to use dynamic quantized model (pytorch>=1.3.0) if opt.enable_dqm and opt.device == 'cpu': model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) print(model) # evaluation preds = None ys = None correct = 0 n_batches = len(test_loader) total_examples = 0 whole_st_time = time.time() first_time = time.time() first_examples = 0 total_duration_time = 0.0 with torch.no_grad(): for i, (x, y) in enumerate(tqdm(test_loader, total=n_batches)): start_time = time.time() x = to_device(x, opt.device) y = to_device(y, opt.device) if opt.enable_ort: x = to_numpy(x) if config['emb_class'] == 'glove': ort_inputs = {ort_session.get_inputs()[0].name: x} if config['emb_class'] in [ 'bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra' ]: if config['emb_class'] in ['distilbert', 'bart']: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1] } else: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1], ort_session.get_inputs()[2].name: x[2] } logits = ort_session.run(None, ort_inputs)[0] logits = to_device(torch.tensor(logits), opt.device) else: logits = model(x) if preds is None: preds = to_numpy(logits) ys = to_numpy(y) else: preds = np.append(preds, to_numpy(logits), axis=0) ys = np.append(ys, to_numpy(y), axis=0) predicted = logits.argmax(1) correct += (predicted == y).sum().item() cur_examples = y.size(0) total_examples += cur_examples if i == 0: # first one may take longer time, so ignore in computing duration. first_time = float((time.time() - first_time) * 1000) first_examples = cur_examples if opt.num_examples != 0 and total_examples >= opt.num_examples: logger.info("[Stop Evaluation] : up to the {} examples".format( total_examples)) break duration_time = float((time.time() - start_time) * 1000) if i != 0: total_duration_time += duration_time ''' logger.info("[Elapsed Time] : {}ms".format(duration_time)) ''' # generate report labels = model.labels label_names = [v for k, v in sorted(labels.items(), key=lambda x: x[0])] preds_ids = np.argmax(preds, axis=1) try: print( classification_report(ys, preds_ids, target_names=label_names, digits=4)) print(labels) print(confusion_matrix(ys, preds_ids)) except Exception as e: logger.warn(str(e)) acc = correct / total_examples whole_time = float((time.time() - whole_st_time) * 1000) avg_time = (whole_time - first_time) / (total_examples - first_examples) # write predictions to file write_prediction(opt, preds, labels) logger.info("[Accuracy] : {:.4f}, {:5d}/{:5d}".format( acc, correct, total_examples)) logger.info("[Elapsed Time] : {}ms, {}ms on average".format( whole_time, avg_time)) logger.info( "[Elapsed Time(total_duration_time, average)] : {}ms, {}ms".format( total_duration_time, total_duration_time / (total_examples - 1)))
def _load_model(args) -> Any: # validation if args.device not in [None, "cpu"] and args.engine != TORCH_ENGINE: raise ValueError( f"device {args.device} is not supported for {args.engine}") if args.fp16 and args.engine != TORCH_ENGINE: raise ValueError(f"half precision is not supported for {args.engine}") if args.quantized_inputs and args.engine == TORCH_ENGINE: raise ValueError(f"quantized inputs not supported for {args.engine}") if args.num_cores is not None and args.engine == TORCH_ENGINE: raise ValueError( f"overriding default num_cores not supported for {args.engine}") if (args.num_cores is not None and args.engine == ORT_ENGINE and onnxruntime.__version__ < "1.7"): raise ValueError( "overriding default num_cores not supported for onnxruntime < 1.7.0. " "If using an older build with OpenMP, try setting the OMP_NUM_THREADS " "environment variable") # scale static ONNX graph to desired image shape if args.engine in [DEEPSPARSE_ENGINE, ORT_ENGINE]: args.model_filepath, _ = modify_yolo_onnx_input_shape( args.model_filepath, args.image_shape) has_postprocessing = yolo_onnx_has_postprocessing(args.model_filepath) # load model if args.engine == DEEPSPARSE_ENGINE: _LOGGER.info(f"Compiling DeepSparse model for {args.model_filepath}") model = compile_model(args.model_filepath, 1, args.num_cores) if args.quantized_inputs and not model.cpu_vnni: _LOGGER.warning("WARNING: VNNI instructions not detected, " "quantization speedup not well supported") elif args.engine == ORT_ENGINE: _LOGGER.info(f"Loading onnxruntime model for {args.model_filepath}") sess_options = onnxruntime.SessionOptions() if args.num_cores is not None: sess_options.intra_op_num_threads = args.num_cores sess_options.log_severity_level = 3 sess_options.graph_optimization_level = ( onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL) onnx_model = onnx.load(args.model_filepath) override_model_batch_size(onnx_model, 1) model = onnxruntime.InferenceSession(onnx_model.SerializeToString(), sess_options=sess_options) elif args.engine == TORCH_ENGINE: _LOGGER.info(f"Loading torch model for {args.model_filepath}") model = torch.load(args.model_filepath) if isinstance(model, dict): model = model["model"] model.to(args.device) model.eval() if args.fp16: _LOGGER.info("Using half precision") model.half() else: _LOGGER.info("Using full precision") model.float() has_postprocessing = True return model, has_postprocessing
def inference(opt): # set config config = load_config(opt) if opt.num_threads > 0: torch.set_num_threads(opt.num_threads) config['opt'] = opt # set path: opt.embedding_path, opt.vocab_path, opt.label_path set_path(config) # load pytorch model checkpoint checkpoint = load_checkpoint(config) # prepare model and load parameters model = load_model(config, checkpoint) model.eval() # load onnx model for using onnxruntime if opt.enable_ort: import onnxruntime as ort sess_options = ort.SessionOptions() sess_options.inter_op_num_threads = opt.num_threads sess_options.intra_op_num_threads = opt.num_threads ort_session = ort.InferenceSession(opt.onnx_path, sess_options=sess_options) # enable to use dynamic quantized model (pytorch>=1.3.0) if opt.enable_dqm and opt.device == 'cpu': model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8) print(model) # prepare tokenizer tokenizer = prepare_tokenizer(config, model) # prepare labels labels = model.labels # inference f_out = open(opt.test_path + '.inference', 'w', encoding='utf-8') total_examples = 0 total_duration_time = 0.0 with torch.no_grad(), open(opt.test_path, 'r', encoding='utf-8') as f: for i, line in enumerate(f): start_time = time.time() sent, label = line.strip().split('\t') x_raw = sent.split() y_raw = label text = ' '.join(x_raw) x = encode_text(config, tokenizer, text) x = to_device(x, opt.device) if opt.enable_ort: x = to_numpy(x) if config['emb_class'] == 'glove': ort_inputs = {ort_session.get_inputs()[0].name: x} if config['emb_class'] in [ 'bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra' ]: if config['emb_class'] in ['distilbert', 'bart']: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1] } else: ort_inputs = { ort_session.get_inputs()[0].name: x[0], ort_session.get_inputs()[1].name: x[1], ort_session.get_inputs()[2].name: x[2] } logits = ort_session.run(None, ort_inputs)[0] logits = to_device(torch.tensor(logits), opt.device) else: logits = model(x) predicted = logits.argmax(1) predicted = to_numpy(predicted)[0] predicted_raw = labels[predicted] f_out.write(text + '\t' + y_raw + '\t' + predicted_raw + '\n') total_examples += 1 if opt.num_examples != 0 and total_examples >= opt.num_examples: logger.info("[Stop Inference] : up to the {} examples".format( total_examples)) break duration_time = float((time.time() - start_time) * 1000) if i != 0: total_duration_time += duration_time logger.info("[Elapsed Time] : {}ms".format(duration_time)) f_out.close() logger.info( "[Elapsed Time(total_duration_time, average)] : {}ms, {}ms".format( total_duration_time, total_duration_time / (total_examples - 1)))
def inference(onnx_model, model_dir, examples, fast_tokenizer, num_threads): quantized_str = '' if 'quantized' in onnx_model: quantized_str = 'quantized' onnx_inference = [] # pytorch_inference = [] # onnx session options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = 1 print(onnx_model) ort_session = ort.InferenceSession(onnx_model, options) # pytorch pretrained model and tokenizer tokenizer = BertTokenizerFast.from_pretrained(model_dir) tokenizer_str = "BertTokenizerFast" print( "**************** {} ONNX inference with batch tokenization and with {} tokenizer****************" .format(quantized_str, tokenizer_str)) start_onnx_inference_batch = time.time() start_batch_tokenization = time.time() tokens_dict = tokenizer.batch_encode_plus(examples, max_length=128) total_batch_tokenization_time = time.time() - start_batch_tokenization total_inference_time = 0 total_build_label_time = 0 for i in range(len(examples)): """ Onnx inference with batch tokenization """ if i % 100 == 0: print('[inference... ]', i, 'out of ', len(examples)) tokens = get_tokens(tokens_dict, i) #inference start_inference = time.time() ort_outs = ort_session.run(None, tokens) total_inference_time = total_inference_time + (time.time() - start_inference) #build label start_build_label = time.time() torch_onnx_output = torch.tensor(ort_outs[0], dtype=torch.float32) onnx_logits = F.softmax(torch_onnx_output, dim=1) logits_label = torch.argmax(onnx_logits, dim=1) label = logits_label.detach().cpu().numpy() # onnx_inference.append(label[0]) onnx_inference.append(onnx_logits.detach().cpu().numpy()[0].tolist()) total_build_label_time = total_build_label_time + (time.time() - start_build_label) # print(i, label[0], onnx_logits.detach().cpu().numpy()[0].tolist(), type(onnx_logits.detach().cpu().numpy()[0]) ) end_onnx_inference_batch = time.time() print("Total batch tokenization time (in seconds): ", total_batch_tokenization_time) print("Total inference time (in seconds): ", total_inference_time) print("Total build label time (in seconds): ", total_build_label_time) print( "Duration ONNX inference (in seconds) with {} and batch tokenization: " .format(tokenizer_str), end_onnx_inference_batch - start_onnx_inference_batch, (end_onnx_inference_batch - start_onnx_inference_batch) / len(examples)) return onnx_inference
def generate_test_data(onnx_file, output_path, batch_size, sequence_length, use_cpu=True, input_tensor_only=False, dictionary_size=DICT_SIZE, test_cases=3): input_data_type = np.int32 for test_case in range(test_cases): input_1 = np.random.randint(dictionary_size, size=(batch_size, sequence_length), dtype=input_data_type) tensor_1 = numpy_helper.from_array(input_1, 'input_ids') actual_seq_len = random.randint(sequence_length - 3, sequence_length) input_2 = np.zeros((batch_size, sequence_length), dtype=input_data_type) temp = np.ones((batch_size, actual_seq_len), dtype=input_data_type) input_2[:temp.shape[0], :temp.shape[1]] = temp tensor_2 = numpy_helper.from_array(input_2, 'attention_mask') input_3 = np.zeros((batch_size, sequence_length), dtype=input_data_type) tensor_3 = numpy_helper.from_array(input_3, 'token_type_ids') path = os.path.join(output_path, 'test_data_set_' + str(test_case)) try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) if input_tensor_only: return sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL sess = onnxruntime.InferenceSession(onnx_file, sess_options, providers=['CPUExecutionProvider']) input1_name = sess.get_inputs()[0].name output_names = [output.name for output in sess.get_outputs()] inputs = { 'input_ids': input_1, 'attention_mask': input_2, 'token_type_ids': input_3 } print("inputs", inputs) result = sess.run(output_names, inputs) with open(os.path.join(path, 'input_{}.pb'.format(0)), 'wb') as f: f.write(tensor_1.SerializeToString()) with open(os.path.join(path, 'input_{}.pb'.format(1)), 'wb') as f: f.write(tensor_2.SerializeToString()) with open(os.path.join(path, 'input_{}.pb'.format(2)), 'wb') as f: f.write(tensor_3.SerializeToString()) for i, output_name in enumerate(output_names): tensor_result = numpy_helper.from_array( np.asarray(result[i]).reshape((batch_size, sequence_length)), output_names[i]) with open(os.path.join(path, 'output_{}.pb'.format(i)), 'wb') as f: f.write(tensor_result.SerializeToString()) start_time = timeit.default_timer() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED path_prefix = onnx_file[:-5] #remove .onnx suffix if use_cpu: sess_options.optimized_model_filepath = path_prefix + "_optimized_cpu.onnx" else: sess_options.optimized_model_filepath = path_prefix + "_optimized_gpu.onnx" session = onnxruntime.InferenceSession(onnx_file, sess_options) if use_cpu: session.set_providers(['CPUExecutionProvider']) # use cpu else: if 'CUDAExecutionProvider' not in session.get_providers(): print("Warning: GPU not found") continue outputs = session.run(None, inputs) evalTime = timeit.default_timer() - start_time if outputs[0].tolist() != result[0].tolist(): print( "Error: not same result after optimization. use_cpu={}, no_opt_output={}, opt_output={}" .format(use_cpu, result[0].tolist(), outputs[1].tolist())) print("** Evaluation done in total {} secs".format(evalTime))
num_gpus = torch.cuda.device_count() print(target, model_name) model = model_fn(pretrained=True) # pytorch --> onnx model conversion if (target != "pytorch"): # write model out to onnx ref_input = torch.tensor(vidl.get_random(0, batch_len * num_gpus)) torch.onnx.export(model, (ref_input), "bench_out.onnx", keep_initializers_as_inputs=True, verbose=True, opset_version=10) so = ort.SessionOptions() so.optimized_model_filepath = "bench_out.onnx.opt" session = ort.InferenceSession("bench_out.onnx", so) del session del so res = None if (num_gpus > 0): scaled_batch_len = batch_len * num_gpus else: scaled_batch_len = batch_len latency = [] # PYTORCH BENCH if target == "pytorch":
def main(): parser = argparse.ArgumentParser(description='Simple ONNX Runtime Test Tool.') parser.add_argument('model_path', help='model path') parser.add_argument('num_iters', nargs='?', type=int, default=1000, help='model run iterations. default=1000') parser.add_argument('--debug', action='store_true', help='pause execution to allow attaching a debugger.') parser.add_argument('--profile', action='store_true', help='enable chrome timeline trace profiling.') args = parser.parse_args() iters = args.num_iters if args.debug: print("Pausing execution ready for debugger to attach to pid: {}".format(os.getpid())) print("Press key to continue.") sys.stdin.read(1) sess_options = None if args.profile: sess_options = onnxrt.SessionOptions() sess_options.enable_profiling = True sess_options.profile_file_prefix = os.path.basename(args.model_path) sess = onnxrt.InferenceSession(args.model_path, sess_options) meta = sess.get_modelmeta() feeds = {} for input_meta in sess.get_inputs(): # replace any symbolic dimensions (value is None) with 1 shape = [dim if dim else 1 for dim in input_meta.shape] if input_meta.type in float_dict: feeds[input_meta.name] = np.random.rand(*shape).astype(float_dict[input_meta.type]) elif input_meta.type in integer_dict: feeds[input_meta.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(integer_dict[input_meta.type]) elif input_meta.type == 'tensor(bool)': feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype('bool') else: print("unsupported input type {} for input {}".format(input_meta.type, input_meta.name)) sys.exit(-1) # Starting with IR4 some initializers provide default values # and can be overridden (available in IR4). For IR < 4 models # the list would be empty for initializer in sess.get_overridable_initializers(): shape = [dim if dim else 1 for dim in initializer.shape] if initializer.type in float_dict: feeds[initializer.name] = np.random.rand(*shape).astype(float_dict[initializer.type]) elif initializer.type in integer_dict: feeds[initializer.name] = np.random.uniform(high=1000, size=tuple(shape)).astype(integer_dict[initializer.type]) elif initializer.type == 'tensor(bool)': feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype('bool') else: print("unsupported initializer type {} for initializer {}".format(initializer.type, initializer.name)) sys.exit(-1) start = timer() for i in range(iters): sess.run([], feeds) # fetch all outputs end = timer() print("model: {}".format(meta.graph_name)) print("version: {}".format(meta.version)) print("iterations: {}".format(iters)) print("avg latency: {} ms".format(((end - start) * 1000) / iters)) if args.profile: trace_file = sess.end_profiling() print("trace file written to: {}".format(trace_file)) return 0
def predict(self, data): import onnxruntime as ort assert self.model is not None remainder_sess = None sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = self.params["nthread"] sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL sess = ort.InferenceSession(self.model.SerializeToString(), sess_options=sess_options) if self.remainder_model is not None: remainder_sess = ort.InferenceSession( self.remainder_model.SerializeToString(), sess_options=sess_options) batch_size = 1 if self.params["operator"] == "xgb" else self.params[ "batch_size"] input_name = sess.get_inputs()[0].name is_regression = data.learning_task == LearningTask.REGRESSION if is_regression: output_name_index = 0 else: output_name_index = 1 output_name = sess.get_outputs()[output_name_index].name with Timer() as t: predict_data = ScoreBackend.get_data(data.X_test) total_size = len(predict_data) iterations = total_size // batch_size iterations += 1 if total_size % batch_size > 0 else 0 iterations = max(1, iterations) self.predictions = np.empty([total_size, self.params["n_classes"]], dtype="f4") for i in range(0, iterations): start = i * batch_size end = min(start + batch_size, total_size) if self.params["operator"] == "xgb": self.predictions[start:end, :] = sess.run( [output_name], {input_name: predict_data[start:end, :]}) elif self.params["operator"] == "lgbm" or "rf": if i == iterations - 1 and self.remainder_model is not None: pred = remainder_sess.run( [output_name], {input_name: predict_data[start:end, :]}) else: pred = sess.run( [output_name], {input_name: predict_data[start:end, :]}) if is_regression: self.predictions[start:end, :] = pred[0] else: self.predictions[start:end, :] = list( map(lambda x: list(x.values()), pred[0])) if is_regression: self.predictions = self.predictions.flatten() del sess if remainder_sess is not None: del remainder_sess return t.interval
for shp in ishapes: ts = np.product(shp) #print("reshaping %s with offset %d" % (str(shp), offset), file=sys.stderr) inputs.append(read(ts).reshape(shp)) ret = m.run(None, dict(zip(keys, inputs))) #print(ret, file=sys.stderr) for r in ret: write(r) if __name__ == "__main__": print(ort.get_available_providers(), file=sys.stderr) if 'OpenVINOExecutionProvider' in ort.get_available_providers( ) and 'ONNXCPU' not in os.environ: print("OnnxJit is using openvino", file=sys.stderr) options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL provider = 'OpenVINOExecutionProvider' elif 'CUDAExecutionProvider' in ort.get_available_providers( ) and 'ONNXCPU' not in os.environ: print("OnnxJit is using CUDA") options = ort.SessionOptions() options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL provider = 'CUDAExecutionProvider' else: print("OnnxJit is using CPU", file=sys.stderr) options = ort.SessionOptions() options.intra_op_num_threads = 4 options.inter_op_num_threads = 8 options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
def onnx2tensorrt(onnx_file, trt_file, input_config, verify=False, show=False, dataset='coco', workspace_size=1, verbose=False): import tensorrt as trt onnx_model = onnx.load(onnx_file) input_shape = input_config['input_shape'] # create trt engine and wraper opt_shape_dict = {'input': [input_shape, input_shape, input_shape]} max_workspace_size = get_GiB(workspace_size) trt_engine = onnx2trt( onnx_model, opt_shape_dict, log_level=trt.Logger.VERBOSE if verbose else trt.Logger.ERROR, fp16_mode=False, max_workspace_size=max_workspace_size) save_dir, _ = osp.split(trt_file) if save_dir: os.makedirs(save_dir, exist_ok=True) save_trt_engine(trt_engine, trt_file) print(f'Successfully created TensorRT engine: {trt_file}') if verify: one_img, one_meta = preprocess_example_input(input_config) input_img_cpu = one_img.detach().cpu().numpy() input_img_cuda = one_img.cuda() img = one_meta['show_img'] # Get results from ONNXRuntime ort_custom_op_path = get_onnxruntime_op_path() session_options = ort.SessionOptions() if osp.exists(ort_custom_op_path): session_options.register_custom_ops_library(ort_custom_op_path) sess = ort.InferenceSession(onnx_file, session_options) output_names = [_.name for _ in sess.get_outputs()] ort_outputs = sess.run(None, { 'input': input_img_cpu, }) with_mask = len(output_names) == 3 ort_outputs = [_.squeeze(0) for _ in ort_outputs] ort_dets, ort_labels = ort_outputs[:2] ort_masks = ort_outputs[2] if with_mask else None ort_shapes = [_.shape for _ in ort_outputs] print(f'ONNX Runtime output names: {output_names}, \ output shapes: {ort_shapes}') # Get results from TensorRT trt_model = TRTWraper(trt_file, ['input'], output_names) with torch.no_grad(): trt_outputs = trt_model({'input': input_img_cuda}) trt_outputs = [ trt_outputs[_].detach().cpu().numpy().squeeze(0) for _ in output_names ] trt_dets, trt_labels = trt_outputs[:2] trt_shapes = [_.shape for _ in trt_outputs] print(f'TensorRT output names: {output_names}, \ output shapes: {trt_shapes}') trt_masks = trt_outputs[2] if with_mask else None # Show detection outputs if show: CLASSES = get_classes(dataset) score_thr = 0.35 imshow_det_bboxes(img.copy(), trt_dets, trt_labels, segms=trt_masks, class_names=CLASSES, score_thr=score_thr, win_name='TensorRT') imshow_det_bboxes(img.copy(), ort_dets, ort_labels, segms=ort_masks, class_names=CLASSES, score_thr=score_thr, win_name='ONNXRuntime') # Compare results np.testing.assert_allclose(ort_dets, trt_dets, rtol=1e-03, atol=1e-05) np.testing.assert_allclose(ort_labels, trt_labels) if with_mask: np.testing.assert_allclose(ort_masks, trt_masks, rtol=1e-03, atol=1e-05) print('The numerical values are the same ' + 'between ONNXRuntime and TensorRT')
def testOrtExecutionMode(self): opt = onnxrt.SessionOptions() self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_SEQUENTIAL) opt.execution_mode = onnxrt.ExecutionMode.ORT_PARALLEL self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_PARALLEL)
## conda create -n cv2020 python=3.8 ## conda activate cv2020 ## conda install onnx protobuf numpy pip six fastapi uvicorn python-multipart -c conda-forge ## pip install opencv-python # need to install from pip due to QT dependencies on arm64 ## ONNXRuntime https://elinux.org/Jetson_Zoo#ONNX_Runtime ## wget https://nvidia.box.com/shared/static/8xgbee5ghhb92i9rrcr04yymg0n3x3t0.whl -O onnxruntime_gpu-1.7.0-cp38-cp38-linux_aarch64.whl ## pip install onnxruntime_gpu-1.7.0-cp38-cp38-linux_aarch64.whl import cv2 import onnxruntime as rt import numpy as np #### sessOptions = rt.SessionOptions() sessOptions.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL raccoonModel = rt.InferenceSession('raccoon_sim.onnx', sessOptions) #### inputStream = cv2.VideoCapture(0) while True: isImageValid, inputImage = inputStream.read() if isImageValid: ### Pre-processing ### inputTensor = cv2.resize(inputImage, (320, 320)) inputTensor = (inputTensor - [103.53, 116.28, 123.675]) / [57.375, 57.12, 58.395] inputTensor = inputTensor.transpose(2, 0, 1)[np.newaxis].astype( np.float32) #NCHW ### Inference ###
def create_ort_training_session_with_optimizer(model, device, training_optimizer_name, lr_params_feed_name, map_optimizer_attributes, world_rank=-1, world_size=1, gradient_accumulation_steps=1, bind_parameters=False, use_mixed_precision=False, allreduce_post_accumulation=False, deepspeed_zero_stage=0, enable_grad_norm_clip=True, frozen_weights=[], opset_version=DEFAULT_OPSET_VERSION, use_deterministic_compute=False, use_invertible_layernorm_grad=False, enable_adasum=False): output_name = model.graph.output[0].name ort_parameters = ort.TrainingParameters() ort_parameters.loss_output_name = output_name ort_parameters.use_mixed_precision = use_mixed_precision ort_parameters.world_rank = world_rank ort_parameters.world_size = world_size ort_parameters.gradient_accumulation_steps = gradient_accumulation_steps ort_parameters.allreduce_post_accumulation = allreduce_post_accumulation ort_parameters.deepspeed_zero_stage = deepspeed_zero_stage ort_parameters.enable_grad_norm_clip = enable_grad_norm_clip ort_parameters.set_gradients_as_graph_outputs = False ort_parameters.use_invertible_layernorm_grad = use_invertible_layernorm_grad ort_parameters.enable_adasum = enable_adasum output_types = {} for output in model.graph.output: output_types[output.name] = output.type.tensor_type # pybind does not allow to add directly to ort_parameters.weights_to_train. # Have to work around by using a temporary weights_to_train. torch_params = {} optimizer_attributes_map = {} optimizer_int_attributes_map = {} unused_frozen_weights = [n for n in frozen_weights if n not in [i.name for i in model.graph.initializer]] if unused_frozen_weights: raise RuntimeError("{} in frozen_weights not found in model weights.".format(unused_frozen_weights)) weights_to_train = set() for initializer in model.graph.initializer: if initializer.name in frozen_weights: continue weights_to_train.add(initializer.name) if map_optimizer_attributes is not None: attributes = map_optimizer_attributes(initializer.name) optimizer_attributes_map[initializer.name] = {} optimizer_int_attributes_map[initializer.name] = {} for k, v in attributes.items(): if isinstance(v, float): optimizer_attributes_map[initializer.name][k] = v elif isinstance(v, int): optimizer_int_attributes_map[initializer.name][k] = v else: raise ValueError("Optimizer attributes must be either float or int.") else: optimizer_attributes_map[initializer.name] = {} optimizer_int_attributes_map[initializer.name] = {} if bind_parameters: for initializer in model.graph.initializer: torch_tensor = torch.nn.Parameter(torch.as_tensor(numpy_helper.to_array(initializer), device=device)) delete_input_with_name(model.graph.input, initializer.name) model.graph.input.extend( [helper.make_tensor_value_info(initializer.name, initializer.data_type, initializer.dims)]) torch_params[initializer.name] = torch_tensor del model.graph.initializer[:] ort_parameters.weights_to_train = weights_to_train ort_parameters.training_optimizer_name = training_optimizer_name ort_parameters.lr_params_feed_name = lr_params_feed_name ort_parameters.optimizer_attributes_map = optimizer_attributes_map ort_parameters.optimizer_int_attributes_map = optimizer_int_attributes_map sessionOptions = ort.SessionOptions() sessionOptions.use_deterministic_compute = use_deterministic_compute session = ort.TrainingSession(model.SerializeToString(), ort_parameters, sessionOptions) train_io_binding = session.io_binding() eval_io_binding = session.io_binding() if bind_parameters: for param in torch_params.keys(): torch_tensor = torch_params[param] train_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device), dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()), torch_tensor.data_ptr()) eval_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device), dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()), torch_tensor.data_ptr()) return session, train_io_binding, eval_io_binding, output_name, torch_params, output_types
def convert_to_onnx_and_check( job_func, print_outlier=False, explicit_init=True, external_data=False, ort_optimize=True, opset=None, ): check_point = flow.train.CheckPoint() if explicit_init: # it is a trick to keep check_point.save() from hanging when there is no variable @flow.global_function(flow.FunctionConfig()) def add_var(): return flow.get_variable( name="trick", shape=(1, ), dtype=flow.float, initializer=flow.random_uniform_initializer(), ) check_point.init() flow_weight_dir = tempfile.TemporaryDirectory() check_point.save(flow_weight_dir.name) # TODO(daquexian): a more elegant way? while not os.path.exists( os.path.join(flow_weight_dir.name, "snapshot_done")): pass onnx_model_dir = tempfile.TemporaryDirectory() onnx_model_path = os.path.join(onnx_model_dir.name, "model.onnx") flow.onnx.export( job_func, flow_weight_dir.name, onnx_model_path, opset=opset, external_data=external_data, ) flow_weight_dir.cleanup() ort_sess_opt = ort.SessionOptions() ort_sess_opt.graph_optimization_level = ( ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED if ort_optimize else ort.GraphOptimizationLevel.ORT_DISABLE_ALL) sess = ort.InferenceSession(onnx_model_path, sess_options=ort_sess_opt) onnx_model_dir.cleanup() assert len(sess.get_outputs()) == 1 assert len(sess.get_inputs()) <= 1 ipt_dict = OrderedDict() for ipt in sess.get_inputs(): ipt_data = np.random.uniform(low=-10, high=10, size=ipt.shape).astype(np.float32) ipt_dict[ipt.name] = ipt_data onnx_res = sess.run([], ipt_dict)[0] oneflow_res = job_func(*ipt_dict.values()).get().numpy() rtol, atol = 1e-2, 1e-5 if print_outlier: a = onnx_res.flatten() b = oneflow_res.flatten() for i in range(len(a)): if np.abs(a[i] - b[i]) > atol + rtol * np.abs(b[i]): print("a[{}]={}, b[{}]={}".format(i, a[i], i, b[i])) assert np.allclose(onnx_res, oneflow_res, rtol=rtol, atol=atol)
def __test_export_route(self, module, out_name, mode, input_example=None): # select correct extension based on the output format ext = { DF.ONNX: ".onnx", DF.TRTONNX: ".trt.onnx", DF.PYTORCH: ".pt", DF.TORCHSCRIPT: ".ts" }.get(mode, ".onnx") out = Path(f"{out_name}{ext}") out_name = str(out) if out.exists(): os.remove(out) module.eval() outputs_fwd = (module.forward(*tuple(input_example.values())) if isinstance(input_example, OrderedDict) else (module.forward( *input_example) if isinstance(input_example, tuple) else module.forward(input_example) if input_example is not None else None)) deploy_input_example = (tuple(input_example.values()) if isinstance( input_example, OrderedDict) else input_example) self.nf.deployment_export( module=module, output=out_name, input_example=deploy_input_example, d_format=mode, output_example=outputs_fwd, ) tol = 5.0e-3 assert out.exists() == True if mode == DF.TRTONNX: data_loader = DefaultDataLoader() loader_cache = DataLoaderCache(data_loader) profile_shapes = OrderedDict() names = list(module.input_ports) + list(module.output_ports) names = list( filter( lambda x: x not in (module._disabled_deployment_input_ports | module. _disabled_deployment_output_ports), names, )) if isinstance(input_example, tuple): si = [ tuple(input_example[i].shape) for i in range(len(input_example)) ] elif isinstance(input_example, OrderedDict): si = [ tuple(input_example.values())[i].shape for i in range(len(input_example)) ] else: si = [tuple(input_example.shape)] if isinstance(outputs_fwd, tuple): fi = [ tuple(outputs_fwd[i].shape) for i in range(len(outputs_fwd)) ] else: fi = [tuple(outputs_fwd.shape)] si = si + fi i = 0 for name in names: profile_shapes[name] = [si[i]] * 3 i = i + 1 onnx_loader = OnnxFileLoader(out_name) network_loader = OnnxNetworkLoader(onnx_loader, explicit_precision=False) model_loader = BuildEngineLoader( network_loader, max_workspace_size=1 << 30, fp16_mode=False, int8_mode=False, profile_shapes=profile_shapes, write_engine=None, calibrator=None, layerwise=False, ) with TensorRTRunnerV2(model_loader=model_loader) as active_runner: input_metadata = active_runner.get_input_metadata() if input_metadata is None: logging.critical( "For {:}, get_input_metadata() returned None!".format( active_runner.name)) logging.debug("Runner Inputs: {:}".format(input_metadata)) feed_dict = loader_cache.load(iteration=0, input_metadata=input_metadata, input_example=input_example) inputs = dict() input_names = list(input_metadata.keys()) for i in range(len(input_names)): input_name = input_names[i] if input_name in module._disabled_deployment_input_ports: continue inputs[input_name] = ( input_example[input_name].cpu().numpy() if isinstance( input_example, OrderedDict) else (input_example[i].cpu().numpy() if isinstance( input_example, tuple) else input_example.cpu().numpy())) out_dict = active_runner.infer(feed_dict=feed_dict, output=outputs_fwd) for ov in out_dict.values(): outputs_scr = torch.from_numpy(ov).cuda() break outputs = [] outputs.append(copy.deepcopy(out_dict)) logging.debug("Received outputs: {:}".format([ "{:}: {:}".format(name, out.shape) for name, out in out_dict.items() ])) logging.info("Output Buffers: {:}".format(outputs)) inpex = [] for ie in feed_dict.values(): # loader_cache.cache[0].values(): if ie.dtype.type is np.int32: inpex.append(torch.from_numpy(ie).long().cuda()) else: inpex.append(torch.from_numpy(ie).cuda()) if len(inpex) == len(input_example): break inpex = tuple(inpex) outputs_fwd = module.forward(*inpex) elif mode == DF.ONNX: # Must recompute because *module* might be different now outputs_fwd = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC ort_session = ort.InferenceSession(out_name, sess_options, ['CUDAExecutionProvider']) print('Execution Providers: ', ort_session.get_providers()) inputs = dict() input_names = list(module.input_ports) ort_inputs = ort_session.get_inputs() for i in range(len(input_names)): input_name = input_names[i] if input_name in module._disabled_deployment_input_ports: continue inputs[input_name] = (input_example[input_name].cpu().numpy() if isinstance(input_example, OrderedDict) else (input_example[i].cpu().numpy() if isinstance(input_example, tuple) else input_example.cpu().numpy())) outputs_scr = ort_session.run(None, inputs) outputs_scr = torch.from_numpy(outputs_scr[0]).cuda() elif mode == DF.TORCHSCRIPT: scr = torch.jit.load(out_name) if isinstance(module, nemo.backends.pytorch.tutorials.TaylorNet): input_example = torch.randn(4, 1).cuda() outputs_fwd = module.forward(input_example) outputs_scr = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) elif mode == DF.PYTORCH: module.load_state_dict(torch.load(out_name)) module.eval() outputs_scr = ( module.forward(*tuple(input_example.values())) if isinstance( input_example, OrderedDict) else (module.forward(*input_example) if isinstance( input_example, tuple) else module.forward(input_example))) outputs_scr = (outputs_scr[0] if isinstance(outputs_scr, tuple) or isinstance(outputs_scr, list) else outputs_scr) outputs_fwd = (outputs_fwd[0] if isinstance(outputs_fwd, tuple) or isinstance(outputs_fwd, list) else outputs_fwd) assert (outputs_scr - outputs_fwd).norm(p=2) < tol if out.exists(): os.remove(out)
def onnx2tensorrt(onnx_file: str, trt_file: str, config: dict, input_config: dict, model_type: str, img_path: str, fp16: bool = False, verify: bool = False, show: bool = False, workspace_size: int = 1, verbose: bool = False): """Convert ONNX model to TensorRT model Args: onnx_file (str): the path of the input ONNX file. trt_file (str): the path to output the TensorRT file. config (dict): MMCV configuration. input_config (dict): contains min_shape, max_shape and \ input image path. fp16 (bool): whether to enable fp16 mode. verify (bool): whether to verify the outputs of TensorRT \ and ONNX are same. show (bool): whether to show the outputs of TensorRT and ONNX. verbose (bool): whether to print the log when generating \ TensorRT model. """ import tensorrt as trt min_shape = input_config['min_shape'] max_shape = input_config['max_shape'] # create trt engine and wrapper opt_shape_dict = {'input': [min_shape, min_shape, max_shape]} max_workspace_size = get_GiB(workspace_size) trt_engine = onnx2trt( onnx_file, opt_shape_dict, log_level=trt.Logger.VERBOSE if verbose else trt.Logger.ERROR, fp16_mode=fp16, max_workspace_size=max_workspace_size) save_dir, _ = osp.split(trt_file) if save_dir: os.makedirs(save_dir, exist_ok=True) save_trt_engine(trt_engine, trt_file) print(f'Successfully created TensorRT engine: {trt_file}') if verify: inputs = _prepare_input_img( model_type=model_type, img_path=img_path, config=config) imgs = inputs['imgs'] img_list = [imgs.unsqueeze(0)] if max_shape[0] > 1: # concate flip image for batch test flip_img_list = [_.flip(-1) for _ in img_list] img_list = [ torch.cat((ori_img, flip_img), 0) for ori_img, flip_img in zip(img_list, flip_img_list) ] # Get results from ONNXRuntime ort_custom_op_path = get_onnxruntime_op_path() session_options = ort.SessionOptions() if osp.exists(ort_custom_op_path): session_options.register_custom_ops_library(ort_custom_op_path) sess = ort.InferenceSession(onnx_file, session_options) sess.set_providers(['CPUExecutionProvider'], [{}]) # use cpu mode onnx_output = sess.run(['output'], {'input': img_list[0].detach().numpy()})[0][0] # Get results from TensorRT trt_model = TRTWrapper(trt_file, ['input'], ['output']) with torch.no_grad(): trt_outputs = trt_model({'input': img_list[0].contiguous().cuda()}) trt_output = trt_outputs['output'][0].cpu().detach().numpy() if show: onnx_visualize = onnx_output.transpose(1, 2, 0) onnx_visualize = np.clip(onnx_visualize, 0, 1)[:, :, ::-1] trt_visualize = trt_output.transpose(1, 2, 0) trt_visualize = np.clip(trt_visualize, 0, 1)[:, :, ::-1] cv2.imshow('ONNXRuntime', onnx_visualize) cv2.imshow('TensorRT', trt_visualize) cv2.waitKey() np.testing.assert_allclose( onnx_output, trt_output, rtol=1e-03, atol=1e-05) print('TensorRT and ONNXRuntime output all close.')