def test_float16(self): @onnx_function(outputs=['z'], input_types=(_Ty.F([1, 1, 6, 1])), output_types=[_Ty.f]) def transpose_n_matmul(x): ox = x.ox # type: OnnxOperatorBuilderX wm = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]).astype(np.float32).reshape([2, 6]) b = ox.constant(value=wm) a = ox.transpose(x, perm=[0, 1, 3, 2]) c = ox.transpose(b, perm=[1, 0]) return ox.matmul([a, c]) m1 = np.array([[2, 3], [4, 5], [6, 7]]).astype(np.float32).reshape([1, 1, 6, 1]) expected = transpose_n_matmul(m1) model = transpose_n_matmul.to_model() f16model = convert_float_to_float16(copy.deepcopy(model)) actual = _ort_inference(f16model, {'x': m1.astype(np.float16)}) self.assertTrue(np.allclose(expected, actual)) f16model2 = convert_float_to_float16(copy.deepcopy(model), keep_io_types=True) actual2 = _ort_inference(f16model2, {'x': m1}) self.assertTrue(np.allclose(expected, actual2))
def test_float16_with_loop(self): @onnx_function(outputs=['y1', 'y2'], input_types=[_Ty.F([None, None])], output_types=[_Ty.F([None]), _Ty.F([None, None])]) def loop_test(data): ox = data.ox shape = ox.shape(data) dim_0 = ox.gather([shape, ox.constant(value=0)], axis=0) dim_1 = ox.gather( [shape, ox.constant(value=np.array([1], dtype=np.int64))], axis=0) zeros = ox.constant_of_shape(dim_1, value=0.0) is_true = ox.constant(value=True) @onnx_function(outputs=['c_o', 'total_o', 'scan_o'], output_types=[_Ty.b, _Ty.F([None]), _Ty.F([None])], input_types=[_Ty.I([1]), _Ty.b, _Ty.F([None])]) def range_body(iter_n, cond, total): ox = iter_n.ox iter_scalar = ox.squeeze(iter_n, axes=[0]) col = ox.gather([data, iter_scalar], axis=0) total = ox.add([total, col]) return (is_true, total, total) final_total, scan_res = ox.loop( dim_0, is_true, range_body, inputs=[zeros], outputs=['final_total', 'scan_res']) return final_total, scan_res m1 = np.array([[2, 3], [4, 5], [6, 7]], dtype=np.float32) expected_res = loop_test(m1) model = loop_test.to_model() f16model = convert_float_to_float16(copy.deepcopy(model)) actual_res = _ort_inference(f16model, {'data': m1.astype(np.float16)}) for expected, actual in zip(expected_res, actual_res): self.assertTrue(np.allclose(expected, actual)) self.assertTrue(actual.dtype == np.float16) f16model2 = convert_float_to_float16(copy.deepcopy(model), keep_io_types=True) actual_res2 = _ort_inference(f16model2, {'data': m1}) for expected, actual2 in zip(expected_res, actual_res2): self.assertTrue(np.allclose(expected, actual2)) self.assertTrue(actual2.dtype == np.float32)
def __init__(self, model_path, use_fp16): print(">>> [InferBackend] Creating Engine ...") providers = ['CUDAExecutionProvider'] sess_options = ort.SessionOptions() predictor = ort.InferenceSession( model_path, sess_options=sess_options, providers=providers) if "CUDAExecutionProvider" in predictor.get_providers(): print(">>> [InferBackend] Use GPU to inference ...") if use_fp16: from onnxconverter_common import float16 import onnx print(">>> [InferBackend] Use FP16 to inference ...") fp16_model = "fp16_model.onnx" onnx_model = onnx.load_model(model_path) trans_model = float16.convert_float_to_float16( onnx_model, keep_io_types=True) onnx.save_model(trans_model, fp16_model) sess_options = ort.SessionOptions() predictor = ort.InferenceSession( fp16_model, sess_options=sess_options, providers=providers) else: print(">>> [InferBackend] Use CPU to inference ...") if use_fp16: print( ">>> [InferBackend] use_fp16 only takes effect when deploying on gpu ..." ) self.predictor = predictor input_name1 = self.predictor.get_inputs()[1].name input_name2 = self.predictor.get_inputs()[0].name self.input_handles = [input_name1, input_name2] print(">>> [InferBackend] Engine Created ...")
def run_attempt(node_block_list, return_model=False): print(node_block_list) model = float16.convert_float_to_float16(copy.deepcopy(model0), node_block_list=node_block_list, keep_io_types=keep_io_types, disable_shape_infer=True) res1 = get_tensor_values_using_ort(model, feed_dict) if return_model: return validate(res0, res1), model else: valid = validate(res0, res1) print(valid) return valid
def __init__(self, model_path_prefix, device='cpu', use_quantize=False, use_fp16=False): print(">>> [InferBackend] Creating Engine ...") onnx_model = paddle2onnx.command.c_paddle_to_onnx( model_file=model_path_prefix + ".pdmodel", params_file=model_path_prefix + ".pdiparams", opset_version=13, enable_onnx_checker=True) infer_model_dir = model_path_prefix.rsplit("/", 1)[0] float_onnx_file = os.path.join(infer_model_dir, "model.onnx") with open(float_onnx_file, "wb") as f: f.write(onnx_model) if device == "gpu": providers = ['CUDAExecutionProvider'] print(">>> [InferBackend] Use GPU to inference ...") if use_fp16: print(">>> [InferBackend] Use FP16 to inference ...") from onnxconverter_common import float16 import onnx fp16_model_file = os.path.join(infer_model_dir, "fp16_model.onnx") onnx_model = onnx.load_model(float_onnx_file) trans_model = float16.convert_float_to_float16( onnx_model, keep_io_types=True) onnx.save_model(trans_model, fp16_model_file) onnx_model = fp16_model_file else: providers = ['CPUExecutionProvider'] print(">>> [InferBackend] Use CPU to inference ...") sess_options = ort.SessionOptions() self.predictor = ort.InferenceSession(onnx_model, sess_options=sess_options, providers=providers) if device == "gpu": try: assert 'CUDAExecutionProvider' in self.predictor.get_providers( ) except AssertionError: raise AssertionError( f"The environment for GPU inference is not set properly. " "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. " "Please run the following commands to reinstall: \n " "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu" ) print(">>> [InferBackend] Engine Created ...")
def _prepare_onnx_mode(self): import onnx import onnxruntime as ort import paddle2onnx from onnxconverter_common import float16 onnx_dir = os.path.join(self._task_path, 'onnx') if not os.path.exists(onnx_dir): os.mkdir(onnx_dir) float_onnx_file = os.path.join(onnx_dir, 'model.onnx') if not os.path.exists(float_onnx_file): onnx_model = paddle2onnx.command.c_paddle_to_onnx( model_file=self._static_model_file, params_file=self._static_params_file, opset_version=13, enable_onnx_checker=True) with open(float_onnx_file, "wb") as f: f.write(onnx_model) fp16_model_file = os.path.join(onnx_dir, 'fp16_model.onnx') if not os.path.exists(fp16_model_file): onnx_model = onnx.load_model(float_onnx_file) trans_model = float16.convert_float_to_float16(onnx_model, keep_io_types=True) onnx.save_model(trans_model, fp16_model_file) providers = ['CUDAExecutionProvider'] sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = self._num_threads sess_options.inter_op_num_threads = self._num_threads self.predictor = ort.InferenceSession(fp16_model_file, sess_options=sess_options, providers=providers) try: assert 'CUDAExecutionProvider' in self.predictor.get_providers() except AssertionError: raise AssertionError( f"The environment for GPU inference is not set properly. " "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. " "Please run the following commands to reinstall: \n " "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu" )
def convert_model_float32_to_float16(self, cast_input_output=True): """ Convert a graph to FLOAT16 """ from packaging.version import Version import onnxconverter_common.float16 as oc if Version(oc.__version__) > Version("1.7.0"): self.model = oc.convert_float_to_float16( self.model, keep_io_types=cast_input_output) return graph = self.model.graph initializers = graph.initializer for initializer in initializers: if initializer.data_type == 1: initializer.CopyFrom( numpy_helper.from_array( numpy_helper.to_array(initializer).astype(np.float16), initializer.name)) for node in graph.node: if node.op_type in ['Constant', 'ConstantOfShape']: for att in node.attribute: if att.name == 'value' and att.t.data_type == 1: att.CopyFrom( helper.make_attribute( "value", numpy_helper.from_array( numpy_helper.to_array(att.t).astype( np.float16)))) if node.op_type == 'Cast': for att in node.attribute: if att.name == 'to' and att.i == 1: att.CopyFrom( helper.make_attribute("to", int(TensorProto.FLOAT16))) if not cast_input_output: self.change_input_output_float32_to_float16() return # Below assumes that we keep input and output data types. # Add Cast node to convert input from float32 to float16. for input_value_info in graph.input: if input_value_info.type.tensor_type.elem_type == TensorProto.FLOAT: initializer = self.get_initializer(input_value_info.name) if initializer is not None: # for compatibility for old converter/exporter input_value_info.type.tensor_type.elem_type = TensorProto.FLOAT16 else: cast_input = input_value_info.name cast_output = input_value_info.name + '_float16' self.replace_input_of_all_nodes(cast_input, cast_output) cast_node = helper.make_node('Cast', inputs=[cast_input], outputs=[cast_output]) cast_node.attribute.extend([ helper.make_attribute("to", int(TensorProto.FLOAT16)) ]) self.add_node(cast_node) # Add Cast node to convert output from float16 back to float32. for output_value_info in graph.output: if output_value_info.type.tensor_type.elem_type == TensorProto.FLOAT: cast_input = output_value_info.name + '_float16' cast_output = output_value_info.name self.replace_output_of_all_nodes(cast_output, cast_input) self.replace_input_of_all_nodes(cast_output, cast_input) cast_node = helper.make_node('Cast', inputs=[cast_input], outputs=[cast_output]) cast_node.attribute.extend( [helper.make_attribute("to", int(TensorProto.FLOAT))]) self.add_node(cast_node)