def test_float16(self):
        @onnx_function(outputs=['z'],
                       input_types=(_Ty.F([1, 1, 6, 1])),
                       output_types=[_Ty.f])
        def transpose_n_matmul(x):
            ox = x.ox  # type: OnnxOperatorBuilderX
            wm = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                           12]).astype(np.float32).reshape([2, 6])
            b = ox.constant(value=wm)
            a = ox.transpose(x, perm=[0, 1, 3, 2])
            c = ox.transpose(b, perm=[1, 0])
            return ox.matmul([a, c])

        m1 = np.array([[2, 3], [4, 5],
                       [6, 7]]).astype(np.float32).reshape([1, 1, 6, 1])
        expected = transpose_n_matmul(m1)
        model = transpose_n_matmul.to_model()
        f16model = convert_float_to_float16(copy.deepcopy(model))
        actual = _ort_inference(f16model, {'x': m1.astype(np.float16)})
        self.assertTrue(np.allclose(expected, actual))

        f16model2 = convert_float_to_float16(copy.deepcopy(model),
                                             keep_io_types=True)
        actual2 = _ort_inference(f16model2, {'x': m1})
        self.assertTrue(np.allclose(expected, actual2))
    def test_float16_with_loop(self):
        @onnx_function(outputs=['y1', 'y2'],
                       input_types=[_Ty.F([None, None])],
                       output_types=[_Ty.F([None]),
                                     _Ty.F([None, None])])
        def loop_test(data):
            ox = data.ox
            shape = ox.shape(data)
            dim_0 = ox.gather([shape, ox.constant(value=0)], axis=0)
            dim_1 = ox.gather(
                [shape,
                 ox.constant(value=np.array([1], dtype=np.int64))],
                axis=0)
            zeros = ox.constant_of_shape(dim_1, value=0.0)
            is_true = ox.constant(value=True)

            @onnx_function(outputs=['c_o', 'total_o', 'scan_o'],
                           output_types=[_Ty.b,
                                         _Ty.F([None]),
                                         _Ty.F([None])],
                           input_types=[_Ty.I([1]), _Ty.b,
                                        _Ty.F([None])])
            def range_body(iter_n, cond, total):
                ox = iter_n.ox
                iter_scalar = ox.squeeze(iter_n, axes=[0])
                col = ox.gather([data, iter_scalar], axis=0)
                total = ox.add([total, col])
                return (is_true, total, total)

            final_total, scan_res = ox.loop(
                dim_0,
                is_true,
                range_body,
                inputs=[zeros],
                outputs=['final_total', 'scan_res'])
            return final_total, scan_res

        m1 = np.array([[2, 3], [4, 5], [6, 7]], dtype=np.float32)
        expected_res = loop_test(m1)

        model = loop_test.to_model()
        f16model = convert_float_to_float16(copy.deepcopy(model))
        actual_res = _ort_inference(f16model, {'data': m1.astype(np.float16)})
        for expected, actual in zip(expected_res, actual_res):
            self.assertTrue(np.allclose(expected, actual))
            self.assertTrue(actual.dtype == np.float16)

        f16model2 = convert_float_to_float16(copy.deepcopy(model),
                                             keep_io_types=True)
        actual_res2 = _ort_inference(f16model2, {'data': m1})
        for expected, actual2 in zip(expected_res, actual_res2):
            self.assertTrue(np.allclose(expected, actual2))
            self.assertTrue(actual2.dtype == np.float32)
Exemple #3
0
 def __init__(self, model_path, use_fp16):
     print(">>> [InferBackend] Creating Engine ...")
     providers = ['CUDAExecutionProvider']
     sess_options = ort.SessionOptions()
     predictor = ort.InferenceSession(
         model_path, sess_options=sess_options, providers=providers)
     if "CUDAExecutionProvider" in predictor.get_providers():
         print(">>> [InferBackend] Use GPU to inference ...")
         if use_fp16:
             from onnxconverter_common import float16
             import onnx
             print(">>> [InferBackend] Use FP16 to inference ...")
             fp16_model = "fp16_model.onnx"
             onnx_model = onnx.load_model(model_path)
             trans_model = float16.convert_float_to_float16(
                 onnx_model, keep_io_types=True)
             onnx.save_model(trans_model, fp16_model)
             sess_options = ort.SessionOptions()
             predictor = ort.InferenceSession(
                 fp16_model, sess_options=sess_options, providers=providers)
     else:
         print(">>> [InferBackend] Use CPU to inference ...")
         if use_fp16:
             print(
                 ">>> [InferBackend] use_fp16 only takes effect when deploying on gpu ..."
             )
     self.predictor = predictor
     input_name1 = self.predictor.get_inputs()[1].name
     input_name2 = self.predictor.get_inputs()[0].name
     self.input_handles = [input_name1, input_name2]
     print(">>> [InferBackend] Engine Created ...")
Exemple #4
0
 def run_attempt(node_block_list, return_model=False):
     print(node_block_list)
     model = float16.convert_float_to_float16(copy.deepcopy(model0), node_block_list=node_block_list,
                                              keep_io_types=keep_io_types, disable_shape_infer=True)
     res1 = get_tensor_values_using_ort(model, feed_dict)
     if return_model:
         return validate(res0, res1), model
     else:
         valid = validate(res0, res1)
         print(valid)
         return valid
    def __init__(self,
                 model_path_prefix,
                 device='cpu',
                 use_quantize=False,
                 use_fp16=False):
        print(">>> [InferBackend] Creating Engine ...")
        onnx_model = paddle2onnx.command.c_paddle_to_onnx(
            model_file=model_path_prefix + ".pdmodel",
            params_file=model_path_prefix + ".pdiparams",
            opset_version=13,
            enable_onnx_checker=True)
        infer_model_dir = model_path_prefix.rsplit("/", 1)[0]
        float_onnx_file = os.path.join(infer_model_dir, "model.onnx")
        with open(float_onnx_file, "wb") as f:
            f.write(onnx_model)

        if device == "gpu":
            providers = ['CUDAExecutionProvider']
            print(">>> [InferBackend] Use GPU to inference ...")
            if use_fp16:
                print(">>> [InferBackend] Use FP16 to inference ...")
                from onnxconverter_common import float16
                import onnx
                fp16_model_file = os.path.join(infer_model_dir,
                                               "fp16_model.onnx")
                onnx_model = onnx.load_model(float_onnx_file)
                trans_model = float16.convert_float_to_float16(
                    onnx_model, keep_io_types=True)
                onnx.save_model(trans_model, fp16_model_file)
                onnx_model = fp16_model_file
        else:
            providers = ['CPUExecutionProvider']
            print(">>> [InferBackend] Use CPU to inference ...")

        sess_options = ort.SessionOptions()
        self.predictor = ort.InferenceSession(onnx_model,
                                              sess_options=sess_options,
                                              providers=providers)
        if device == "gpu":
            try:
                assert 'CUDAExecutionProvider' in self.predictor.get_providers(
                )
            except AssertionError:
                raise AssertionError(
                    f"The environment for GPU inference is not set properly. "
                    "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. "
                    "Please run the following commands to reinstall: \n "
                    "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu"
                )
        print(">>> [InferBackend] Engine Created ...")
Exemple #6
0
 def _prepare_onnx_mode(self):
     import onnx
     import onnxruntime as ort
     import paddle2onnx
     from onnxconverter_common import float16
     onnx_dir = os.path.join(self._task_path, 'onnx')
     if not os.path.exists(onnx_dir):
         os.mkdir(onnx_dir)
     float_onnx_file = os.path.join(onnx_dir, 'model.onnx')
     if not os.path.exists(float_onnx_file):
         onnx_model = paddle2onnx.command.c_paddle_to_onnx(
             model_file=self._static_model_file,
             params_file=self._static_params_file,
             opset_version=13,
             enable_onnx_checker=True)
         with open(float_onnx_file, "wb") as f:
             f.write(onnx_model)
     fp16_model_file = os.path.join(onnx_dir, 'fp16_model.onnx')
     if not os.path.exists(fp16_model_file):
         onnx_model = onnx.load_model(float_onnx_file)
         trans_model = float16.convert_float_to_float16(onnx_model,
                                                        keep_io_types=True)
         onnx.save_model(trans_model, fp16_model_file)
     providers = ['CUDAExecutionProvider']
     sess_options = ort.SessionOptions()
     sess_options.intra_op_num_threads = self._num_threads
     sess_options.inter_op_num_threads = self._num_threads
     self.predictor = ort.InferenceSession(fp16_model_file,
                                           sess_options=sess_options,
                                           providers=providers)
     try:
         assert 'CUDAExecutionProvider' in self.predictor.get_providers()
     except AssertionError:
         raise AssertionError(
             f"The environment for GPU inference is not set properly. "
             "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. "
             "Please run the following commands to reinstall: \n "
             "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu"
         )
Exemple #7
0
    def convert_model_float32_to_float16(self, cast_input_output=True):
        """ Convert a graph to FLOAT16
        """
        from packaging.version import Version
        import onnxconverter_common.float16 as oc
        if Version(oc.__version__) > Version("1.7.0"):
            self.model = oc.convert_float_to_float16(
                self.model, keep_io_types=cast_input_output)
            return

        graph = self.model.graph
        initializers = graph.initializer

        for initializer in initializers:
            if initializer.data_type == 1:
                initializer.CopyFrom(
                    numpy_helper.from_array(
                        numpy_helper.to_array(initializer).astype(np.float16),
                        initializer.name))

        for node in graph.node:
            if node.op_type in ['Constant', 'ConstantOfShape']:
                for att in node.attribute:
                    if att.name == 'value' and att.t.data_type == 1:
                        att.CopyFrom(
                            helper.make_attribute(
                                "value",
                                numpy_helper.from_array(
                                    numpy_helper.to_array(att.t).astype(
                                        np.float16))))
            if node.op_type == 'Cast':
                for att in node.attribute:
                    if att.name == 'to' and att.i == 1:
                        att.CopyFrom(
                            helper.make_attribute("to",
                                                  int(TensorProto.FLOAT16)))

        if not cast_input_output:
            self.change_input_output_float32_to_float16()
            return

        # Below assumes that we keep input and output data types.
        # Add Cast node to convert input from float32 to float16.
        for input_value_info in graph.input:
            if input_value_info.type.tensor_type.elem_type == TensorProto.FLOAT:
                initializer = self.get_initializer(input_value_info.name)
                if initializer is not None:  # for compatibility for old converter/exporter
                    input_value_info.type.tensor_type.elem_type = TensorProto.FLOAT16
                else:
                    cast_input = input_value_info.name
                    cast_output = input_value_info.name + '_float16'
                    self.replace_input_of_all_nodes(cast_input, cast_output)
                    cast_node = helper.make_node('Cast',
                                                 inputs=[cast_input],
                                                 outputs=[cast_output])
                    cast_node.attribute.extend([
                        helper.make_attribute("to", int(TensorProto.FLOAT16))
                    ])
                    self.add_node(cast_node)

        # Add Cast node to convert output from float16 back to float32.
        for output_value_info in graph.output:
            if output_value_info.type.tensor_type.elem_type == TensorProto.FLOAT:
                cast_input = output_value_info.name + '_float16'
                cast_output = output_value_info.name
                self.replace_output_of_all_nodes(cast_output, cast_input)
                self.replace_input_of_all_nodes(cast_output, cast_input)
                cast_node = helper.make_node('Cast',
                                             inputs=[cast_input],
                                             outputs=[cast_output])
                cast_node.attribute.extend(
                    [helper.make_attribute("to", int(TensorProto.FLOAT))])
                self.add_node(cast_node)