def _guess_type_proto(data_type, dims): # This could be moved to onnxconverter_common. for d in dims: if d == 0: raise RuntimeError("Dimension should not be null: {}.".format( list(dims))) if data_type == onnx_proto.TensorProto.FLOAT: return FloatTensorType(dims) if data_type == onnx_proto.TensorProto.DOUBLE: return DoubleTensorType(dims) if data_type == onnx_proto.TensorProto.STRING: return StringTensorType(dims) if data_type == onnx_proto.TensorProto.INT64: return Int64TensorType(dims) if data_type == onnx_proto.TensorProto.INT32: return Int32TensorType(dims) if data_type == onnx_proto.TensorProto.BOOL: return BooleanTensorType(dims) if data_type == onnx_proto.TensorProto.INT8: return Int8TensorType(dims) if data_type == onnx_proto.TensorProto.UINT8: return UInt8TensorType(dims) if Complex64TensorType is not None: if data_type == onnx_proto.TensorProto.COMPLEX64: return Complex64TensorType(dims) if data_type == onnx_proto.TensorProto.COMPLEX128: return Complex128TensorType(dims) raise NotImplementedError( "Unsupported data_type '{}'. You may raise an issue " "at https://github.com/onnx/sklearn-onnx/issues." "".format(data_type))
def _guess_type_proto_str(data_type, dims): # This could be moved to onnxconverter_common. if data_type == "tensor(float)": return FloatTensorType(dims) if data_type == "tensor(double)": return DoubleTensorType(dims) if data_type == "tensor(string)": return StringTensorType(dims) if data_type == "tensor(int64)": return Int64TensorType(dims) if data_type == "tensor(int32)": return Int32TensorType(dims) if data_type == "tensor(bool)": return BooleanTensorType(dims) if data_type == "tensor(int8)": return Int8TensorType(dims) if data_type == "tensor(uint8)": return UInt8TensorType(dims) if Complex64TensorType is not None: if data_type == "tensor(complex64)": return Complex64TensorType(dims) if data_type == "tensor(complex128)": return Complex128TensorType(dims) raise NotImplementedError( "Unsupported data_type '{}'. You may raise an issue " "at https://github.com/onnx/sklearn-onnx/issues." "".format(data_type))
def test_forgotten_backend_string(self): from sklearn.preprocessing import LabelEncoder model = LabelEncoder() data = np.array([1, 4, 5, 2, 0, 2], dtype=np.int32) model.fit(data) self.assertRaises(ValueError, hummingbird.ml.convert, model, [("input", Int32TensorType([6, 1]))])
def _declare_input_variables(topology, raw_model_container, extra_config): # Declare input variables. inputs = [] n_inputs = extra_config[ constants.N_INPUTS] if constants.N_INPUTS in extra_config else 1 if constants.INPUT_NAMES in extra_config: assert n_inputs == len(extra_config[constants.INPUT_NAMES]) if constants.TEST_INPUT in extra_config: from onnxconverter_common.data_types import ( FloatTensorType, DoubleTensorType, Int32TensorType, Int64TensorType, StringTensorType, ) test_input = extra_config[constants.TEST_INPUT] if n_inputs > 1 else [ extra_config[constants.TEST_INPUT] ] for i in range(n_inputs): input = test_input[i] input_name = (extra_config[constants.INPUT_NAMES][i] if constants.INPUT_NAMES in extra_config else "input_{}".format(i)) if input.dtype == np.float32: input_type = FloatTensorType(input.shape) elif input.dtype == np.float64: input_type = DoubleTensorType(input.shape) elif input.dtype == np.int32: input_type = Int32TensorType(input.shape) elif input.dtype == np.int64: input_type = Int64TensorType(input.shape) elif input.dtype.kind in constants.SUPPORTED_STRING_TYPES: input_type = StringTensorType(input.shape) else: raise NotImplementedError( "Type {} not supported. Please fill an issue on https://github.com/microsoft/hummingbird/." .format(input.dtype)) inputs.append( topology.declare_logical_variable(input_name, type=input_type)) else: # We have no information on the input. Sklearn/Spark-ML always gets as input a single dataframe, # therefore by default we start with a single `input` variable input_name = extra_config[constants.INPUT_NAMES][ 0] if constants.TEST_INPUT in extra_config else "input" var = topology.declare_logical_variable(input_name) inputs.append(var) # The object raw_model_container is a part of the topology we're going to return. # We use it to store the inputs of the Sklearn/Spark-ML's computational graph. for variable in inputs: raw_model_container.add_input(variable) return inputs
def test_onnx_no_test_data_int(self): warnings.filterwarnings("ignore") model = OneHotEncoder() X = np.array([[1, 2, 3]], dtype=np.int32) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("input", Int32TensorType([X.shape[0], X.shape[1]]))], target_opset=11 ) # Test onnx requires no test_data hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx") assert hb_model
def _guess_type_proto(data_type, dims): # This could be moved to onnxconverter_common. if data_type == onnx_proto.TensorProto.FLOAT: return FloatTensorType(dims) elif data_type == onnx_proto.TensorProto.DOUBLE: return DoubleTensorType(dims) elif data_type == onnx_proto.TensorProto.STRING: return StringTensorType(dims) elif data_type == onnx_proto.TensorProto.INT64: return Int64TensorType(dims) elif data_type == onnx_proto.TensorProto.INT32: return Int32TensorType(dims) elif data_type == onnx_proto.TensorProto.BOOL: return BooleanTensorType(dims) else: raise NotImplementedError( "Unsupported data_type '{}'. You may raise an issue " "at https://github.com/onnx/sklearn-onnx/issues." "".format(data_type))
def _guess_numpy_type(data_type, dims): # This could be moved to onnxconverter_common. if data_type == np.float32: return FloatTensorType(dims) elif data_type in (np.str, str, object) or str(data_type) in ('<U1', ): # noqa return StringTensorType(dims) elif data_type in (np.int64, np.uint64) or str(data_type) == '<U6': return Int64TensorType(dims) elif data_type in (np.int32, np.uint32) or str(data_type) in ('<U4', ): # noqa return Int32TensorType(dims) elif data_type == np.bool: return BooleanTensorType(dims) else: raise NotImplementedError( "Unsupported data_type '{}'. You may raise an issue " "at https://github.com/onnx/sklearn-onnx/issues." "".format(data_type))
def _guess_numpy_type(data_type, dims): # This could be moved to onnxconverter_common. if data_type == np.float32: return FloatTensorType(dims) if data_type == np.float64: return DoubleTensorType(dims) if data_type in (np.str_, str, object) or str(data_type) in ('<U1', ) or ( hasattr(data_type, 'type') and data_type.type is np.str_): # noqa return StringTensorType(dims) if data_type in (np.int64, ) or str(data_type) == '<U6': return Int64TensorType(dims) if data_type in (np.int32, ) or str(data_type) in ('<U4', ): # noqa return Int32TensorType(dims) if data_type == np.uint8: return UInt8TensorType(dims) if data_type in (np.bool_, bool): return BooleanTensorType(dims) if data_type in (np.str_, str): return StringTensorType(dims) if data_type == np.int8: return Int8TensorType(dims) if data_type == np.int16: return Int16TensorType(dims) if data_type == np.uint64: return UInt64TensorType(dims) if data_type == np.uint32: return UInt32TensorType(dims) if data_type == np.uint16: return UInt16TensorType(dims) if data_type == np.float16: return Float16TensorType(dims) if Complex64TensorType is not None: if data_type == np.complex64: return Complex64TensorType(dims) if data_type == np.complex128: return Complex128TensorType(dims) raise NotImplementedError( "Unsupported data_type %r (type=%r). You may raise an issue " "at https://github.com/onnx/sklearn-onnx/issues." "" % (data_type, type(data_type)))
def from_pb(obj): """ Creates a data type from a protobuf object. """ def get_shape(tt): return [ tt.shape.dim[i].dim_value for i in range(len(tt.shape.dim)) ] if hasattr(obj, 'extend'): return [Variable.from_pb(o) for o in obj] name = obj.name if obj.type.tensor_type: tt = obj.type.tensor_type elem = tt.elem_type shape = get_shape(tt) if elem == onnx_proto.TensorProto.FLOAT: ty = FloatTensorType(shape) elif elem == onnx_proto.TensorProto.BOOL: ty = BooleanTensorType(shape) elif elem == onnx_proto.TensorProto.DOUBLE: ty = DoubleTensorType(shape) elif elem == onnx_proto.TensorProto.STRING: ty = StringTensorType(shape) elif elem == onnx_proto.TensorProto.INT64: ty = Int64TensorType(shape) elif elem == onnx_proto.TensorProto.INT32: ty = Int32TensorType(shape) else: raise NotImplementedError("Unsupported type '{}' " "(elem_type={}).".format( type(obj.type.tensor_type), elem)) else: raise NotImplementedError("Unsupported type '{}' as " "a string ({}).".format(type(obj), obj)) return Variable(name, name, None, ty)
def parse_sklearn_api_model(model, extra_config={}): """ Puts *scikit-learn* object into an abstract representation so that our framework can work seamlessly on models created with different machine learning tools. Args: model: A model object in scikit-learn format Returns: A `onnxconverter_common.topology.Topology` object representing the input model """ assert model is not None, "Cannot convert a mode of type None." raw_model_container = CommonSklearnModelContainer(model) # Declare a computational graph. It will become a representation of # the input scikit-learn model after parsing. topology = Topology(raw_model_container) # Declare an object to provide variables' and operators' naming mechanism. # One global scope is enough for parsing scikit-learn models. scope = topology.declare_scope("__root__") # Declare input variables. inputs = [] n_inputs = extra_config[ constants.N_INPUTS] if constants.N_INPUTS in extra_config else 1 if constants.INPUT_NAMES in extra_config: assert n_inputs == len(extra_config[constants.INPUT_NAMES]) if constants.TEST_INPUT in extra_config: from onnxconverter_common.data_types import FloatTensorType, DoubleTensorType, Int32TensorType, Int64TensorType test_input = extra_config[constants.TEST_INPUT] if n_inputs > 1 else [ extra_config[constants.TEST_INPUT] ] for i in range(n_inputs): input = test_input[i] input_name = (extra_config[constants.INPUT_NAMES][i] if constants.INPUT_NAMES in extra_config else "input_{}".format(i)) if input.dtype == np.float32: input_type = FloatTensorType(input.shape) elif input.dtype == np.float64: input_type = DoubleTensorType(input.shape) elif input.dtype == np.int32: input_type = Int32TensorType(input.shape) elif input.dtype == np.int64: input_type = Int64TensorType(input.shape) else: raise RuntimeError( "Type {} not supported. Please fill an issue on https://github.com/microsoft/hummingbird/." .format(type(input.dtype))) inputs.append( scope.declare_local_variable(input_name, type=input_type)) else: # We have no information on the input. Sklearn always gets as input a single dataframe, # therefore by default we start with a single `input` variable input_name = extra_config[constants.INPUT_NAMES][ 0] if constants.TEST_INPUT in extra_config else "input" inputs.append(scope.declare_local_variable(input_name)) # The object raw_model_container is a part of the topology we're going to return. # We use it to store the inputs of the scikit-learn's computational graph. for variable in inputs: raw_model_container.add_input(variable) # Parse the input scikit-learn model into its scope with the topology. # Get the outputs of the model. outputs = _parse_sklearn_api(scope, model, inputs) # Use the output names specified by the user, if any if constants.OUTPUT_NAMES in extra_config: assert len(extra_config[constants.OUTPUT_NAMES]) == len(outputs) for i in range(len(outputs)): outputs[i].raw_name = extra_config[constants.OUTPUT_NAMES][i] # The object raw_model_container is a part of the topology we're going to return. # We use it to store the outputs of the scikit-learn's computational graph. for variable in outputs: raw_model_container.add_output(variable) return topology
def _convert_onnxml(model, backend, test_input, device, extra_config={}): """ This function converts the specified [ONNX-ML] model into its *backend* counterpart. The supported operators can be found at `hummingbird.ml.supported`. """ assert model is not None assert torch_installed(), "To use Hummingbird you need to install torch." import onnx # The conversion requires some test input for tracing. # Test inputs can be either provided or generate from the input schema of the model. # Generate some test input if necessary. if test_input is None: import torch from onnxconverter_common.data_types import FloatTensorType, DoubleTensorType, Int32TensorType, Int64TensorType tvm_backend = None if tvm_installed(): import tvm tvm_backend = tvm.__name__ # Get the input information from the ONNX schema. initial_types = [] for input in model.graph.input: name = input.name if hasattr(input, "name") else None data_type = ( input.type.tensor_type.elem_type if hasattr(input, "type") and hasattr(input.type, "tensor_type") and hasattr(input.type.tensor_type, "elem_type") else None) if name is None: raise RuntimeError( "Cannot fetch input name or data_type from the ONNX schema. Please provide some test input." ) if data_type is None: raise RuntimeError( "Cannot fetch input data_type from the ONNX schema, or data type is not tensor_type. Please provide some test input." ) if not hasattr(input.type.tensor_type, "shape"): raise RuntimeError( "Cannot fetch input shape from ONNX schema. Please provide some test input." ) shape = [dim.dim_value for dim in input.type.tensor_type.shape.dim] if len(shape) == 1: shape = [1, shape[0]] assert len(shape) == 2 # In ONNX dynamic dimensions will have a shape of 0. Fix the 0-shape in the batch dimension if they exist. if shape[0] == 0: shape[0] = 1 if data_type == 1: initial_types.append((name, FloatTensorType(shape))) elif data_type == 11: initial_types.append((name, DoubleTensorType(shape))) elif data_type == 6: initial_types.append((name, Int32TensorType(shape))) elif data_type == 7: initial_types.append((name, Int64TensorType(shape))) else: raise RuntimeError( "Input data type {} not supported. Please fill an issue at https://github.com/microsoft/hummingbird/, or pass some test_input" .format(data_type)) first_shape = initial_types[0][1].shape assert all( map(lambda x: x[1].shape == first_shape, initial_types) ), "Hummingbird currently supports only inputs with same shape." extra_config[constants.N_INPUTS] = len(initial_types) extra_config[constants.N_FEATURES] = extra_config[ constants.N_INPUTS] * first_shape[1] # Generate some random input data if necessary for the model conversion. if backend == onnx.__name__ or backend == tvm_backend or backend == torch.jit.__name__: test_input = [] for i, it in enumerate(initial_types): if type(it[1]) is FloatTensorType: test_input.append( np.array(np.random.rand(first_shape[0], first_shape[1]), dtype=np.float32)) elif type(it[1]) is DoubleTensorType: test_input.append( np.random.rand(first_shape[0], first_shape[1])) elif type(it[1]) is Int32TensorType: test_input.append( np.array(np.random.randint(100, size=first_shape), dtype=np.int32)) elif type(it[1]) is Int64TensorType: test_input.append(np.random.randint(100, size=first_shape)) else: raise RuntimeError( "Type {} not supported. Please fill an issue on https://github.com/microsoft/hummingbird/." .format(type(it[1]))) if extra_config[constants.N_INPUTS] == 1: test_input = test_input[0] else: test_input = tuple(test_input) extra_config[constants.TEST_INPUT] = test_input # Set the number of features. Some converter requires to know in advance the number of features. if constants.N_FEATURES not in extra_config and test_input is not None: if len(test_input.shape) < 2: extra_config[constants.N_FEATURES] = 1 else: extra_config[constants.N_FEATURES] = test_input.shape[1] # Set the initializers. Some converter requires the access to initializers. initializers = {} if model.graph.initializer is None else { in_.name: in_ for in_ in model.graph.initializer } extra_config[constants.ONNX_INITIALIZERS] = initializers # Parse ONNX model as our internal data structure (i.e., Topology). topology = parse_onnx_api_model(model) # Convert the Topology object into a PyTorch model. hb_model = topology_converter(topology, backend, test_input, device, extra_config=extra_config) return hb_model