def _dataframe_from_json(path_or_str, schema: Schema = None, pandas_orient: str = "split", precise_float=False) -> pd.DataFrame: """ Parse json into pandas.DataFrame. User can pass schema to ensure correct type parsing and to make any necessary conversions (e.g. string -> binary for binary columns). :param path_or_str: Path to a json file or a json string. :param schema: Mlflow schema used when parsing the data. :param pandas_orient: pandas data frame convention used to store the data. :return: pandas.DataFrame. """ if schema is not None: dtypes = dict(zip(schema.column_names(), schema.pandas_types())) df = pd.read_json(path_or_str, orient=pandas_orient, dtype=dtypes, precise_float=precise_float) actual_cols = set(df.columns) for type_, name in zip(schema.column_types(), schema.column_names()): if type_ == DataType.binary and name in actual_cols: df[name] = df[name].map( lambda x: base64.decodebytes(bytes(x, 'utf8'))) return df else: return pd.read_json(path_or_str, orient=pandas_orient, dtype=False, precise_float=precise_float)
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) spark_schema = StructType( [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def from_dict(cls, signature_dict: Dict[str, Any]): """ Deserialize from dictionary representation. :param signature_dict: Dictionary representation of model signature. Expected dictionary format: `{'inputs': <json string>, 'outputs': <json string>" }` :return: ModelSignature populated with the data form the dictionary. """ inputs = Schema.from_json(signature_dict["inputs"]) if "outputs" in signature_dict and signature_dict[ "outputs"] is not None: outputs = Schema.from_json(signature_dict["outputs"]) return cls(inputs, outputs) else: return cls(inputs)
def test_model_save_load(): m = Model(artifact_path="some/path", run_id="123", flavors={ "flavor1": {"a": 1, "b": 2}, "flavor2": {"x": 1, "y": 2}, }, signature=ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")])), saved_input_example_info={"x": 1, "y": 2}) assert m.get_input_schema() == m.signature.inputs assert m.get_output_schema() == m.signature.outputs x = Model(artifact_path="some/other/path", run_id="1234") assert x.get_input_schema() is None assert x.get_output_schema() is None n = Model(artifact_path="some/path", run_id="123", flavors={ "flavor1": {"a": 1, "b": 2}, "flavor2": {"x": 1, "y": 2}, }, signature=ModelSignature( inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")])), saved_input_example_info={"x": 1, "y": 2}) n.utc_time_created = m.utc_time_created assert m == n n.signature = None assert m != n with TempDir() as tmp: m.save(tmp.path("model")) o = Model.load(tmp.path("model")) assert m == o assert m.to_json() == o.to_json() assert m.to_yaml() == o.to_yaml()
def test_model_log(): with TempDir(chdr=True) as tmp: experiment_id = kiwi.create_experiment("test") sig = ModelSignature(inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]), outputs=Schema([ColSpec(name=None, type="double")])) input_example = {"x": 1, "y": 2} with kiwi.start_run(experiment_id=experiment_id) as r: Model.log("some/path", TestFlavor, signature=sig, input_example=input_example) local_path = _download_artifact_from_uri("runs:/{}/some/path".format(r.info.run_id), output_path=tmp.path("")) loaded_model = Model.load(os.path.join(local_path, "MLmodel")) assert loaded_model.run_id == r.info.run_id assert loaded_model.artifact_path == "some/path" assert loaded_model.flavors == { "flavor1": {"a": 1, "b": 2}, "flavor2": {"x": 1, "y": 2}, } assert loaded_model.signature == sig path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"]) x = _dataframe_from_json(path) assert x.to_dict(orient="records")[0] == input_example
def test_spark_type_mapping(pandas_df_with_all_types): import pyspark from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \ StringType, BinaryType from pyspark.sql.types import StructField, StructType assert isinstance(DataType.boolean.to_spark(), BooleanType) assert isinstance(DataType.integer.to_spark(), IntegerType) assert isinstance(DataType.long.to_spark(), LongType) assert isinstance(DataType.float.to_spark(), FloatType) assert isinstance(DataType.double.to_spark(), DoubleType) assert isinstance(DataType.string.to_spark(), StringType) assert isinstance(DataType.binary.to_spark(), BinaryType) schema = _infer_schema(pandas_df_with_all_types) expected_spark_schema = StructType( [StructField(t.name, t.to_spark(), True) for t in schema.column_types()]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema) schema2 = _infer_schema(sparkdf) assert schema == schema2 # test unnamed columns schema = Schema([ColSpec(col.type) for col in schema.columns]) expected_spark_schema = StructType( [StructField(str(i), t.to_spark(), True) for i, t in enumerate(schema.column_types())]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() # test single unnamed column is mapped to just a single spark type schema = Schema([ColSpec(DataType.integer)]) spark_type = schema.as_spark_schema() assert isinstance(spark_type, IntegerType)
def _infer_schema(data: Any) -> Schema: """ Infer an MLflow schema from a dataset. This method captures the column names and data types from the user data. The signature represents model input and output as data frames with (optionally) named columns and data type specified as one of types defined in :py:class:`DataType`. This method will raise an exception if the user data contains incompatible types or is not passed in one of the supported formats (containers). The input should be one of these: - pandas.DataFrame or pandas.Series - dictionary of { name -> numpy.ndarray} - numpy.ndarray - pyspark.sql.DataFrame The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType`. NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time. :param data: Dataset to infer from. :return: Schema """ if isinstance(data, dict): res = [] for col in data.keys(): ary = data[col] if not isinstance(ary, np.ndarray): raise TypeError( "Data in the dictionary must be of type numpy.ndarray") dims = len(ary.shape) if dims == 1: res.append(ColSpec(type=_infer_numpy_array(ary), name=col)) else: raise TensorsNotSupportedException( "Data in the dictionary must be 1-dimensional, " "got shape {}".format(ary.shape)) return Schema(res) elif isinstance(data, pd.Series): return Schema([ColSpec(type=_infer_numpy_array(data.values))]) elif isinstance(data, pd.DataFrame): return Schema([ ColSpec(type=_infer_numpy_array(data[col].values), name=col) for col in data.columns ]) elif isinstance(data, np.ndarray): if len(data.shape) > 2: raise TensorsNotSupportedException( "Attempting to infer schema from numpy array with " "shape {}".format(data.shape)) if data.dtype == np.object: data = pd.DataFrame(data).infer_objects() return Schema([ ColSpec(type=_infer_numpy_array(data[col].values)) for col in data.columns ]) if len(data.shape) == 1: return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))]) elif len(data.shape) == 2: return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))] * data.shape[1]) elif _is_spark_df(data): return Schema([ ColSpec(type=_infer_spark_type(field.dataType), name=field.name) for field in data.schema.fields ]) raise TypeError( "Expected one of (pandas.DataFrame, numpy array, " "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) " "but got '{}'".format(type(data)))
def test_schema_inference_on_numpy_array(pandas_df_with_all_types): # drop int and float as we lose type size information when storing as objects and defaults are # 64b. pandas_df_with_all_types = pandas_df_with_all_types.drop(columns=["integer", "float"]) schema = _infer_schema(pandas_df_with_all_types.values) assert schema == Schema([ColSpec(x) for x in pandas_df_with_all_types.columns]) # test objects schema = _infer_schema(np.array(["a"], dtype=np.object)) assert schema == Schema([ColSpec(DataType.string)]) schema = _infer_schema(np.array([bytes([1])], dtype=np.object)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([bytearray([1]), None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([True, None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.boolean)]) schema = _infer_schema(np.array([1.1, None], dtype=np.object)) assert schema == Schema([ColSpec(DataType.double)]) # test bytes schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_)) assert schema == Schema([ColSpec(DataType.binary)]) schema = _infer_schema(np.array([bytearray([1])], dtype=np.bytes_)) assert schema == Schema([ColSpec(DataType.binary)]) # test string schema = _infer_schema(np.array(["a"], dtype=np.str)) assert schema == Schema([ColSpec(DataType.string)]) # test boolean schema = _infer_schema(np.array([True], dtype=np.bool)) assert schema == Schema([ColSpec(DataType.boolean)]) # test ints for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]: schema = _infer_schema(np.array([1, 2, 3], dtype=t)) assert schema == Schema([ColSpec("integer")]) # test longs for t in [np.uint32, np.int64]: schema = _infer_schema(np.array([1, 2, 3], dtype=t)) assert schema == Schema([ColSpec("long")]) # unsigned long is unsupported with pytest.raises(MlflowException): _infer_schema(np.array([1, 2, 3], dtype=np.uint64)) # test floats for t in [np.float16, np.float32]: schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t)) assert schema == Schema([ColSpec("float")]) # test doubles schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float64)) assert schema == Schema([ColSpec("double")]) # unsupported if hasattr(np, "float128"): with pytest.raises(MlflowException): _infer_schema(np.array([1, 2, 3], dtype=np.float128))
def test_schema_inference_on_dataframe(pandas_df_with_all_types): schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])