def bundle(spark_session, spark_df_schema, spark_pipeline_model): #spark_df_as_java = _py2java(spark_session, spark_df) #spark_df_schema_as_java = spark_df_as_java.schema.__call__() spark_df_schema_as_json = spark_df_schema.json() with open('model.schema', 'wb') as pkl_file: pickle.dump(spark_df_schema_as_json, pkl_file) spark_pipeline_model.write().overwrite().save('model.parquet') ## SERVE FROM HERE with open('model.schema', 'rb') as pkl_file: from pyspark.sql.types import _parse_datatype_json_string restored_spark_df_schema_as_json = pickle.load(pkl_file) restored_spark_df_schema = _parse_datatype_json_string( restored_spark_df_schema_as_json) restored_spark_df_schema_as_java = _py2java(spark_session, restored_spark_df_schema) restored_spark_pipeline_model = PipelineModel.read().load('model.parquet') restored_spark_pipeline_model_as_java = restored_spark_pipeline_model._to_java( ) return spark_session._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray( restored_spark_df_schema_as_java, restored_spark_pipeline_model_as_java)
def ext_schema_of_xml_df(df, options={}): assert len(df.columns) == 1 scala_options = spark._jvm.PythonUtils.toScalaMap(options) java_xml_module = getattr(getattr( spark._jvm.com.databricks.spark.xml, "package$"), "MODULE$") java_schema = java_xml_module.schema_of_xml_df(df._jdf, scala_options) return _parse_datatype_json_string(java_schema.json())
def schema(self): """Returns the schema of this DataFrame (represented by a L{StructType}). >>> df.schema() StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) """ return _parse_datatype_json_string(self._jdf.schema().json())
def check_datatype(datatype): pickled = pickle.loads(pickle.dumps(datatype)) assert datatype == pickled scala_datatype = self.spark._jsparkSession.parseDataType( datatype.json()) python_datatype = _parse_datatype_json_string( scala_datatype.json()) assert datatype == python_datatype
def schema(self): """Returns the schema of this :class:`DataFrame` as a :class:`types.StructType`. >>> df.schema StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) """ if self._schema is None: self._schema = _parse_datatype_json_string(self._jdf.schema().json()) return self._schema
def schema(self): """Returns the schema of this :class:`DataFrame` as a :class:`types.StructType`. >>> df.schema StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) """ if self._schema is None: try: self._schema = _parse_datatype_json_string(self._jdf.schema().json()) except AttributeError as e: raise Exception("Unable to parse datatype from schema. %s" % e) return self._schema
def schema(self): """Returns the schema of this :class:`DataFrame` as a :class:`types.StructType`. >>> df.schema StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) """ if self._schema is None: try: self._schema = _parse_datatype_json_string( self._jdf.schema().json()) except AttributeError as e: raise Exception("Unable to parse datatype from schema. %s" % e) return self._schema
def imageSchema(self): """ Returns the image schema. :return: a :class:`StructType` with a single column of images named "image" (nullable). .. versionadded:: 2.3.0 """ if self._imageSchema is None: ctx = SparkContext._active_spark_context jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema() self._imageSchema = _parse_datatype_json_string(jschema.json()) return self._imageSchema
def columnSchema(self): """ Returns the schema for the image column. :return: a :class:`StructType` for image column, ``struct<origin:string, height:int, width:int, nChannels:int, mode:int, data:binary>``. .. versionadded:: 2.4.0 """ if self._columnSchema is None: ctx = SparkContext._active_spark_context jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.columnSchema() self._columnSchema = _parse_datatype_json_string(jschema.json()) return self._columnSchema
def imageSchema(self) -> StructType: """ Returns the image schema. Returns ------- :class:`StructType` with a single column of images named "image" (nullable) and having the same type returned by :meth:`columnSchema`. .. versionadded:: 2.3.0 """ if self._imageSchema is None: ctx = SparkContext._active_spark_context assert ctx is not None and ctx._jvm is not None jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema( ) self._imageSchema = cast( StructType, _parse_datatype_json_string(jschema.json())) return self._imageSchema
def check_datatype(datatype): pickled = pickle.loads(pickle.dumps(datatype)) assert datatype == pickled scala_datatype = self.sqlCtx._ssql_ctx.parseDataType(datatype.json()) python_datatype = _parse_datatype_json_string(scala_datatype.json()) assert datatype == python_datatype
def check_datatype(datatype): pickled = pickle.loads(pickle.dumps(datatype)) assert datatype == pickled scala_datatype = self.spark._jsparkSession.parseDataType(datatype.json()) python_datatype = _parse_datatype_json_string(scala_datatype.json()) assert datatype == python_datatype
def to_spark_schema(self, avsc_json: str) -> DataType: avsc_jvm = self.spark._jvm.org.apache.avro.Schema.Parser().parse(avsc_json) spark_type_jvm = self.schema_converters.toSqlType(avsc_jvm) return _parse_datatype_json_string(spark_type_jvm.dataType().json())
import sys from pyspark.sql.types import StringType from pyspark.serializers import CloudPickleSerializer f = open(sys.argv[1], 'wb') from pyspark.files import SparkFiles from pyspark.sql.functions import pandas_udf from pyspark.sql.types import ArrayType, DataType from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType from pyspark.sql.types import _parse_datatype_json_string return_type = _parse_datatype_json_string(sys.argv[2]) print("function return type: " + str(return_type)) archive_path = sys.argv[3] def predict(*args): import pandas from mlflow.pyfunc.spark_model_cache import SparkModelCache from mlflow.pyfunc import load_pyfunc # pylint: disable=cyclic-import # elem_type = IntegerType elem_type = return_type if isinstance(elem_type, ArrayType): elem_type = elem_type.elementType supported_types = [ IntegerType, LongType, FloatType, DoubleType, StringType ] if not any([isinstance(elem_type, x) for x in supported_types]):