def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string for k, t in _all_atomic_types.items(): if t != NullType: self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) self.assertEqual(ArrayType(IntegerType()), _parse_datatype_string("array<int >")) self.assertEqual(MapType(IntegerType(), DoubleType()), _parse_datatype_string("map< int, double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("struct<a:int, c:double >")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a:int, c:double")) self.assertEqual( StructType([ StructField("a", IntegerType()), StructField("c", DoubleType()) ]), _parse_datatype_string("a INT, c DOUBLE"))
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema( [ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession( pyspark.SparkContext.getOrCreate()) struct_fields = [] for t in schema.input_types(): # pyspark _parse_datatype_string() expects "timestamp" instead of "datetime" if t == DataType.datetime: struct_fields.append( StructField("datetime", _parse_datatype_string("timestamp"), True)) else: struct_fields.append( StructField(t.name, _parse_datatype_string(t.name), True)) spark_schema = StructType(struct_fields) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema( [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def registerJavaFunction(self, name, javaClassName, returnType=None): """Register a Java user-defined function as a SQL function. In addition to a name and the function itself, the return type can be optionally specified. When the return type is not specified we would infer it via reflection. :param name: name of the user-defined function :param javaClassName: fully qualified name of java class :param returnType: the return type of the registered Java function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. >>> from pyspark.sql.types import IntegerType >>> spark.udf.registerJavaFunction( ... "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType()) >>> spark.sql("SELECT javaStringLength('test')").collect() [Row(UDF:javaStringLength(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength") >>> spark.sql("SELECT javaStringLength2('test')").collect() [Row(UDF:javaStringLength2(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer") >>> spark.sql("SELECT javaStringLength3('test')").collect() [Row(UDF:javaStringLength3(test)=4)] """ jdt = None if returnType is not None: if not isinstance(returnType, DataType): returnType = _parse_datatype_string(returnType) jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json()) self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
def add_column(self, colname, coltype, func, *col_names): """ Add a column `colname` of type `coltype` to this `AxsFrame` object by executing function `func` on columns `col_names`. Function `func` has to accept the exact number of arguments as the number of columns specified. The function will be applied to the dataframe data row by row. The arguments passed to the function will match types of the specified columns and the function needs to return an object of type represented by string `coltype`. This method will use `@udf` in the background (see `pyspark.functions.udf`). This function is slower than `add_column` but can produce array or map columns. :param colname: The name of the new column. :param coltype: The name of the new column's type. (See `pyspark.sql.types`) :param func: The function which will produce data for the new column. Needs to return a single object of the type defined by `coltype`. :param col_names: The columns whose data will be supplied to the function `func`. :return: Returns a new AxsFrame with the new column added. """ from pyspark.sql.types import _parse_datatype_string t = _parse_datatype_string(coltype) @udf(returnType=t) def customudf(*cols): return func(*cols) columns = [self._df[cn] for cn in col_names] return wrap(self._df.withColumn(colname, customudf(*columns)), self._table_info)
def add_primitive_column(self, colname, coltype, func, *col_names): """ Add a column `colname` of type `coltype` to this `AxsFrame` object by executing function `func` on columns `col_names`. Function `func` has to accept the exact number of arguments as the number of columns specified. The arguments will be of type Pandas.Series and the function needs to return a Pandas.Series object of type represented by string `coltype`. This method will use `@pandas_udf` in the background (see `pyspark.functions.pandas_udf`). The type of the new column needs to be a primitive type (int, double, etc.). The input columns can be of complex types. This function is faster than `add_column` but cannot *produce* array or map columns. :param colname: The name of the new column. :param coltype: The name of the new column's type. (See `pyspark.sql.types`) :param func: The function which will produce data for the new column. Needs to operate on Pandas.Series objects (of the same number as the number of `col_names`) and return a single Pandas.Series object of the type defined by `coltype`. :param col_names: The columns whose data will be supplied to the function `func`. :return: Returns a new AxsFrame with the new column added. """ from pyspark.sql.types import _parse_datatype_string t = _parse_datatype_string(coltype) @pandas_udf(t) def pudf(*cols): return func(*cols) columns = [self._df[cn] for cn in col_names] return wrap(self._df.withColumn(colname, pudf(*columns)), self._table_info)
def returnType(self): # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string( self._returnType) if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF: try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid returnType with scalar Pandas UDFs: %s is " "not supported" % str(self._returnType_placeholder)) elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_schema(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid returnType with grouped map Pandas UDFs: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError("Invalid returnType for grouped map Pandas " "UDFs: returnType must be a StructType.") return self._returnType_placeholder
def spark_schema_from_json(j): return spark_types.StructType([ spark_types.StructField( f["name"], spark_types._parse_datatype_string(f.get("type", "string")), f.get("nullable", True), f.get("metadata", None)) for f in j ])
def registerJavaFunction( self, name: str, javaClassName: str, returnType: Optional["DataTypeOrString"] = None, ) -> None: """Register a Java user-defined function as a SQL function. In addition to a name and the function itself, the return type can be optionally specified. When the return type is not specified we would infer it via reflection. .. versionadded:: 2.3.0 Parameters ---------- name : str name of the user-defined function javaClassName : str fully qualified name of java class returnType : :class:`pyspark.sql.types.DataType` or str, optional the return type of the registered Java function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Examples -------- >>> from pyspark.sql.types import IntegerType >>> spark.udf.registerJavaFunction( ... "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType()) ... # doctest: +SKIP >>> spark.sql("SELECT javaStringLength('test')").collect() # doctest: +SKIP [Row(javaStringLength(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength") ... # doctest: +SKIP >>> spark.sql("SELECT javaStringLength2('test')").collect() # doctest: +SKIP [Row(javaStringLength2(test)=4)] >>> spark.udf.registerJavaFunction( ... "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer") ... # doctest: +SKIP >>> spark.sql("SELECT javaStringLength3('test')").collect() # doctest: +SKIP [Row(javaStringLength3(test)=4)] """ jdt = None if returnType is not None: if not isinstance(returnType, DataType): returnType = _parse_datatype_string(returnType) returnType = cast(DataType, returnType) jdt = self.sparkSession._jsparkSession.parseDataType( returnType.json()) self.sparkSession._jsparkSession.udf().registerJava( name, javaClassName, jdt)
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) spark_schema = StructType( [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
def test_parse_datatype_string(self): from pyspark.sql.types import _all_atomic_types, _parse_datatype_string for k, t in _all_atomic_types.items(): if t != NullType: self.assertEqual(t(), _parse_datatype_string(k)) self.assertEqual(IntegerType(), _parse_datatype_string("int")) self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)")) self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )")) self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)")) self.assertEqual( ArrayType(IntegerType()), _parse_datatype_string("array<int >")) self.assertEqual( MapType(IntegerType(), DoubleType()), _parse_datatype_string("map< int, double >")) self.assertEqual( StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), _parse_datatype_string("struct<a:int, c:double >")) self.assertEqual( StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), _parse_datatype_string("a:int, c:double")) self.assertEqual( StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]), _parse_datatype_string("a INT, c DOUBLE"))
def returnType(self): # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string(self._returnType) if self.evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF \ and not isinstance(self._returnType_placeholder, StructType): raise ValueError("Invalid returnType: returnType must be a StructType for " "pandas_udf with function type GROUP_MAP") return self._returnType_placeholder
def insert_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: """ Insert data into a feature group. The data path, feature group name and versions are in the configuration file """ feature_store = job_conf.pop("feature_store") fs = get_feature_store_handle(feature_store) fg = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"]) schema = StructType( [StructField(f.name, _parse_datatype_string(f.type), True) for f in fg.features] ) df = get_fg_spark_df(job_conf, schema) fg.insert(df, write_options=job_conf.pop("write_options", {}) or {})
def returnType(self): # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string(self._returnType) if self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ and not isinstance(self._returnType_placeholder, StructType): raise ValueError("Invalid returnType: returnType must be a StructType for " "pandas_udf with function type GROUPED_MAP") elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF \ and isinstance(self._returnType_placeholder, (StructType, ArrayType, MapType)): raise NotImplementedError( "ArrayType, StructType and MapType are not supported with " "PandasUDFType.GROUPED_AGG") return self._returnType_placeholder
def returnType(self): # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string(self._returnType) if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF: try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid returnType with scalar Pandas UDFs: %s is " "not supported" % str(self._returnType_placeholder)) elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid returnType with grouped map Pandas UDFs: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError("Invalid returnType for grouped map Pandas " "UDFs: returnType must be a StructType.") elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: try: # StructType is not yet allowed as a return type, explicitly check here to fail fast if isinstance(self._returnType_placeholder, StructType): raise TypeError to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid returnType with grouped aggregate Pandas UDFs: " "%s is not supported" % str(self._returnType_placeholder)) return self._returnType_placeholder
def spark_udf(spark, path, run_id=None, result_type="double"): """ A Spark UDF that can be used to invoke the Python function formatted model. Parameters passed to the UDF are forwarded to the model as a DataFrame where the names are ordinals (0, 1, ...). The predictions are filtered to contain only the columns that can be represented as the ``result_type``. If the ``result_type`` is string or array of strings, all predictions are converted to string. If the result type is not an array type, the left most column with matching type will be returned. >>> predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model") >>> df.withColumn("prediction", predict("name", "age")).show() :param spark: A SparkSession object. :param path: A path containing a :py:mod:`mlflow.pyfunc` model. :param run_id: ID of the run that produced this model. If provided, ``run_id`` is used to retrieve the model logged with MLflow. :param result_type: the return type of the user-defined function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Only a primitive type or an array (pyspark.sql.types.ArrayType) of primitive types are allowed. The following classes of result type are supported: - "int" or pyspark.sql.types.IntegerType: The leftmost integer that can fit in int32 result is returned or exception is raised if there is none. - "long" or pyspark.sql.types.LongType: The leftmost long integer that can fit in int64 result is returned or exception is raised if there is none. - ArrayType(IntegerType|LongType): Return all integer columns that can fit into the requested size. - "float" or pyspark.sql.types.FloatType: The leftmost numeric result cast to float32 is returned or exception is raised if there is none. - "double" or pyspark.sql.types.DoubleType: The leftmost numeric result cast to double is returned or exception is raised if there is none.. - ArrayType(FloatType|DoubleType): Return all numeric columns cast to the requested type. Exception is raised if there are no numeric columns. - "string" or pyspark.sql.types.StringType: Result is the leftmost column converted to string. - ArrayType(StringType): Return all columns converted to string. :return: Spark UDF which will apply model's prediction method to the data. Default double. """ # Scope Spark import to this method so users don't need pyspark to use non-Spark-related # functionality. from mlflow.pyfunc.spark_model_cache import SparkModelCache from pyspark.sql.functions import pandas_udf from pyspark.sql.types import _parse_datatype_string from pyspark.sql.types import ArrayType, DataType from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType if not isinstance(result_type, DataType): result_type = _parse_datatype_string(result_type) elem_type = result_type if isinstance(elem_type, ArrayType): elem_type = elem_type.elementType supported_types = [ IntegerType, LongType, FloatType, DoubleType, StringType ] if not any([isinstance(elem_type, x) for x in supported_types]): raise MlflowException( message= "Invalid result_type '{}'. Result type can only be one of or an array of one " "of the following types types: {}".format(str(elem_type), str(supported_types)), error_code=INVALID_PARAMETER_VALUE) if run_id: path = tracking.artifact_utils._get_model_log_dir(path, run_id) archive_path = SparkModelCache.add_local_model(spark, path) def predict(*args): model = SparkModelCache.get_or_load(archive_path) schema = {str(i): arg for i, arg in enumerate(args)} # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2) columns = [str(i) for i, _ in enumerate(args)] pdf = pandas.DataFrame(schema, columns=columns) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elif type(elem_type) == IntegerType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=np.number).astype(np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=np.number).astype(np.float64) if len(result.columns) == 0: raise MlflowException( message= "The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type)), error_code=INVALID_PARAMETER_VALUE) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series([row[1].values for row in result.iterrows()]) else: return result[result.columns[0]] return pandas_udf(predict, result_type)
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [ x.encode('utf-8') if not isinstance(x, str) else x for x in schema ] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \ == "true": timezone = self.conf.get("spark.sql.session.timeZone") else: timezone = None # If no schema supplied by user then get the names of columns only if schema is None: schema = [ str(x) if not isinstance(x, basestring) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns ] if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \ and len(data) > 0: try: return self._create_from_pandas_with_arrow( data, schema, timezone) except Exception as e: from pyspark.util import _exception_message if self.conf.get("spark.sql.execution.arrow.fallback.enabled", "true") \ .lower() == "true": msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.fallback.enabled' is set to " "true." % _exception_message(e)) warnings.warn(msg) else: msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true, but has reached " "the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.fallback.enabled' has been set to " "false.\n %s" % _exception_message(e)) warnings.warn(msg) raise data = self._convert_from_pandas(data, schema, timezone) if isinstance(schema, StructType): verify_func = _make_type_verifier( schema) if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) verify_func = _make_type_verifier( dataType, name="field value") if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj, else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD( jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def createDataFrame( # type: ignore[misc] self, data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike"], schema: Optional[Union[AtomicType, StructType, str]] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True ) -> DataFrame: """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of either :class:`Row`, :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value". Each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. .. versionadded:: 2.0.0 .. versionchanged:: 2.1.0 Added verifySchema. Parameters ---------- data : :class:`RDD` or iterable an RDD of any kind of SQL data representation (:class:`Row`, :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, or :class:`pandas.DataFrame`. schema : :class:`pyspark.sql.types.DataType`, str or list, optional a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is None. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`. samplingRatio : float, optional the sample ratio of rows used for inferring verifySchema : bool, optional verify data types of every row against schema. Enabled by default. Returns ------- :class:`DataFrame` Notes ----- Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. Examples -------- >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1='Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name='Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name='Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1='Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name='Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name='Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name='Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name='Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a='Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ SparkSession._activeSession = self self._jvm.SparkSession.setActiveSession(self._jsparkSession) if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, str): schema = cast(Union[AtomicType, StructType, str], _parse_datatype_string(schema)) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): # Create a DataFrame from pandas DataFrame. return super(SparkSession, self).createDataFrame( # type: ignore[call-overload] data, schema, samplingRatio, verifySchema) return self._create_dataframe( data, schema, samplingRatio, verifySchema # type: ignore[arg-type] )
) # COMMAND ---------- from pyspark.sql.types import _parse_datatype_string aggregate_schema = """ _id string, lifestyle string, mean_BMI double, mean_active_heartrate double, mean_resting_heartrate double, mean_VO2_max double """ assert health_tracker_sample_agg_df.schema == _parse_datatype_string(aggregate_schema) # COMMAND ---------- # MAGIC %md # MAGIC ### Write the Aggregation Dataframe to a Delta Table # MAGIC # MAGIC Use the following path: `goldPath + "health_tracker_sample_agg"` # COMMAND ---------- # TODO ( health_tracker_sample_agg_df.write .format("delta") .mode("overwrite")
def spark_udf(spark, model_uri, result_type="double"): """ A Spark UDF that can be used to invoke the Python function formatted model. Parameters passed to the UDF are forwarded to the model as a DataFrame where the names are ordinals (0, 1, ...). The predictions are filtered to contain only the columns that can be represented as the ``result_type``. If the ``result_type`` is string or array of strings, all predictions are converted to string. If the result type is not an array type, the left most column with matching type is returned. >>> predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model") >>> df.withColumn("prediction", predict("name", "age")).show() :param spark: A SparkSession object. :param model_uri: The location, in URI format, of the MLflow model with the :py:mod:`mlflow.pyfunc` flavor. For example: - ``/Users/me/path/to/local/model`` - ``relative/path/to/local/model`` - ``s3://my_bucket/path/to/model`` - ``runs:/<mlflow_run_id>/run-relative/path/to/model`` For more information about supported URI schemes, see `Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html# artifact-locations>`_. :param result_type: the return type of the user-defined function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Only a primitive type or an array ``pyspark.sql.types.ArrayType`` of primitive type are allowed. The following classes of result type are supported: - "int" or ``pyspark.sql.types.IntegerType``: The leftmost integer that can fit in an ``int32`` or an exception if there is none. - "long" or ``pyspark.sql.types.LongType``: The leftmost long integer that can fit in an ``int64`` or an exception if there is none. - ``ArrayType(IntegerType|LongType)``: All integer columns that can fit into the requested size. - "float" or ``pyspark.sql.types.FloatType``: The leftmost numeric result cast to ``float32`` or an exception if there is none. - "double" or ``pyspark.sql.types.DoubleType``: The leftmost numeric result cast to ``double`` or an exception if there is none. - ``ArrayType(FloatType|DoubleType)``: All numeric columns cast to the requested type or an exception if there are no numeric columns. - "string" or ``pyspark.sql.types.StringType``: The leftmost column converted to ``string``. - ``ArrayType(StringType)``: All columns converted to ``string``. :return: Spark UDF that applies the model's ``predict`` method to the data and returns a type specified by ``result_type``, which by default is a double. """ # Scope Spark import to this method so users don't need pyspark to use non-Spark-related # functionality. from mlflow.pyfunc.spark_model_cache import SparkModelCache from pyspark.sql.functions import pandas_udf from pyspark.sql.types import _parse_datatype_string from pyspark.sql.types import ArrayType, DataType from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType if not isinstance(result_type, DataType): result_type = _parse_datatype_string(result_type) elem_type = result_type if isinstance(elem_type, ArrayType): elem_type = elem_type.elementType supported_types = [IntegerType, LongType, FloatType, DoubleType, StringType] if not any([isinstance(elem_type, x) for x in supported_types]): raise MlflowException( message="Invalid result_type '{}'. Result type can only be one of or an array of one " "of the following types types: {}".format(str(elem_type), str(supported_types)), error_code=INVALID_PARAMETER_VALUE) local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) archive_path = SparkModelCache.add_local_model(spark, local_model_path) def predict(*args): model = SparkModelCache.get_or_load(archive_path) schema = {str(i): arg for i, arg in enumerate(args)} # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2) columns = [str(i) for i, _ in enumerate(args)] pdf = pandas.DataFrame(schema, columns=columns) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elif type(elem_type) == IntegerType: result = result.select_dtypes([np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes([np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=(np.number,)).astype(np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=(np.number,)).astype(np.float64) if len(result.columns) == 0: raise MlflowException( message="The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type)), error_code=INVALID_PARAMETER_VALUE) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series([row[1].values for row in result.iterrows()]) else: return result[result.columns[0]] return pandas_udf(predict, result_type)
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ SparkSession._activeSession = self self._jvm.SparkSession.setActiveSession(self._jsparkSession) if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() if self._wrapped._conf.pandasRespectSessionTimeZone(): timezone = self._wrapped._conf.sessionLocalTimeZone() else: timezone = None # If no schema supplied by user then get the names of columns only if schema is None: schema = [str(x) if not isinstance(x, basestring) else (x.encode('utf-8') if not isinstance(x, str) else x) for x in data.columns] if self._wrapped._conf.arrowEnabled() and len(data) > 0: try: return self._create_from_pandas_with_arrow(data, schema, timezone) except Exception as e: from pyspark.util import _exception_message if self._wrapped._conf.arrowFallbackEnabled(): msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.fallback.enabled' is set to " "true." % _exception_message(e)) warnings.warn(msg) else: msg = ( "createDataFrame attempted Arrow optimization because " "'spark.sql.execution.arrow.enabled' is set to true, but has reached " "the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.fallback.enabled' has been set to " "false.\n %s" % _exception_message(e)) warnings.warn(msg) raise data = self._convert_from_pandas(data, schema, timezone) if isinstance(schema, StructType): verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) verify_func = _make_type_verifier( dataType, name="field value") if verifySchema else lambda _: True def prepare(obj): verify_func(obj) return obj, else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def returnType(self) -> DataType: # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string( self._returnType) if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or \ self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF: try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with scalar Pandas UDFs: %s is " "not supported" % str(self._returnType_placeholder)) elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with grouped map Pandas UDFs or " "at groupby.applyInPandas: %s is not supported" % str(self._returnType_placeholder)) else: raise TypeError( "Invalid return type for grouped map Pandas " "UDFs or at groupby.applyInPandas: return type must be a " "StructType.") elif self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type in mapInPandas: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError("Invalid return type in mapInPandas: " "return type must be a StructType.") elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type in cogroup.applyInPandas: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError( "Invalid return type in cogroup.applyInPandas: " "return type must be a StructType.") elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: try: # StructType is not yet allowed as a return type, explicitly check here to fail fast if isinstance(self._returnType_placeholder, StructType): raise TypeError to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with grouped aggregate Pandas UDFs: " "%s is not supported" % str(self._returnType_placeholder)) return self._returnType_placeholder
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): if schema is None: schema = [str(x) for x in data.columns] data = [r.tolist() for r in data.to_records(index=False)] verify_func = _verify_type if verifySchema else lambda _, t: True if isinstance(schema, StructType): def prepare(obj): verify_func(obj, schema) return obj elif isinstance(schema, DataType): dataType = schema schema = StructType().add("value", schema) def prepare(obj): verify_func(obj, dataType) return obj, else: if isinstance(schema, list): schema = [ x.encode('utf-8') if not isinstance(x, str) else x for x in schema ] prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD( jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def createDataFrame( # type: ignore[misc] self, data: Union[RDD[Any], Iterable[Any], "PandasDataFrameLike", "ArrayLike"], schema: Optional[Union[AtomicType, StructType, str]] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True, ) -> DataFrame: """ Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame` or a :class:`numpy.ndarray`. .. versionadded:: 2.0.0 Parameters ---------- data : :class:`RDD` or iterable an RDD of any kind of SQL data representation (:class:`Row`, :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, :class:`pandas.DataFrame` or :class:`numpy.ndarray`. schema : :class:`pyspark.sql.types.DataType`, str or list, optional a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is None. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>``. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of either :class:`Row`, :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value". Each record will also be wrapped into a tuple, which can be converted to row later. samplingRatio : float, optional the sample ratio of rows used for inferring. The first few rows will be used if ``samplingRatio`` is ``None``. verifySchema : bool, optional verify data types of every row against schema. Enabled by default. .. versionadded:: 2.1.0 Returns ------- :class:`DataFrame` Notes ----- Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental. Examples -------- Create a DataFrame from a list of tuples. >>> spark.createDataFrame([('Alice', 1)]).collect() [Row(_1='Alice', _2=1)] >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).collect() [Row(name='Alice', age=1)] Create a DataFrame from a list of dictionaries >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name='Alice')] Create a DataFrame from an RDD. >>> rdd = spark.sparkContext.parallelize([('Alice', 1)]) >>> spark.createDataFrame(rdd).collect() [Row(_1='Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name='Alice', age=1)] Create a DataFrame from Row instances. >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name='Alice', age=1)] Create a DataFrame with the explicit schema specified. >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name='Alice', age=1)] Create a DataFrame from a pandas DataFrame. >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name='Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] Create a DataFrame from an RDD with the schema in DDL formatted string. >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a='Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] When the type is unmatched, it throws an exception. >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ SparkSession._activeSession = self assert self._jvm is not None self._jvm.SparkSession.setActiveSession(self._jsparkSession) if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, str): schema = cast(Union[AtomicType, StructType, str], _parse_datatype_string(schema)) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode("utf-8") if not isinstance(x, str) else x for x in schema] try: import pandas as pd has_pandas = True except Exception: has_pandas = False try: import numpy as np has_numpy = True except Exception: has_numpy = False if has_numpy and isinstance(data, np.ndarray): # `data` of numpy.ndarray type will be converted to a pandas DataFrame, # so pandas is required. from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() if data.ndim not in [1, 2]: raise ValueError("NumPy array input should be of 1 or 2 dimensions.") column_names = ["value"] if data.ndim == 1 else ["_1", "_2"] if schema is None and not self._jconf.arrowPySparkEnabled(): # Construct `schema` from `np.dtype` of the input NumPy array # TODO: Apply the logic below when self._jconf.arrowPySparkEnabled() is True spark_type = _from_numpy_type(data.dtype) if spark_type is not None: schema = StructType( [StructField(name, spark_type, nullable=True) for name in column_names] ) data = pd.DataFrame(data, columns=column_names) if has_pandas and isinstance(data, pd.DataFrame): # Create a DataFrame from pandas DataFrame. return super(SparkSession, self).createDataFrame( # type: ignore[call-overload] data, schema, samplingRatio, verifySchema ) return self._create_dataframe( data, schema, samplingRatio, verifySchema # type: ignore[arg-type] )
def createDataFrame(self, data, schema=None, samplingRatio=None): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of :class:`Row`, or :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or exception will be thrown at runtime. If the given schema is not StructType, it will be wrapped into a StructType as its only field, and the field name will be "value", each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean, etc.), or :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`DataType` or a datatype string or a list of column names, default is None. The data type string format equals to `DataType.simpleString`, except that top level struct type can omit the `struct<>` and atomic types use `typeName()` as their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int` as a short name for IntegerType. :param samplingRatio: the sample ratio of rows used for inferring :return: :class:`DataFrame` .. versionchanged:: 2.0 The schema parameter can be a DataType or a datatype string after 2.0. If it's not a StructType, it will be wrapped into a StructType and each record will also be wrapped into a tuple. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): if schema is None: schema = [str(x) for x in data.columns] data = [r.tolist() for r in data.to_records(index=False)] if isinstance(schema, StructType): def prepare(obj): _verify_type(obj, schema) return obj elif isinstance(schema, DataType): datatype = schema def prepare(obj): _verify_type(obj, datatype) return (obj, ) schema = StructType().add("value", datatype) else: prepare = lambda obj: obj if isinstance(data, RDD): rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) else: rdd, schema = self._createFromLocal(map(prepare, data), schema) jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True): """ Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`. When ``schema`` is a list of column names, the type of each column will be inferred from ``data``. When ``schema`` is ``None``, it will try to infer the schema (column names and types) from ``data``, which should be an RDD of either :class:`Row`, :class:`namedtuple`, or :class:`dict`. When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match the real data, or an exception will be thrown at runtime. If the given schema is not :class:`pyspark.sql.types.StructType`, it will be wrapped into a :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value". Each record will also be wrapped into a tuple, which can be converted to row later. If schema inference is needed, ``samplingRatio`` is used to determined the ratio of rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``. :param data: an RDD of any kind of SQL data representation (e.g. row, tuple, int, boolean, etc.), :class:`list`, or :class:`pandas.DataFrame`. :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of column names, default is ``None``. The data type string format equals to :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use ``int`` as a short name for ``IntegerType``. :param samplingRatio: the sample ratio of rows used for inferring :param verifySchema: verify data types of every row against schema. :return: :class:`DataFrame` .. versionchanged:: 2.1 Added verifySchema. .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental. .. note:: When Arrow optimization is enabled, strings inside Pandas DataFrame in Python 2 are converted into bytes as they are bytes in Python 2 whereas regular strings are left as strings. When using strings in Python 2, use unicode `u""` as Python standard practice. >>> l = [('Alice', 1)] >>> spark.createDataFrame(l).collect() [Row(_1=u'Alice', _2=1)] >>> spark.createDataFrame(l, ['name', 'age']).collect() [Row(name=u'Alice', age=1)] >>> d = [{'name': 'Alice', 'age': 1}] >>> spark.createDataFrame(d).collect() [Row(age=1, name=u'Alice')] >>> rdd = sc.parallelize(l) >>> spark.createDataFrame(rdd).collect() [Row(_1=u'Alice', _2=1)] >>> df = spark.createDataFrame(rdd, ['name', 'age']) >>> df.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql import Row >>> Person = Row('name', 'age') >>> person = rdd.map(lambda r: Person(*r)) >>> df2 = spark.createDataFrame(person) >>> df2.collect() [Row(name=u'Alice', age=1)] >>> from pyspark.sql.types import * >>> schema = StructType([ ... StructField("name", StringType(), True), ... StructField("age", IntegerType(), True)]) >>> df3 = spark.createDataFrame(rdd, schema) >>> df3.collect() [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(df.toPandas()).collect() # doctest: +SKIP [Row(name=u'Alice', age=1)] >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP [Row(0=1, 1=2)] >>> spark.createDataFrame(rdd, "a: string, b: int").collect() [Row(a=u'Alice', b=1)] >>> rdd = rdd.map(lambda row: row[1]) >>> spark.createDataFrame(rdd, "int").collect() [Row(value=1)] >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... Py4JJavaError: ... """ SparkSession._activeSession = self self._jvm.SparkSession.setActiveSession(self._jsparkSession) if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") if isinstance(schema, basestring): schema = _parse_datatype_string(schema) elif isinstance(schema, (list, tuple)): # Must re-encode any unicode strings to be consistent with StructField names schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema] try: import pandas has_pandas = True except Exception: has_pandas = False if has_pandas and isinstance(data, pandas.DataFrame): # Create a DataFrame from pandas DataFrame. return super(SparkSession, self).createDataFrame( data, schema, samplingRatio, verifySchema) return self._create_dataframe(data, schema, samplingRatio, verifySchema)
def spark_udf(spark, model_uri, result_type="double"): """ A Spark UDF that can be used to invoke the Python function formatted model. Parameters passed to the UDF are forwarded to the model as a DataFrame where the column names are ordinals (0, 1, ...). On some versions of Spark, it is also possible to wrap the input in a struct. In that case, the data will be passed as a DataFrame with column names given by the struct definition (e.g. when invoked as my_udf(struct('x', 'y'), the model will ge the data as a pandas DataFrame with 2 columns 'x' and 'y'). The predictions are filtered to contain only the columns that can be represented as the ``result_type``. If the ``result_type`` is string or array of strings, all predictions are converted to string. If the result type is not an array type, the left most column with matching type is returned. .. code-block:: python :caption: Example predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model") df.withColumn("prediction", predict("name", "age")).show() :param spark: A SparkSession object. :param model_uri: The location, in URI format, of the MLflow model with the :py:mod:`mlflow.pyfunc` flavor. For example: - ``/Users/me/path/to/local/model`` - ``relative/path/to/local/model`` - ``s3://my_bucket/path/to/model`` - ``runs:/<mlflow_run_id>/run-relative/path/to/model`` - ``models:/<model_name>/<model_version>`` - ``models:/<model_name>/<stage>`` For more information about supported URI schemes, see `Referencing Artifacts <https://www.mlflow.org/docs/latest/concepts.html# artifact-locations>`_. :param result_type: the return type of the user-defined function. The value can be either a ``pyspark.sql.types.DataType`` object or a DDL-formatted type string. Only a primitive type or an array ``pyspark.sql.types.ArrayType`` of primitive type are allowed. The following classes of result type are supported: - "int" or ``pyspark.sql.types.IntegerType``: The leftmost integer that can fit in an ``int32`` or an exception if there is none. - "long" or ``pyspark.sql.types.LongType``: The leftmost long integer that can fit in an ``int64`` or an exception if there is none. - ``ArrayType(IntegerType|LongType)``: All integer columns that can fit into the requested size. - "float" or ``pyspark.sql.types.FloatType``: The leftmost numeric result cast to ``float32`` or an exception if there is none. - "double" or ``pyspark.sql.types.DoubleType``: The leftmost numeric result cast to ``double`` or an exception if there is none. - ``ArrayType(FloatType|DoubleType)``: All numeric columns cast to the requested type or an exception if there are no numeric columns. - "string" or ``pyspark.sql.types.StringType``: The leftmost column converted to ``string``. - ``ArrayType(StringType)``: All columns converted to ``string``. :return: Spark UDF that applies the model's ``predict`` method to the data and returns a type specified by ``result_type``, which by default is a double. """ # Scope Spark import to this method so users don't need pyspark to use non-Spark-related # functionality. from mlflow.pyfunc.spark_model_cache import SparkModelCache from pyspark.sql.functions import pandas_udf from pyspark.sql.types import _parse_datatype_string from pyspark.sql.types import ArrayType, DataType as SparkDataType from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType if not isinstance(result_type, SparkDataType): result_type = _parse_datatype_string(result_type) elem_type = result_type if isinstance(elem_type, ArrayType): elem_type = elem_type.elementType supported_types = [ IntegerType, LongType, FloatType, DoubleType, StringType ] if not any([isinstance(elem_type, x) for x in supported_types]): raise MlflowException( message= "Invalid result_type '{}'. Result type can only be one of or an array of one " "of the following types types: {}".format(str(elem_type), str(supported_types)), error_code=INVALID_PARAMETER_VALUE, ) with TempDir() as local_tmpdir: local_model_path = _download_artifact_from_uri( artifact_uri=model_uri, output_path=local_tmpdir.path()) archive_path = SparkModelCache.add_local_model(spark, local_model_path) def predict(*args): model = SparkModelCache.get_or_load(archive_path) input_schema = model.metadata.get_input_schema() pdf = None for x in args: if type(x) == pandas.DataFrame: if len(args) != 1: raise Exception( "If passing a StructType column, there should be only one " "input column, but got %d" % len(args)) pdf = x if pdf is None: args = list(args) if input_schema is None: names = [str(i) for i in range(len(args))] else: names = input_schema.column_names() if len(args) > len(names): args = args[:len(names)] if len(args) < len(names): message = ( "Model input is missing columns. Expected {0} input columns {1}," " but the model received only {2} unnamed input columns" " (Since the columns were passed unnamed they are expected to be in" " the order specified by the schema).".format( len(names), names, len(args))) raise MlflowException(message) pdf = pandas.DataFrame( data={names[i]: x for i, x in enumerate(args)}, columns=names) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elem_type = result_type.elementType if isinstance( result_type, ArrayType) else result_type if type(elem_type) == IntegerType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=(np.number, )).astype( np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=(np.number, )).astype( np.float64) if len(result.columns) == 0: raise MlflowException( message= "The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type)), error_code=INVALID_PARAMETER_VALUE, ) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series(result.to_numpy().tolist()) else: return result[result.columns[0]] return pandas_udf(predict, result_type)
def spark_udf(spark, model_uri, features, result_type="double"): """ Create spark pandas udf given the model uri :param spark: spark context :param model_uri: path to model :param features: list containing the feature names :param result_type: result type of the model :return: """ # Scope Spark import to this method so users don't need pyspark to use non-Spark-related # functionality. from mlflow.pyfunc.spark_model_cache import SparkModelCache from pyspark.sql.functions import pandas_udf from pyspark.sql.types import _parse_datatype_string from pyspark.sql.types import ArrayType, DataType from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType if not isinstance(result_type, DataType): result_type = _parse_datatype_string(result_type) elem_type = result_type if isinstance(elem_type, ArrayType): elem_type = elem_type.elementType supported_types = [ IntegerType, LongType, FloatType, DoubleType, StringType ] if not any([isinstance(elem_type, x) for x in supported_types]): raise ValueError( "Invalid result_type '{}'. Result type can only be one of or an array of one " "of the following types types: {}".format(str(elem_type), str(supported_types))) with TempDir() as local_tmpdir: local_model_path = _download_artifact_from_uri( artifact_uri=model_uri, output_path=local_tmpdir.path()) archive_path = SparkModelCache.add_local_model(spark, local_model_path) def predict(*args): model = SparkModelCache.get_or_load(archive_path) schema = {features[i]: arg for i, arg in enumerate(args)} pdf = None for x in args: if type(x) == pandas.DataFrame: if len(args) != 1: raise Exception( "If passing a StructType column, there should be only one " "input column, but got %d" % len(args)) pdf = x if pdf is None: pdf = pandas.DataFrame(schema) result = model.predict(pdf) if not isinstance(result, pandas.DataFrame): result = pandas.DataFrame(data=result) elif type(elem_type) == IntegerType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int32]).astype(np.int32) elif type(elem_type) == LongType: result = result.select_dtypes( [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long]) elif type(elem_type) == FloatType: result = result.select_dtypes(include=(np.number, )).astype( np.float32) elif type(elem_type) == DoubleType: result = result.select_dtypes(include=(np.number, )).astype( np.float64) if len(result.columns) == 0: raise ValueError( "The the model did not produce any values compatible with the requested " "type '{}'. Consider requesting udf with StringType or " "Arraytype(StringType).".format(str(elem_type))) if type(elem_type) == StringType: result = result.applymap(str) if type(result_type) == ArrayType: return pandas.Series([row[1].values for row in result.iterrows()]) else: return result[result.columns[0]] return pandas_udf(predict, result_type)