Esempio n. 1
0
 def test_parse_datatype_string(self):
     from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
     for k, t in _all_atomic_types.items():
         if t != NullType:
             self.assertEqual(t(), _parse_datatype_string(k))
     self.assertEqual(IntegerType(), _parse_datatype_string("int"))
     self.assertEqual(DecimalType(1, 1),
                      _parse_datatype_string("decimal(1  ,1)"))
     self.assertEqual(DecimalType(10, 1),
                      _parse_datatype_string("decimal( 10,1 )"))
     self.assertEqual(DecimalType(11, 1),
                      _parse_datatype_string("decimal(11,1)"))
     self.assertEqual(ArrayType(IntegerType()),
                      _parse_datatype_string("array<int >"))
     self.assertEqual(MapType(IntegerType(), DoubleType()),
                      _parse_datatype_string("map< int, double  >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("struct<a:int, c:double >"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a:int, c:double"))
     self.assertEqual(
         StructType([
             StructField("a", IntegerType()),
             StructField("c", DoubleType())
         ]), _parse_datatype_string("a INT, c DOUBLE"))
Esempio n. 2
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType

    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())

    struct_fields = []
    for t in schema.input_types():
        # pyspark _parse_datatype_string() expects "timestamp" instead of "datetime"
        if t == DataType.datetime:
            struct_fields.append(
                StructField("datetime", _parse_datatype_string("timestamp"),
                            True))
        else:
            struct_fields.append(
                StructField(t.name, _parse_datatype_string(t.name), True))
    spark_schema = StructType(struct_fields)
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema(
        [ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Esempio n. 3
0
    def registerJavaFunction(self, name, javaClassName, returnType=None):
        """Register a Java user-defined function as a SQL function.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not specified we would infer it via reflection.

        :param name: name of the user-defined function
        :param javaClassName: fully qualified name of java class
        :param returnType: the return type of the registered Java function. The value can be either
            a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        >>> from pyspark.sql.types import IntegerType
        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
        >>> spark.sql("SELECT javaStringLength('test')").collect()
        [Row(UDF:javaStringLength(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
        >>> spark.sql("SELECT javaStringLength2('test')").collect()
        [Row(UDF:javaStringLength2(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer")
        >>> spark.sql("SELECT javaStringLength3('test')").collect()
        [Row(UDF:javaStringLength3(test)=4)]
        """

        jdt = None
        if returnType is not None:
            if not isinstance(returnType, DataType):
                returnType = _parse_datatype_string(returnType)
            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
Esempio n. 4
0
    def add_column(self, colname, coltype, func, *col_names):
        """
        Add a column `colname` of type `coltype` to this `AxsFrame` object by executing function `func` on
        columns `col_names`. Function `func` has to accept the exact number of arguments as the number of columns
        specified. The function will be applied to the dataframe data row by row. The arguments passed to the function
        will match types of the specified columns and the function needs to return an object of type represented by
        string `coltype`.

        This method will use `@udf` in the background (see `pyspark.functions.udf`).

        This function is slower than `add_column` but can produce array or map columns.

        :param colname: The name of the new column.
        :param coltype: The name of the new column's type. (See `pyspark.sql.types`)
        :param func: The function which will produce data for the new column. Needs to return a single object of the
        type defined by `coltype`.
        :param col_names: The columns whose data will be supplied to the function `func`.
        :return: Returns a new AxsFrame with the new column added.
        """
        from pyspark.sql.types import _parse_datatype_string
        t = _parse_datatype_string(coltype)

        @udf(returnType=t)
        def customudf(*cols):
            return func(*cols)

        columns = [self._df[cn] for cn in col_names]
        return wrap(self._df.withColumn(colname, customudf(*columns)),
                    self._table_info)
Esempio n. 5
0
    def add_primitive_column(self, colname, coltype, func, *col_names):
        """
        Add a column `colname` of type `coltype` to this `AxsFrame` object by executing function `func` on
        columns `col_names`. Function `func` has to accept the exact number of arguments as the number of columns
        specified. The arguments will be of type Pandas.Series and the function needs to return a Pandas.Series
        object of type represented by string `coltype`.

        This method will use `@pandas_udf` in the background (see `pyspark.functions.pandas_udf`).

        The type of the new column needs to be a primitive type (int, double, etc.). The input columns can be of
        complex types.

        This function is faster than `add_column` but cannot *produce* array or map columns.

        :param colname: The name of the new column.
        :param coltype: The name of the new column's type. (See `pyspark.sql.types`)
        :param func: The function which will produce data for the new column. Needs to operate on Pandas.Series objects
        (of the same number as the number of `col_names`) and return a single Pandas.Series object of the type defined
        by `coltype`.
        :param col_names: The columns whose data will be supplied to the function `func`.
        :return: Returns a new AxsFrame with the new column added.
        """
        from pyspark.sql.types import _parse_datatype_string
        t = _parse_datatype_string(coltype)

        @pandas_udf(t)
        def pudf(*cols):
            return func(*cols)

        columns = [self._df[cn] for cn in col_names]
        return wrap(self._df.withColumn(colname, pudf(*columns)),
                    self._table_info)
    def registerJavaFunction(self, name, javaClassName, returnType=None):
        """Register a Java user-defined function as a SQL function.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not specified we would infer it via reflection.

        :param name: name of the user-defined function
        :param javaClassName: fully qualified name of java class
        :param returnType: the return type of the registered Java function. The value can be either
            a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        >>> from pyspark.sql.types import IntegerType
        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
        >>> spark.sql("SELECT javaStringLength('test')").collect()
        [Row(UDF:javaStringLength(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
        >>> spark.sql("SELECT javaStringLength2('test')").collect()
        [Row(UDF:javaStringLength2(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer")
        >>> spark.sql("SELECT javaStringLength3('test')").collect()
        [Row(UDF:javaStringLength3(test)=4)]
        """

        jdt = None
        if returnType is not None:
            if not isinstance(returnType, DataType):
                returnType = _parse_datatype_string(returnType)
            jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())
        self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)
Esempio n. 7
0
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(
                    self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_schema(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with grouped map Pandas UDFs: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for grouped map Pandas "
                                "UDFs: returnType must be a StructType.")

        return self._returnType_placeholder
Esempio n. 8
0
def spark_schema_from_json(j):
    return spark_types.StructType([
        spark_types.StructField(
            f["name"],
            spark_types._parse_datatype_string(f.get("type", "string")),
            f.get("nullable", True), f.get("metadata", None)) for f in j
    ])
Esempio n. 9
0
    def registerJavaFunction(
        self,
        name: str,
        javaClassName: str,
        returnType: Optional["DataTypeOrString"] = None,
    ) -> None:
        """Register a Java user-defined function as a SQL function.

        In addition to a name and the function itself, the return type can be optionally specified.
        When the return type is not specified we would infer it via reflection.

        .. versionadded:: 2.3.0

        Parameters
        ----------
        name : str
            name of the user-defined function
        javaClassName : str
            fully qualified name of java class
        returnType : :class:`pyspark.sql.types.DataType` or str, optional
            the return type of the registered Java function. The value can be either
            a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Examples
        --------
        >>> from pyspark.sql.types import IntegerType
        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength", "test.org.apache.spark.sql.JavaStringLength", IntegerType())
        ... # doctest: +SKIP
        >>> spark.sql("SELECT javaStringLength('test')").collect()  # doctest: +SKIP
        [Row(javaStringLength(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength2", "test.org.apache.spark.sql.JavaStringLength")
        ... # doctest: +SKIP
        >>> spark.sql("SELECT javaStringLength2('test')").collect()  # doctest: +SKIP
        [Row(javaStringLength2(test)=4)]

        >>> spark.udf.registerJavaFunction(
        ...     "javaStringLength3", "test.org.apache.spark.sql.JavaStringLength", "integer")
        ... # doctest: +SKIP
        >>> spark.sql("SELECT javaStringLength3('test')").collect()  # doctest: +SKIP
        [Row(javaStringLength3(test)=4)]
        """

        jdt = None
        if returnType is not None:
            if not isinstance(returnType, DataType):
                returnType = _parse_datatype_string(returnType)
            returnType = cast(DataType, returnType)
            jdt = self.sparkSession._jsparkSession.parseDataType(
                returnType.json())
        self.sparkSession._jsparkSession.udf().registerJava(
            name, javaClassName, jdt)
Esempio n. 10
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True)
         for t in schema.column_types()])
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Esempio n. 11
0
 def test_parse_datatype_string(self):
     from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
     for k, t in _all_atomic_types.items():
         if t != NullType:
             self.assertEqual(t(), _parse_datatype_string(k))
     self.assertEqual(IntegerType(), _parse_datatype_string("int"))
     self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1  ,1)"))
     self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )"))
     self.assertEqual(DecimalType(11, 1), _parse_datatype_string("decimal(11,1)"))
     self.assertEqual(
         ArrayType(IntegerType()),
         _parse_datatype_string("array<int >"))
     self.assertEqual(
         MapType(IntegerType(), DoubleType()),
         _parse_datatype_string("map< int, double  >"))
     self.assertEqual(
         StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
         _parse_datatype_string("struct<a:int, c:double >"))
     self.assertEqual(
         StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
         _parse_datatype_string("a:int, c:double"))
     self.assertEqual(
         StructType([StructField("a", IntegerType()), StructField("c", DoubleType())]),
         _parse_datatype_string("a INT, c DOUBLE"))
Esempio n. 12
0
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(self._returnType)

        if self.evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF \
                and not isinstance(self._returnType_placeholder, StructType):
            raise ValueError("Invalid returnType: returnType must be a StructType for "
                             "pandas_udf with function type GROUP_MAP")

        return self._returnType_placeholder
Esempio n. 13
0
File: udf.py Progetto: zzq1/Spark
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(self._returnType)

        if self.evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF \
                and not isinstance(self._returnType_placeholder, StructType):
            raise ValueError("Invalid returnType: returnType must be a StructType for "
                             "pandas_udf with function type GROUP_MAP")

        return self._returnType_placeholder
def insert_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None:
    """
    Insert data into a feature group.
    The data path, feature group name and versions are in the configuration file
    """
    feature_store = job_conf.pop("feature_store")
    fs = get_feature_store_handle(feature_store)

    fg = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"])

    schema = StructType(
        [StructField(f.name, _parse_datatype_string(f.type), True) for f in fg.features]
    )

    df = get_fg_spark_df(job_conf, schema)
    fg.insert(df, write_options=job_conf.pop("write_options", {}) or {})
Esempio n. 15
0
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(self._returnType)

        if self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \
                and not isinstance(self._returnType_placeholder, StructType):
            raise ValueError("Invalid returnType: returnType must be a StructType for "
                             "pandas_udf with function type GROUPED_MAP")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF \
                and isinstance(self._returnType_placeholder, (StructType, ArrayType, MapType)):
            raise NotImplementedError(
                "ArrayType, StructType and MapType are not supported with "
                "PandasUDFType.GROUPED_AGG")

        return self._returnType_placeholder
Esempio n. 16
0
File: udf.py Progetto: Brett-A/spark
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with grouped map Pandas UDFs: "
                        "%s is not supported" % str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for grouped map Pandas "
                                "UDFs: returnType must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
            try:
                # StructType is not yet allowed as a return type, explicitly check here to fail fast
                if isinstance(self._returnType_placeholder, StructType):
                    raise TypeError
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with grouped aggregate Pandas UDFs: "
                    "%s is not supported" % str(self._returnType_placeholder))

        return self._returnType_placeholder
Esempio n. 17
0
def spark_udf(spark, path, run_id=None, result_type="double"):
    """
    A Spark UDF that can be used to invoke the Python function formatted model.

    Parameters passed to the UDF are forwarded to the model as a DataFrame where the names are
    ordinals (0, 1, ...).

    The predictions are filtered to contain only the columns that can be represented as the
    ``result_type``. If the ``result_type`` is string or array of strings, all predictions are
    converted to string. If the result type is not an array type, the left most column with
    matching type will be returned.

    >>> predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model")
    >>> df.withColumn("prediction", predict("name", "age")).show()

    :param spark: A SparkSession object.
    :param path: A path containing a :py:mod:`mlflow.pyfunc` model.
    :param run_id: ID of the run that produced this model. If provided, ``run_id`` is used to
                   retrieve the model logged with MLflow.
    :param result_type: the return type of the user-defined function. The value can be either a
                        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
                        Only a primitive type or an array (pyspark.sql.types.ArrayType) of primitive
                        types are allowed. The following classes of result type are supported:
                        - "int" or pyspark.sql.types.IntegerType: The leftmost integer that can fit
                          in int32 result is returned or exception is raised if there is none.
                        - "long" or pyspark.sql.types.LongType: The leftmost long integer that can
                          fit in int64 result is returned or exception is raised if there is none.
                        - ArrayType(IntegerType|LongType): Return all integer columns that can fit
                          into the requested size.
                        - "float" or pyspark.sql.types.FloatType: The leftmost numeric result cast
                          to float32 is returned or exception is raised if there is none.
                        - "double" or pyspark.sql.types.DoubleType: The leftmost numeric result cast
                          to double is returned or exception is raised if there is none..
                        - ArrayType(FloatType|DoubleType): Return all numeric columns cast to the
                          requested type. Exception is raised if there are no numeric columns.
                        - "string" or pyspark.sql.types.StringType: Result is the leftmost column
                          converted to string.
                        - ArrayType(StringType): Return all columns converted to string.

    :return: Spark UDF which will apply model's prediction method to the data. Default double.
    """

    # Scope Spark import to this method so users don't need pyspark to use non-Spark-related
    # functionality.
    from mlflow.pyfunc.spark_model_cache import SparkModelCache
    from pyspark.sql.functions import pandas_udf
    from pyspark.sql.types import _parse_datatype_string
    from pyspark.sql.types import ArrayType, DataType
    from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType

    if not isinstance(result_type, DataType):
        result_type = _parse_datatype_string(result_type)

    elem_type = result_type
    if isinstance(elem_type, ArrayType):
        elem_type = elem_type.elementType

    supported_types = [
        IntegerType, LongType, FloatType, DoubleType, StringType
    ]

    if not any([isinstance(elem_type, x) for x in supported_types]):
        raise MlflowException(
            message=
            "Invalid result_type '{}'. Result type can only be one of or an array of one "
            "of the following types types: {}".format(str(elem_type),
                                                      str(supported_types)),
            error_code=INVALID_PARAMETER_VALUE)

    if run_id:
        path = tracking.artifact_utils._get_model_log_dir(path, run_id)

    archive_path = SparkModelCache.add_local_model(spark, path)

    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        schema = {str(i): arg for i, arg in enumerate(args)}
        # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2)
        columns = [str(i) for i, _ in enumerate(args)]
        pdf = pandas.DataFrame(schema, columns=columns)
        result = model.predict(pdf)
        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elif type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)

        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=np.number).astype(np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=np.number).astype(np.float64)

        if len(result.columns) == 0:
            raise MlflowException(
                message=
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)),
                error_code=INVALID_PARAMETER_VALUE)

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series([row[1].values for row in result.iterrows()])
        else:
            return result[result.columns[0]]

    return pandas_udf(predict, result_type)
Esempio n. 18
0
    def createDataFrame(self,
                        data,
                        schema=None,
                        samplingRatio=None,
                        verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [
                x.encode('utf-8') if not isinstance(x, str) else x
                for x in schema
            ]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self.conf.get("spark.sql.execution.pandas.respectSessionTimeZone").lower() \
               == "true":
                timezone = self.conf.get("spark.sql.session.timeZone")
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [
                    str(x) if not isinstance(x, basestring) else
                    (x.encode('utf-8') if not isinstance(x, str) else x)
                    for x in data.columns
                ]

            if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                    and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(
                        data, schema, timezone)
                except Exception as e:
                    from pyspark.util import _exception_message

                    if self.conf.get("spark.sql.execution.arrow.fallback.enabled", "true") \
                            .lower() == "true":
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true; however, "
                            "failed by the reason below:\n  %s\n"
                            "Attempting non-optimization as "
                            "'spark.sql.execution.arrow.fallback.enabled' is set to "
                            "true." % _exception_message(e))
                        warnings.warn(msg)
                    else:
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
                            "the error below and will not continue because automatic fallback "
                            "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
                            "false.\n  %s" % _exception_message(e))
                        warnings.warn(msg)
                        raise
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(
                schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType,
                name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema,
                                              samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(
            jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Esempio n. 19
0
    def createDataFrame(  # type: ignore[misc]
        self,
        data: Union["RDD[Any]", Iterable[Any], "PandasDataFrameLike"],
        schema: Optional[Union[AtomicType, StructType, str]] = None,
        samplingRatio: Optional[float] = None,
        verifySchema: bool = True
    ) -> DataFrame:
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of either :class:`Row`,
        :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value".
        Each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        .. versionadded:: 2.0.0

        .. versionchanged:: 2.1.0
           Added verifySchema.

        Parameters
        ----------
        data : :class:`RDD` or iterable
            an RDD of any kind of SQL data representation (:class:`Row`,
            :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`, or
            :class:`pandas.DataFrame`.
        schema : :class:`pyspark.sql.types.DataType`, str or list, optional
            a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is None.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`.
            We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`.
        samplingRatio : float, optional
            the sample ratio of rows used for inferring
        verifySchema : bool, optional
            verify data types of every row against schema. Enabled by default.

        Returns
        -------
        :class:`DataFrame`

        Notes
        -----
        Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        Examples
        --------
        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1='Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name='Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name='Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1='Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name='Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name='Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name='Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name='Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a='Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, str):
            schema = cast(Union[AtomicType, StructType, str], _parse_datatype_string(schema))
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            # Create a DataFrame from pandas DataFrame.
            return super(SparkSession, self).createDataFrame(  # type: ignore[call-overload]
                data, schema, samplingRatio, verifySchema)
        return self._create_dataframe(
            data, schema, samplingRatio, verifySchema  # type: ignore[arg-type]
        )
Esempio n. 20
0
)

# COMMAND ----------

from pyspark.sql.types import _parse_datatype_string

aggregate_schema = """
  _id string,
  lifestyle string,
  mean_BMI double,
  mean_active_heartrate double,
  mean_resting_heartrate double,
  mean_VO2_max double
"""

assert health_tracker_sample_agg_df.schema == _parse_datatype_string(aggregate_schema)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Write the Aggregation Dataframe to a Delta Table
# MAGIC 
# MAGIC Use the following path: `goldPath + "health_tracker_sample_agg"`

# COMMAND ----------

# TODO
(
  health_tracker_sample_agg_df.write
  .format("delta")
  .mode("overwrite")
Esempio n. 21
0
def spark_udf(spark, model_uri, result_type="double"):
    """
    A Spark UDF that can be used to invoke the Python function formatted model.

    Parameters passed to the UDF are forwarded to the model as a DataFrame where the names are
    ordinals (0, 1, ...).

    The predictions are filtered to contain only the columns that can be represented as the
    ``result_type``. If the ``result_type`` is string or array of strings, all predictions are
    converted to string. If the result type is not an array type, the left most column with
    matching type is returned.

    >>> predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model")
    >>> df.withColumn("prediction", predict("name", "age")).show()

    :param spark: A SparkSession object.
    :param model_uri: The location, in URI format, of the MLflow model with the
                      :py:mod:`mlflow.pyfunc` flavor. For example:

                      - ``/Users/me/path/to/local/model``
                      - ``relative/path/to/local/model``
                      - ``s3://my_bucket/path/to/model``
                      - ``runs:/<mlflow_run_id>/run-relative/path/to/model``

                      For more information about supported URI schemes, see
                      `Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
                      artifact-locations>`_.

    :param result_type: the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Only a primitive
        type or an array ``pyspark.sql.types.ArrayType`` of primitive type are allowed.
        The following classes of result type are supported:

        - "int" or ``pyspark.sql.types.IntegerType``: The leftmost integer that can fit in an
          ``int32`` or an exception if there is none.

        - "long" or ``pyspark.sql.types.LongType``: The leftmost long integer that can fit in an
          ``int64`` or an exception if there is none.

        - ``ArrayType(IntegerType|LongType)``: All integer columns that can fit into the requested
          size.

        - "float" or ``pyspark.sql.types.FloatType``: The leftmost numeric result cast to
          ``float32`` or an exception if there is none.

        - "double" or ``pyspark.sql.types.DoubleType``: The leftmost numeric result cast to
          ``double`` or an exception if there is none.

        - ``ArrayType(FloatType|DoubleType)``: All numeric columns cast to the requested type or
          an exception if there are no numeric columns.

        - "string" or ``pyspark.sql.types.StringType``: The leftmost column converted to ``string``.

        - ``ArrayType(StringType)``: All columns converted to ``string``.

    :return: Spark UDF that applies the model's ``predict`` method to the data and returns a
             type specified by ``result_type``, which by default is a double.
    """

    # Scope Spark import to this method so users don't need pyspark to use non-Spark-related
    # functionality.
    from mlflow.pyfunc.spark_model_cache import SparkModelCache
    from pyspark.sql.functions import pandas_udf
    from pyspark.sql.types import _parse_datatype_string
    from pyspark.sql.types import ArrayType, DataType
    from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType

    if not isinstance(result_type, DataType):
        result_type = _parse_datatype_string(result_type)

    elem_type = result_type
    if isinstance(elem_type, ArrayType):
        elem_type = elem_type.elementType

    supported_types = [IntegerType, LongType, FloatType, DoubleType, StringType]

    if not any([isinstance(elem_type, x) for x in supported_types]):
        raise MlflowException(
            message="Invalid result_type '{}'. Result type can only be one of or an array of one "
                    "of the following types types: {}".format(str(elem_type), str(supported_types)),
            error_code=INVALID_PARAMETER_VALUE)

    local_model_path = _download_artifact_from_uri(artifact_uri=model_uri)
    archive_path = SparkModelCache.add_local_model(spark, local_model_path)

    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        schema = {str(i): arg for i, arg in enumerate(args)}
        # Explicitly pass order of columns to avoid lexicographic ordering (i.e., 10 < 2)
        columns = [str(i) for i, _ in enumerate(args)]
        pdf = pandas.DataFrame(schema, columns=columns)
        result = model.predict(pdf)
        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elif type(elem_type) == IntegerType:
            result = result.select_dtypes([np.byte, np.ubyte, np.short, np.ushort,
                                           np.int32]).astype(np.int32)

        elif type(elem_type) == LongType:
            result = result.select_dtypes([np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number,)).astype(np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number,)).astype(np.float64)

        if len(result.columns) == 0:
            raise MlflowException(
                message="The the model did not produce any values compatible with the requested "
                        "type '{}'. Consider requesting udf with StringType or "
                        "Arraytype(StringType).".format(str(elem_type)),
                error_code=INVALID_PARAMETER_VALUE)

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series([row[1].values for row in result.iterrows()])
        else:
            return result[result.columns[0]]

    return pandas_udf(predict, result_type)
Esempio n. 22
0
    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self._wrapped._conf.pandasRespectSessionTimeZone():
                timezone = self._wrapped._conf.sessionLocalTimeZone()
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [str(x) if not isinstance(x, basestring) else
                          (x.encode('utf-8') if not isinstance(x, str) else x)
                          for x in data.columns]

            if self._wrapped._conf.arrowEnabled() and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(data, schema, timezone)
                except Exception as e:
                    from pyspark.util import _exception_message

                    if self._wrapped._conf.arrowFallbackEnabled():
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true; however, "
                            "failed by the reason below:\n  %s\n"
                            "Attempting non-optimization as "
                            "'spark.sql.execution.arrow.fallback.enabled' is set to "
                            "true." % _exception_message(e))
                        warnings.warn(msg)
                    else:
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
                            "the error below and will not continue because automatic fallback "
                            "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
                            "false.\n  %s" % _exception_message(e))
                        warnings.warn(msg)
                        raise
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType, name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Esempio n. 23
0
    def returnType(self) -> DataType:
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(
                    self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or \
                self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid return type with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type with grouped map Pandas UDFs or "
                        "at groupby.applyInPandas: %s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError(
                    "Invalid return type for grouped map Pandas "
                    "UDFs or at groupby.applyInPandas: return type must be a "
                    "StructType.")
        elif self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type in mapInPandas: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid return type in mapInPandas: "
                                "return type must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type in cogroup.applyInPandas: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError(
                    "Invalid return type in cogroup.applyInPandas: "
                    "return type must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
            try:
                # StructType is not yet allowed as a return type, explicitly check here to fail fast
                if isinstance(self._returnType_placeholder, StructType):
                    raise TypeError
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid  return type with grouped aggregate Pandas UDFs: "
                    "%s is not supported" % str(self._returnType_placeholder))

        return self._returnType_placeholder
Esempio n. 24
0
    def createDataFrame(self,
                        data,
                        schema=None,
                        samplingRatio=None,
                        verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = [str(x) for x in data.columns]
            data = [r.tolist() for r in data.to_records(index=False)]

        verify_func = _verify_type if verifySchema else lambda _, t: True
        if isinstance(schema, StructType):

            def prepare(obj):
                verify_func(obj, schema)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            def prepare(obj):
                verify_func(obj, dataType)
                return obj,
        else:
            if isinstance(schema, list):
                schema = [
                    x.encode('utf-8') if not isinstance(x, str) else x
                    for x in schema
                ]
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema,
                                              samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(
            jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Esempio n. 25
0
    def createDataFrame(  # type: ignore[misc]
        self,
        data: Union[RDD[Any], Iterable[Any], "PandasDataFrameLike", "ArrayLike"],
        schema: Optional[Union[AtomicType, StructType, str]] = None,
        samplingRatio: Optional[float] = None,
        verifySchema: bool = True,
    ) -> DataFrame:
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
        or a :class:`numpy.ndarray`.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        data : :class:`RDD` or iterable
            an RDD of any kind of SQL data representation (:class:`Row`,
            :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
            :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
        schema : :class:`pyspark.sql.types.DataType`, str or list, optional
            a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is None.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>``.

            When ``schema`` is a list of column names, the type of each column
            will be inferred from ``data``.

            When ``schema`` is ``None``, it will try to infer the schema (column names and types)
            from ``data``, which should be an RDD of either :class:`Row`,
            :class:`namedtuple`, or :class:`dict`.

            When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must
            match the real data, or an exception will be thrown at runtime. If the given schema is
            not :class:`pyspark.sql.types.StructType`, it will be wrapped into a
            :class:`pyspark.sql.types.StructType` as its only field, and the field name will be
            "value". Each record will also be wrapped into a tuple, which can be converted to row
            later.
        samplingRatio : float, optional
            the sample ratio of rows used for inferring. The first few rows will be used
            if ``samplingRatio`` is ``None``.
        verifySchema : bool, optional
            verify data types of every row against schema. Enabled by default.

            .. versionadded:: 2.1.0

        Returns
        -------
        :class:`DataFrame`

        Notes
        -----
        Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental.

        Examples
        --------
        Create a DataFrame from a list of tuples.

        >>> spark.createDataFrame([('Alice', 1)]).collect()
        [Row(_1='Alice', _2=1)]
        >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).collect()
        [Row(name='Alice', age=1)]

        Create a DataFrame from a list of dictionaries

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name='Alice')]

        Create a DataFrame from an RDD.

        >>> rdd = spark.sparkContext.parallelize([('Alice', 1)])
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1='Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name='Alice', age=1)]

        Create a DataFrame from Row instances.

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name='Alice', age=1)]

        Create a DataFrame with the explicit schema specified.

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name='Alice', age=1)]

        Create a DataFrame from a pandas DataFrame.

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name='Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        Create  a DataFrame from an RDD with the schema in DDL formatted string.

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a='Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]

        When the type is unmatched, it throws an exception.

        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        assert self._jvm is not None
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, str):
            schema = cast(Union[AtomicType, StructType, str], _parse_datatype_string(schema))
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode("utf-8") if not isinstance(x, str) else x for x in schema]

        try:
            import pandas as pd

            has_pandas = True
        except Exception:
            has_pandas = False

        try:
            import numpy as np

            has_numpy = True
        except Exception:
            has_numpy = False

        if has_numpy and isinstance(data, np.ndarray):
            # `data` of numpy.ndarray type will be converted to a pandas DataFrame,
            # so pandas is required.
            from pyspark.sql.pandas.utils import require_minimum_pandas_version

            require_minimum_pandas_version()
            if data.ndim not in [1, 2]:
                raise ValueError("NumPy array input should be of 1 or 2 dimensions.")
            column_names = ["value"] if data.ndim == 1 else ["_1", "_2"]
            if schema is None and not self._jconf.arrowPySparkEnabled():
                # Construct `schema` from `np.dtype` of the input NumPy array
                # TODO: Apply the logic below when self._jconf.arrowPySparkEnabled() is True
                spark_type = _from_numpy_type(data.dtype)
                if spark_type is not None:
                    schema = StructType(
                        [StructField(name, spark_type, nullable=True) for name in column_names]
                    )

            data = pd.DataFrame(data, columns=column_names)

        if has_pandas and isinstance(data, pd.DataFrame):
            # Create a DataFrame from pandas DataFrame.
            return super(SparkSession, self).createDataFrame(  # type: ignore[call-overload]
                data, schema, samplingRatio, verifySchema
            )
        return self._create_dataframe(
            data, schema, samplingRatio, verifySchema  # type: ignore[arg-type]
        )
Esempio n. 26
0
    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
        exception will be thrown at runtime. If the given schema is not StructType, it will be
        wrapped into a StructType as its only field, and the field name will be "value", each record
        will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`DataType` or a datatype string or a list of column names, default
            is None.  The data type string format equals to `DataType.simpleString`, except that
            top level struct type can omit the `struct<>` and atomic types use `typeName()` as
            their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
            as a short name for IntegerType.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        .. versionchanged:: 2.0
           The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
           StructType, it will be wrapped into a StructType and each record will also be wrapped
           into a tuple.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = [str(x) for x in data.columns]
            data = [r.tolist() for r in data.to_records(index=False)]

        if isinstance(schema, StructType):
            def prepare(obj):
                _verify_type(obj, schema)
                return obj
        elif isinstance(schema, DataType):
            datatype = schema

            def prepare(obj):
                _verify_type(obj, datatype)
                return (obj, )
            schema = StructType().add("value", datatype)
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
Esempio n. 27
0
    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of either :class:`Row`,
        :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value".
        Each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation (e.g. row, tuple, int, boolean,
            etc.), :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        .. note:: When Arrow optimization is enabled, strings inside Pandas DataFrame in Python
            2 are converted into bytes as they are bytes in Python 2 whereas regular strings are
            left as strings. When using strings in Python 2, use unicode `u""` as Python standard
            practice.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            # Create a DataFrame from pandas DataFrame.
            return super(SparkSession, self).createDataFrame(
                data, schema, samplingRatio, verifySchema)
        return self._create_dataframe(data, schema, samplingRatio, verifySchema)
Esempio n. 28
0
def spark_udf(spark, model_uri, result_type="double"):
    """
    A Spark UDF that can be used to invoke the Python function formatted model.

    Parameters passed to the UDF are forwarded to the model as a DataFrame where the column names
    are ordinals (0, 1, ...). On some versions of Spark, it is also possible to wrap the input in a
    struct. In that case, the data will be passed as a DataFrame with column names given by the
    struct definition (e.g. when invoked as my_udf(struct('x', 'y'), the model will ge the data as a
    pandas DataFrame with 2 columns 'x' and 'y').

    The predictions are filtered to contain only the columns that can be represented as the
    ``result_type``. If the ``result_type`` is string or array of strings, all predictions are
    converted to string. If the result type is not an array type, the left most column with
    matching type is returned.

    .. code-block:: python
        :caption: Example

        predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model")
        df.withColumn("prediction", predict("name", "age")).show()

    :param spark: A SparkSession object.
    :param model_uri: The location, in URI format, of the MLflow model with the
                      :py:mod:`mlflow.pyfunc` flavor. For example:

                      - ``/Users/me/path/to/local/model``
                      - ``relative/path/to/local/model``
                      - ``s3://my_bucket/path/to/model``
                      - ``runs:/<mlflow_run_id>/run-relative/path/to/model``
                      - ``models:/<model_name>/<model_version>``
                      - ``models:/<model_name>/<stage>``

                      For more information about supported URI schemes, see
                      `Referencing Artifacts <https://www.mlflow.org/docs/latest/concepts.html#
                      artifact-locations>`_.

    :param result_type: the return type of the user-defined function. The value can be either a
        ``pyspark.sql.types.DataType`` object or a DDL-formatted type string. Only a primitive
        type or an array ``pyspark.sql.types.ArrayType`` of primitive type are allowed.
        The following classes of result type are supported:

        - "int" or ``pyspark.sql.types.IntegerType``: The leftmost integer that can fit in an
          ``int32`` or an exception if there is none.

        - "long" or ``pyspark.sql.types.LongType``: The leftmost long integer that can fit in an
          ``int64`` or an exception if there is none.

        - ``ArrayType(IntegerType|LongType)``: All integer columns that can fit into the requested
          size.

        - "float" or ``pyspark.sql.types.FloatType``: The leftmost numeric result cast to
          ``float32`` or an exception if there is none.

        - "double" or ``pyspark.sql.types.DoubleType``: The leftmost numeric result cast to
          ``double`` or an exception if there is none.

        - ``ArrayType(FloatType|DoubleType)``: All numeric columns cast to the requested type or
          an exception if there are no numeric columns.

        - "string" or ``pyspark.sql.types.StringType``: The leftmost column converted to ``string``.

        - ``ArrayType(StringType)``: All columns converted to ``string``.

    :return: Spark UDF that applies the model's ``predict`` method to the data and returns a
             type specified by ``result_type``, which by default is a double.
    """

    # Scope Spark import to this method so users don't need pyspark to use non-Spark-related
    # functionality.
    from mlflow.pyfunc.spark_model_cache import SparkModelCache
    from pyspark.sql.functions import pandas_udf
    from pyspark.sql.types import _parse_datatype_string
    from pyspark.sql.types import ArrayType, DataType as SparkDataType
    from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType

    if not isinstance(result_type, SparkDataType):
        result_type = _parse_datatype_string(result_type)

    elem_type = result_type
    if isinstance(elem_type, ArrayType):
        elem_type = elem_type.elementType

    supported_types = [
        IntegerType, LongType, FloatType, DoubleType, StringType
    ]

    if not any([isinstance(elem_type, x) for x in supported_types]):
        raise MlflowException(
            message=
            "Invalid result_type '{}'. Result type can only be one of or an array of one "
            "of the following types types: {}".format(str(elem_type),
                                                      str(supported_types)),
            error_code=INVALID_PARAMETER_VALUE,
        )

    with TempDir() as local_tmpdir:
        local_model_path = _download_artifact_from_uri(
            artifact_uri=model_uri, output_path=local_tmpdir.path())
        archive_path = SparkModelCache.add_local_model(spark, local_model_path)

    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        input_schema = model.metadata.get_input_schema()
        pdf = None

        for x in args:
            if type(x) == pandas.DataFrame:
                if len(args) != 1:
                    raise Exception(
                        "If passing a StructType column, there should be only one "
                        "input column, but got %d" % len(args))
                pdf = x
        if pdf is None:
            args = list(args)
            if input_schema is None:
                names = [str(i) for i in range(len(args))]
            else:
                names = input_schema.column_names()
                if len(args) > len(names):
                    args = args[:len(names)]
                if len(args) < len(names):
                    message = (
                        "Model input is missing columns. Expected {0} input columns {1},"
                        " but the model received only {2} unnamed input columns"
                        " (Since the columns were passed unnamed they are expected to be in"
                        " the order specified by the schema).".format(
                            len(names), names, len(args)))
                    raise MlflowException(message)
            pdf = pandas.DataFrame(
                data={names[i]: x
                      for i, x in enumerate(args)}, columns=names)

        result = model.predict(pdf)

        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elem_type = result_type.elementType if isinstance(
            result_type, ArrayType) else result_type

        if type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)
        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float64)

        if len(result.columns) == 0:
            raise MlflowException(
                message=
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)),
                error_code=INVALID_PARAMETER_VALUE,
            )

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series(result.to_numpy().tolist())
        else:
            return result[result.columns[0]]

    return pandas_udf(predict, result_type)
Esempio n. 29
0
def spark_udf(spark, model_uri, features, result_type="double"):
    """
    Create spark pandas udf given the model uri
    :param spark: spark context
    :param model_uri: path to model
    :param features: list containing the feature names
    :param result_type: result type of the model
    :return:
    """
    # Scope Spark import to this method so users don't need pyspark to use non-Spark-related
    # functionality.
    from mlflow.pyfunc.spark_model_cache import SparkModelCache
    from pyspark.sql.functions import pandas_udf
    from pyspark.sql.types import _parse_datatype_string
    from pyspark.sql.types import ArrayType, DataType
    from pyspark.sql.types import DoubleType, IntegerType, FloatType, LongType, StringType

    if not isinstance(result_type, DataType):
        result_type = _parse_datatype_string(result_type)

    elem_type = result_type
    if isinstance(elem_type, ArrayType):
        elem_type = elem_type.elementType

    supported_types = [
        IntegerType, LongType, FloatType, DoubleType, StringType
    ]

    if not any([isinstance(elem_type, x) for x in supported_types]):
        raise ValueError(
            "Invalid result_type '{}'. Result type can only be one of or an array of one "
            "of the following types types: {}".format(str(elem_type),
                                                      str(supported_types)))

    with TempDir() as local_tmpdir:
        local_model_path = _download_artifact_from_uri(
            artifact_uri=model_uri, output_path=local_tmpdir.path())
        archive_path = SparkModelCache.add_local_model(spark, local_model_path)

    def predict(*args):
        model = SparkModelCache.get_or_load(archive_path)
        schema = {features[i]: arg for i, arg in enumerate(args)}
        pdf = None
        for x in args:
            if type(x) == pandas.DataFrame:
                if len(args) != 1:
                    raise Exception(
                        "If passing a StructType column, there should be only one "
                        "input column, but got %d" % len(args))
                pdf = x
        if pdf is None:
            pdf = pandas.DataFrame(schema)
        result = model.predict(pdf)
        if not isinstance(result, pandas.DataFrame):
            result = pandas.DataFrame(data=result)

        elif type(elem_type) == IntegerType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort,
                 np.int32]).astype(np.int32)

        elif type(elem_type) == LongType:
            result = result.select_dtypes(
                [np.byte, np.ubyte, np.short, np.ushort, np.int, np.long])

        elif type(elem_type) == FloatType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float32)

        elif type(elem_type) == DoubleType:
            result = result.select_dtypes(include=(np.number, )).astype(
                np.float64)

        if len(result.columns) == 0:
            raise ValueError(
                "The the model did not produce any values compatible with the requested "
                "type '{}'. Consider requesting udf with StringType or "
                "Arraytype(StringType).".format(str(elem_type)))

        if type(elem_type) == StringType:
            result = result.applymap(str)

        if type(result_type) == ArrayType:
            return pandas.Series([row[1].values for row in result.iterrows()])
        else:
            return result[result.columns[0]]

    return pandas_udf(predict, result_type)