コード例 #1
0
    def wrapped(key_series, value_series):
        import pandas as pd

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError(
                "Return type of the user-defined function should be "
                "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type),
                                                 len(result.columns)))

        # Assign result columns by schema name if user labeled with strings, else use position
        if assign_cols_by_name and any(
                isinstance(name, basestring) for name in result.columns):
            return [(result[field.name], to_arrow_type(field.dataType))
                    for field in return_type]
        else:
            return [(result[result.columns[i]], to_arrow_type(field.dataType))
                    for i, field in enumerate(return_type)]
コード例 #2
0
ファイル: udf.py プロジェクト: atasi1/Tensorflow
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(
                    self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_schema(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with grouped map Pandas UDFs: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for grouped map Pandas "
                                "UDFs: returnType must be a StructType.")

        return self._returnType_placeholder
コード例 #3
0
ファイル: session.py プロジェクト: CodingCat/spark
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone, safecheck)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
コード例 #4
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
コード例 #5
0
ファイル: session.py プロジェクト: zhuchazhucha/spark
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
            _old_pandas_exception_message, TimestampType
        try:
            from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        except ImportError as e:
            raise ImportError(_old_pandas_exception_message(e))

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf[start:start + step]
                      for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [
            _create_batch(
                [(c, t)
                 for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                timezone) for pdf_slice in pdf_slices
        ]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches),
                                          ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
コード例 #6
0
ファイル: session.py プロジェクト: aa8y/spark
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
            _old_pandas_exception_message, TimestampType
        from pyspark.sql.utils import _require_minimum_pyarrow_version
        try:
            from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        except ImportError as e:
            raise ImportError(_old_pandas_exception_message(e))

        _require_minimum_pyarrow_version()

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
コード例 #7
0
ファイル: internal.py プロジェクト: takitsuba/koalas
    def pandas_df(self):
        """ Return as pandas DataFrame. """
        sdf = self.spark_internal_df
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })

        index_columns = self.index_columns
        if len(index_columns) > 0:
            append = False
            for index_field in index_columns:
                drop = index_field not in self.data_columns
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[[
                str(name) if len(name) > 1 else name[0]
                for name in self.column_index
            ]]

        if self.column_index_level > 1:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_index)
        else:
            pdf.columns = [idx[0] for idx in self._column_index]
        if self._column_index_names is not None:
            pdf.columns.names = self._column_index_names

        index_names = self.index_names
        if len(index_names) > 0:
            pdf.index.names = index_names
        return pdf
コード例 #8
0
def wrap_scalar_pandas_udf(f, return_type, eval_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_type(result):
        if not hasattr(result, "__len__"):
            pd_type = "Pandas.DataFrame" if type(
                return_type) == StructType else "Pandas.Series"
            raise TypeError(
                "Return type of the user-defined function should be "
                "{}, but is {}".format(pd_type, type(result)))
        return result

    def verify_result_length(result, length):
        if len(result) != length:
            raise RuntimeError(
                "Result vector from pandas_udf was not the required length: "
                "expected %d, got %d" % (length, len(result)))
        return result

    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
        return lambda *a: (verify_result_length(verify_result_type(f(*a)),
                                                len(a[0])), arrow_return_type)
    else:
        # The result length verification is done at the end of a partition.
        return lambda *iterator: map(lambda res: (res, arrow_return_type),
                                     map(verify_result_type, f(*iterator)))
コード例 #9
0
    def pandas_df(self):
        """ Return as pandas DataFrame. """
        sdf = self.spark_df
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })

        index_columns = self.index_columns
        if len(index_columns) > 0:
            append = False
            for index_field in index_columns:
                drop = index_field not in self.data_columns
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[self.data_columns]

        if self._column_index is not None:
            pdf.columns = pd.MultiIndex.from_tuples(self._column_index)

        index_names = self.index_names
        if len(index_names) > 0:
            if isinstance(pdf.index, pd.MultiIndex):
                pdf.index.names = index_names
            else:
                pdf.index.name = index_names[0]
        return pdf
コード例 #10
0
def wrap_cogrouped_map_pandas_udf(f, return_type, argspec):
    def wrapped(left_key_series, left_value_series, right_key_series,
                right_value_series):
        import pandas as pd

        left_df = pd.concat(left_value_series, axis=1)
        right_df = pd.concat(right_value_series, axis=1)

        if len(argspec.args) == 2:
            result = f(left_df, right_df)
        elif len(argspec.args) == 3:
            key_series = left_key_series if not left_df.empty else right_key_series
            key = tuple(s[0] for s in key_series)
            result = f(key, left_df, right_df)
        if not isinstance(result, pd.DataFrame):
            raise TypeError(
                "Return type of the user-defined function should be "
                "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type),
                                                 len(result.columns)))
        return result

    return lambda kl, vl, kr, vr: [(wrapped(kl, vl, kr, vr),
                                    to_arrow_type(return_type))]
コード例 #11
0
def wrap_bounded_window_agg_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(begin_index, end_index, *series):
        import pandas as pd
        result = []

        # Index operation is faster on np.ndarray,
        # So we turn the index series into np array
        # here for performance
        begin_array = begin_index.values
        end_array = end_index.values

        for i in range(len(begin_array)):
            # Note: Create a slice from a series for each window is
            #       actually pretty expensive. However, there
            #       is no easy way to reduce cost here.
            # Note: s.iloc[i : j] is about 30% faster than s[i: j], with
            #       the caveat that the created slices shares the same
            #       memory with s. Therefore, user are not allowed to
            #       change the value of input series inside the window
            #       function. It is rare that user needs to modify the
            #       input series in the window function, and therefore,
            #       it is be a reasonable restriction.
            # Note: Calling reset_index on the slices will increase the cost
            #       of creating slices by about 100%. Therefore, for performance
            #       reasons we don't do it here.
            series_slices = [s.iloc[begin_array[i]: end_array[i]] for s in series]
            result.append(f(*series_slices))
        return pd.Series(result)

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #12
0
def wrap_pandas_udf(f, return_type):
    # If the return_type is a StructType, it indicates this is a groupby apply udf,
    # and has already been wrapped under apply(), otherwise, it's a vectorized column udf.
    # We can distinguish these two by return type because in groupby apply, we always specify
    # returnType as a StructType, and in vectorized column udf, StructType is not supported.
    #
    # TODO: Look into refactoring use of StructType to be more flexible for future pandas_udfs
    if isinstance(return_type, StructType):
        return lambda *a: f(*a)
    else:
        arrow_return_type = to_arrow_type(return_type)

        def verify_result_length(*a):
            result = f(*a)
            if not hasattr(result, "__len__"):
                raise TypeError(
                    "Return type of the user-defined functon should be "
                    "Pandas.Series, but is {}".format(type(result)))
            if len(result) != len(a[0]):
                raise RuntimeError(
                    "Result vector from pandas_udf was not the required length: "
                    "expected %d, got %d" % (len(a[0]), len(result)))
            return result

        return lambda *a: (verify_result_length(*a), arrow_return_type)
コード例 #13
0
def spark_type_to_pandas_dtype(spark_type):
    """ Return the given Spark DataType to pandas dtype. """
    if isinstance(spark_type, (types.DateType, types.StructType, types.UserDefinedType)):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
コード例 #14
0
def wrap_grouped_agg_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(*series):
        import pandas as pd
        result = f(*series)
        return pd.Series([result])

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #15
0
def wrap_pandas_iter_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_type(result):
        if not hasattr(result, "__len__"):
            pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series"
            raise TypeError("Return type of the user-defined function should be "
                            "{}, but is {}".format(pd_type, type(result)))
        return result

    return lambda *iterator: map(lambda res: (res, arrow_return_type),
                                 map(verify_result_type, f(*iterator)))
コード例 #16
0
def wrap_unbounded_window_agg_pandas_udf(f, return_type):
    # This is similar to grouped_agg_pandas_udf, the only difference
    # is that window_agg_pandas_udf needs to repeat the return value
    # to match window length, where grouped_agg_pandas_udf just returns
    # the scalar value.
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(*series):
        import pandas as pd
        result = f(*series)
        return pd.Series([result]).repeat(len(series[0]))

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #17
0
ファイル: worker.py プロジェクト: Tongzhenguo/spark
def wrap_scalar_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_length(*a):
        result = f(*a)
        if not hasattr(result, "__len__"):
            raise TypeError("Return type of the user-defined functon should be "
                            "Pandas.Series, but is {}".format(type(result)))
        if len(result) != len(a[0]):
            raise RuntimeError("Result vector from pandas_udf was not the required length: "
                               "expected %d, got %d" % (len(a[0]), len(result)))
        return result

    return lambda *a: (verify_result_length(*a), arrow_return_type)
コード例 #18
0
def _to_arrow_type(dt: pt.DataType) -> pa.DataType:
    if isinstance(dt, pt.TimestampType):
        return TRIAD_DEFAULT_TIMESTAMP
    if isinstance(dt, pt.StructType):
        fields = [
            pa.field(
                # field.name, _to_arrow_type(field.dataType), nullable=field.nullable
                field.name,
                _to_arrow_type(field.dataType),
                nullable=True,
            ) for field in dt
        ]
        return pa.struct(fields)
    return to_arrow_type(dt)
コード例 #19
0
def wrap_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_length(*a):
        result = f(*a)
        if not hasattr(result, "__len__"):
            raise TypeError("Return type of the user-defined functon should be "
                            "Pandas.Series, but is {}".format(type(result)))
        if len(result) != len(a[0]):
            raise RuntimeError("Result vector from pandas_udf was not the required length: "
                               "expected %d, got %d" % (len(a[0]), len(result)))
        return result

    return lambda *a: (verify_result_length(*a), arrow_return_type)
コード例 #20
0
ファイル: worker.py プロジェクト: Tongzhenguo/spark
    def wrapped(*series):
        import pandas as pd

        result = f(pd.concat(series, axis=1))
        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
        arrow_return_types = (to_arrow_type(field.dataType) for field in return_type)
        return [(result[result.columns[i]], arrow_type)
                for i, arrow_type in enumerate(arrow_return_types)]
コード例 #21
0
ファイル: worker.py プロジェクト: atasi1/Tensorflow
    def wrapped(*series):
        import pandas as pd

        result = f(pd.concat(series, axis=1))
        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
        arrow_return_types = (to_arrow_type(field.dataType) for field in return_type)
        return [(result[result.columns[i]], arrow_type)
                for i, arrow_type in enumerate(arrow_return_types)]
コード例 #22
0
ファイル: udf.py プロジェクト: Tongzhenguo/spark
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_schema(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with grouped map Pandas UDFs: "
                        "%s is not supported" % str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for grouped map Pandas "
                                "UDFs: returnType must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with grouped aggregate Pandas UDFs: "
                    "%s is not supported" % str(self._returnType_placeholder))

        return self._returnType_placeholder
コード例 #23
0
ファイル: worker.py プロジェクト: Brett-A/spark
    def wrapped(key_series, value_series):
        import pandas as pd

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))

        # Assign result columns by schema name if user labeled with strings, else use position
        if assign_cols_by_name and any(isinstance(name, basestring) for name in result.columns):
            return [(result[field.name], to_arrow_type(field.dataType)) for field in return_type]
        else:
            return [(result[result.columns[i]], to_arrow_type(field.dataType))
                    for i, field in enumerate(return_type)]
コード例 #24
0
ファイル: typehints.py プロジェクト: ynuosoft/spark
def spark_type_to_pandas_dtype(spark_type: types.DataType,
                               *,
                               use_extension_dtypes: bool = False) -> Dtype:
    """ Return the given Spark DataType to pandas dtype. """

    if use_extension_dtypes and extension_dtypes_available:
        # IntegralType
        if isinstance(spark_type, types.ByteType):
            return Int8Dtype()
        elif isinstance(spark_type, types.ShortType):
            return Int16Dtype()
        elif isinstance(spark_type, types.IntegerType):
            return Int32Dtype()
        elif isinstance(spark_type, types.LongType):
            return Int64Dtype()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(spark_type, types.BooleanType):
                return BooleanDtype()
            # StringType
            elif isinstance(spark_type, types.StringType):
                return StringDtype()

        # FractionalType
        if extension_float_dtypes_available:
            if isinstance(spark_type, types.FloatType):
                return Float32Dtype()
            elif isinstance(spark_type, types.DoubleType):
                return Float64Dtype()

    if isinstance(
            spark_type,
        (
            types.DateType,
            types.NullType,
            types.ArrayType,
            types.MapType,
            types.StructType,
            types.UserDefinedType,
        ),
    ):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
コード例 #25
0
def wrap_scalar_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_type(result):
        if not hasattr(result, "__len__"):
            pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series"
            raise TypeError("Return type of the user-defined function should be "
                            "{}, but is {}".format(pd_type, type(result)))
        return result

    def verify_result_length(result, length):
        if len(result) != length:
            raise RuntimeError("Result vector from pandas_udf was not the required length: "
                               "expected %d, got %d" % (length, len(result)))
        return result

    return lambda *a: (verify_result_length(
        verify_result_type(f(*a)), len(a[0])), arrow_return_type)
コード例 #26
0
ファイル: frame.py プロジェクト: processout/koalas
    def to_pandas(self):
        """
        Return a Pandas DataFrame.

        .. note:: This method should only be used if the resulting Pandas DataFrame is expected
            to be small, as all the data is loaded into the driver's memory.

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'])
        >>> df.to_pandas()
           dogs  cats
        0   0.2   0.3
        1   0.0   0.6
        2   0.6   0.0
        3   0.2   0.1
        """
        sdf = self._sdf.select(
            ['`{}`'.format(name) for name in self._metadata.all_fields])
        pdf = sdf.toPandas()
        if len(pdf) == 0 and len(sdf.schema) > 0:
            # TODO: push to OSS
            pdf = pdf.astype({
                field.name: to_arrow_type(field.dataType).to_pandas_dtype()
                for field in sdf.schema
            })
        if len(self._metadata.index_info) > 0:
            append = False
            for index_field in self._metadata.index_fields:
                drop = index_field not in self._metadata.column_fields
                pdf = pdf.set_index(index_field, drop=drop, append=append)
                append = True
            pdf = pdf[self._metadata.column_fields]
        index_names = self._metadata.index_names
        if len(index_names) > 0:
            if isinstance(pdf.index, pd.MultiIndex):
                pdf.index.names = index_names
            else:
                pdf.index.name = index_names[0]
        return pdf
コード例 #27
0
    def dtype(self):
        """Return the dtype object of the underlying data.

        Examples
        --------
        >>> s = ks.Series([1, 2, 3])
        >>> s.dtype
        dtype('int64')

        >>> s = ks.Series(list('abc'))
        >>> s.dtype
        dtype('O')

        >>> s = ks.Series(pd.date_range('20130101', periods=3))
        >>> s.dtype
        dtype('<M8[ns]')
        """
        if type(self.spark_type) == TimestampType:
            return np.dtype('datetime64[ns]')
        else:
            return np.dtype(to_arrow_type(self.spark_type).to_pandas_dtype())
コード例 #28
0
ファイル: worker.py プロジェクト: amolthacker/spark
def wrap_grouped_map_pandas_udf(f, return_type, argspec):

    def wrapped(key_series, value_series):
        import pandas as pd

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
        return result

    return lambda k, v: [(wrapped(k, v), to_arrow_type(return_type))]
コード例 #29
0
def wrap_grouped_map_pandas_udf(f, return_type, argspec):

    def wrapped(key_series, value_series):
        import pandas as pd

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError("Return type of the user-defined function should be "
                            "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns)))
        return result

    return lambda k, v: [(wrapped(k, v), to_arrow_type(return_type))]
コード例 #30
0
ファイル: worker.py プロジェクト: FavioVazquez/spark
def wrap_pandas_udf(f, return_type):
    # If the return_type is a StructType, it indicates this is a groupby apply udf,
    # and has already been wrapped under apply(), otherwise, it's a vectorized column udf.
    # We can distinguish these two by return type because in groupby apply, we always specify
    # returnType as a StructType, and in vectorized column udf, StructType is not supported.
    #
    # TODO: Look into refactoring use of StructType to be more flexible for future pandas_udfs
    if isinstance(return_type, StructType):
        return lambda *a: f(*a)
    else:
        arrow_return_type = to_arrow_type(return_type)

        def verify_result_length(*a):
            result = f(*a)
            if not hasattr(result, "__len__"):
                raise TypeError("Return type of the user-defined functon should be "
                                "Pandas.Series, but is {}".format(type(result)))
            if len(result) != len(a[0]):
                raise RuntimeError("Result vector from pandas_udf was not the required length: "
                                   "expected %d, got %d" % (len(a[0]), len(result)))
            return result

        return lambda *a: (verify_result_length(*a), arrow_return_type)
コード例 #31
0
 def toPandas(self):
     df = self._spark_select(self._metadata.all_fields)
     pdf = df._spark_toPandas()
     if len(pdf) == 0 and len(df.schema) > 0:
         # TODO: push to OSS
         pdf = pdf.astype({
             field.name: to_arrow_type(field.dataType).to_pandas_dtype()
             for field in df.schema
         })
     if len(self._metadata.index_info) > 0:
         append = False
         for index_field in self._metadata.index_fields:
             drop = index_field not in self._metadata.column_fields
             pdf = pdf.set_index(index_field, drop=drop, append=append)
             append = True
         pdf = pdf[self._metadata.column_fields]
     index_names = self._metadata.index_names
     if len(index_names) > 0:
         if isinstance(pdf.index, pd.MultiIndex):
             pdf.index.names = index_names
         else:
             pdf.index.name = index_names[0]
     return pdf
コード例 #32
0
    def returnType(self):
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(
                    self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or \
                self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with grouped map Pandas UDFs: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for grouped map Pandas "
                                "UDFs: returnType must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with map iterator Pandas UDFs: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for map iterator Pandas "
                                "UDFs: returnType must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid returnType with cogrouped map Pandas UDFs: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid returnType for cogrouped map Pandas "
                                "UDFs: returnType must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
            try:
                # StructType is not yet allowed as a return type, explicitly check here to fail fast
                if isinstance(self._returnType_placeholder, StructType):
                    raise TypeError
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid returnType with grouped aggregate Pandas UDFs: "
                    "%s is not supported" % str(self._returnType_placeholder))

        return self._returnType_placeholder
コード例 #33
0
def spark_type_to_pandas_dtype(spark_type):
    """ Return the given Spark DataType to pandas dtype. """
    if isinstance(spark_type, types.TimestampType):
        return np.dtype('datetime64[ns]')
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
コード例 #34
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from distutils.version import LooseVersion
        from pyspark.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            if LooseVersion(pa.__version__) < LooseVersion("0.12.0"):
                temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False)
                arrow_schema = temp_batch.schema
            else:
                arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            for name, field in zip(schema, arrow_schema):
                struct.add(name, from_arrow_type(field.type), nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
                      for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df