Ejemplo n.º 1
0
    def toPandas(self):
        """
        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.

        This is only available if Pandas is installed and available.

        .. versionadded:: 1.3.0

        Notes
        -----
        This method should only be used if the resulting Pandas's :class:`DataFrame` is
        expected to be small, as all the data is loaded into the driver's memory.

        Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        Examples
        --------
        >>> df.toPandas()  # doctest: +SKIP
           age   name
        0    2  Alice
        1    5    Bob
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        from pyspark.sql.pandas.utils import require_minimum_pandas_version
        require_minimum_pandas_version()

        import numpy as np
        import pandas as pd

        timezone = self.sql_ctx._conf.sessionLocalTimeZone()

        if self.sql_ctx._conf.arrowPySparkEnabled():
            use_arrow = True
            try:
                from pyspark.sql.pandas.types import to_arrow_schema
                from pyspark.sql.pandas.utils import require_minimum_pyarrow_version

                require_minimum_pyarrow_version()
                to_arrow_schema(self.schema)
            except Exception as e:

                if self.sql_ctx._conf.arrowPySparkFallbackEnabled():
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
                        "failed by the reason below:\n  %s\n"
                        "Attempting non-optimization as "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
                        "true." % str(e))
                    warnings.warn(msg)
                    use_arrow = False
                else:
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and will not continue because automatic fallback "
                        "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
                        "false.\n  %s" % str(e))
                    warnings.warn(msg)
                    raise

            # Try to use Arrow optimization when the schema is supported and the required version
            # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled.
            if use_arrow:
                try:
                    from pyspark.sql.pandas.types import _check_series_localize_timestamps, \
                        _convert_map_items_to_dict
                    import pyarrow
                    # Rename columns to avoid duplicated column names.
                    tmp_column_names = [
                        'col_{}'.format(i) for i in range(len(self.columns))
                    ]
                    self_destruct = self.sql_ctx._conf.arrowPySparkSelfDestructEnabled(
                    )
                    batches = self.toDF(*tmp_column_names)._collect_as_arrow(
                        split_batches=self_destruct)
                    if len(batches) > 0:
                        table = pyarrow.Table.from_batches(batches)
                        # Ensure only the table has a reference to the batches, so that
                        # self_destruct (if enabled) is effective
                        del batches
                        # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
                        # values, but we should use datetime.date to match the behavior with when
                        # Arrow optimization is disabled.
                        pandas_options = {'date_as_object': True}
                        if self_destruct:
                            # Configure PyArrow to use as little memory as possible:
                            # self_destruct - free columns as they are converted
                            # split_blocks - create a separate Pandas block for each column
                            # use_threads - convert one column at a time
                            pandas_options.update({
                                'self_destruct': True,
                                'split_blocks': True,
                                'use_threads': False,
                            })
                        pdf = table.to_pandas(**pandas_options)
                        # Rename back to the original column names.
                        pdf.columns = self.columns
                        for field in self.schema:
                            if isinstance(field.dataType, TimestampType):
                                pdf[field.name] = \
                                    _check_series_localize_timestamps(pdf[field.name], timezone)
                            elif isinstance(field.dataType, MapType):
                                pdf[field.name] = \
                                    _convert_map_items_to_dict(pdf[field.name])
                        return pdf
                    else:
                        return pd.DataFrame.from_records([],
                                                         columns=self.columns)
                except Exception as e:
                    # We might have to allow fallback here as well but multiple Spark jobs can
                    # be executed. So, simply fail in this case for now.
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and can not continue. Note that "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                        "effect on failures in the middle of "
                        "computation.\n  %s" % str(e))
                    warnings.warn(msg)
                    raise

        # Below is toPandas without Arrow optimization.
        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
        column_counter = Counter(self.columns)

        dtype = [None] * len(self.schema)
        for fieldIdx, field in enumerate(self.schema):
            # For duplicate column name, we use `iloc` to access it.
            if column_counter[field.name] > 1:
                pandas_col = pdf.iloc[:, fieldIdx]
            else:
                pandas_col = pdf[field.name]

            pandas_type = PandasConversionMixin._to_corrected_pandas_type(
                field.dataType)
            # SPARK-21766: if an integer field is nullable and has null values, it can be
            # inferred by pandas as float column. Once we convert the column with NaN back
            # to integer type e.g., np.int16, we will hit exception. So we use the inferred
            # float type, not the corrected type from the schema in this case.
            if pandas_type is not None and \
                not(isinstance(field.dataType, IntegralType) and field.nullable and
                    pandas_col.isnull().any()):
                dtype[fieldIdx] = pandas_type
            # Ensure we fall back to nullable numpy types, even when whole column is null:
            if isinstance(field.dataType,
                          IntegralType) and pandas_col.isnull().any():
                dtype[fieldIdx] = np.float64
            if isinstance(field.dataType,
                          BooleanType) and pandas_col.isnull().any():
                dtype[fieldIdx] = np.object

        df = pd.DataFrame()
        for index, t in enumerate(dtype):
            column_name = self.schema[index].name

            # For duplicate column name, we use `iloc` to access it.
            if column_counter[column_name] > 1:
                series = pdf.iloc[:, index]
            else:
                series = pdf[column_name]

            if t is not None:
                series = series.astype(t, copy=False)

            # `insert` API makes copy of data, we only do it for Series of duplicate column names.
            # `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work because `iloc` could
            # return a view or a copy depending by context.
            if column_counter[column_name] > 1:
                df.insert(index, column_name, series, allow_duplicates=True)
            else:
                df[column_name] = series

        pdf = df

        if timezone is None:
            return pdf
        else:
            from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz
            for field in self.schema:
                # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                if isinstance(field.dataType, TimestampType):
                    pdf[field.name] = \
                        _check_series_convert_timestamps_local_tz(pdf[field.name], timezone)
            return pdf
Ejemplo n.º 2
0
 def test_schema_conversion_roundtrip(self):
     from pyspark.sql.pandas.types import from_arrow_schema, to_arrow_schema
     arrow_schema = to_arrow_schema(self.schema)
     schema_rt = from_arrow_schema(arrow_schema)
     self.assertEqual(self.schema, schema_rt)
Ejemplo n.º 3
0
    def toPandas(self):
        """
        Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.

        This is only available if Pandas is installed and available.

        .. note:: This method should only be used if the resulting Pandas's :class:`DataFrame` is
            expected to be small, as all the data is loaded into the driver's memory.

        .. note:: Usage with spark.sql.execution.arrow.pyspark.enabled=True is experimental.

        >>> df.toPandas()  # doctest: +SKIP
           age   name
        0    2  Alice
        1    5    Bob
        """
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, DataFrame)

        from pyspark.sql.pandas.utils import require_minimum_pandas_version
        require_minimum_pandas_version()

        import numpy as np
        import pandas as pd

        timezone = self.sql_ctx._conf.sessionLocalTimeZone()

        if self.sql_ctx._conf.arrowPySparkEnabled():
            use_arrow = True
            try:
                from pyspark.sql.pandas.types import to_arrow_schema
                from pyspark.sql.pandas.utils import require_minimum_pyarrow_version

                require_minimum_pyarrow_version()
                to_arrow_schema(self.schema)
            except Exception as e:

                if self.sql_ctx._conf.arrowPySparkFallbackEnabled():
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
                        "failed by the reason below:\n  %s\n"
                        "Attempting non-optimization as "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
                        "true." % _exception_message(e))
                    warnings.warn(msg)
                    use_arrow = False
                else:
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and will not continue because automatic fallback "
                        "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
                        "false.\n  %s" % _exception_message(e))
                    warnings.warn(msg)
                    raise

            # Try to use Arrow optimization when the schema is supported and the required version
            # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled.
            if use_arrow:
                try:
                    from pyspark.sql.pandas.types import _check_dataframe_localize_timestamps
                    import pyarrow
                    batches = self._collect_as_arrow()
                    if len(batches) > 0:
                        table = pyarrow.Table.from_batches(batches)
                        # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
                        # values, but we should use datetime.date to match the behavior with when
                        # Arrow optimization is disabled.
                        pdf = table.to_pandas(date_as_object=True)
                        return _check_dataframe_localize_timestamps(
                            pdf, timezone)
                    else:
                        return pd.DataFrame.from_records([],
                                                         columns=self.columns)
                except Exception as e:
                    # We might have to allow fallback here as well but multiple Spark jobs can
                    # be executed. So, simply fail in this case for now.
                    msg = (
                        "toPandas attempted Arrow optimization because "
                        "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                        "reached the error below and can not continue. Note that "
                        "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                        "effect on failures in the middle of "
                        "computation.\n  %s" % _exception_message(e))
                    warnings.warn(msg)
                    raise

        # Below is toPandas without Arrow optimization.
        pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)

        dtype = {}
        for field in self.schema:
            pandas_type = PandasConversionMixin._to_corrected_pandas_type(
                field.dataType)
            # SPARK-21766: if an integer field is nullable and has null values, it can be
            # inferred by pandas as float column. Once we convert the column with NaN back
            # to integer type e.g., np.int16, we will hit exception. So we use the inferred
            # float type, not the corrected type from the schema in this case.
            if pandas_type is not None and \
                not(isinstance(field.dataType, IntegralType) and field.nullable and
                    pdf[field.name].isnull().any()):
                dtype[field.name] = pandas_type
            # Ensure we fall back to nullable numpy types, even when whole column is null:
            if isinstance(field.dataType,
                          IntegralType) and pdf[field.name].isnull().any():
                dtype[field.name] = np.float64
            if isinstance(field.dataType,
                          BooleanType) and pdf[field.name].isnull().any():
                dtype[field.name] = np.object

        for f, t in dtype.items():
            pdf[f] = pdf[f].astype(t, copy=False)

        if timezone is None:
            return pdf
        else:
            from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz
            for field in self.schema:
                # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                if isinstance(field.dataType, TimestampType):
                    pdf[field.name] = \
                        _check_series_convert_timestamps_local_tz(pdf[field.name], timezone)
            return pdf
Ejemplo n.º 4
0
def main():

    # Location of the Flight Service
    host = '127.0.0.1'
    port = '8888'

    # Unique identifier for flight data
    flight_desc = 'spark-flight-descriptor'

    # --------------------------------------------- #
    # Run Spark to put Arrow data to Flight Service #
    # --------------------------------------------- #
    spark = SparkSession \
        .builder \
        .appName('spark-flight') \
        .getOrCreate()

    df = spark.range(10) \
        .select((col('id') % 2).alias('label')).withColumn('data', rand())

    df.show(10)

    # Put the Spark DataFrame to the Flight Service
    SparkFlightConnector.put(df, host, port, flight_desc)

    # ------------------------------------------------------------- #
    # Create a Pandas DataFrame from a pyarrow Flight client reader #
    # ------------------------------------------------------------- #

    # Connect to the Flight service and get endpoints from FlightInfo
    client = pa_flight.connect((host, int(port)))
    desc = pa_flight.FlightDescriptor.for_path(flight_desc)
    info = client.get_flight_info(desc)
    endpoints = info.endpoints

    # Read all flight endpoints into pyarrow Tables
    tables = []
    for e in endpoints:
        flight_reader = client.do_get(e.ticket)
        table = flight_reader.read_all()
        tables.append(table)

    # Convert Tables to a single Pandas DataFrame
    table = pa.concat_tables(tables)
    pdf = table.to_pandas()
    print(f"DataFrame from Flight streams:\n{pdf}")

    # ------------------------------------------------------------- #
    # Create tf.data.Dataset to iterate over Arrow data from Flight #
    # ------------------------------------------------------------- #
    have_tensorflow = False
    try:
        import tensorflow
        import tensorflow_io
        have_tensorflow = True
    except ImportError:
        pass

    if have_tensorflow:
        from tensorflow_flight_dataset import ArrowFlightDataset
        dataset = ArrowFlightDataset.from_schema(host, port, flight_desc,
                                                 to_arrow_schema(df.schema))
        for row in dataset:
            print(row)
        dataset.proc.terminate()

    spark.stop()
Ejemplo n.º 5
0
def main():

    # Location of the Flight Service
    host = '127.0.0.1'
    port = '8888'

    # Unique identifier for flight data
    flight_desc = 'spark-flight-descriptor'

    # --------------------------------------------- #
    # Run Spark to put Arrow data to Flight Service #
    # --------------------------------------------- #
    spark = SparkSession \
        .builder \
        .appName('spark-flight') \
        .getOrCreate()

    start_time = time.time()
    df = spark.read.format("avro").load(
        "file:///Users/vsowrira/git/test_vary_parallel_read_calls-16-8-16-1/")

    # Put the Spark DataFrame to the Flight Service
    SparkFlightConnector.put(df, host, port, flight_desc)

    # ------------------------------------------------------------- #
    # Create a Pandas DataFrame from a pyarrow Flight client reader #
    # ------------------------------------------------------------- #

    # Connect to the Flight service and get endpoints from FlightInfo
    '''
    client = pa_flight.connect((host, int(port)))
    desc = pa_flight.FlightDescriptor.for_path(flight_desc)
    info = client.get_flight_info(desc)
    endpoints = info.endpoints

    # Read all flight endpoints into pyarrow Tables
    tables = []
    for e in endpoints:
        flight_reader = client.do_get(e.ticket)
        table = flight_reader.read_all()
        tables.append(table)

    # Convert Tables to a single Pandas DataFrame
    table = pa.concat_tables(tables)
    pdf = table.to_pandas()
    print("DataFrame from Flight streams:\n %", pdf)
    '''

    # ------------------------------------------------------------- #
    # Create tf.data.Dataset to iterate over Arrow data from Flight #
    # ------------------------------------------------------------- #
    have_tensorflow = False
    try:
        import tensorflow
        import tensorflow_io
        have_tensorflow = True
    except ImportError:
        pass

    if have_tensorflow:
        from tensorflow_flight_dataset import ArrowFlightDataset
        dataset = ArrowFlightDataset.from_schema(host, port, flight_desc,
                                                 to_arrow_schema(df.schema))
        count = 0
        for row in dataset:
            if count < 10:
                print(row)
                count = count + 1
        dataset.proc.terminate()

    print("Total time taken to read avro data in TF using Spark with Arrow:",
          (time.time() - start_time))

    spark.stop()