Ejemplo n.º 1
0
    def _check_schema(self, rows, expected_schema):
        from oarphpy.spark import RowAdapter
        schema = RowAdapter.to_schema(rows[0])
        adapted_rows = [RowAdapter.to_row(r) for r in rows]
        with testutil.LocalSpark.sess() as spark:
            df = spark.createDataFrame(adapted_rows,
                                       schema=schema,
                                       verifySchema=False)
            # verifySchema is expensive and improperly errors on mostly empty rows

            if self._is_spark_2x():
                # Spark 2.x returns schema values in a different order, so we do a more
                # flexible test
                def tokenize(s):
                    import re
                    return sorted(re.split('[<>,]+', s))

                actual = dict((col, tokenize(s)) for col, s in df.dtypes)
                expected = dict(
                    (col, tokenize(s)) for col, s in expected_schema)
                assert actual == expected
            else:
                # Tests are written for Spark 3.x
                assert df.dtypes == expected_schema

        return schema
Ejemplo n.º 2
0
def test_row_adapter():
    import numpy as np

    from pyspark.sql import Row

    from oarphpy.spark import RowAdapter

    rows = [
        Row(
            id=1,
            np_number=np.float32(1.),
            a=np.array([1]),
            b={'foo': np.array([[1]], dtype=np.uint8)},
            c=[np.array([[[1.]], [[2.]], [[3.]]])],
            d=Slotted(foo=5, bar="abc", _not_hidden=1),
            e=[Slotted(foo=6, bar="def", _not_hidden=1)],
            f=Unslotted(meow=4, _not_hidden=1, __hidden=2),
            g=Unslotted(),  # Intentionally empty; adapter should set nothing
            h=Row(i=1, j=2),
        ),

        # Include a mostly empty row below to exercise Spark type validation.
        # Spark will ensure the row below and row above have the same schema;
        # note that `None` (or 'null') is only allowed for Struct / Row types.
        Row(
            id=2,
            np_number=np.float32(2.),
            a=np.array([]),
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=Row(i=3, j=3),
        ),
    ]

    with testutil.LocalSpark.sess() as spark:

        ## Test basic round-trip serialization and adaptation
        _check_serialization(spark, rows, 'basic')

        ## Test Schema Deduction
        mostly_empty = Row(
            id=2,
            np_number=None,
            a=None,
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=None,
        )
        mostly_empty_adapted = RowAdapter.to_row(mostly_empty)

        # Spark can't deduce schema from the empty-ish row ...
        with pytest.raises(ValueError) as excinfo:
            df = spark.createDataFrame([mostly_empty_adapted])
        assert "Some of types cannot be determined" in str(excinfo.value)

        # ... but this works if we tell it the schema!
        schema = RowAdapter.to_schema(rows[0])
        df = spark.createDataFrame([mostly_empty_adapted],
                                   schema=schema,
                                   verifySchema=False)

        EXPECTED_SCHEMA = [
            ('a',
             'struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>'
             ),
            ('b',
             'map<string,struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>>'
             ),
            ('c',
             'array<struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<double>>>'
             ),
            ('d',
             'struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>'
             ),
            ('e',
             'array<struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>>'
             ),
            ('f', 'struct<__pyclass__:string,_not_hidden:bigint,meow:bigint>'),
            ('g', 'struct<__pyclass__:string>'),
            ('h', 'struct<i:bigint,j:bigint>'),
            ('id', 'bigint'),
            ('np_number', 'double'),
        ]
        assert df.dtypes == EXPECTED_SCHEMA

        # Check that pyspark retains the empty values in `mostly_empty`
        for colname in sorted(df.columns):
            values = df.select(colname).collect()
            assert len(values) == 1
            assert mostly_empty[colname] == values[0][colname]

        # ... and we can also read/write the empty-ish row!
        _check_serialization(spark, [mostly_empty],
                             'with_schema',
                             schema=schema)
Ejemplo n.º 3
0
    def test_rowadapter_complex(self):
        from oarphpy.spark import RowAdapter

        # A large-ish example that covers the above cases in aggregate
        rows = [
            Row(
                id=1,
                np_number=np.float32(1.),
                a=np.array([1]),
                b={'foo': np.array([[1]], dtype=np.uint8)},
                c=[np.array([[[1.]], [[2.]], [[3.]]])],
                d=Slotted(foo=5, bar="abc", _not_hidden=1),
                e=[Slotted(foo=6, bar="def", _not_hidden=1)],
                f=Unslotted(meow=4, _not_hidden=1, __hidden=2),
                g=Unslotted(
                ),  # Intentionally empty; adapter should set nothing
                h=Row(i=1, j=2),
            ),

            # Include a mostly empty row below to exercise Spark type validation.
            # Spark will ensure the row below and row above have the same schema;
            # note that `None` (or 'null') is only allowed for Struct / Row types.
            Row(
                id=2,
                np_number=np.float32(2.),
                a=np.array([]),
                b={},
                c=[],
                d=None,
                e=[],
                f=None,
                g=None,
                h=Row(i=3, j=3),
            ),
        ]

        df = self._check_serialization(rows)
        EXPECTED_ALL = """
                                                                                  0                                                1
    id                                                                            1                                                2
    np_number                                                                   1.0                                              2.0
    a                                (oarphpy.spark.Tensor, [1], int64, C, [1], [])  (oarphpy.spark.Tensor, [0], float64, C, [], [])
    b              {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])}                                               {}
    c          [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])]                                               []
    d                                  (oarphpy_test.test_spark.Slotted, 5, abc, 1)                                             None
    e                                [(oarphpy_test.test_spark.Slotted, 6, def, 1)]                                               []
    f                                     (oarphpy_test.test_spark.Unslotted, 4, 1)                                             None
    g                                          (oarphpy_test.test_spark.Unslotted,)                                             None
    h                                                                        (1, 2)                                           (3, 3)
    """

        # DEPRECATED: pyspark 2.x is deprecated
        import pyspark
        if pyspark.__version__.startswith('2.'):
            EXPECTED_ALL = """
                                                                                  0                                                1
      id                                                                            1                                                2
      np_number                                                                     1                                                2
      a                                (oarphpy.spark.Tensor, [1], int64, C, [1], [])  (oarphpy.spark.Tensor, [0], float64, C, [], [])
      b              {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])}                                               {}
      c          [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])]                                               []
      d                                  (oarphpy_test.test_spark.Slotted, 5, abc, 1)                                             None
      e                                [(oarphpy_test.test_spark.Slotted, 6, def, 1)]                                               []
      f                                     (oarphpy_test.test_spark.Unslotted, 4, 1)                                             None
      g                                          (oarphpy_test.test_spark.Unslotted,)                                             None
      h                                                                        (1, 2)                                           (3, 3)
      """

        self._pandas_compare_str(df.orderBy('id').toPandas().T, EXPECTED_ALL)

        # Test Schema Deduction
        mostly_empty = Row(
            id=2,
            np_number=None,
            a=None,
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=None,
        )
        mostly_empty_adapted = RowAdapter.to_row(mostly_empty)

        # Spark can't deduce schema from the empty-ish row ...
        with pytest.raises(ValueError) as excinfo:
            self._check_serialization([mostly_empty_adapted],
                                      do_adaption=False)
        assert "Some of types cannot be determined" in str(excinfo.value)

        # ... but this works if we tell it the schema!
        schema = RowAdapter.to_schema(rows[0])
        self._check_serialization([mostly_empty_adapted], schema=schema)

        # Let's check that RowAdapter schema deduction works as expected
        EXPECTED_SCHEMA = [
            ('id', 'bigint'),
            ('np_number', 'double'),
            ('a',
             'struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>'
             ),
            ('b',
             'map<string,struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>>'
             ),
            ('c',
             'array<struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<double>,values_packed:binary>>'
             ),
            ('d',
             'struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>'
             ),
            ('e',
             'array<struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>>'
             ),
            ('f', 'struct<__pyclass__:string,meow:bigint,_not_hidden:bigint>'),
            ('g', 'struct<__pyclass__:string>'),
            ('h', 'struct<i:bigint,j:bigint>'),
        ]
        self._check_schema(rows, EXPECTED_SCHEMA)