Exemple #1
0
    def _check_schema(self, rows, expected_schema):
        from oarphpy.spark import RowAdapter
        schema = RowAdapter.to_schema(rows[0])
        adapted_rows = [RowAdapter.to_row(r) for r in rows]
        with testutil.LocalSpark.sess() as spark:
            df = spark.createDataFrame(adapted_rows,
                                       schema=schema,
                                       verifySchema=False)
            # verifySchema is expensive and improperly errors on mostly empty rows

            if self._is_spark_2x():
                # Spark 2.x returns schema values in a different order, so we do a more
                # flexible test
                def tokenize(s):
                    import re
                    return sorted(re.split('[<>,]+', s))

                actual = dict((col, tokenize(s)) for col, s in df.dtypes)
                expected = dict(
                    (col, tokenize(s)) for col, s in expected_schema)
                assert actual == expected
            else:
                # Tests are written for Spark 3.x
                assert df.dtypes == expected_schema

        return schema
Exemple #2
0
def _check_serialization(spark, rows, testname, schema=None):
    from oarphpy import util
    from oarphpy.spark import RowAdapter

    TEST_TEMPDIR = testutil.test_tempdir('spark_row_adapter_test')

    adapted_rows = [RowAdapter.to_row(r) for r in rows]
    if schema:
        df = spark.createDataFrame(adapted_rows,
                                   schema=schema,
                                   verifySchema=False)
        # verifySchema is expensive and improperly erros on mostly empty rows
    else:
        df = spark.createDataFrame(adapted_rows)
        # Automatically samples both rows to get schema
    outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % testname)
    df.write.parquet(outpath)

    df2 = spark.read.parquet(outpath)
    decoded_wrapped_rows = df2.collect()

    decoded_rows = [RowAdapter.from_row(row) for row in decoded_wrapped_rows]

    # We can't do assert sorted(rows) == sorted(decoded_rows)
    # because numpy syntatic sugar breaks ==
    import pprint

    def sorted_row_str(rowz):
        return pprint.pformat(sorted(rowz, key=lambda row: row['id']))

    assert sorted_row_str(rows) == sorted_row_str(decoded_rows)
Exemple #3
0
    def test_nonadapted_input(self):
        from oarphpy.spark import RowAdapter

        # RowAdapter leaves bare data input unchanged
        BARE_VALUES = True, 1, 1.0, "moof", bytes(b"moof")
        for datum in BARE_VALUES:
            assert RowAdapter.to_row(datum) == datum
            assert RowAdapter.from_row(datum) == datum
Exemple #4
0
    def _check_raw_adaption(self, raw_expected):
        from oarphpy.spark import RowAdapter

        for raw_data, expected_row in raw_expected:
            actual_row = RowAdapter.to_row(raw_data)
            assert actual_row == expected_row

            actual_data = RowAdapter.from_row(expected_row)
            assert actual_data == raw_data
Exemple #5
0
    def test_pesky_numpy(self):
        import numpy as np
        from oarphpy.spark import RowAdapter

        # RowAdapter translates pesky numpy-boxed numbers ...
        assert RowAdapter.to_row(Row(x=np.float32(1.))) == Row(x=1.)

        # ... but only one way! In practice, just don't save boxed numbers in rows.
        assert RowAdapter.from_row(
            Row(x=np.float32(1.))) == Row(x=np.float32(1.))
Exemple #6
0
    def _check_serialization(self, rows, schema=None, do_adaption=True):
        import inspect
        from oarphpy import util
        from oarphpy.spark import RowAdapter

        test_name = inspect.stack()[1][3]

        TEST_TEMPDIR = testutil.test_tempdir('TestRowAdapter.' + test_name)

        if do_adaption:
            adapted_rows = [RowAdapter.to_row(r) for r in rows]
        else:
            adapted_rows = rows

        with testutil.LocalSpark.sess() as spark:
            if schema:
                df = spark.createDataFrame(adapted_rows,
                                           schema=schema,
                                           verifySchema=False)
                # verifySchema is expensive and improperly errors on mostly
                # empty rows
            else:
                df = spark.createDataFrame(adapted_rows)
                # Automatically samples rows to get schema
            outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % test_name)
            df.write.parquet(outpath)

            df2 = spark.read.parquet(outpath)
            decoded_wrapped_rows = df2.collect()

            if do_adaption:
                decoded_rows = [
                    RowAdapter.from_row(row) for row in decoded_wrapped_rows
                ]
                # We can't do assert sorted(rows) == sorted(decoded_rows)
                # because numpy syntatic sugar breaks __eq__, so use pprint,
                # which is safe for our tests
                import pprint

                def sorted_row_str(rowz):
                    if self._is_spark_2x():
                        # Spark 2.x has non-stable sorting semantics for Row
                        if len(rowz) > 1:
                            rowz = sorted(rowz, key=lambda r: r.id)
                        return pprint.pformat(rowz)
                    else:
                        return pprint.pformat(sorted(rowz))

                assert sorted_row_str(rows) == sorted_row_str(decoded_rows)

            return df
Exemple #7
0
    def test_rowadapter_cloudpickled_callable(self):
        from oarphpy.spark import CloudpickeledCallable

        def moof():
            return 'moof'

        cc_moof = CloudpickeledCallable(moof)
        assert cc_moof() == moof()

        cc_empty = CloudpickeledCallable()
        assert cc_empty == CloudpickeledCallable.empty()
        with pytest.raises(Exception):
            cc_empty()

        rows = [
            Row(id=0, f=cc_empty),
            Row(id=1, f=cc_moof),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df, 'f.__pyclass__') == [
            'oarphpy.spark.CloudpickeledCallableData'
        ]

        # Ensuring invoking the functions still works
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        with pytest.raises(Exception):
            decoded[0].f()
        assert decoded[1].f() == moof()
Exemple #8
0
    def test_rowadapter_numpy_packed(self):
        import numpy as np
        from oarphpy.spark import TENSOR_AUTO_PACK_MIN_KBYTES

        N = int(TENSOR_AUTO_PACK_MIN_KBYTES *
                (2**10) / np.dtype(int).itemsize) + 1
        expect_packed = np.reshape(np.array(range(2 * N)), (2, N))

        rows = [
            Row(id=0, x=expect_packed),
            Row(id=1, x=(expect_packed + 1)),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df,
                                'x.__pyclass__') == ['oarphpy.spark.Tensor']

        # Verify that we actually have a column of packed values
        bin_data = df.select('*').first().x.values_packed
        assert len(
            bin_data) == expect_packed.size * expect_packed.dtype.itemsize
        # For ints, usually 8 bytes per int * 2 * N

        # _check_serialization() verifies that the data gets decoded as numpy
        # arrays, but just to make things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        np.testing.assert_equal(decoded[0].x, expect_packed)
        np.testing.assert_equal(decoded[1].x, expect_packed + 1)
Exemple #9
0
    def test_rowadapter_numpy_unpacked(self):
        import numpy as np

        # RowAdapter translates Numpy arrays to a oarphy Tensor object that affords
        # SQL-based inspection for small arrays (and uses a more efficient row- or
        # column-major packed encoding for large arrays; see next test)
        rows = [
            Row(id=0, x=np.array([1, 2, 3])),
            Row(id=1, x=np.array([4, 5, 6])),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df,
                                'x.__pyclass__') == ['oarphpy.spark.Tensor']
        EXPECTED = """
        id                                                     x
    0   0  (oarphpy.spark.Tensor, [3], int64, C, [1, 2, 3], [])
    1   1  (oarphpy.spark.Tensor, [3], int64, C, [4, 5, 6], [])
    """
        self._pandas_compare_str(df.orderBy('id').toPandas(), EXPECTED)

        # _check_serialization() verifies that the data gets decoded as numpy
        # arrays, but just to make things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        np.testing.assert_equal(decoded[0].x, np.array([1, 2, 3]))
        np.testing.assert_equal(decoded[1].x, np.array([4, 5, 6]))
Exemple #10
0
    def test_rowadapter_unslotted(self):

        # RowAdapter will deserialize and re-create Python objects, using the
        # `Unslotted` class defined at runtime.
        rows = [
            Row(id=0, x=Unslotted(v=4)),
            Row(id=1, x=Unslotted(v=5)),
        ]
        df = self._check_serialization(rows)

        # NB: The check above also checks that `Unslotted` instances are created
        # and checks equality via __eq__ or pprint.pformat().  But just to make
        # things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = sorted(RowAdapter.from_row(r) for r in df.collect())
        assert [r.x for r in decoded] == [Unslotted(v=4), Unslotted(v=5)]

        # RowAdapter records the (full) class name in the table as the
        # `__pyclass__` attribute of each value of the `x` column.
        assert _select_distinct(
            df, 'x.__pyclass__') == ['oarphpy_test.test_spark.Unslotted']

        # RowAdapter encodes objects as structs (even though in Python objects are
        # very dict-like).
        self._check_schema(rows, [
            ('id', 'bigint'),
            ('x', 'struct<__pyclass__:string,v:bigint>'),
        ])
Exemple #11
0
def test_row_adapter():
    import numpy as np

    from pyspark.sql import Row

    from oarphpy.spark import RowAdapter

    rows = [
        Row(
            id=1,
            np_number=np.float32(1.),
            a=np.array([1]),
            b={'foo': np.array([[1]], dtype=np.uint8)},
            c=[np.array([[[1.]], [[2.]], [[3.]]])],
            d=Slotted(foo=5, bar="abc", _not_hidden=1),
            e=[Slotted(foo=6, bar="def", _not_hidden=1)],
            f=Unslotted(meow=4, _not_hidden=1, __hidden=2),
            g=Unslotted(),  # Intentionally empty; adapter should set nothing
            h=Row(i=1, j=2),
        ),

        # Include a mostly empty row below to exercise Spark type validation.
        # Spark will ensure the row below and row above have the same schema;
        # note that `None` (or 'null') is only allowed for Struct / Row types.
        Row(
            id=2,
            np_number=np.float32(2.),
            a=np.array([]),
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=Row(i=3, j=3),
        ),
    ]

    with testutil.LocalSpark.sess() as spark:

        ## Test basic round-trip serialization and adaptation
        _check_serialization(spark, rows, 'basic')

        ## Test Schema Deduction
        mostly_empty = Row(
            id=2,
            np_number=None,
            a=None,
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=None,
        )
        mostly_empty_adapted = RowAdapter.to_row(mostly_empty)

        # Spark can't deduce schema from the empty-ish row ...
        with pytest.raises(ValueError) as excinfo:
            df = spark.createDataFrame([mostly_empty_adapted])
        assert "Some of types cannot be determined" in str(excinfo.value)

        # ... but this works if we tell it the schema!
        schema = RowAdapter.to_schema(rows[0])
        df = spark.createDataFrame([mostly_empty_adapted],
                                   schema=schema,
                                   verifySchema=False)

        EXPECTED_SCHEMA = [
            ('a',
             'struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>'
             ),
            ('b',
             'map<string,struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>>'
             ),
            ('c',
             'array<struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<double>>>'
             ),
            ('d',
             'struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>'
             ),
            ('e',
             'array<struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>>'
             ),
            ('f', 'struct<__pyclass__:string,_not_hidden:bigint,meow:bigint>'),
            ('g', 'struct<__pyclass__:string>'),
            ('h', 'struct<i:bigint,j:bigint>'),
            ('id', 'bigint'),
            ('np_number', 'double'),
        ]
        assert df.dtypes == EXPECTED_SCHEMA

        # Check that pyspark retains the empty values in `mostly_empty`
        for colname in sorted(df.columns):
            values = df.select(colname).collect()
            assert len(values) == 1
            assert mostly_empty[colname] == values[0][colname]

        # ... and we can also read/write the empty-ish row!
        _check_serialization(spark, [mostly_empty],
                             'with_schema',
                             schema=schema)
Exemple #12
0
    def test_rowadapter_complex(self):
        from oarphpy.spark import RowAdapter

        # A large-ish example that covers the above cases in aggregate
        rows = [
            Row(
                id=1,
                np_number=np.float32(1.),
                a=np.array([1]),
                b={'foo': np.array([[1]], dtype=np.uint8)},
                c=[np.array([[[1.]], [[2.]], [[3.]]])],
                d=Slotted(foo=5, bar="abc", _not_hidden=1),
                e=[Slotted(foo=6, bar="def", _not_hidden=1)],
                f=Unslotted(meow=4, _not_hidden=1, __hidden=2),
                g=Unslotted(
                ),  # Intentionally empty; adapter should set nothing
                h=Row(i=1, j=2),
            ),

            # Include a mostly empty row below to exercise Spark type validation.
            # Spark will ensure the row below and row above have the same schema;
            # note that `None` (or 'null') is only allowed for Struct / Row types.
            Row(
                id=2,
                np_number=np.float32(2.),
                a=np.array([]),
                b={},
                c=[],
                d=None,
                e=[],
                f=None,
                g=None,
                h=Row(i=3, j=3),
            ),
        ]

        df = self._check_serialization(rows)
        EXPECTED_ALL = """
                                                                                  0                                                1
    id                                                                            1                                                2
    np_number                                                                   1.0                                              2.0
    a                                (oarphpy.spark.Tensor, [1], int64, C, [1], [])  (oarphpy.spark.Tensor, [0], float64, C, [], [])
    b              {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])}                                               {}
    c          [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])]                                               []
    d                                  (oarphpy_test.test_spark.Slotted, 5, abc, 1)                                             None
    e                                [(oarphpy_test.test_spark.Slotted, 6, def, 1)]                                               []
    f                                     (oarphpy_test.test_spark.Unslotted, 4, 1)                                             None
    g                                          (oarphpy_test.test_spark.Unslotted,)                                             None
    h                                                                        (1, 2)                                           (3, 3)
    """

        # DEPRECATED: pyspark 2.x is deprecated
        import pyspark
        if pyspark.__version__.startswith('2.'):
            EXPECTED_ALL = """
                                                                                  0                                                1
      id                                                                            1                                                2
      np_number                                                                     1                                                2
      a                                (oarphpy.spark.Tensor, [1], int64, C, [1], [])  (oarphpy.spark.Tensor, [0], float64, C, [], [])
      b              {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])}                                               {}
      c          [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])]                                               []
      d                                  (oarphpy_test.test_spark.Slotted, 5, abc, 1)                                             None
      e                                [(oarphpy_test.test_spark.Slotted, 6, def, 1)]                                               []
      f                                     (oarphpy_test.test_spark.Unslotted, 4, 1)                                             None
      g                                          (oarphpy_test.test_spark.Unslotted,)                                             None
      h                                                                        (1, 2)                                           (3, 3)
      """

        self._pandas_compare_str(df.orderBy('id').toPandas().T, EXPECTED_ALL)

        # Test Schema Deduction
        mostly_empty = Row(
            id=2,
            np_number=None,
            a=None,
            b={},
            c=[],
            d=None,
            e=[],
            f=None,
            g=None,
            h=None,
        )
        mostly_empty_adapted = RowAdapter.to_row(mostly_empty)

        # Spark can't deduce schema from the empty-ish row ...
        with pytest.raises(ValueError) as excinfo:
            self._check_serialization([mostly_empty_adapted],
                                      do_adaption=False)
        assert "Some of types cannot be determined" in str(excinfo.value)

        # ... but this works if we tell it the schema!
        schema = RowAdapter.to_schema(rows[0])
        self._check_serialization([mostly_empty_adapted], schema=schema)

        # Let's check that RowAdapter schema deduction works as expected
        EXPECTED_SCHEMA = [
            ('id', 'bigint'),
            ('np_number', 'double'),
            ('a',
             'struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>'
             ),
            ('b',
             'map<string,struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>>'
             ),
            ('c',
             'array<struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<double>,values_packed:binary>>'
             ),
            ('d',
             'struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>'
             ),
            ('e',
             'array<struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>>'
             ),
            ('f', 'struct<__pyclass__:string,meow:bigint,_not_hidden:bigint>'),
            ('g', 'struct<__pyclass__:string>'),
            ('h', 'struct<i:bigint,j:bigint>'),
        ]
        self._check_schema(rows, EXPECTED_SCHEMA)