Ejemplo n.º 1
0
    def test_rowadapter_cloudpickled_callable(self):
        from oarphpy.spark import CloudpickeledCallable

        def moof():
            return 'moof'

        cc_moof = CloudpickeledCallable(moof)
        assert cc_moof() == moof()

        cc_empty = CloudpickeledCallable()
        assert cc_empty == CloudpickeledCallable.empty()
        with pytest.raises(Exception):
            cc_empty()

        rows = [
            Row(id=0, f=cc_empty),
            Row(id=1, f=cc_moof),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df, 'f.__pyclass__') == [
            'oarphpy.spark.CloudpickeledCallableData'
        ]

        # Ensuring invoking the functions still works
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        with pytest.raises(Exception):
            decoded[0].f()
        assert decoded[1].f() == moof()
Ejemplo n.º 2
0
def _check_serialization(spark, rows, testname, schema=None):
    from oarphpy import util
    from oarphpy.spark import RowAdapter

    TEST_TEMPDIR = testutil.test_tempdir('spark_row_adapter_test')

    adapted_rows = [RowAdapter.to_row(r) for r in rows]
    if schema:
        df = spark.createDataFrame(adapted_rows,
                                   schema=schema,
                                   verifySchema=False)
        # verifySchema is expensive and improperly erros on mostly empty rows
    else:
        df = spark.createDataFrame(adapted_rows)
        # Automatically samples both rows to get schema
    outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % testname)
    df.write.parquet(outpath)

    df2 = spark.read.parquet(outpath)
    decoded_wrapped_rows = df2.collect()

    decoded_rows = [RowAdapter.from_row(row) for row in decoded_wrapped_rows]

    # We can't do assert sorted(rows) == sorted(decoded_rows)
    # because numpy syntatic sugar breaks ==
    import pprint

    def sorted_row_str(rowz):
        return pprint.pformat(sorted(rowz, key=lambda row: row['id']))

    assert sorted_row_str(rows) == sorted_row_str(decoded_rows)
Ejemplo n.º 3
0
    def test_rowadapter_numpy_unpacked(self):
        import numpy as np

        # RowAdapter translates Numpy arrays to a oarphy Tensor object that affords
        # SQL-based inspection for small arrays (and uses a more efficient row- or
        # column-major packed encoding for large arrays; see next test)
        rows = [
            Row(id=0, x=np.array([1, 2, 3])),
            Row(id=1, x=np.array([4, 5, 6])),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df,
                                'x.__pyclass__') == ['oarphpy.spark.Tensor']
        EXPECTED = """
        id                                                     x
    0   0  (oarphpy.spark.Tensor, [3], int64, C, [1, 2, 3], [])
    1   1  (oarphpy.spark.Tensor, [3], int64, C, [4, 5, 6], [])
    """
        self._pandas_compare_str(df.orderBy('id').toPandas(), EXPECTED)

        # _check_serialization() verifies that the data gets decoded as numpy
        # arrays, but just to make things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        np.testing.assert_equal(decoded[0].x, np.array([1, 2, 3]))
        np.testing.assert_equal(decoded[1].x, np.array([4, 5, 6]))
Ejemplo n.º 4
0
    def test_rowadapter_numpy_packed(self):
        import numpy as np
        from oarphpy.spark import TENSOR_AUTO_PACK_MIN_KBYTES

        N = int(TENSOR_AUTO_PACK_MIN_KBYTES *
                (2**10) / np.dtype(int).itemsize) + 1
        expect_packed = np.reshape(np.array(range(2 * N)), (2, N))

        rows = [
            Row(id=0, x=expect_packed),
            Row(id=1, x=(expect_packed + 1)),
        ]
        df = self._check_serialization(rows)
        assert _select_distinct(df,
                                'x.__pyclass__') == ['oarphpy.spark.Tensor']

        # Verify that we actually have a column of packed values
        bin_data = df.select('*').first().x.values_packed
        assert len(
            bin_data) == expect_packed.size * expect_packed.dtype.itemsize
        # For ints, usually 8 bytes per int * 2 * N

        # _check_serialization() verifies that the data gets decoded as numpy
        # arrays, but just to make things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()]
        np.testing.assert_equal(decoded[0].x, expect_packed)
        np.testing.assert_equal(decoded[1].x, expect_packed + 1)
Ejemplo n.º 5
0
    def test_rowadapter_unslotted(self):

        # RowAdapter will deserialize and re-create Python objects, using the
        # `Unslotted` class defined at runtime.
        rows = [
            Row(id=0, x=Unslotted(v=4)),
            Row(id=1, x=Unslotted(v=5)),
        ]
        df = self._check_serialization(rows)

        # NB: The check above also checks that `Unslotted` instances are created
        # and checks equality via __eq__ or pprint.pformat().  But just to make
        # things visible in this test:
        from oarphpy.spark import RowAdapter
        decoded = sorted(RowAdapter.from_row(r) for r in df.collect())
        assert [r.x for r in decoded] == [Unslotted(v=4), Unslotted(v=5)]

        # RowAdapter records the (full) class name in the table as the
        # `__pyclass__` attribute of each value of the `x` column.
        assert _select_distinct(
            df, 'x.__pyclass__') == ['oarphpy_test.test_spark.Unslotted']

        # RowAdapter encodes objects as structs (even though in Python objects are
        # very dict-like).
        self._check_schema(rows, [
            ('id', 'bigint'),
            ('x', 'struct<__pyclass__:string,v:bigint>'),
        ])
Ejemplo n.º 6
0
    def test_nonadapted_input(self):
        from oarphpy.spark import RowAdapter

        # RowAdapter leaves bare data input unchanged
        BARE_VALUES = True, 1, 1.0, "moof", bytes(b"moof")
        for datum in BARE_VALUES:
            assert RowAdapter.to_row(datum) == datum
            assert RowAdapter.from_row(datum) == datum
Ejemplo n.º 7
0
    def _check_raw_adaption(self, raw_expected):
        from oarphpy.spark import RowAdapter

        for raw_data, expected_row in raw_expected:
            actual_row = RowAdapter.to_row(raw_data)
            assert actual_row == expected_row

            actual_data = RowAdapter.from_row(expected_row)
            assert actual_data == raw_data
Ejemplo n.º 8
0
    def test_pesky_numpy(self):
        import numpy as np
        from oarphpy.spark import RowAdapter

        # RowAdapter translates pesky numpy-boxed numbers ...
        assert RowAdapter.to_row(Row(x=np.float32(1.))) == Row(x=1.)

        # ... but only one way! In practice, just don't save boxed numbers in rows.
        assert RowAdapter.from_row(
            Row(x=np.float32(1.))) == Row(x=np.float32(1.))
Ejemplo n.º 9
0
    def _check_serialization(self, rows, schema=None, do_adaption=True):
        import inspect
        from oarphpy import util
        from oarphpy.spark import RowAdapter

        test_name = inspect.stack()[1][3]

        TEST_TEMPDIR = testutil.test_tempdir('TestRowAdapter.' + test_name)

        if do_adaption:
            adapted_rows = [RowAdapter.to_row(r) for r in rows]
        else:
            adapted_rows = rows

        with testutil.LocalSpark.sess() as spark:
            if schema:
                df = spark.createDataFrame(adapted_rows,
                                           schema=schema,
                                           verifySchema=False)
                # verifySchema is expensive and improperly errors on mostly
                # empty rows
            else:
                df = spark.createDataFrame(adapted_rows)
                # Automatically samples rows to get schema
            outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % test_name)
            df.write.parquet(outpath)

            df2 = spark.read.parquet(outpath)
            decoded_wrapped_rows = df2.collect()

            if do_adaption:
                decoded_rows = [
                    RowAdapter.from_row(row) for row in decoded_wrapped_rows
                ]
                # We can't do assert sorted(rows) == sorted(decoded_rows)
                # because numpy syntatic sugar breaks __eq__, so use pprint,
                # which is safe for our tests
                import pprint

                def sorted_row_str(rowz):
                    if self._is_spark_2x():
                        # Spark 2.x has non-stable sorting semantics for Row
                        if len(rowz) > 1:
                            rowz = sorted(rowz, key=lambda r: r.id)
                        return pprint.pformat(rowz)
                    else:
                        return pprint.pformat(sorted(rowz))

                assert sorted_row_str(rows) == sorted_row_str(decoded_rows)

            return df