def _check_schema(self, rows, expected_schema): from oarphpy.spark import RowAdapter schema = RowAdapter.to_schema(rows[0]) adapted_rows = [RowAdapter.to_row(r) for r in rows] with testutil.LocalSpark.sess() as spark: df = spark.createDataFrame(adapted_rows, schema=schema, verifySchema=False) # verifySchema is expensive and improperly errors on mostly empty rows if self._is_spark_2x(): # Spark 2.x returns schema values in a different order, so we do a more # flexible test def tokenize(s): import re return sorted(re.split('[<>,]+', s)) actual = dict((col, tokenize(s)) for col, s in df.dtypes) expected = dict( (col, tokenize(s)) for col, s in expected_schema) assert actual == expected else: # Tests are written for Spark 3.x assert df.dtypes == expected_schema return schema
def _check_serialization(spark, rows, testname, schema=None): from oarphpy import util from oarphpy.spark import RowAdapter TEST_TEMPDIR = testutil.test_tempdir('spark_row_adapter_test') adapted_rows = [RowAdapter.to_row(r) for r in rows] if schema: df = spark.createDataFrame(adapted_rows, schema=schema, verifySchema=False) # verifySchema is expensive and improperly erros on mostly empty rows else: df = spark.createDataFrame(adapted_rows) # Automatically samples both rows to get schema outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % testname) df.write.parquet(outpath) df2 = spark.read.parquet(outpath) decoded_wrapped_rows = df2.collect() decoded_rows = [RowAdapter.from_row(row) for row in decoded_wrapped_rows] # We can't do assert sorted(rows) == sorted(decoded_rows) # because numpy syntatic sugar breaks == import pprint def sorted_row_str(rowz): return pprint.pformat(sorted(rowz, key=lambda row: row['id'])) assert sorted_row_str(rows) == sorted_row_str(decoded_rows)
def test_nonadapted_input(self): from oarphpy.spark import RowAdapter # RowAdapter leaves bare data input unchanged BARE_VALUES = True, 1, 1.0, "moof", bytes(b"moof") for datum in BARE_VALUES: assert RowAdapter.to_row(datum) == datum assert RowAdapter.from_row(datum) == datum
def _check_raw_adaption(self, raw_expected): from oarphpy.spark import RowAdapter for raw_data, expected_row in raw_expected: actual_row = RowAdapter.to_row(raw_data) assert actual_row == expected_row actual_data = RowAdapter.from_row(expected_row) assert actual_data == raw_data
def test_pesky_numpy(self): import numpy as np from oarphpy.spark import RowAdapter # RowAdapter translates pesky numpy-boxed numbers ... assert RowAdapter.to_row(Row(x=np.float32(1.))) == Row(x=1.) # ... but only one way! In practice, just don't save boxed numbers in rows. assert RowAdapter.from_row( Row(x=np.float32(1.))) == Row(x=np.float32(1.))
def _check_serialization(self, rows, schema=None, do_adaption=True): import inspect from oarphpy import util from oarphpy.spark import RowAdapter test_name = inspect.stack()[1][3] TEST_TEMPDIR = testutil.test_tempdir('TestRowAdapter.' + test_name) if do_adaption: adapted_rows = [RowAdapter.to_row(r) for r in rows] else: adapted_rows = rows with testutil.LocalSpark.sess() as spark: if schema: df = spark.createDataFrame(adapted_rows, schema=schema, verifySchema=False) # verifySchema is expensive and improperly errors on mostly # empty rows else: df = spark.createDataFrame(adapted_rows) # Automatically samples rows to get schema outpath = os.path.join(TEST_TEMPDIR, 'rowdata_%s' % test_name) df.write.parquet(outpath) df2 = spark.read.parquet(outpath) decoded_wrapped_rows = df2.collect() if do_adaption: decoded_rows = [ RowAdapter.from_row(row) for row in decoded_wrapped_rows ] # We can't do assert sorted(rows) == sorted(decoded_rows) # because numpy syntatic sugar breaks __eq__, so use pprint, # which is safe for our tests import pprint def sorted_row_str(rowz): if self._is_spark_2x(): # Spark 2.x has non-stable sorting semantics for Row if len(rowz) > 1: rowz = sorted(rowz, key=lambda r: r.id) return pprint.pformat(rowz) else: return pprint.pformat(sorted(rowz)) assert sorted_row_str(rows) == sorted_row_str(decoded_rows) return df
def test_rowadapter_cloudpickled_callable(self): from oarphpy.spark import CloudpickeledCallable def moof(): return 'moof' cc_moof = CloudpickeledCallable(moof) assert cc_moof() == moof() cc_empty = CloudpickeledCallable() assert cc_empty == CloudpickeledCallable.empty() with pytest.raises(Exception): cc_empty() rows = [ Row(id=0, f=cc_empty), Row(id=1, f=cc_moof), ] df = self._check_serialization(rows) assert _select_distinct(df, 'f.__pyclass__') == [ 'oarphpy.spark.CloudpickeledCallableData' ] # Ensuring invoking the functions still works from oarphpy.spark import RowAdapter decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()] with pytest.raises(Exception): decoded[0].f() assert decoded[1].f() == moof()
def test_rowadapter_numpy_packed(self): import numpy as np from oarphpy.spark import TENSOR_AUTO_PACK_MIN_KBYTES N = int(TENSOR_AUTO_PACK_MIN_KBYTES * (2**10) / np.dtype(int).itemsize) + 1 expect_packed = np.reshape(np.array(range(2 * N)), (2, N)) rows = [ Row(id=0, x=expect_packed), Row(id=1, x=(expect_packed + 1)), ] df = self._check_serialization(rows) assert _select_distinct(df, 'x.__pyclass__') == ['oarphpy.spark.Tensor'] # Verify that we actually have a column of packed values bin_data = df.select('*').first().x.values_packed assert len( bin_data) == expect_packed.size * expect_packed.dtype.itemsize # For ints, usually 8 bytes per int * 2 * N # _check_serialization() verifies that the data gets decoded as numpy # arrays, but just to make things visible in this test: from oarphpy.spark import RowAdapter decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()] np.testing.assert_equal(decoded[0].x, expect_packed) np.testing.assert_equal(decoded[1].x, expect_packed + 1)
def test_rowadapter_numpy_unpacked(self): import numpy as np # RowAdapter translates Numpy arrays to a oarphy Tensor object that affords # SQL-based inspection for small arrays (and uses a more efficient row- or # column-major packed encoding for large arrays; see next test) rows = [ Row(id=0, x=np.array([1, 2, 3])), Row(id=1, x=np.array([4, 5, 6])), ] df = self._check_serialization(rows) assert _select_distinct(df, 'x.__pyclass__') == ['oarphpy.spark.Tensor'] EXPECTED = """ id x 0 0 (oarphpy.spark.Tensor, [3], int64, C, [1, 2, 3], []) 1 1 (oarphpy.spark.Tensor, [3], int64, C, [4, 5, 6], []) """ self._pandas_compare_str(df.orderBy('id').toPandas(), EXPECTED) # _check_serialization() verifies that the data gets decoded as numpy # arrays, but just to make things visible in this test: from oarphpy.spark import RowAdapter decoded = [RowAdapter.from_row(r) for r in df.orderBy('id').collect()] np.testing.assert_equal(decoded[0].x, np.array([1, 2, 3])) np.testing.assert_equal(decoded[1].x, np.array([4, 5, 6]))
def test_rowadapter_unslotted(self): # RowAdapter will deserialize and re-create Python objects, using the # `Unslotted` class defined at runtime. rows = [ Row(id=0, x=Unslotted(v=4)), Row(id=1, x=Unslotted(v=5)), ] df = self._check_serialization(rows) # NB: The check above also checks that `Unslotted` instances are created # and checks equality via __eq__ or pprint.pformat(). But just to make # things visible in this test: from oarphpy.spark import RowAdapter decoded = sorted(RowAdapter.from_row(r) for r in df.collect()) assert [r.x for r in decoded] == [Unslotted(v=4), Unslotted(v=5)] # RowAdapter records the (full) class name in the table as the # `__pyclass__` attribute of each value of the `x` column. assert _select_distinct( df, 'x.__pyclass__') == ['oarphpy_test.test_spark.Unslotted'] # RowAdapter encodes objects as structs (even though in Python objects are # very dict-like). self._check_schema(rows, [ ('id', 'bigint'), ('x', 'struct<__pyclass__:string,v:bigint>'), ])
def test_row_adapter(): import numpy as np from pyspark.sql import Row from oarphpy.spark import RowAdapter rows = [ Row( id=1, np_number=np.float32(1.), a=np.array([1]), b={'foo': np.array([[1]], dtype=np.uint8)}, c=[np.array([[[1.]], [[2.]], [[3.]]])], d=Slotted(foo=5, bar="abc", _not_hidden=1), e=[Slotted(foo=6, bar="def", _not_hidden=1)], f=Unslotted(meow=4, _not_hidden=1, __hidden=2), g=Unslotted(), # Intentionally empty; adapter should set nothing h=Row(i=1, j=2), ), # Include a mostly empty row below to exercise Spark type validation. # Spark will ensure the row below and row above have the same schema; # note that `None` (or 'null') is only allowed for Struct / Row types. Row( id=2, np_number=np.float32(2.), a=np.array([]), b={}, c=[], d=None, e=[], f=None, g=None, h=Row(i=3, j=3), ), ] with testutil.LocalSpark.sess() as spark: ## Test basic round-trip serialization and adaptation _check_serialization(spark, rows, 'basic') ## Test Schema Deduction mostly_empty = Row( id=2, np_number=None, a=None, b={}, c=[], d=None, e=[], f=None, g=None, h=None, ) mostly_empty_adapted = RowAdapter.to_row(mostly_empty) # Spark can't deduce schema from the empty-ish row ... with pytest.raises(ValueError) as excinfo: df = spark.createDataFrame([mostly_empty_adapted]) assert "Some of types cannot be determined" in str(excinfo.value) # ... but this works if we tell it the schema! schema = RowAdapter.to_schema(rows[0]) df = spark.createDataFrame([mostly_empty_adapted], schema=schema, verifySchema=False) EXPECTED_SCHEMA = [ ('a', 'struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>' ), ('b', 'map<string,struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<bigint>>>' ), ('c', 'array<struct<__pyclass__:string,dtype:string,order:string,shape:array<bigint>,values:array<double>>>' ), ('d', 'struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>' ), ('e', 'array<struct<__pyclass__:string,_not_hidden:bigint,bar:string,foo:bigint>>' ), ('f', 'struct<__pyclass__:string,_not_hidden:bigint,meow:bigint>'), ('g', 'struct<__pyclass__:string>'), ('h', 'struct<i:bigint,j:bigint>'), ('id', 'bigint'), ('np_number', 'double'), ] assert df.dtypes == EXPECTED_SCHEMA # Check that pyspark retains the empty values in `mostly_empty` for colname in sorted(df.columns): values = df.select(colname).collect() assert len(values) == 1 assert mostly_empty[colname] == values[0][colname] # ... and we can also read/write the empty-ish row! _check_serialization(spark, [mostly_empty], 'with_schema', schema=schema)
def test_rowadapter_complex(self): from oarphpy.spark import RowAdapter # A large-ish example that covers the above cases in aggregate rows = [ Row( id=1, np_number=np.float32(1.), a=np.array([1]), b={'foo': np.array([[1]], dtype=np.uint8)}, c=[np.array([[[1.]], [[2.]], [[3.]]])], d=Slotted(foo=5, bar="abc", _not_hidden=1), e=[Slotted(foo=6, bar="def", _not_hidden=1)], f=Unslotted(meow=4, _not_hidden=1, __hidden=2), g=Unslotted( ), # Intentionally empty; adapter should set nothing h=Row(i=1, j=2), ), # Include a mostly empty row below to exercise Spark type validation. # Spark will ensure the row below and row above have the same schema; # note that `None` (or 'null') is only allowed for Struct / Row types. Row( id=2, np_number=np.float32(2.), a=np.array([]), b={}, c=[], d=None, e=[], f=None, g=None, h=Row(i=3, j=3), ), ] df = self._check_serialization(rows) EXPECTED_ALL = """ 0 1 id 1 2 np_number 1.0 2.0 a (oarphpy.spark.Tensor, [1], int64, C, [1], []) (oarphpy.spark.Tensor, [0], float64, C, [], []) b {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])} {} c [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])] [] d (oarphpy_test.test_spark.Slotted, 5, abc, 1) None e [(oarphpy_test.test_spark.Slotted, 6, def, 1)] [] f (oarphpy_test.test_spark.Unslotted, 4, 1) None g (oarphpy_test.test_spark.Unslotted,) None h (1, 2) (3, 3) """ # DEPRECATED: pyspark 2.x is deprecated import pyspark if pyspark.__version__.startswith('2.'): EXPECTED_ALL = """ 0 1 id 1 2 np_number 1 2 a (oarphpy.spark.Tensor, [1], int64, C, [1], []) (oarphpy.spark.Tensor, [0], float64, C, [], []) b {'foo': ('oarphpy.spark.Tensor', [1, 1], 'uint8', 'C', [1], [])} {} c [(oarphpy.spark.Tensor, [3, 1, 1], float64, C, [1.0, 2.0, 3.0], [])] [] d (oarphpy_test.test_spark.Slotted, 5, abc, 1) None e [(oarphpy_test.test_spark.Slotted, 6, def, 1)] [] f (oarphpy_test.test_spark.Unslotted, 4, 1) None g (oarphpy_test.test_spark.Unslotted,) None h (1, 2) (3, 3) """ self._pandas_compare_str(df.orderBy('id').toPandas().T, EXPECTED_ALL) # Test Schema Deduction mostly_empty = Row( id=2, np_number=None, a=None, b={}, c=[], d=None, e=[], f=None, g=None, h=None, ) mostly_empty_adapted = RowAdapter.to_row(mostly_empty) # Spark can't deduce schema from the empty-ish row ... with pytest.raises(ValueError) as excinfo: self._check_serialization([mostly_empty_adapted], do_adaption=False) assert "Some of types cannot be determined" in str(excinfo.value) # ... but this works if we tell it the schema! schema = RowAdapter.to_schema(rows[0]) self._check_serialization([mostly_empty_adapted], schema=schema) # Let's check that RowAdapter schema deduction works as expected EXPECTED_SCHEMA = [ ('id', 'bigint'), ('np_number', 'double'), ('a', 'struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>' ), ('b', 'map<string,struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<bigint>,values_packed:binary>>' ), ('c', 'array<struct<__pyclass__:string,shape:array<bigint>,dtype:string,order:string,values:array<double>,values_packed:binary>>' ), ('d', 'struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>' ), ('e', 'array<struct<__pyclass__:string,foo:bigint,bar:string,_not_hidden:bigint>>' ), ('f', 'struct<__pyclass__:string,meow:bigint,_not_hidden:bigint>'), ('g', 'struct<__pyclass__:string>'), ('h', 'struct<i:bigint,j:bigint>'), ] self._check_schema(rows, EXPECTED_SCHEMA)