def test_struct_array_field(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) x0 = a.field(0) y0 = a.field(1) x1 = a.field(-2) y1 = a.field(-1) x2 = a.field('x') y2 = a.field('y') assert isinstance(x0, pa.lib.Int16Array) assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) assert x0.equals(x2) assert y0.equals(y1) assert y0.equals(y2) for invalid_index in [None, pa.int16()]: with pytest.raises(TypeError): a.field(invalid_index) for invalid_index in [3, -3]: with pytest.raises(IndexError): a.field(invalid_index) for invalid_name in ['z', '']: with pytest.raises(KeyError): a.field(invalid_name)
def test_convert_options(): cls = ConvertOptions opts = cls() assert opts.check_utf8 is True opts.check_utf8 = False assert opts.check_utf8 is False assert opts.strings_can_be_null is False opts.strings_can_be_null = True assert opts.strings_can_be_null is True assert opts.column_types == {} # Pass column_types as mapping opts.column_types = {'b': pa.int16(), 'c': pa.float32()} assert opts.column_types == {'b': pa.int16(), 'c': pa.float32()} opts.column_types = {'v': 'int16', 'w': 'null'} assert opts.column_types == {'v': pa.int16(), 'w': pa.null()} # Pass column_types as schema schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) opts.column_types = schema assert opts.column_types == {'a': pa.int32(), 'b': pa.string()} # Pass column_types as sequence opts.column_types = [('x', pa.binary())] assert opts.column_types == {'x': pa.binary()} with pytest.raises(TypeError, match='DataType expected'): opts.column_types = {'a': None} with pytest.raises(TypeError): opts.column_types = 0 assert isinstance(opts.null_values, list) assert '' in opts.null_values assert 'N/A' in opts.null_values opts.null_values = ['xxx', 'yyy'] assert opts.null_values == ['xxx', 'yyy'] assert isinstance(opts.true_values, list) opts.true_values = ['xxx', 'yyy'] assert opts.true_values == ['xxx', 'yyy'] assert isinstance(opts.false_values, list) opts.false_values = ['xxx', 'yyy'] assert opts.false_values == ['xxx', 'yyy'] opts = cls(check_utf8=False, column_types={'a': pa.null()}, null_values=['N', 'nn'], true_values=['T', 'tt'], false_values=['F', 'ff'], strings_can_be_null=True) assert opts.check_utf8 is False assert opts.column_types == {'a': pa.null()} assert opts.null_values == ['N', 'nn'] assert opts.false_values == ['F', 'ff'] assert opts.true_values == ['T', 'tt'] assert opts.strings_can_be_null is True
def test_dictionary_type(): ty0 = pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])) assert ty0.index_type == pa.int32() assert isinstance(ty0.dictionary, pa.Array) assert ty0.dictionary.to_pylist() == ['a', 'b', 'c'] assert ty0.ordered is False ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True) assert ty1.index_type == pa.float32() assert isinstance(ty0.dictionary, pa.Array) assert ty1.dictionary.to_pylist() == [1.0, 2.0] assert ty1.ordered is True
def test_column_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16())) assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5], type=pa.float32())) # Empty column a = pa.array([], type=ty) col = pa.Column.from_array('foo', a) x, y = col.flatten() assert x == pa.column('foo.x', pa.array([], type=pa.int16())) assert y == pa.column('foo.y', pa.array([], type=pa.float32()))
def test_float32_integer_coerce_representable_range(): f32 = np.float32 valid_values = [f32(1.5), 1 << 24, -(1 << 24)] invalid_values = [f32(1.5), (1 << 24) + 1] invalid_values2 = [f32(1.5), -((1 << 24) + 1)] # it works pa.array(valid_values, type=pa.float32()) # it fails with pytest.raises(ValueError): pa.array(invalid_values, type=pa.float32()) with pytest.raises(ValueError): pa.array(invalid_values2, type=pa.float32())
def test_empty_cast(): types = [ pa.null(), pa.bool_(), pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64(), pa.float16(), pa.float32(), pa.float64(), pa.date32(), pa.date64(), pa.binary(), pa.binary(length=4), pa.string(), ] for (t1, t2) in itertools.product(types, types): try: # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) except pa.lib.ArrowNotImplementedError: continue
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = [('f4', pa.float32()), ('f8', pa.float64())] names = ['f4', 'f8'] expected_cols = [] arrays = [] fields = [] for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) arr = pa.array(values, from_pandas=True, mask=null_mask) arrays.append(arr) fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), columns=names) table = pa.Table.from_arrays(arrays, names) assert table.schema.equals(pa.schema(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame)
def test_type_to_pandas_dtype(): M8_ns = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.float64), (pa.bool_(), np.bool_), (pa.int8(), np.int8), (pa.int16(), np.int16), (pa.int32(), np.int32), (pa.int64(), np.int64), (pa.uint8(), np.uint8), (pa.uint16(), np.uint16), (pa.uint32(), np.uint32), (pa.uint64(), np.uint64), (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), (pa.date32(), M8_ns), (pa.date64(), M8_ns), (pa.timestamp('ms'), M8_ns), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), (pa.list_(pa.int8()), np.object_), ] for arrow_type, numpy_type in cases: assert arrow_type.to_pandas_dtype() == numpy_type
def test_orcfile_empty(): from pyarrow import orc f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile')) table = f.read() assert table.num_rows == 0 schema = table.schema expected_schema = pa.schema([ ('boolean1', pa.bool_()), ('byte1', pa.int8()), ('short1', pa.int16()), ('int1', pa.int32()), ('long1', pa.int64()), ('float1', pa.float32()), ('double1', pa.float64()), ('bytes1', pa.binary()), ('string1', pa.string()), ('middle', pa.struct([ ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ])), ('list', pa.list_(pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ]))), ('map', pa.list_(pa.struct([ ('key', pa.string()), ('value', pa.struct([ ('int1', pa.int32()), ('string1', pa.string()), ])), ]))), ]) assert schema == expected_schema
def test_is_integer(): signed_ints = [pa.int8(), pa.int16(), pa.int32(), pa.int64()] unsigned_ints = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] for t in signed_ints + unsigned_ints: assert types.is_integer(t) for t in signed_ints: assert types.is_signed_integer(t) assert not types.is_unsigned_integer(t) for t in unsigned_ints: assert types.is_unsigned_integer(t) assert not types.is_signed_integer(t) assert not types.is_integer(pa.float32()) assert not types.is_signed_integer(pa.float32())
def test_table_flatten(): ty1 = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) ty2 = pa.struct([pa.field('nest', ty1)]) a = pa.array([(1, 2.5), (3, 4.5)], type=ty1) b = pa.array([((11, 12.5),), ((13, 14.5),)], type=ty2) c = pa.array([False, True], type=pa.bool_()) table = pa.Table.from_arrays([a, b, c], names=['a', 'b', 'c']) t2 = table.flatten() t2._validate() expected = pa.Table.from_arrays([ pa.array([1, 3], type=pa.int16()), pa.array([2.5, 4.5], type=pa.float32()), pa.array([(11, 12.5), (13, 14.5)], type=ty1), c], names=['a.x', 'a.y', 'b.nest', 'c']) assert t2.equals(expected)
def test_mixed_sequence_errors(): with pytest.raises(ValueError, match="tried to convert to boolean"): pa.array([True, 'foo'], type=pa.bool_()) with pytest.raises(ValueError, match="tried to convert to float32"): pa.array([1.5, 'foo'], type=pa.float32()) with pytest.raises(ValueError, match="tried to convert to double"): pa.array([1.5, 'foo'])
def dataframe_with_arrays(include_index=False): """ Dataframe with numpy arrays columns of every possible primtive type. Returns ------- df: pandas.DataFrame schema: pyarrow.Schema Arrow schema definition that is in line with the constructed df. """ dtypes = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f4', pa.float32()), ('f8', pa.float64())] arrays = OrderedDict() fields = [] for dtype, arrow_dtype in dtypes: fields.append(pa.field(dtype, pa.list_(arrow_dtype))) arrays[dtype] = [ np.arange(10, dtype=dtype), np.arange(5, dtype=dtype), None, np.arange(1, dtype=dtype) ] fields.append(pa.field('str', pa.list_(pa.string()))) arrays['str'] = [ np.array([u"1", u"ä"], dtype="object"), None, np.array([u"1"], dtype="object"), np.array([u"1", u"2", u"3"], dtype="object") ] fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) arrays['datetime64'] = [ np.array(['2007-07-13T01:23:34.123456789', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), None, None, np.array(['2007-07-13T02', None, '2010-08-13T05:46:57.437699912'], dtype='datetime64[ms]'), ] if include_index: fields.append(pa.field('__index_level_0__', pa.int64())) df = pd.DataFrame(arrays) schema = pa.schema(fields) return df, schema
def json_to_parquet(data, output, schema): column_data = {} array_data = [] for row in data: for column in schema.names: _col = column_data.get(column, []) _col.append(row.get(column)) column_data[column] = _col for column in schema: _col = column_data.get(column.name) if isinstance(column.type, pa.lib.TimestampType): _converted_col = [] for t in _col: try: _converted_col.append(pd.to_datetime(t)) except pd._libs.tslib.OutOfBoundsDatetime: _converted_col.append(pd.Timestamp.max) array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ms'))) # Float types are ambiguous for conversions, need to specify the exact type elif column.type.id == pa.float64().id: array_data.append(pa.array(_col, type=pa.float64())) elif column.type.id == pa.float32().id: # Python doesn't have a native float32 type # and PyArrow cannot cast float64 -> float32 _col = pd.to_numeric(_col, downcast='float') array_data.append(pa.Array.from_pandas(_col, type=pa.float32())) elif column.type.id == pa.int64().id: array_data.append(pa.array([int(ele) for ele in _col], type=pa.int64())) else: array_data.append(pa.array(_col, type=column.type)) data = pa.RecordBatch.from_arrays(array_data, schema.names) try: table = pa.Table.from_batches(data) except TypeError: table = pa.Table.from_batches([data]) pq.write_table(table, output, compression='SNAPPY', coerce_timestamps='ms')
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_recordbatch_pickle(): data = [ pa.array(range(5)), pa.array([-10, -5, 0, 5, 10]) ] schema = pa.schema([pa.field('ints', pa.int8()), pa.field('floats', pa.float32()), ]).add_metadata({b'foo': b'bar'}) batch = pa.RecordBatch.from_arrays(data, schema) result = pickle.loads(pickle.dumps(batch)) assert result.equals(batch) assert result.schema == schema
def test_float_no_nulls(self): data = {} fields = [] dtypes = [('f4', pa.float32()), ('f8', pa.float64())] num_values = 100 for numpy_dtype, arrow_dtype in dtypes: values = np.random.randn(num_values) data[numpy_dtype] = values.astype(numpy_dtype) fields.append(pa.field(numpy_dtype, arrow_dtype)) df = pd.DataFrame(data) schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema)
def test_take_indices_types(): arr = pa.array(range(5)) for indices_type in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']: indices = pa.array([0, 4, 2, None], type=indices_type) result = arr.take(indices) expected = pa.array([0, 4, 2, None]) assert result.equals(expected) for indices_type in [pa.float32(), pa.float64()]: indices = pa.array([0, 4, 2], type=indices_type) with pytest.raises(TypeError): arr.take(indices)
def test_array_from_py_float32(): data = [[1.2, 3.4], [9.0, 42.0]] t = pa.float32() arr1 = pa.array(data[0], type=t) arr2 = pa.array(data, type=pa.list_(t)) expected1 = np.array(data[0], dtype=np.float32) expected2 = pd.Series([np.array(data[0], dtype=np.float32), np.array(data[1], dtype=np.float32)]) assert arr1.type == t assert arr1.equals(pa.array(expected1)) assert arr2.equals(pa.array(expected2))
def test_struct_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) xs, ys = a.flatten() assert xs.type == pa.int16() assert ys.type == pa.float32() assert xs.to_pylist() == [1, 3, 5] assert ys.to_pylist() == [2.5, 4.5, 6.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [3, 5] assert ys.to_pylist() == [4.5, 6.5] a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, None, 3] assert ys.to_pylist() == [2.5, None, 4.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [None, 3] assert ys.to_pylist() == [None, 4.5] a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, 2, None] assert ys.to_pylist() == [None, 3.5, 4.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [2, None] assert ys.to_pylist() == [3.5, 4.5] a = pa.array([(1, None), None, (None, 2.5)], type=ty) xs, ys = a.flatten() assert xs.to_pylist() == [1, None, None] assert ys.to_pylist() == [None, None, 2.5] xs, ys = a[1:].flatten() assert xs.to_pylist() == [None, None] assert ys.to_pylist() == [None, 2.5]
def test_column_types(self): # Ask for specific column types in ConvertOptions opts = ConvertOptions(column_types={'b': 'float32', 'c': 'string', 'd': 'boolean', 'zz': 'null'}) rows = b"a,b,c,d\n1,2,3,true\n4,-5,6,false\n" table = self.read_bytes(rows, convert_options=opts) schema = pa.schema([('a', pa.int64()), ('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_())]) expected = { 'a': [1, 4], 'b': [2.0, -5.0], 'c': ["3", "6"], 'd': [True, False], } assert table.schema == schema assert table.to_pydict() == expected # Pass column_types as schema opts = ConvertOptions( column_types=pa.schema([('b', pa.float32()), ('c', pa.string()), ('d', pa.bool_()), ('zz', pa.bool_())])) table = self.read_bytes(rows, convert_options=opts) assert table.schema == schema assert table.to_pydict() == expected # One of the columns in column_types fails converting rows = b"a,b,c,d\n1,XXX,3,true\n4,-5,6,false\n" with pytest.raises(pa.ArrowInvalid) as exc: self.read_bytes(rows, convert_options=opts) err = str(exc.value) assert "In column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err
def test_struct_value_subscripting(self): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[0]['x'] == 1 assert arr[0]['y'] == 2.5 assert arr[1]['x'] == 3 assert arr[1]['y'] == 4.5 assert arr[2]['x'] == 5 assert arr[2]['y'] == 6.5 with pytest.raises(IndexError): arr[4]['non-existent'] with pytest.raises(KeyError): arr[0]['non-existent']
def _from_jvm_float_type(jvm_type): """ Convert a JVM float type to its Python equivalent. Parameters ---------- jvm_type: org.apache.arrow.vector.types.pojo.ArrowType$FloatingPoint Returns ------- typ: pyarrow.DataType """ precision = jvm_type.getPrecision().toString() if precision == 'HALF': return pa.float16() elif precision == 'SINGLE': return pa.float32() elif precision == 'DOUBLE': return pa.float64()
def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), (np.dtype('int8'), pa.int8()), (np.dtype('int16'), pa.int16()), (np.dtype('int32'), pa.int32()), (np.dtype('int64'), pa.int64()), (np.dtype('uint8'), pa.uint8()), (np.dtype('uint16'), pa.uint16()), (np.dtype('uint32'), pa.uint32()), (np.dtype('float16'), pa.float16()), (np.dtype('float32'), pa.float32()), (np.dtype('float64'), pa.float64()), (np.dtype('U'), pa.string()), (np.dtype('S'), pa.binary()), (np.dtype('datetime64[s]'), pa.timestamp('s')), (np.dtype('datetime64[ms]'), pa.timestamp('ms')), (np.dtype('datetime64[us]'), pa.timestamp('us')), (np.dtype('datetime64[ns]'), pa.timestamp('ns')) ] for dt, pt in cases: result = pa.from_numpy_dtype(dt) assert result == pt # Things convertible to numpy dtypes work assert pa.from_numpy_dtype('U') == pa.string() assert pa.from_numpy_dtype(np.unicode) == pa.string() assert pa.from_numpy_dtype('int32') == pa.int32() assert pa.from_numpy_dtype(bool) == pa.bool_() with pytest.raises(NotImplementedError): pa.from_numpy_dtype(np.dtype('O')) with pytest.raises(TypeError): pa.from_numpy_dtype('not_convertible_to_dtype')
def get_many_types(): # returning them from a function is required because of pa.dictionary # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated # checks that the default memory pool has zero allocated bytes return ( pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), pa.struct([pa.field('a', pa.int32(), nullable=False), pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()) )
try: from scipy.sparse import csr_matrix, coo_matrix except ImportError: coo_matrix = None csr_matrix = None try: import sparse except ImportError: sparse = None tensor_type_pairs = [('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64())] @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCSCMatrix, pa.SparseCOOTensor, pa.SparseCSFTensor, ]) def test_sparse_tensor_attrs(sparse_tensor_type): data = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 4, 6], ])
from tfx_bsl.tfxio import tensor_to_arrow from google.protobuf import text_format from absl.testing import absltest from absl.testing import parameterized from tensorflow_metadata.proto.v0 import schema_pb2 _TF_TYPE_TO_ARROW_TYPE = { tf.int8: pa.int8(), tf.int16: pa.int16(), tf.int32: pa.int32(), tf.int64: pa.int64(), tf.uint8: pa.uint8(), tf.uint16: pa.uint16(), tf.uint32: pa.uint32(), tf.uint64: pa.uint64(), tf.float32: pa.float32(), tf.float64: pa.float64(), tf.string: pa.large_binary(), } _ROW_PARTITION_DTYPES = {"INT64": np.int64, "INT32": np.int32} def _make_2d_dense_tensor_test_cases(): result = [] for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items(): if tf_type == tf.string: tensor = tf.constant([[b"1", b"2"], [b"3", b"4"]], dtype=tf.string) expected_array = pa.array([[b"1", b"2"], [b"3", b"4"]], type=pa.large_list(arrow_type)) else:
import pyarrow as pa import deepr as dpr try: import pandas as pd except ImportError as e: print(f"Pandas needs to be installed for MovieLens {e}") LOGGER = logging.getLogger(__name__) COLUMNS = ["uid", "user", "input", "target"] SCHEMA = pa.schema([ ("uid", pa.int64()), ("user", pa.list_(pa.float32())), ("input", pa.list_(pa.int64())), ("target", pa.list_(pa.int64())), ]) @dataclass class Predict(dpr.jobs.Job): """Compute MovieLens predictions.""" path_saved_model: str path_predictions: str input_fn: Callable[[], tf.data.Dataset] prepro_fn: Callable[[tf.data.Dataset, str], tf.data.Dataset] def run(self):
pa.field('b', pa.int8()), pa.field('c', pa.string())]) ] in_dict = {} for i, type_ in enumerate(types): assert hash(type_) == hash(type_) in_dict[type_] = i assert in_dict[type_] == i @pytest.mark.parametrize('t,check_func', [ (pa.date32(), types.is_date32), (pa.date64(), types.is_date64), (pa.time32('s'), types.is_time32), (pa.time64('ns'), types.is_time64), (pa.int8(), types.is_int8), (pa.int16(), types.is_int16), (pa.int32(), types.is_int32), (pa.int64(), types.is_int64), (pa.uint8(), types.is_uint8), (pa.uint16(), types.is_uint16), (pa.uint32(), types.is_uint32), (pa.uint64(), types.is_uint64), (pa.float16(), types.is_float16), (pa.float32(), types.is_float32), (pa.float64(), types.is_float64) ]) def test_exact_primitive_types(t, check_func): assert check_func(t)
def test_is_floating(): for t in [pa.float16(), pa.float32(), pa.float64()]: assert types.is_floating(t) assert not types.is_floating(pa.int32())
(pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), ] numerical_arrow_types = [ pa.int8(), pa.int16(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint64(), pa.float32(), pa.float64() ] @pytest.mark.parametrize('arrow_type', numerical_arrow_types) def test_sum_array(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum().as_py() == 10 assert pc.sum(arr).as_py() == 10 arr = pa.array([], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 @pytest.mark.parametrize('arrow_type', numerical_arrow_types)
# om = jpype.JClass('com.fasterxml.jackson.databind.ObjectMapper')() # field = … # Code to instantiate the field # jvm_spec = om.writeValueAsString(field) @pytest.mark.parametrize('pa_type,jvm_spec', [ (pa.null(), '{"name":"null"}'), (pa.bool_(), '{"name":"bool"}'), (pa.int8(), '{"name":"int","bitWidth":8,"isSigned":true}'), (pa.int16(), '{"name":"int","bitWidth":16,"isSigned":true}'), (pa.int32(), '{"name":"int","bitWidth":32,"isSigned":true}'), (pa.int64(), '{"name":"int","bitWidth":64,"isSigned":true}'), (pa.uint8(), '{"name":"int","bitWidth":8,"isSigned":false}'), (pa.uint16(), '{"name":"int","bitWidth":16,"isSigned":false}'), (pa.uint32(), '{"name":"int","bitWidth":32,"isSigned":false}'), (pa.uint64(), '{"name":"int","bitWidth":64,"isSigned":false}'), (pa.float16(), '{"name":"floatingpoint","precision":"HALF"}'), (pa.float32(), '{"name":"floatingpoint","precision":"SINGLE"}'), (pa.float64(), '{"name":"floatingpoint","precision":"DOUBLE"}'), (pa.time32('s'), '{"name":"time","unit":"SECOND","bitWidth":32}'), (pa.time32('ms'), '{"name":"time","unit":"MILLISECOND","bitWidth":32}'), (pa.time64('us'), '{"name":"time","unit":"MICROSECOND","bitWidth":64}'), (pa.time64('ns'), '{"name":"time","unit":"NANOSECOND","bitWidth":64}'), (pa.timestamp('s'), '{"name":"timestamp","unit":"SECOND",' '"timezone":null}'), (pa.timestamp('ms'), '{"name":"timestamp","unit":"MILLISECOND",' '"timezone":null}'), (pa.timestamp('us'), '{"name":"timestamp","unit":"MICROSECOND",' '"timezone":null}'), (pa.timestamp('ns'), '{"name":"timestamp","unit":"NANOSECOND",' '"timezone":null}'), (pa.timestamp('ns', tz='UTC'), '{"name":"timestamp","unit":"NANOSECOND"' ',"timezone":"UTC"}'),
def test_concat_tables_with_promotion_error(): t1 = pa.Table.from_arrays([pa.array([1, 2], type=pa.int64())], ["f"]) t2 = pa.Table.from_arrays([pa.array([1, 2], type=pa.float32())], ["f"]) with pytest.raises(pa.ArrowInvalid): pa.concat_tables([t1, t2], promote=True)
def test_mixed_sequence_errors(): with pytest.raises(ValueError, match="tried to convert to boolean"): pa.array([True, 'foo'], type=pa.bool_()) with pytest.raises(ValueError, match="tried to convert to float32"): pa.array([1.5, 'foo'], type=pa.float32()) with pytest.raises(ValueError, match="tried to convert to double"): pa.array([1.5, 'foo']) @parametrize_with_iterable_types @pytest.mark.parametrize("np_scalar,pa_type", [(np.float16, pa.float16()), (np.float32, pa.float32()), (np.float64, pa.float64())]) @pytest.mark.parametrize("from_pandas", [True, False]) def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas): data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan] arr = pa.array(seq(data), from_pandas=from_pandas) assert len(arr) == 6 if from_pandas: assert arr.null_count == 3 else: assert arr.null_count == 2 if from_pandas: # The NaN is skipped in type inference, otherwise it forces a # float64 promotion assert arr.type == pa_type else:
import apache_beam as beam import pyarrow as pa import six from tfx_bsl.coders import example_coder from tfx_bsl.tfxio import record_based_tfxio from tfx_bsl.tfxio import tensor_adapter from tfx_bsl.tfxio import tensor_representation_util from tfx_bsl.tfxio import tfxio from typing import List, Optional, Text from tensorflow_metadata.proto.v0 import schema_pb2 _FEATURE_TYPE_TO_ARROW_TYPE = { schema_pb2.FeatureType.BYTES: pa.list_(pa.binary()), schema_pb2.FeatureType.INT: pa.list_(pa.int64()), schema_pb2.FeatureType.FLOAT: pa.list_(pa.float32()), } @six.add_metaclass(abc.ABCMeta) class _TFExampleRecordBase(record_based_tfxio.RecordBasedTFXIO): """Base class for TFXIO implementations for record based tf.Examples.""" def __init__(self, schema: Optional[schema_pb2.Schema] = None, raw_record_column_name: Optional[Text] = None): super(_TFExampleRecordBase, self).__init__(raw_record_column_name) self._schema = schema def SupportAttachingRawRecords(self) -> bool: return True
from collections.abc import Sequence import cupy as cp import numpy as np import pandas as pd import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType import cudf from cudf._lib.scalar import Scalar _NA_REP = "<NA>" _np_pa_dtypes = { np.float64: pa.float64(), np.float32: pa.float32(), np.int64: pa.int64(), np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), }
class DataMapping: """ Map primary data between different supported data frameworks, preserving equivalent data types. DataMapping is for primary data, to map metadata types and values use :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`. """ __log = _util.logger_for_namespace(_DataInternal.__module__ + ".DataMapping") # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data __TRAC_DECIMAL_PRECISION = 38 __TRAC_DECIMAL_SCALE = 12 __TRAC_TIMESTAMP_UNIT = "ms" __TRAC_TIMESTAMP_ZONE = None __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = { _meta.BasicType.BOOLEAN: pa.bool_(), _meta.BasicType.INTEGER: pa.int64(), _meta.BasicType.FLOAT: pa.float64(), _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE), _meta.BasicType.STRING: pa.utf8(), _meta.BasicType.DATE: pa.date32(), _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE) } # Check the Pandas dtypes for handling floats are available before setting up the type mapping __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check() __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way __ARROW_TO_PANDAS_TYPE_MAPPING = { pa.bool_(): pd.BooleanDtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.float16(): pd.Float32Dtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), pa.utf8(): pd.StringDtype() } @staticmethod def arrow_to_python_type(arrow_type: pa.DataType) -> type: if pa.types.is_boolean(arrow_type): return bool if pa.types.is_integer(arrow_type): return int if pa.types.is_floating(arrow_type): return float if pa.types.is_decimal(arrow_type): return decimal.Decimal if pa.types.is_string(arrow_type): return str if pa.types.is_date(arrow_type): return dt.date if pa.types.is_timestamp(arrow_type): return dt.datetime raise _ex.ETracInternal( f"No Python type mapping available for Arrow type [{arrow_type}]") @classmethod def python_to_arrow_type(cls, python_type: type) -> pa.DataType: if python_type == bool: return pa.bool_() if python_type == int: return pa.int64() if python_type == float: return pa.float64() if python_type == decimal.Decimal: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) if python_type == str: return pa.utf8() if python_type == dt.date: return pa.date32() if python_type == dt.datetime: return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE) raise _ex.ETracInternal( f"No Arrow type mapping available for Python type [{python_type}]") @classmethod def trac_to_arrow_type(cls, trac_type: _meta.TypeDescriptor) -> pa.DataType: return cls.trac_to_arrow_basic_type(trac_type.basicType) @classmethod def trac_to_arrow_basic_type( cls, trac_basic_type: _meta.BasicType) -> pa.DataType: arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get( trac_basic_type) if arrow_type is None: raise _ex.ETracInternal( f"No Arrow type mapping available for TRAC type [{trac_basic_type}]" ) return arrow_type @classmethod def trac_to_arrow_schema(cls, trac_schema: _meta.SchemaDefinition) -> pa.Schema: if trac_schema.schemaType != _meta.SchemaType.TABLE: raise _ex.ETracInternal( f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow" ) arrow_fields = [(f.fieldName, cls.trac_to_arrow_basic_type(f.fieldType)) for f in trac_schema.table.fields] return pa.schema(arrow_fields, metadata={}) @classmethod def trac_arrow_decimal_type(cls) -> pa.Decimal128Type: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) @classmethod def pandas_datetime_type(cls): return cls.__PANDAS_DATETIME_TYPE @classmethod def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame: deltas = view.parts.get(part) # Sanity checks if not view.arrow_schema: raise _ex.ETracInternal(f"Data view schema not set") if not deltas: raise _ex.ETracInternal( f"Data view for part [{part.opaque_key}] does not contain any items" ) if len(deltas) == 1: return cls.item_to_pandas(deltas[0]) batches = { batch for delta in deltas for batch in ( delta.batches if delta.batches else delta.table.to_batches()) } table = pa.Table.from_batches(batches) # noqa return table.to_pandas() @classmethod def item_to_pandas(cls, item: DataItem) -> pd.DataFrame: if item.pandas is not None: return item.pandas.copy() if item.table is not None: return cls.arrow_to_pandas(item.table) if item.batches is not None: table = pa.Table.from_batches(item.batches, item.schema) # noqa return cls.arrow_to_pandas(table) raise _ex.ETracInternal(f"Data item does not contain any usable data") @classmethod def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame: return table.to_pandas( ignore_metadata=True, # noqa date_as_object=False, # noqa timestamp_as_object=False, # noqa types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get) @classmethod def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView, part: DataPartKey): item = cls.pandas_to_item(df, prior_view.arrow_schema) return cls.add_item_to_view(prior_view, part, item) @classmethod def pandas_to_item(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema]) -> DataItem: table = cls.pandas_to_arrow(df, schema) return DataItem(table.schema, table) @classmethod def pandas_to_arrow(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table: # Here we convert the whole Pandas df and then pass it to conformance # An optimization would be to filter columns before applying conformance # To do this, we'd need the case-insensitive field matching logic, including output of warnings # Also, note that schema is not applied in from_pandas # This is because the conformance logic allows for a wider range of conversions # Applying the schema directly would fail for some types where casting is possible if len(df) == 0: df_schema = pa.Schema.from_pandas(df, preserve_index=False) # noqa table = pa.Table.from_batches(list(), df_schema) # noqa else: table = pa.Table.from_pandas(df, preserve_index=False) # noqa # If there is no explict schema, give back the table exactly as it was received from Pandas # There could be an option here to coerce types to the appropriate TRAC standard types # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type if schema is None: return table else: return DataConformance.conform_to_schema(table, schema, df.dtypes) @classmethod def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView: prior_deltas = view.parts.get(part) or list() deltas = [*prior_deltas, item] parts = {**view.parts, part: deltas} return DataView(view.trac_schema, view.arrow_schema, parts)
from tensorflow_model_analysis.slicer import slicer_lib as slicer from tensorflow_model_analysis.writers import writer _PARQUET_FORMAT = 'parquet' _TFRECORD_FORMAT = 'tfrecord' _SUPPORTED_FORMATS = (_PARQUET_FORMAT, _TFRECORD_FORMAT) _SLICE_KEY_PARQUET_COLUMN_NAME = 'slice_key' _SERIALIZED_VALUE_PARQUET_COLUMN_NAME = 'serialized_value' _SINGLE_SLICE_KEYS_PARQUET_FIELD_NAME = 'single_slice_specs' _SLICE_KEY_ARROW_TYPE = pa.struct([(pa.field( _SINGLE_SLICE_KEYS_PARQUET_FIELD_NAME, pa.list_( pa.struct([ pa.field('column', pa.string()), pa.field('bytes_value', pa.binary()), pa.field('float_value', pa.float32()), pa.field('int64_value', pa.int64()) ]))))]) _SLICED_PARQUET_SCHEMA = pa.schema([ pa.field(_SLICE_KEY_PARQUET_COLUMN_NAME, _SLICE_KEY_ARROW_TYPE), pa.field(_SERIALIZED_VALUE_PARQUET_COLUMN_NAME, pa.binary()) ]) _UNSLICED_PARQUET_SCHEMA = pa.schema( [pa.field(_SERIALIZED_VALUE_PARQUET_COLUMN_NAME, pa.binary())]) _SliceKeyDictPythonType = Dict[Text, List[Dict[Text, Union[bytes, float, int]]]] def _match_all_files(file_path: Text) -> Text: """Return expression to match all files at given path."""
_check_cast_case(case) def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError): str(result) @pytest.mark.parametrize( ('type', 'expected'), [(pa.null(), 'float64'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time')]) def test_logical_type(type, expected): assert get_logical_type(type) == expected def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr) arr2 = pa.array(arr) # noqa assert sys.getrefcount(arr) == (refcount + 1)
def test_struct_array_slice(): # ARROW-2311: slicing nested arrays needs special care ty = pa.struct([pa.field('a', pa.int8()), pa.field('b', pa.float32())]) arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5}, {'a': 5, 'b': 6.5}]
import deepr as dpr try: import pandas as pd except ImportError as e: print(f"Pandas needs to be installed for MovieLens {e}") LOGGER = logging.getLogger(__name__) COLUMNS = ["uid", "user", "target"] SCHEMA = pa.schema([("uid", pa.int64()), ("user", pa.list_(pa.float32())), ("target", pa.list_(pa.int64()))]) @dataclass class Predict(dpr.jobs.Job): """Compute MovieLens predictions.""" path_saved_model: str path_predictions: str input_fn: Callable[[], tf.data.Dataset] prepro_fn: Callable[[tf.data.Dataset, str], tf.data.Dataset] def run(self): LOGGER.info(f"Computing predictions from {self.path_saved_model}") predictor = dpr.predictors.SavedModelPredictor( path=dpr.predictors.get_latest_saved_model(self.path_saved_model)
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == TimestampNTZType: arrow_type = pa.timestamp('us', tz=None) elif type(dt) == ArrayType: if type(dt.elementType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) elif type(dt) == MapType: if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if type(dt.keyType) in [StructType, TimestampType] or \ type(dt.valueType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) elif type(dt) == StructType: if any(type(field.dataType) == StructType for field in dt): raise TypeError( "Nested StructType not supported in conversion to Arrow") fields = [ pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) for field in dt ] arrow_type = pa.struct(fields) elif type(dt) == NullType: arrow_type = pa.null() else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
@pytest.mark.pandas @pytest.mark.parametrize( ('data', 'type', 'physical_type', 'min_value', 'max_value', 'null_count', 'num_values', 'distinct_count'), [ ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0), ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0), ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float32(), 'FLOAT', -1.1, 4.4, 1, 4, 0), ([-1.1, 2.2, 2.3, None, 4.4 ], pa.float64(), 'DOUBLE', -1.1, 4.4, 1, 4, 0), (['', 'b', chr(1000), None, 'aaa'], pa.binary(), 'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0), ([True, False, False, True, True ], pa.bool_(), 'BOOLEAN', False, True, 0, 5, 0), ([b'\x00', b'b', b'12', None, b'aaa' ], pa.binary(), 'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0), ]) def test_parquet_column_statistics_api(data, type, physical_type, min_value, max_value, null_count, num_values, distinct_count): df = pd.DataFrame({'data': data}) schema = pa.schema([pa.field('data', type)]) table = pa.Table.from_pandas(df, schema=schema, safe=False)
@pytest.mark.parametrize( ('type', 'expected'), [ (pa.null(), 'empty'), (pa.bool_(), 'bool'), (pa.int8(), 'int8'), (pa.int16(), 'int16'), (pa.int32(), 'int32'), (pa.int64(), 'int64'), (pa.uint8(), 'uint8'), (pa.uint16(), 'uint16'), (pa.uint32(), 'uint32'), (pa.uint64(), 'uint64'), (pa.float16(), 'float16'), (pa.float32(), 'float32'), (pa.float64(), 'float64'), (pa.date32(), 'date'), (pa.date64(), 'date'), (pa.binary(), 'bytes'), (pa.binary(length=4), 'bytes'), (pa.string(), 'unicode'), (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'), (pa.decimal128(18, 3), 'decimal'), (pa.timestamp('ms'), 'datetime'), (pa.timestamp('us', 'UTC'), 'datetimetz'), (pa.time32('s'), 'time'), (pa.time64('us'), 'time') ] ) def test_logical_type(type, expected):
def create_float32(): import pyarrow as pa return convert(pa.array([np.float32(1), np.float32(2)], type=pa.float32()))
import sparse except ImportError: sparse = None tensor_type_pairs = [ ('i1', pa.int8()), ('i2', pa.int16()), ('i4', pa.int32()), ('i8', pa.int64()), ('u1', pa.uint8()), ('u2', pa.uint16()), ('u4', pa.uint32()), ('u8', pa.uint64()), ('f2', pa.float16()), ('f4', pa.float32()), ('f8', pa.float64()) ] @pytest.mark.parametrize('sparse_tensor_type', [ pa.SparseCSRMatrix, pa.SparseCSCMatrix, pa.SparseCOOTensor, pa.SparseCSFTensor, ]) def test_sparse_tensor_attrs(sparse_tensor_type): data = np.array([ [8, 0, 2, 0, 0, 0], [0, 0, 0, 0, 0, 5], [3, 0, 0, 0, 0, 0],
def main(): # https://arrow.apache.org/docs/python/api/datatypes.html my_schema = pa.schema([ # skip null ('c_bool', pa.bool_()), ('c_int8', pa.int8()), ('c_int16', pa.int16()), ('c_int32', pa.int32()), ('c_int64', pa.int64()), ('c_uint8', pa.uint8()), ('c_uint16', pa.uint16()), ('c_uint32', pa.uint32()), ('c_uint64', pa.uint64()), # skip ('c_float16', pa.float16()), ('c_float32', pa.float32()), ('c_float64', pa.float64()), ('c_time32', pa.time32('ms')), ('c_time64', pa.time64('ns')), ('c_timestamp', pa.timestamp('ms')), ('c_date32', pa.date32()), ('c_date64', pa.date64()), # skip binary ('c_string', pa.string()), # skip utf8 # skip large_binary # skip large_string # skip large_utf8 ('c_decimal128_8_3', pa.decimal128(8, 3)) # skip list_ # skip large_list # skip struct # skip dictionary # skip field # skip schema # skip from_numpy_dtype ]) c_bool = pa.array([False, True, False], type=pa.bool_()) c_int8 = pa.array([1, 2, 3], type=pa.int8()) c_int16 = pa.array([1, 2, 3], type=pa.int16()) c_int32 = pa.array([1, 2, 3], type=pa.int32()) c_int64 = pa.array([1, 2, 3], type=pa.int64()) c_uint8 = pa.array([1, 2, 3], type=pa.uint8()) c_uint16 = pa.array([1, 2, 3], type=pa.uint16()) c_uint32 = pa.array([1, 2, 3], type=pa.uint32()) c_uint64 = pa.array([1, 2, 3], type=pa.uint64()) # c_float16 = pa.array([np.float16(1.0), np.float16(2.0), np.float16(3.0)], type=pa.float16()) c_float32 = pa.array([1.0, 2.0, 3.0], type=pa.float32()) c_float64 = pa.array([1.0, 2.0, 3.0], type=pa.float64()) c_time32 = pa.array([1, 2, 3], type=pa.time32('ms')) c_time64 = pa.array([1, 2, 3], type=pa.time64('ns')) c_timestamp = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.timestamp('ms')) c_date32 = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.date32()) c_date64 = pa.array([ datetime(2019, 9, 3, 9, 0, 0), datetime(2019, 9, 3, 10, 0, 0), datetime(2019, 9, 3, 11, 0, 0) ], type=pa.date64()) c_string = pa.array( ['*****@*****.**', '*****@*****.**', '*****@*****.**'], type=pa.string() ) c_decimal128_8_3 = pa.array([1, 2, 3], type=pa.decimal128(8, 3)) batch = pa.RecordBatch.from_arrays( [c_bool, c_int8, c_int16, c_int32, c_int64, c_uint8, c_uint16, c_uint32, c_uint64, # c_float16, c_float32, c_float64, c_time32, c_time64, c_timestamp, c_date32, c_date64, c_string, c_decimal128_8_3 ], schema=my_schema ) table = pa.Table.from_batches([batch]) pq.write_table(table, 'example.parquet')
pa.int8(), pa.int16(), pa.int32(), pa.int64() ]) unsigned_integer_types = st.sampled_from([ pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([ pa.float16(), pa.float32(), pa.float64() ]) decimal_type = st.builds( pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38) ) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([ pa.date32(), pa.date64() ]) time_types = st.sampled_from([ pa.time32('s'),
} feature { key: "float_feature_1" value { float_list { value: [ 4.0 ] } } } feature { key: "float_feature_2" value { float_list { value: [ 5.0, 6.0 ] } } } feature { key: "str_feature_1" value { bytes_list { value: [ 'female' ] } } } feature { key: "str_feature_2" value { bytes_list { value: [ 'string', 'list' ] } } } } ''', 'decoded_table': pa.Table.from_arrays([ pa.array([[0]], pa.list_(pa.int64())), pa.array([[1, 2, 3]], pa.list_(pa.int64())), pa.array([[4.0]], pa.list_(pa.float32())), pa.array([[5.0, 6.0]], pa.list_(pa.float32())), pa.array([[b'female']], pa.list_(pa.binary())), pa.array([[b'string', b'list']], pa.list_(pa.binary())) ], ['int_feature_1', 'int_feature_2', 'float_feature_1', 'float_feature_2', 'str_feature_1', 'str_feature_2']) }, ]
""" features { feature { key: "x" value { bytes_list { value: [] } } } feature { key: "y" value { float_list { value: [] } } } feature { key: "z" value { int64_list { value: [] } } } } """, ] _DECODE_CASES = [ dict(testcase_name="without_schema_simple", schema_text_proto=None, examples_text_proto=_TEST_EXAMPLES, expected=pa.RecordBatch.from_arrays([ pa.array([None, None, [1.0], None], type=pa.large_list(pa.float32())), pa.array([None, None, None, None], type=pa.null()), pa.array([[b"a", b"b"], None, None, []], type=pa.large_list(pa.large_binary())), pa.array([[1.0, 2.0], None, None, []], type=pa.large_list(pa.float32())), pa.array([[4, 5], None, None, []], type=pa.large_list(pa.int64())) ], ["v", "w", "x", "y", "z"])), dict(testcase_name="with_schema_simple", schema_text_proto=""" feature { name: "x" type: BYTES } feature { name: "y"
import pyarrow as pa import pyarrow.types as types MANY_TYPES = [ pa.null(), pa.bool_(), pa.int32(), pa.time32('s'), pa.time64('us'), pa.date32(), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), pa.timestamp('us', tz='Europe/Paris'), pa.float16(), pa.float32(), pa.float64(), pa.decimal128(19, 4), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.int32()), pa.struct([ pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string()) ]), pa.union([pa.field('a', pa.binary(10)), pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)),
def test_is_floating(): for t in [pa.float16(), pa.float32(), pa.float64()]: assert types.is_floating(t) assert not types.is_floating(pa.int32())
] if not pa_version_under1p01: import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [ pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64() ] SIGNED_INT_PYARROW_DTYPES = [ pa.uint8(), pa.int16(), pa.int32(), pa.uint64() ] ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()] STRING_PYARROW_DTYPES = [pa.string(), pa.utf8()] TIME_PYARROW_DTYPES = [ pa.time32("s"), pa.time32("ms"), pa.time64("us"), pa.time64("ns"), ] DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()] DATETIME_PYARROW_DTYPES = [ pa.timestamp(unit=unit, tz=tz) for unit in ["s", "ms", "us", "ns"] for tz in [None, "UTC", "US/Pacific", "US/Eastern"] ] TIMEDELTA_PYARROW_DTYPES = [ pa.duration(unit) for unit in ["s", "ms", "us", "ns"]
def test_sql(redshift_table, postgresql_table, mysql_table, databases_parameters, db_type): if db_type == "postgresql": table = postgresql_table elif db_type == "mysql": table = mysql_table else: table = redshift_table df = get_df() if db_type == "redshift": df.drop(["binary"], axis=1, inplace=True) engine = wr.catalog.get_engine(connection=f"aws-data-wrangler-{db_type}", echo=False) index = True if engine.name == "redshift" else False wr.db.to_sql( df=df, con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=index, index_label=None, chunksize=None, method=None, dtype={"iint32": sqlalchemy.types.Integer}, ) df = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine) ensure_data_types(df, has_list=False) engine = wr.db.get_engine( db_type=db_type, host=databases_parameters[db_type]["host"], port=databases_parameters[db_type]["port"], database=databases_parameters[db_type]["database"], user=databases_parameters["user"], password=databases_parameters["password"], echo=False, ) dfs = wr.db.read_sql_query( sql=f"SELECT * FROM {databases_parameters[db_type]['schema']}.{table}", con=engine, chunksize=1, dtype={ "iint8": pa.int8(), "iint16": pa.int16(), "iint32": pa.int32(), "iint64": pa.int64(), "float": pa.float32(), "double": pa.float64(), "decimal": pa.decimal128(3, 2), "string_object": pa.string(), "string": pa.string(), "date": pa.date32(), "timestamp": pa.timestamp(unit="ns"), "binary": pa.binary(), "category": pa.float64(), }, ) for df in dfs: ensure_data_types(df, has_list=False) if db_type != "redshift": account_id = boto3.client("sts").get_caller_identity().get("Account") engine = wr.catalog.get_engine( connection=f"aws-data-wrangler-{db_type}", catalog_id=account_id) wr.db.to_sql( df=pd.DataFrame({"col0": [1, 2, 3]}, dtype="Int32"), con=engine, name=table, schema=databases_parameters[db_type]["schema"], if_exists="replace", index=True, index_label="index", ) schema = None if db_type == "postgresql": schema = databases_parameters[db_type]["schema"] df = wr.db.read_sql_table(con=engine, table=table, schema=schema, index_col="index") assert df.shape == (3, 1)
"INT64", pyarrow.int32().id: "INT64", pyarrow.int64().id: "INT64", pyarrow.uint8().id: "INT64", pyarrow.uint16().id: "INT64", pyarrow.uint32().id: "INT64", pyarrow.uint64().id: "INT64", pyarrow.float16().id: "FLOAT64", pyarrow.float32().id: "FLOAT64", pyarrow.float64().id: "FLOAT64", pyarrow.time32("ms").id: "TIME", pyarrow.time64("ns").id: "TIME", pyarrow.timestamp("ns").id: "TIMESTAMP", pyarrow.date32().id: "DATE", pyarrow.date64().id: "DATETIME", # because millisecond resolution pyarrow.binary().id: "BYTES",
def clean_data_common(self, processed_data, raw_data): """Fix the type and default value of of each extracted field This routine is common to all services. It ensures that all the missing fields, as defined by the schema, are added to the records extracted. Furthermore, each field is set to the specified type. """ # Build default data structure schema_rec = {} def_vals = self._get_default_vals() ptype_map = { pa.string(): str, pa.int32(): int, pa.int64(): int, pa.float32(): float, pa.float64(): float, pa.date64(): float, pa.list_(pa.string()): list, pa.list_(pa.int64()): list, pa.bool_(): bool, } for fld in self.schema: default = def_vals[fld.type] schema_rec.update({fld.name: default}) if isinstance(raw_data, list): read_from = raw_data[0] else: read_from = raw_data # pylint: disable=too-many-nested-blocks for entry in processed_data or []: entry.update({"hostname": read_from["hostname"]}) entry.update({"namespace": read_from["namespace"]}) entry.update({"timestamp": read_from["timestamp"]}) entry.update({"sqvers": self.version}) for fld, val in schema_rec.items(): if fld not in entry: if fld == "active": entry.update({fld: True}) else: entry.update({fld: val}) else: fld_type = self.schema.field(fld).type if not isinstance(entry[fld], ptype_map[fld_type]): try: entry[fld] = ptype_map[fld_type](entry[fld]) except (ValueError, TypeError): entry[fld] = val elif isinstance(entry[fld], list): for i, ele in enumerate(entry[fld]): if not isinstance(ele, ptype_map[fld_type.value_type]): try: if ptype_map[fld_type.value_type] == int: entry[fld][i] = int(entry[fld][i]) elif ptype_map[fld_type.value_type] == str: entry[fld][i] = str(entry[fld][i]) else: raise ValueError except (ValueError, TypeError): entry[fld][i] = val return processed_data
""" Copyright (C) 2018 Anthony Potappel, The Netherlands. All Rights Reserved. This work is licensed under the terms of the MIT license (for details, see attached LICENSE file). """ import pyarrow as pa _ENDIANNESS = '<' _DTYPES_CONV = { _ENDIANNESS + 'f2': pa.float16(), _ENDIANNESS + 'f4': pa.float32(), _ENDIANNESS + 'f8': pa.float64(), _ENDIANNESS + 'i2': pa.int16(), _ENDIANNESS + 'i4': pa.int32(), _ENDIANNESS + 'i8': pa.int64(), _ENDIANNESS + 'u2': pa.uint16(), _ENDIANNESS + 'u4': pa.uint32(), _ENDIANNESS + 'u8': pa.uint64(), '|i1': pa.int8(), '|u1': pa.uint8(), } _DTYPES_CONV_STR = { "float16": pa.float16(), "float32": pa.float32(), "float64": pa.float64(), "int16": pa.int16(), "int32": pa.int32(), "int64": pa.int64(), "uint16": pa.uint16(),
bool_type = st.just(pa.bool_()) binary_type = st.just(pa.binary()) string_type = st.just(pa.string()) large_binary_type = st.just(pa.large_binary()) large_string_type = st.just(pa.large_string()) signed_integer_types = st.sampled_from( [pa.int8(), pa.int16(), pa.int32(), pa.int64()]) unsigned_integer_types = st.sampled_from( [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]) integer_types = st.one_of(signed_integer_types, unsigned_integer_types) floating_types = st.sampled_from([pa.float16(), pa.float32(), pa.float64()]) decimal_type = st.builds(pa.decimal128, precision=st.integers(min_value=1, max_value=38), scale=st.integers(min_value=1, max_value=38)) numeric_types = st.one_of(integer_types, floating_types, decimal_type) date_types = st.sampled_from([pa.date32(), pa.date64()]) time_types = st.sampled_from( [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]) timestamp_types = st.builds(pa.timestamp, unit=st.sampled_from(['s', 'ms', 'us', 'ns']), tz=tzst.timezones()) temporal_types = st.one_of(date_types, time_types, timestamp_types)
dict( testcase_name='simple', input_lines=['1,2.0,hello', '5,12.34,world'], column_names=['int_feature', 'float_feature', 'str_feature'], expected_csv_cells=[ [b'1', b'2.0', b'hello'], [b'5', b'12.34', b'world'], ], expected_types=[ csv_decoder.ColumnType.INT, csv_decoder.ColumnType.FLOAT, csv_decoder.ColumnType.STRING, ], expected_record_batch=pa.RecordBatch.from_arrays([ pa.array([[1], [5]], pa.list_(pa.int64())), pa.array([[2.0], [12.34]], pa.list_(pa.float32())), pa.array([[b'hello'], [b'world']], pa.list_(pa.binary())) ], ['int_feature', 'float_feature', 'str_feature'])), dict( testcase_name='missing_values', input_lines=[',,', '1,,hello', ',12.34,'], column_names=['f1', 'f2', 'f3'], expected_csv_cells=[ [b'', b'', b''], [b'1', b'', b'hello'], [b'', b'12.34', b''], ], expected_types=[ csv_decoder.ColumnType.INT, csv_decoder.ColumnType.FLOAT, csv_decoder.ColumnType.STRING,