def test_map_from_dicts(): data = [[{ 'key': b'a', 'value': 1 }, { 'key': b'b', 'value': 2 }], [{ 'key': b'c', 'value': 3 }], [{ 'key': b'd', 'value': 4 }, { 'key': b'e', 'value': 5 }, { 'key': b'f', 'value': None }], [{ 'key': b'g', 'value': 7 }]] expected = [[(d['key'], d['value']) for d in entry] for entry in data] arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) assert arr.to_pylist() == expected # With omitted values data[1] = None expected[1] = None arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) assert arr.to_pylist() == expected # Invalid dictionary for entry in [[{'value': 5}], [{}], [{'k': 1, 'v': 2}]]: with pytest.raises(ValueError, match="Invalid Map"): pa.array([entry], type=pa.map_('i4', 'i4')) # Invalid dictionary types for entry in [[{'key': '1', 'value': 5}], [{'key': {'value': 2}}]]: with pytest.raises(TypeError, match="integer is required"): pa.array([entry], type=pa.map_('i4', 'i4'))
def pyarrow_datatype_from_dict(json_dict: Dict) -> pyarrow.DataType: """ Create a DataType in PyArrow format from a Schema json format. :param json_dict: the DataType in json format :return: the DataType in PyArrow format """ "" type_class = json_dict["type"]["name"] if type_class == "dictionary": key_type = json_dict["dictionary"]["indexType"] value_type = json_dict["children"][0] key_type = pyarrow_datatype_from_dict(key_type) value_type = pyarrow_datatype_from_dict(value_type) return pyarrow.map_(key_type, value_type) elif type_class == "list": field = json_dict["children"][0] element_type = pyarrow_datatype_from_dict(field) return pyarrow.list_(element_type) elif type_class == "struct": fields = [ pyarrow_field_from_dict(field) for field in json_dict["children"] ] return pyarrow.struct(fields) elif type_class == "int" or type_class == "float" or type_class == "date": return pyarrow.type_for_alias( f'{type_class}{json_dict["type"]["bitWidth"]}') elif type_class == "time": type_info = json_dict["type"] if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" else: unit = "s" return pyarrow.type_for_alias( f'{type_class}{type_info["bitWidth"]}[{unit}]') elif type_class == "timestamp": type_info = json_dict["type"] if "unit" in type_info: if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" elif type_info["unit"] == "SECOND": unit = "s" else: unit = "ns" return pyarrow.type_for_alias(f"{type_class}[{unit}]") elif type_class.startswith("decimal"): type_info = json_dict["type"] return pyarrow.decimal128(precision=type_info["precision"], scale=type_info["scale"]) else: return pyarrow.type_for_alias(type_class)
def test_map_from_tuples(): expected = [[(b'a', 1), (b'b', 2)], [(b'c', 3)], [(b'd', 4), (b'e', 5), (b'f', None)], [(b'g', 7)]] arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) assert arr.to_pylist() == expected # With omitted values expected[1] = None arr = pa.array(expected, type=pa.map_(pa.binary(), pa.int32())) assert arr.to_pylist() == expected # Invalid tuple size for entry in [[(5, )], [()], [('5', 'foo', True)]]: with pytest.raises(ValueError, match="(?i)tuple size"): pa.array([entry], type=pa.map_('i4', 'i4'))
def test_type_schema_pickling(): cases = [ pa.int8(), pa.string(), pa.binary(), pa.binary(10), pa.list_(pa.string()), pa.map_(pa.string(), pa.int8()), pa.struct([pa.field('a', 'int8'), pa.field('b', 'string')]), pa.union([pa.field('a', pa.int8()), pa.field('b', pa.int16())], pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.int8()), pa.field('b', pa.int16())], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), pa.date64(), pa.timestamp('ms'), pa.timestamp('ns'), pa.decimal128(12, 2), pa.decimal256(76, 38), pa.field('a', 'string', metadata={b'foo': b'bar'}), pa.list_(pa.field("element", pa.int64())), pa.large_list(pa.field("element", pa.int64())), pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int8())) ] for val in cases: roundtripped = pickle.loads(pickle.dumps(val)) assert val == roundtripped fields = [] for i, f in enumerate(cases): if isinstance(f, pa.Field): fields.append(f) else: fields.append(pa.field('_f{}'.format(i), f)) schema = pa.schema(fields, metadata={b'foo': b'bar'}) roundtripped = pickle.loads(pickle.dumps(schema)) assert schema == roundtripped
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == ArrayType: if type(dt.elementType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) elif type(dt) == MapType: if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if type(dt.keyType) in [StructType, TimestampType] or \ type(dt.valueType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) elif type(dt) == StructType: if any(type(field.dataType) == StructType for field in dt): raise TypeError("Nested StructType not supported in conversion to Arrow") fields = [pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable) for field in dt] arrow_type = pa.struct(fields) elif type(dt) == NullType: arrow_type = pa.null() else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
def test_map(): ty = pa.map_(pa.string(), pa.int8()) v = [('a', 1), ('b', 2)] s = pa.scalar(v, type=ty) assert len(s) == 2 assert isinstance(s, pa.MapScalar) assert isinstance(s.values, pa.Array) assert repr(s) == "<pyarrow.MapScalar: [('a', 1), ('b', 2)]>" assert s.as_py() == v assert s[1] == (pa.scalar('b', type=pa.string()), pa.scalar(2, type=pa.int8())) assert s[-1] == s[1] assert s[-2] == s[0] with pytest.raises(IndexError): s[-3] with pytest.raises(IndexError): s[2]
def athena2pyarrow(dtype: str) -> pa.DataType: # pylint: disable=too-many-return-statements,too-many-branches """Athena to PyArrow data types conversion.""" if dtype.startswith(("array", "struct", "map")): orig_dtype: str = dtype dtype = dtype.lower().replace(" ", "") if dtype == "tinyint": return pa.int8() if dtype == "smallint": return pa.int16() if dtype in ("int", "integer"): return pa.int32() if dtype == "bigint": return pa.int64() if dtype in ("float", "real"): return pa.float32() if dtype == "double": return pa.float64() if dtype == "boolean": return pa.bool_() if (dtype == "string" ) or dtype.startswith("char") or dtype.startswith("varchar"): return pa.string() if dtype == "timestamp": return pa.timestamp(unit="ns") if dtype == "date": return pa.date32() if dtype in ("binary" or "varbinary"): return pa.binary() if dtype.startswith("decimal") is True: precision, scale = dtype.replace("decimal(", "").replace(")", "").split(sep=",") return pa.decimal128(precision=int(precision), scale=int(scale)) if dtype.startswith("array") is True: return pa.list_(value_type=athena2pyarrow(dtype=orig_dtype[6:-1]), list_size=-1) if dtype.startswith("struct") is True: return pa.struct([(f.split(":", 1)[0], athena2pyarrow(f.split(":", 1)[1])) for f in _split_struct(orig_dtype[7:-1])]) if dtype.startswith("map") is True: parts: List[str] = _split_map(s=orig_dtype[4:-1]) return pa.map_(athena2pyarrow(parts[0]), athena2pyarrow(parts[1])) raise exceptions.UnsupportedType(f"Unsupported Athena type: {dtype}")
def test_read_pandas_map_fields(tempdir): # ARROW-10140 - table created from Pandas with mapping fields df = pd.DataFrame({ 'col1': pd.Series([ [('id', 'something'), ('value2', 'else')], [('id', 'something2'), ('value', 'else2')], ]), 'col2': pd.Series(['foo', 'bar']) }) filename = tempdir / 'data.parquet' udt = pa.map_(pa.string(), pa.string()) schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())]) arrow_table = pa.Table.from_pandas(df, schema) _write_table(arrow_table, filename) result = pq.read_pandas(filename).to_pandas() tm.assert_frame_equal(result, df)
def test_to_column_info(): schema = pa.schema([ pa.field("col_boolean", pa.bool_()), pa.field("col_tinyint", pa.int32()), pa.field("col_smallint", pa.int32()), pa.field("col_int", pa.int32()), pa.field("col_bigint", pa.int64()), pa.field("col_float", pa.float32()), pa.field("col_double", pa.float64()), pa.field("col_string", pa.string()), pa.field("col_varchar", pa.string()), pa.field("col_timestamp", pa.timestamp("ns")), pa.field("col_date", pa.date32()), pa.field("col_binary", pa.binary()), pa.field("col_array", pa.list_(pa.field("array_element", pa.int32()))), pa.field("col_map", pa.map_(pa.int32(), pa.field("entries", pa.int32()))), pa.field( "col_struct", pa.struct([pa.field("a", pa.int32()), pa.field("b", pa.int32())]), ), pa.field("col_decimal", pa.decimal128(10, 1)), ]) assert to_column_info(schema) == ( { "Name": "col_boolean", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "boolean", }, { "Name": "col_tinyint", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_smallint", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_int", "Nullable": "NULLABLE", "Precision": 10, "Scale": 0, "Type": "integer", }, { "Name": "col_bigint", "Nullable": "NULLABLE", "Precision": 19, "Scale": 0, "Type": "bigint", }, { "Name": "col_float", "Nullable": "NULLABLE", "Precision": 17, "Scale": 0, "Type": "float", }, { "Name": "col_double", "Nullable": "NULLABLE", "Precision": 17, "Scale": 0, "Type": "double", }, { "Name": "col_string", "Nullable": "NULLABLE", "Precision": 2147483647, "Scale": 0, "Type": "varchar", }, { "Name": "col_varchar", "Nullable": "NULLABLE", "Precision": 2147483647, "Scale": 0, "Type": "varchar", }, { "Name": "col_timestamp", "Nullable": "NULLABLE", "Precision": 3, "Scale": 0, "Type": "timestamp", }, { "Name": "col_date", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "date", }, { "Name": "col_binary", "Nullable": "NULLABLE", "Precision": 1073741824, "Scale": 0, "Type": "varbinary", }, { "Name": "col_array", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "array", }, { "Name": "col_map", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "map", }, { "Name": "col_struct", "Nullable": "NULLABLE", "Precision": 0, "Scale": 0, "Type": "row", }, { "Name": "col_decimal", "Nullable": "NULLABLE", "Precision": 10, "Scale": 1, "Type": "decimal", }, )
def map_types(draw, key_strategy=primitive_types, item_strategy=primitive_types): key_type = draw(key_strategy) h.assume(not pa.types.is_null(key_type)) value_type = draw(item_strategy) return pa.map_(key_type, value_type)
def generate_test_parquet(): import pyarrow as pa import datetime import decimal import json import pandas as pd import pathlib import pyarrow.parquet as pq import struct boolean = pa.array([True, False, None, False, True], type=pa.bool_()) uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)], type=pa.uint8()) int8 = pa.array([None if i == 2 else -2 + i for i in range(5)], type=pa.int8()) uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)], type=pa.uint16()) int16 = pa.array( [None if i == 2 else -20000 + i * 10000 for i in range(5)], type=pa.int16()) uint32 = pa.array( [None if i == 2 else 1 + i * 1000000000 for i in range(5)], type=pa.uint32()) int32 = pa.array( [None if i == 2 else -2000000000 + i * 1000000000 for i in range(5)], type=pa.int32()) uint64 = pa.array( [None if i == 2 else 1 + i * 100000000000 for i in range(5)], type=pa.uint64()) int64 = pa.array([ None if i == 2 else -200000000000 + i * 100000000000 for i in range(5) ], type=pa.int64()) float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float32()) float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float64()) string = pa.array(["abcd", "", None, "c", "d"], type=pa.string()) large_string = pa.array(["abcd", "", None, "c", "d"], type=pa.large_string()) gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2)) timestamp_ms_gmt_plus_2 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_plus_2) ] * 5, type=pa.timestamp('ms', tz=gmt_plus_2)) gmt = datetime.timezone(datetime.timedelta(hours=0)) timestamp_ms_gmt = pa.array([ pd.Timestamp( year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt) ] * 5, type=pa.timestamp('ms', tz=gmt)) gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25)) timestamp_ms_gmt_minus_0215 = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6, tz=gmt_minus_0215) ] * 5, type=pa.timestamp( 'ms', tz=gmt_minus_0215)) timestamp_s_no_tz = pa.array([ pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500 * 1e6) ] * 5, type=pa.timestamp('s')) time32_s = pa.array([3600 + 120 + 3, None, 3, 4, 5], type=pa.time32('s')) time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456, 2, 3, 4, 5], type=pa.time32('ms')) time64_us = pa.array([(3600 + 120 + 3) * 1e6, None, 3, 4, 5], type=pa.time64('us')) time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456, 2, 3, 4, 5], type=pa.time64('ns')) date32 = pa.array([1, 2, 3, 4, 5], type=pa.date32()) date64 = pa.array([86400 * 1000, 2, 3, 4, 5], type=pa.date64()) duration_s = pa.array([1, 2, 3, 4, 5], type=pa.duration('s')) duration_ms = pa.array([1, 2, 3, 4, 5], type=pa.duration('ms')) binary = pa.array([b'\x00\x01'] * 5, type=pa.binary()) large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary()) fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2)) decimal128 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal128(7, 3)) decimal256 = pa.array([ decimal.Decimal('1234.567'), decimal.Decimal('-1234.567'), None, decimal.Decimal('1234.567'), decimal.Decimal('-1234.567') ], type=pa.decimal256(7, 3)) list_boolean = pa.array([ None if i == 2 else [ None if j == 0 else True if (j % 2) == 0 else False for j in range(i) ] for i in range(5) ], type=pa.list_(pa.bool_())) list_uint8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint8())) list_int8 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int8())) list_uint16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint16())) list_int16 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int16())) list_uint32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint32())) list_int32 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int32())) list_uint64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.uint64())) list_int64 = pa.array([ None if i == 2 else [None if j == 0 else j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.int64())) list_float32 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float32())) list_float64 = pa.array([ None if i == 2 else [None if j == 0 else 0.5 + j + i * (i - 1) // 2 for j in range(i)] for i in range(5) ], type=pa.list_(pa.float64())) list_string = pa.array([ None if i == 2 else [ "".join(["%c" % (65 + j + k) for k in range(1 + j)]) for j in range(i) ] for i in range(5) ]) fixed_size_list_boolean = pa.array( [[True, False], [False, True], [True, False], [False, True], [True, False]], type=pa.list_(pa.bool_(), 2)) fixed_size_list_uint8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint8(), 2)) fixed_size_list_int8 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int8(), 2)) fixed_size_list_uint16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint16(), 2)) fixed_size_list_int16 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int16(), 2)) fixed_size_list_uint32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint32(), 2)) fixed_size_list_int32 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int32(), 2)) fixed_size_list_uint64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.uint64(), 2)) fixed_size_list_int64 = pa.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.int64(), 2)) fixed_size_list_float32 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float32(), 2)) fixed_size_list_float64 = pa.array( [[0, None], [2, 3], [4, 5], [6, 7], [8, 9]], type=pa.list_(pa.float64(), 2)) fixed_size_list_string = pa.array( [["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]], type=pa.list_(pa.string(), 2)) struct_field = pa.array([{ "a": 1, "b": 2.5, "c": { "d": "e", "f": "g" }, "h": [5, 6], "i": 3 }] * 5) #struct_val = { "a": 5 } #for i in range(123): # struct_val = { "a": struct_val } #struct_field = pa.array([struct_val] * 5) map_boolean = pa.array([[('x', None), ('y', True)], [('z', True)], None, [], []], type=pa.map_(pa.string(), pa.bool_())) map_uint8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint8())) map_int8 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int8())) map_uint16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint16())) map_int16 = pa.array([[('x', 1), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int16())) map_uint32 = pa.array([[('x', 4 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint32())) map_int32 = pa.array([[('x', 2 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int32())) map_uint64 = pa.array([[('x', 4 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.uint64())) map_int64 = pa.array([[('x', -2 * 1000 * 1000 * 1000 * 1000), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.int64())) map_float32 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float32())) map_float64 = pa.array([[('x', 1.5), ('y', None)], [('z', 3)], None, [], []], type=pa.map_(pa.string(), pa.float64())) map_string = pa.array([[('x', 'x_val'), ('y', None)], [('z', 'z_val')], None, [], []], type=pa.map_(pa.string(), pa.string())) indices = pa.array([0, 1, 2, None, 2]) dictionary = pa.array(['foo', 'bar', 'baz']) dict = pa.DictionaryArray.from_arrays(indices, dictionary) map_list = pa.array([[('x', []), ('y', [])], [('z', [])], None, [], []], type=pa.map_(pa.string(), pa.list_(pa.uint32()))) geometry = pa.array([ None if i == 1 else (b'\x01\x01\x00\x00\x00' + struct.pack('<dd', i, 2)) for i in range(5) ], type=pa.binary()) names = [ "boolean", "uint8", "int8", "uint16", "int16", "uint32", "int32", "uint64", "int64", "float32", "float64", "string", "large_string", "timestamp_ms_gmt", "timestamp_ms_gmt_plus_2", "timestamp_ms_gmt_minus_0215", "timestamp_s_no_tz", "time32_s", "time32_ms", "time64_us", "time64_ns", "date32", "date64", # "duration_s", # "duration_ms", "binary", "large_binary", "fixed_size_binary", "decimal128", "decimal256", "list_boolean", "list_uint8", "list_int8", "list_uint16", "list_int16", "list_uint32", "list_int32", "list_uint64", "list_int64", "list_float32", "list_float64", "list_string", "fixed_size_list_boolean", "fixed_size_list_uint8", "fixed_size_list_int8", "fixed_size_list_uint16", "fixed_size_list_int16", "fixed_size_list_uint32", "fixed_size_list_int32", "fixed_size_list_uint64", "fixed_size_list_int64", "fixed_size_list_float32", "fixed_size_list_float64", "fixed_size_list_string", "struct_field", "map_boolean", "map_uint8", "map_int8", "map_uint16", "map_int16", "map_uint32", "map_int32", "map_uint64", "map_int64", "map_float32", "map_float64", "map_string", # "map_list", "dict", "geometry", ] locals_ = locals() table = pa.table([locals_[x] for x in names], names=names) my_schema = table.schema.with_metadata({ "geo": json.dumps({ "version": "0.1.0", "primary_column": "geometry", "columns": { "geometry": { 'crs': wkt_epsg_4326, 'bbox': [0, 2, 4, 2], 'encoding': 'WKB' } } }) }) table = table.cast(my_schema) HERE = pathlib.Path(__file__).parent pq.write_table(table, HERE / "ogr/data/parquet/test.parquet", compression='NONE', row_group_size=3)
] ), pa.struct( [ pa.field("a", pa.int32(), nullable=False), pa.field("b", pa.int8(), nullable=False), pa.field("c", pa.string()), ] ), pa.dictionary(pa.int8(), pa.string()), ] _unsupported_pyarrow_types = [ pa.decimal256(76, 38), pa.duration("s"), pa.map_(pa.string(), pa.int32()), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_DENSE, type_codes=[4, 8], ), pa.union( [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], mode=pa.lib.UnionMode_SPARSE, ), pa.union( [
class TestAbstractFileParserStatics: @pytest.mark.parametrize( # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html "input_json_type, output_pyarrow_type", [ ("string", pa.large_string()), ("number", pa.float64()), ("integer", pa.int64()), ("object", pa.large_string()), ("array", pa.large_string()), ("boolean", pa.bool_()), ("null", pa.large_string()), ], ) def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None: # Json -> PyArrow direction LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type @pytest.mark.parametrize( # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html "input_pyarrow_types, output_json_type", [ ((pa.null(),), "string"), # null type ((pa.bool_(),), "boolean"), # boolean type ( (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()), "integer", ), # integer types ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"), # number types ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"), # temporal types ((pa.binary(), pa.large_binary()), "string"), # binary types ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"), # string types ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"), # array types ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"), # object types ], ) def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None: # PyArrow -> Json direction (reverse=True) for typ in input_pyarrow_types: LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...") assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type @pytest.mark.parametrize( # if expecting fail, put pyarrow_schema as None "json_schema, pyarrow_schema", [ ( {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"}, { "a": pa.large_string(), "b": pa.float64(), "c": pa.int64(), "d": pa.large_string(), "e": pa.large_string(), "f": pa.bool_(), "g": pa.large_string(), }, ), ({"single_column": "object"}, {"single_column": pa.large_string()}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None: # Json -> PyArrow direction if pyarrow_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) LOGGER.debug(str(e_info)) @pytest.mark.parametrize( # if expecting fail, put json_schema as None "pyarrow_schema, json_schema", [ ( { "a": pa.utf8(), "b": pa.float16(), "c": pa.uint32(), "d": pa.map_(pa.string(), pa.float32()), "e": pa.bool_(), "f": pa.date64(), }, {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"}, ), ({"single_column": pa.int32()}, {"single_column": "integer"}), ({}, {}), ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}), (["string", "object"], None), # bad input type ], ) def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None: # PyArrow -> Json direction (reverse=True) if json_schema is not None: assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema else: with pytest.raises(Exception) as e_info: AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) LOGGER.debug(str(e_info))
"id": 3, "val": [("a", { "weight": 22.5, "temp": 33.1 }), ("b", { "weight": 33.6, "temp": 44.5 }), ("c", { "weight": 44.6, "temp": 55.5 })], "val2": [("vb", { "weight": 5, "temp": 10 })] }] df2 = pd.DataFrame(rows) mystruct = pa.struct( [pa.field("weight", pa.float32()), pa.field("temp", pa.float32())]) mymap = pa.map_(pa.string(), mystruct) schema = pa.schema([ pa.field('id', pa.int32()), pa.field('val', mymap), pa.field("val2", mymap) ]) print(schema) table = pa.Table.from_pandas(df2, schema) pq.write_table(table, 'test/data/nested.parquet')
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType: """ Create a DataType in PyArrow format from a Schema json format. :param json_dict: the DataType in json format :return: the DataType in PyArrow format """ type_class = json_dict["type"]["name"] if type_class == "dictionary": key_type = json_dict["dictionary"]["indexType"] value_type = json_dict["children"][0] key_type = pyarrow_datatype_from_dict(key_type) value_type = pyarrow_datatype_from_dict(value_type) return pyarrow.map_(key_type, value_type) elif "dictionary" in json_dict: key_type = { "name": "key", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } key = pyarrow_datatype_from_dict(key_type) if type_class == "list": value_type = { "name": "val", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } return pyarrow.map_( key, pyarrow.list_( pyarrow.field( "element", pyarrow.struct([pyarrow_field_from_dict(value_type) ]))), ) value_type = { "name": "value", "type": json_dict["type"], "nullable": json_dict["nullable"], } return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type)) elif type_class == "list": field = json_dict["children"][0] element_type = pyarrow_datatype_from_dict(field) return pyarrow.list_(pyarrow.field("element", element_type)) elif type_class == "struct": fields = [ pyarrow_field_from_dict(field) for field in json_dict["children"] ] return pyarrow.struct(fields) elif type_class == "int": return pyarrow.type_for_alias( f'{type_class}{json_dict["type"]["bitWidth"]}') elif type_class == "date": type_info = json_dict["type"] if type_info["unit"] == "DAY": return pyarrow.date32() else: return pyarrow.date64() elif type_class == "time": type_info = json_dict["type"] if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" else: unit = "s" return pyarrow.type_for_alias( f'{type_class}{type_info["bitWidth"]}[{unit}]') elif type_class == "timestamp": type_info = json_dict["type"] if "unit" in type_info: if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" elif type_info["unit"] == "SECOND": unit = "s" else: unit = "ns" return pyarrow.type_for_alias(f"{type_class}[{unit}]") elif type_class.startswith("decimal"): type_info = json_dict["type"] return pyarrow.decimal128(precision=type_info["precision"], scale=type_info["scale"]) elif type_class.startswith("floatingpoint"): type_info = json_dict["type"] if type_info["precision"] == "HALF": return pyarrow.float16() elif type_info["precision"] == "SINGLE": return pyarrow.float32() elif type_info["precision"] == "DOUBLE": return pyarrow.float64() else: return pyarrow.type_for_alias(type_class)
datetime(2018, 1, 5), datetime(2018, 1, 7), datetime(2018, 1, 9)], 'five': [date(2018, 1, 1), date(2018, 1, 3), date(2018, 1, 5), date(2018, 1, 7), date(2018, 1, 9)], 'six': [True, False, True, False, True]}) table3 = pa.Table.from_pandas(df3) with pq.ParquetWriter('simple/example2.parquet', table3.schema) as writer: writer.write_table(table3) # example3.parquet file mdt1 = pa.map_(pa.int32(), pa.string()) mdt2 = pa.map_(pa.date32(), pa.int16()) df = pd.DataFrame({ 'one': pd.Series([ [(1, 'foo'), (2, 'bar'), (3, 'baz')], [(4, 'test1'), (5,'test2')], ]), 'two': pd.Series([ [(date(2018, 1, 1), 10), (date(2018, 1, 2), 15)], [(date(2018, 1, 3), 20), (date(2018, 1, 4), 25)], ]), 'three': pd.Series([1, 2]), } ) schema = pa.schema([
pa.LargeBinaryValue), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar, pa.FixedSizeBinaryValue), ([1, 2, 3], None, pa.ListScalar, pa.ListValue), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar, pa.LargeListValue), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar, pa.FixedSizeListValue), (datetime.date.today(), None, pa.Date32Scalar, pa.Date32Value), (datetime.date.today(), pa.date64(), pa.Date64Scalar, pa.Date64Value), (datetime.datetime.now(), None, pa.TimestampScalar, pa.TimestampValue), (datetime.datetime.now().time().replace(microsecond=0), pa.time32('s'), pa.Time32Scalar, pa.Time32Value), (datetime.datetime.now().time(), None, pa.Time64Scalar, pa.Time64Value), (datetime.timedelta(days=1), None, pa.DurationScalar, pa.DurationValue), ({'a': 1, 'b': [1, 2]}, None, pa.StructScalar, pa.StructValue), ([('a', 1), ('b', 2)], pa.map_(pa.string(), pa.int8()), pa.MapScalar, pa.MapValue), ]) def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) assert s.as_py() == value assert s == pa.scalar(value, type=ty) assert s != value assert s != "else" assert hash(s) == hash(s) assert s.is_valid is True with pytest.warns(FutureWarning): assert isinstance(s, deprecated) s = pa.scalar(None, type=s.type)