def orc_type(field): if pa.types.is_boolean(field): return pyorc.Boolean() elif pa.types.is_int8(field): return pyorc.TinyInt() elif pa.types.is_int16(field): return pyorc.SmallInt() elif pa.types.is_int32(field): return pyorc.Int() elif pa.types.is_int64(field): return pyorc.BigInt() elif pa.types.is_float32(field): return pyorc.Float() elif pa.types.is_float64(field): return pyorc.Double() elif pa.types.is_decimal(field): return pyorc.Decimal(field.precision, field.scale) elif pa.types.is_list(field): return pyorc.Array(orc_type(field.value_type)) elif pa.types.is_timestamp(field): return pyorc.Timestamp() elif pa.types.is_date(field): return pyorc.Date() elif pa.types.is_binary(field): return pyorc.Binary() elif pa.types.is_string(field): return pyorc.String() else: raise ValueError('Cannot Convert %s' % field)
def test_empty_statistics(): buff = BytesIO() orc_schema = po.Struct( a=po.BigInt(), b=po.Double(), c=po.String(), d=po.Decimal(11, 2), e=po.Date(), f=po.Timestamp(), g=po.Boolean(), h=po.Binary(), i=po.BigInt(), # One column with non null value, else cudf/pyorc readers crash ) data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) with po.Writer(buff, orc_schema) as writer: writer.write(data) got = cudf.io.orc.read_orc_statistics([buff]) # Check for both file and stripe stats for stats in got: # Similar expected stats for the first 6 columns in this case for col_name in ascii_lowercase[:6]: assert stats[0][col_name].get("number_of_values") == 0 assert stats[0][col_name].get("has_null") is True assert stats[0][col_name].get("minimum") is None assert stats[0][col_name].get("maximum") is None for col_name in ascii_lowercase[:3]: assert stats[0][col_name].get("sum") == 0 # Sum for decimal column is a string assert stats[0]["d"].get("sum") == "0" assert stats[0]["g"].get("number_of_values") == 0 assert stats[0]["g"].get("has_null") is True assert stats[0]["g"].get("true_count") == 0 assert stats[0]["g"].get("false_count") == 0 assert stats[0]["h"].get("number_of_values") == 0 assert stats[0]["h"].get("has_null") is True assert stats[0]["h"].get("sum") == 0 assert stats[0]["i"].get("number_of_values") == 1 assert stats[0]["i"].get("has_null") is False assert stats[0]["i"].get("minimum") == 1 assert stats[0]["i"].get("maximum") == 1 assert stats[0]["i"].get("sum") == 1
cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"}, cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"}, } PANDAS_TO_ORC_TYPES = { cudf.dtype("int8"): pyorc.TinyInt(), pd.Int8Dtype(): pyorc.TinyInt(), pd.Int16Dtype(): pyorc.SmallInt(), pd.Int32Dtype(): pyorc.Int(), pd.Int64Dtype(): pyorc.BigInt(), pd.BooleanDtype(): pyorc.Boolean(), cudf.dtype("bool_"): pyorc.Boolean(), cudf.dtype("int16"): pyorc.SmallInt(), cudf.dtype("int32"): pyorc.Int(), cudf.dtype("int64"): pyorc.BigInt(), cudf.dtype("O"): pyorc.String(), pd.StringDtype(): pyorc.String(), cudf.dtype("float32"): pyorc.Float(), cudf.dtype("float64"): pyorc.Double(), cudf.dtype("<M8[ns]"): pyorc.Timestamp(), cudf.dtype("<M8[ms]"): pyorc.Timestamp(), cudf.dtype("<M8[us]"): pyorc.Timestamp(), } ORC_TO_PANDAS_TYPES = { pyorc.TinyInt().name: pd.Int8Dtype(), pyorc.Int().name: pd.Int32Dtype(), pyorc.Boolean().name: pd.BooleanDtype(), pyorc.SmallInt().name: pd.Int16Dtype(), pyorc.BigInt().name: pd.Int64Dtype(), pyorc.String().name: cudf.dtype("O"),
"logicalType": "timestamp-micros" }, } PANDAS_TO_ORC_TYPES = { np.dtype("int8"): pyorc.TinyInt(), pd.Int8Dtype(): pyorc.TinyInt(), pd.Int16Dtype(): pyorc.SmallInt(), pd.Int32Dtype(): pyorc.Int(), pd.Int64Dtype(): pyorc.BigInt(), pd.BooleanDtype(): pyorc.Boolean(), np.dtype("bool_"): pyorc.Boolean(), np.dtype("int16"): pyorc.SmallInt(), np.dtype("int32"): pyorc.Int(), np.dtype("int64"): pyorc.BigInt(), np.dtype("O"): pyorc.String(), pd.StringDtype(): pyorc.String(), np.dtype("float32"): pyorc.Float(), np.dtype("float64"): pyorc.Double(), np.dtype("<M8[ns]"): pyorc.Timestamp(), np.dtype("<M8[ms]"): pyorc.Timestamp(), np.dtype("<M8[us]"): pyorc.Timestamp(), } def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = []
def gen_map_buff(size=10000): from string import ascii_letters as al rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() schema = { "lvl1_map": po.Map(key=po.String(), value=po.BigInt()), "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())), "lvl2_struct_map": po.Map( key=po.String(), value=po.Struct(**{ "a": po.BigInt(), "b": po.BigInt() }), ), } schema = po.Struct(**schema) lvl1_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([None, np.random.randint(1, 1500)]), ) for y in range(2)], ]) for x in range(size) ] lvl2_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, [ rd.choice([None, np.random.randint(1, 1500)]) for z in range(5) ], ]), ) for y in range(2)], ]) for x in range(size) ] lvl2_struct_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, ( rd.choice([None, np.random.randint(1, 1500)]), rd.choice([None, np.random.randint(1, 1500)]), ), ]), ) for y in range(2)], ]) for x in range(size) ] pdf = pd.DataFrame({ "lvl1_map": lvl1_map, "lvl2_map": lvl2_map, "lvl2_struct_map": lvl2_struct_map, }) writer = po.Writer(buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE) tuples = list( map( lambda x: (None, ) if x[0] is pd.NA else x, list(pdf.itertuples(index=False, name=None)), )) writer.writerows(tuples) writer.close() return buff