def orc_type(field): if pa.types.is_boolean(field): return pyorc.Boolean() elif pa.types.is_int8(field): return pyorc.TinyInt() elif pa.types.is_int16(field): return pyorc.SmallInt() elif pa.types.is_int32(field): return pyorc.Int() elif pa.types.is_int64(field): return pyorc.BigInt() elif pa.types.is_float32(field): return pyorc.Float() elif pa.types.is_float64(field): return pyorc.Double() elif pa.types.is_decimal(field): return pyorc.Decimal(field.precision, field.scale) elif pa.types.is_list(field): return pyorc.Array(orc_type(field.value_type)) elif pa.types.is_timestamp(field): return pyorc.Timestamp() elif pa.types.is_date(field): return pyorc.Date() elif pa.types.is_binary(field): return pyorc.Binary() elif pa.types.is_string(field): return pyorc.String() else: raise ValueError('Cannot Convert %s' % field)
def generate_list_struct_buff(size=28000): rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() schema = { "lvl3_list": po.Array(po.Array(po.Array(po.BigInt()))), "lvl1_list": po.Array(po.BigInt()), "lvl1_struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), "lvl2_struct": po.Struct( **{ "a": po.BigInt(), "lvl1_struct": po.Struct( **{"c": po.BigInt(), "d": po.BigInt()} ), } ), "list_nests_struct": po.Array( po.Array(po.Struct(**{"a": po.BigInt(), "b": po.BigInt()})) ), "struct_nests_list": po.Struct( **{ "struct": po.Struct(**{"a": po.BigInt(), "b": po.BigInt()}), "list": po.Array(po.BigInt()), } ), } schema = po.Struct(**schema) lvl3_list = [ rd.choice( [ None, [ [ [ rd.choice([None, np.random.randint(1, 3)]) for z in range(np.random.randint(1, 3)) ] for z in range(np.random.randint(0, 3)) ] for y in range(np.random.randint(0, 3)) ], ] ) for x in range(size) ] lvl1_list = [ [ rd.choice([None, np.random.randint(0, 3)]) for y in range(np.random.randint(1, 4)) ] for x in range(size) ] lvl1_struct = [ rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))]) for x in range(size) ] lvl2_struct = [ rd.choice( [ None, ( rd.choice([None, np.random.randint(0, 3)]), ( rd.choice([None, np.random.randint(0, 3)]), np.random.randint(0, 3), ), ), ] ) for x in range(size) ] list_nests_struct = [ [ [rd.choice(lvl1_struct), rd.choice(lvl1_struct)] for y in range(np.random.randint(1, 4)) ] for x in range(size) ] struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)] df = pd.DataFrame( { "lvl3_list": lvl3_list, "lvl1_list": lvl1_list, "lvl1_struct": lvl1_struct, "lvl2_struct": lvl2_struct, "list_nests_struct": list_nests_struct, "struct_nests_list": struct_nests_list, } ) writer = po.Writer(buff, schema, stripe_size=1024) tuples = list( map( lambda x: (None,) if x[0] is pd.NA else x, list(df.itertuples(index=False, name=None)), ) ) writer.writerows(tuples) writer.close() return buff
def gen_map_buff(size=10000): from string import ascii_letters as al rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() schema = { "lvl1_map": po.Map(key=po.String(), value=po.BigInt()), "lvl2_map": po.Map(key=po.String(), value=po.Array(po.BigInt())), "lvl2_struct_map": po.Map( key=po.String(), value=po.Struct(**{ "a": po.BigInt(), "b": po.BigInt() }), ), } schema = po.Struct(**schema) lvl1_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([None, np.random.randint(1, 1500)]), ) for y in range(2)], ]) for x in range(size) ] lvl2_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, [ rd.choice([None, np.random.randint(1, 1500)]) for z in range(5) ], ]), ) for y in range(2)], ]) for x in range(size) ] lvl2_struct_map = [ rd.choice([ None, [( rd.choice(al), rd.choice([ None, ( rd.choice([None, np.random.randint(1, 1500)]), rd.choice([None, np.random.randint(1, 1500)]), ), ]), ) for y in range(2)], ]) for x in range(size) ] pdf = pd.DataFrame({ "lvl1_map": lvl1_map, "lvl2_map": lvl2_map, "lvl2_struct_map": lvl2_struct_map, }) writer = po.Writer(buff, schema, stripe_size=1024, compression=po.CompressionKind.NONE) tuples = list( map( lambda x: (None, ) if x[0] is pd.NA else x, list(pdf.itertuples(index=False, name=None)), )) writer.writerows(tuples) writer.close() return buff