def orc_type(field): if pa.types.is_boolean(field): return pyorc.Boolean() elif pa.types.is_int8(field): return pyorc.TinyInt() elif pa.types.is_int16(field): return pyorc.SmallInt() elif pa.types.is_int32(field): return pyorc.Int() elif pa.types.is_int64(field): return pyorc.BigInt() elif pa.types.is_float32(field): return pyorc.Float() elif pa.types.is_float64(field): return pyorc.Double() elif pa.types.is_decimal(field): return pyorc.Decimal(field.precision, field.scale) elif pa.types.is_list(field): return pyorc.Array(orc_type(field.value_type)) elif pa.types.is_timestamp(field): return pyorc.Timestamp() elif pa.types.is_date(field): return pyorc.Date() elif pa.types.is_binary(field): return pyorc.Binary() elif pa.types.is_string(field): return pyorc.String() else: raise ValueError('Cannot Convert %s' % field)
} PANDAS_TO_ORC_TYPES = { cudf.dtype("int8"): pyorc.TinyInt(), pd.Int8Dtype(): pyorc.TinyInt(), pd.Int16Dtype(): pyorc.SmallInt(), pd.Int32Dtype(): pyorc.Int(), pd.Int64Dtype(): pyorc.BigInt(), pd.BooleanDtype(): pyorc.Boolean(), cudf.dtype("bool_"): pyorc.Boolean(), cudf.dtype("int16"): pyorc.SmallInt(), cudf.dtype("int32"): pyorc.Int(), cudf.dtype("int64"): pyorc.BigInt(), cudf.dtype("O"): pyorc.String(), pd.StringDtype(): pyorc.String(), cudf.dtype("float32"): pyorc.Float(), cudf.dtype("float64"): pyorc.Double(), cudf.dtype("<M8[ns]"): pyorc.Timestamp(), cudf.dtype("<M8[ms]"): pyorc.Timestamp(), cudf.dtype("<M8[us]"): pyorc.Timestamp(), } ORC_TO_PANDAS_TYPES = { pyorc.TinyInt().name: pd.Int8Dtype(), pyorc.Int().name: pd.Int32Dtype(), pyorc.Boolean().name: pd.BooleanDtype(), pyorc.SmallInt().name: pd.Int16Dtype(), pyorc.BigInt().name: pd.Int64Dtype(), pyorc.String().name: cudf.dtype("O"), pyorc.Float().name: cudf.dtype("float32"), pyorc.Double().name: cudf.dtype("float64"),
} PANDAS_TO_ORC_TYPES = { np.dtype("int8"): pyorc.TinyInt(), pd.Int8Dtype(): pyorc.TinyInt(), pd.Int16Dtype(): pyorc.SmallInt(), pd.Int32Dtype(): pyorc.Int(), pd.Int64Dtype(): pyorc.BigInt(), pd.BooleanDtype(): pyorc.Boolean(), np.dtype("bool_"): pyorc.Boolean(), np.dtype("int16"): pyorc.SmallInt(), np.dtype("int32"): pyorc.Int(), np.dtype("int64"): pyorc.BigInt(), np.dtype("O"): pyorc.String(), pd.StringDtype(): pyorc.String(), np.dtype("float32"): pyorc.Float(), np.dtype("float64"): pyorc.Double(), np.dtype("<M8[ns]"): pyorc.Timestamp(), np.dtype("<M8[ms]"): pyorc.Timestamp(), np.dtype("<M8[us]"): pyorc.Timestamp(), } ORC_TO_PANDAS_TYPES = { pyorc.TinyInt().name: pd.Int8Dtype(), pyorc.Int().name: pd.Int32Dtype(), pyorc.Boolean().name: pd.BooleanDtype(), pyorc.SmallInt().name: pd.Int16Dtype(), pyorc.BigInt().name: pd.Int64Dtype(), pyorc.String().name: np.dtype("O"), pyorc.Float().name: np.dtype("float32"), pyorc.Double().name: np.dtype("float64"),
} PANDAS_TO_ORC_TYPES = { np.dtype("int8"): pyorc.TinyInt(), pd.Int8Dtype(): pyorc.TinyInt(), pd.Int16Dtype(): pyorc.SmallInt(), pd.Int32Dtype(): pyorc.Int(), pd.Int64Dtype(): pyorc.BigInt(), pd.BooleanDtype(): pyorc.Boolean(), np.dtype("bool_"): pyorc.Boolean(), np.dtype("int16"): pyorc.SmallInt(), np.dtype("int32"): pyorc.Int(), np.dtype("int64"): pyorc.BigInt(), np.dtype("O"): pyorc.String(), pd.StringDtype(): pyorc.String(), np.dtype("float32"): pyorc.Float(), np.dtype("float64"): pyorc.Double(), np.dtype("<M8[ns]"): pyorc.Timestamp(), np.dtype("<M8[ms]"): pyorc.Timestamp(), np.dtype("<M8[us]"): pyorc.Timestamp(), } def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = [] for _ in range(num_cols):