Example #1
0
def orc_type(field):
    if pa.types.is_boolean(field):
        return pyorc.Boolean()
    elif pa.types.is_int8(field):
        return pyorc.TinyInt()
    elif pa.types.is_int16(field):
        return pyorc.SmallInt()
    elif pa.types.is_int32(field):
        return pyorc.Int()
    elif pa.types.is_int64(field):
        return pyorc.BigInt()
    elif pa.types.is_float32(field):
        return pyorc.Float()
    elif pa.types.is_float64(field):
        return pyorc.Double()
    elif pa.types.is_decimal(field):
        return pyorc.Decimal(field.precision, field.scale)
    elif pa.types.is_list(field):
        return pyorc.Array(orc_type(field.value_type))
    elif pa.types.is_timestamp(field):
        return pyorc.Timestamp()
    elif pa.types.is_date(field):
        return pyorc.Date()
    elif pa.types.is_binary(field):
        return pyorc.Binary()
    elif pa.types.is_string(field):
        return pyorc.String()
    else:
        raise ValueError('Cannot Convert %s' % field)
Example #2
0
}

PANDAS_TO_ORC_TYPES = {
    cudf.dtype("int8"): pyorc.TinyInt(),
    pd.Int8Dtype(): pyorc.TinyInt(),
    pd.Int16Dtype(): pyorc.SmallInt(),
    pd.Int32Dtype(): pyorc.Int(),
    pd.Int64Dtype(): pyorc.BigInt(),
    pd.BooleanDtype(): pyorc.Boolean(),
    cudf.dtype("bool_"): pyorc.Boolean(),
    cudf.dtype("int16"): pyorc.SmallInt(),
    cudf.dtype("int32"): pyorc.Int(),
    cudf.dtype("int64"): pyorc.BigInt(),
    cudf.dtype("O"): pyorc.String(),
    pd.StringDtype(): pyorc.String(),
    cudf.dtype("float32"): pyorc.Float(),
    cudf.dtype("float64"): pyorc.Double(),
    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
    pyorc.TinyInt().name: pd.Int8Dtype(),
    pyorc.Int().name: pd.Int32Dtype(),
    pyorc.Boolean().name: pd.BooleanDtype(),
    pyorc.SmallInt().name: pd.Int16Dtype(),
    pyorc.BigInt().name: pd.Int64Dtype(),
    pyorc.String().name: cudf.dtype("O"),
    pyorc.Float().name: cudf.dtype("float32"),
    pyorc.Double().name: cudf.dtype("float64"),
Example #3
0
}

PANDAS_TO_ORC_TYPES = {
    np.dtype("int8"): pyorc.TinyInt(),
    pd.Int8Dtype(): pyorc.TinyInt(),
    pd.Int16Dtype(): pyorc.SmallInt(),
    pd.Int32Dtype(): pyorc.Int(),
    pd.Int64Dtype(): pyorc.BigInt(),
    pd.BooleanDtype(): pyorc.Boolean(),
    np.dtype("bool_"): pyorc.Boolean(),
    np.dtype("int16"): pyorc.SmallInt(),
    np.dtype("int32"): pyorc.Int(),
    np.dtype("int64"): pyorc.BigInt(),
    np.dtype("O"): pyorc.String(),
    pd.StringDtype(): pyorc.String(),
    np.dtype("float32"): pyorc.Float(),
    np.dtype("float64"): pyorc.Double(),
    np.dtype("<M8[ns]"): pyorc.Timestamp(),
    np.dtype("<M8[ms]"): pyorc.Timestamp(),
    np.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
    pyorc.TinyInt().name: pd.Int8Dtype(),
    pyorc.Int().name: pd.Int32Dtype(),
    pyorc.Boolean().name: pd.BooleanDtype(),
    pyorc.SmallInt().name: pd.Int16Dtype(),
    pyorc.BigInt().name: pd.Int64Dtype(),
    pyorc.String().name: np.dtype("O"),
    pyorc.Float().name: np.dtype("float32"),
    pyorc.Double().name: np.dtype("float64"),
Example #4
0
}

PANDAS_TO_ORC_TYPES = {
    np.dtype("int8"): pyorc.TinyInt(),
    pd.Int8Dtype(): pyorc.TinyInt(),
    pd.Int16Dtype(): pyorc.SmallInt(),
    pd.Int32Dtype(): pyorc.Int(),
    pd.Int64Dtype(): pyorc.BigInt(),
    pd.BooleanDtype(): pyorc.Boolean(),
    np.dtype("bool_"): pyorc.Boolean(),
    np.dtype("int16"): pyorc.SmallInt(),
    np.dtype("int32"): pyorc.Int(),
    np.dtype("int64"): pyorc.BigInt(),
    np.dtype("O"): pyorc.String(),
    pd.StringDtype(): pyorc.String(),
    np.dtype("float32"): pyorc.Float(),
    np.dtype("float64"): pyorc.Double(),
    np.dtype("<M8[ns]"): pyorc.Timestamp(),
    np.dtype("<M8[ms]"): pyorc.Timestamp(),
    np.dtype("<M8[us]"): pyorc.Timestamp(),
}


def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):