def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 elif types.is_int16(field_type): np_type = np.int16 elif types.is_int32(field_type): np_type = np.int32 elif types.is_int64(field_type): np_type = np.int64 elif types.is_string(field_type): np_type = np.unicode_ elif types.is_boolean(field_type): np_type = np.bool_ elif types.is_float32(field_type): np_type = np.float32 elif types.is_float64(field_type): np_type = np.float64 elif types.is_decimal(field_type): np_type = Decimal elif types.is_binary(field_type): np_type = np.string_ elif types.is_fixed_size_binary(field_type): np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 elif types.is_timestamp(field_type): np_type = np.datetime64 elif types.is_list(field_type): np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str: if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType) or types.is_large_string(pyarrowType) or types.is_large_unicode(pyarrowType)): return 'string' if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)): return 'bigint' if (types.is_binary(pyarrowType)): return 'binary' if (types.is_boolean(pyarrowType)): return 'boolean' if (types.is_date(pyarrowType) or types.is_date32(pyarrowType) or types.is_date64(pyarrowType)): return 'date' if (types.is_decimal(pyarrowType)): return 'decimal(16,2)' if (types.is_float64(pyarrowType)): 'return double' if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)): return 'float' if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType) or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)): return 'int' if (types.is_map(pyarrowType)): return 'map' if (types.is_struct(pyarrowType)): return 'struct' if (types.is_timestamp(pyarrowType)): return 'timestamp' if (types.is_union(pyarrowType)): return 'union' return str(pyarrowType)
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 codec = ScalarCodec(ByteType()) elif types.is_int16(field_type): np_type = np.int16 codec = ScalarCodec(ShortType()) elif types.is_int32(field_type): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif types.is_int64(field_type): np_type = np.int64 codec = ScalarCodec(LongType()) elif types.is_string(field_type): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif types.is_boolean(field_type): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif types.is_float32(field_type): np_type = np.float32 codec = ScalarCodec(FloatType()) elif types.is_float64(field_type): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif types.is_decimal(field_type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif types.is_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_fixed_size_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 codec = ScalarCodec(DateType()) elif types.is_timestamp(field_type): np_type = np.datetime64 codec = ScalarCodec(TimestampType()) elif types.is_list(field_type): _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) codec = None else: raise ValueError( 'Cannot auto-create unischema due to unsupported column type {}'. format(field_type)) return codec, np_type
def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] time_types = [pa.time32('s'), pa.time64('ns')] timestamp_types = [pa.timestamp('ms')] for case in date_types + time_types + timestamp_types: assert types.is_temporal(case) for case in date_types: assert types.is_date(case) assert not types.is_time(case) assert not types.is_timestamp(case) for case in time_types: assert types.is_time(case) assert not types.is_date(case) assert not types.is_timestamp(case) for case in timestamp_types: assert types.is_timestamp(case) assert not types.is_date(case) assert not types.is_time(case) assert not types.is_temporal(pa.int32())