def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 elif types.is_int16(field_type): np_type = np.int16 elif types.is_int32(field_type): np_type = np.int32 elif types.is_int64(field_type): np_type = np.int64 elif types.is_string(field_type): np_type = np.unicode_ elif types.is_boolean(field_type): np_type = np.bool_ elif types.is_float32(field_type): np_type = np.float32 elif types.is_float64(field_type): np_type = np.float64 elif types.is_decimal(field_type): np_type = Decimal elif types.is_binary(field_type): np_type = np.string_ elif types.is_fixed_size_binary(field_type): np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 elif types.is_timestamp(field_type): np_type = np.datetime64 elif types.is_list(field_type): np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) else: raise ValueError('Cannot auto-create unischema due to unsupported column type {}'.format(field_type)) return np_type
def convertPyArrowTypeToGlueType(pyarrowType: pa.DataType) -> str: if (types.is_string(pyarrowType) or types.is_unicode(pyarrowType) or types.is_large_string(pyarrowType) or types.is_large_unicode(pyarrowType)): return 'string' if (types.is_int64(pyarrowType) or types.is_uint64(pyarrowType)): return 'bigint' if (types.is_binary(pyarrowType)): return 'binary' if (types.is_boolean(pyarrowType)): return 'boolean' if (types.is_date(pyarrowType) or types.is_date32(pyarrowType) or types.is_date64(pyarrowType)): return 'date' if (types.is_decimal(pyarrowType)): return 'decimal(16,2)' if (types.is_float64(pyarrowType)): 'return double' if (types.is_float16(pyarrowType) or types.is_float32(pyarrowType)): return 'float' if (types.is_int16(pyarrowType) or types.is_int32(pyarrowType) or types.is_uint16(pyarrowType) or types.is_uint32(pyarrowType)): return 'int' if (types.is_map(pyarrowType)): return 'map' if (types.is_struct(pyarrowType)): return 'struct' if (types.is_timestamp(pyarrowType)): return 'timestamp' if (types.is_union(pyarrowType)): return 'union' return str(pyarrowType)
def _is_zero_copy_only(pa_type: pa.DataType) -> bool: """ When converting a pyarrow array to a numpy array, we must know whether this could be done in zero-copy or not. This function returns the value of the ``zero_copy_only`` parameter to pass to ``.to_numpy()``, given the type of the pyarrow array. # zero copy is available for all primitive types except booleans # primitive types are types for which the physical representation in arrow and in numpy # https://github.com/wesm/arrow/blob/c07b9b48cf3e0bbbab493992a492ae47e5b04cad/python/pyarrow/types.pxi#L821 # see https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy # and https://issues.apache.org/jira/browse/ARROW-2871?jql=text%20~%20%22boolean%20to_numpy%22 """ return is_primitive(pa_type) and not is_boolean(pa_type)
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def _numpy_and_codec_from_arrow_type(field_type): from pyarrow import types if types.is_int8(field_type): np_type = np.int8 codec = ScalarCodec(ByteType()) elif types.is_int16(field_type): np_type = np.int16 codec = ScalarCodec(ShortType()) elif types.is_int32(field_type): np_type = np.int32 codec = ScalarCodec(IntegerType()) elif types.is_int64(field_type): np_type = np.int64 codec = ScalarCodec(LongType()) elif types.is_string(field_type): np_type = np.unicode_ codec = ScalarCodec(StringType()) elif types.is_boolean(field_type): np_type = np.bool_ codec = ScalarCodec(BooleanType()) elif types.is_float32(field_type): np_type = np.float32 codec = ScalarCodec(FloatType()) elif types.is_float64(field_type): np_type = np.float64 codec = ScalarCodec(DoubleType()) elif types.is_decimal(field_type): np_type = Decimal codec = ScalarCodec(DecimalType(field_type.precision, field_type.scale)) elif types.is_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_fixed_size_binary(field_type): codec = ScalarCodec(StringType()) np_type = np.string_ elif types.is_date(field_type): np_type = np.datetime64 codec = ScalarCodec(DateType()) elif types.is_timestamp(field_type): np_type = np.datetime64 codec = ScalarCodec(TimestampType()) elif types.is_list(field_type): _, np_type = _numpy_and_codec_from_arrow_type(field_type.value_type) codec = None else: raise ValueError( 'Cannot auto-create unischema due to unsupported column type {}'. format(field_type)) return codec, np_type
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def to_numpy(self): storage: pa.ListArray = self.storage size = 1 for i in range(self.type.ndims): size *= self.type.shape[i] storage = storage.flatten() # zero copy is available for all primitive types except booleans # primitive types are types for which the physical representation in arrow and in numpy # https://github.com/wesm/arrow/blob/c07b9b48cf3e0bbbab493992a492ae47e5b04cad/python/pyarrow/types.pxi#L821 # see https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy # and https://issues.apache.org/jira/browse/ARROW-2871?jql=text%20~%20%22boolean%20to_numpy%22 zero_copy_only = is_primitive(storage.type) and not is_boolean(storage.type) numpy_arr = storage.to_numpy(zero_copy_only=zero_copy_only) numpy_arr = numpy_arr.reshape(len(self), *self.type.shape) return numpy_arr
def test_is_boolean(): assert types.is_boolean(pa.bool_()) assert not types.is_boolean(pa.int8())
def is_possible_feature(arrow_type: DataType) -> bool: """Check if data type is possibly an ML feature.""" return is_boolean(arrow_type) or is_string(arrow_type) or is_num( arrow_type) # noqa: E501
def is_possible_cat(arrow_type: DataType, /) -> bool: """Check if data type is possibly categorical.""" return (is_boolean(arrow_type) or is_string(arrow_type) or is_num(arrow_type))
def is_possible_cat(arrow_type): return is_boolean(arrow_type) or is_string(arrow_type) or is_num( arrow_type)